diff options
Diffstat (limited to 'ANDROID_3.4.5/fs/ext4')
36 files changed, 0 insertions, 38444 deletions
diff --git a/ANDROID_3.4.5/fs/ext4/Kconfig b/ANDROID_3.4.5/fs/ext4/Kconfig deleted file mode 100644 index 9ed1bb1f..00000000 --- a/ANDROID_3.4.5/fs/ext4/Kconfig +++ /dev/null @@ -1,85 +0,0 @@ -config EXT4_FS - tristate "The Extended 4 (ext4) filesystem" - select JBD2 - select CRC16 - help - This is the next generation of the ext3 filesystem. - - Unlike the change from ext2 filesystem to ext3 filesystem, - the on-disk format of ext4 is not forwards compatible with - ext3; it is based on extent maps and it supports 48-bit - physical block numbers. The ext4 filesystem also supports delayed - allocation, persistent preallocation, high resolution time stamps, - and a number of other features to improve performance and speed - up fsck time. For more information, please see the web pages at - http://ext4.wiki.kernel.org. - - The ext4 filesystem will support mounting an ext3 - filesystem; while there will be some performance gains from - the delayed allocation and inode table readahead, the best - performance gains will require enabling ext4 features in the - filesystem, or formatting a new filesystem as an ext4 - filesystem initially. - - To compile this file system support as a module, choose M here. The - module will be called ext4. - - If unsure, say N. - -config EXT4_USE_FOR_EXT23 - bool "Use ext4 for ext2/ext3 file systems" - depends on EXT4_FS - depends on EXT3_FS=n || EXT2_FS=n - default y - help - Allow the ext4 file system driver code to be used for ext2 or - ext3 file system mounts. This allows users to reduce their - compiled kernel size by using one file system driver for - ext2, ext3, and ext4 file systems. - -config EXT4_FS_XATTR - bool "Ext4 extended attributes" - depends on EXT4_FS - default y - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - - You need this for POSIX ACL support on ext4. - -config EXT4_FS_POSIX_ACL - bool "Ext4 POSIX Access Control Lists" - depends on EXT4_FS_XATTR - select FS_POSIX_ACL - help - POSIX Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the POSIX ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config EXT4_FS_SECURITY - bool "Ext4 Security Labels" - depends on EXT4_FS_XATTR - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the ext4 filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. - -config EXT4_DEBUG - bool "EXT4 debugging support" - depends on EXT4_FS - help - Enables run-time debugging support for the ext4 filesystem. - - If you select Y here, then you will be able to turn on debugging - with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" diff --git a/ANDROID_3.4.5/fs/ext4/Makefile b/ANDROID_3.4.5/fs/ext4/Makefile deleted file mode 100644 index 56fd8f86..00000000 --- a/ANDROID_3.4.5/fs/ext4/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -# -# Makefile for the linux ext4-filesystem routines. -# - -obj-$(CONFIG_EXT4_FS) += ext4.o - -ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o indirect.o - -ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o -ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o -ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/ANDROID_3.4.5/fs/ext4/acl.c b/ANDROID_3.4.5/fs/ext4/acl.c deleted file mode 100644 index a5c29bb3..00000000 --- a/ANDROID_3.4.5/fs/ext4/acl.c +++ /dev/null @@ -1,439 +0,0 @@ -/* - * linux/fs/ext4/acl.c - * - * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> - */ - -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include "ext4_jbd2.h" -#include "ext4.h" -#include "xattr.h" -#include "acl.h" - -/* - * Convert from filesystem to in-memory representation. - */ -static struct posix_acl * -ext4_acl_from_disk(const void *value, size_t size) -{ - const char *end = (char *)value + size; - int n, count; - struct posix_acl *acl; - - if (!value) - return NULL; - if (size < sizeof(ext4_acl_header)) - return ERR_PTR(-EINVAL); - if (((ext4_acl_header *)value)->a_version != - cpu_to_le32(EXT4_ACL_VERSION)) - return ERR_PTR(-EINVAL); - value = (char *)value + sizeof(ext4_acl_header); - count = ext4_acl_count(size); - if (count < 0) - return ERR_PTR(-EINVAL); - if (count == 0) - return NULL; - acl = posix_acl_alloc(count, GFP_NOFS); - if (!acl) - return ERR_PTR(-ENOMEM); - for (n = 0; n < count; n++) { - ext4_acl_entry *entry = - (ext4_acl_entry *)value; - if ((char *)value + sizeof(ext4_acl_entry_short) > end) - goto fail; - acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); - acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); - - switch (acl->a_entries[n].e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - value = (char *)value + - sizeof(ext4_acl_entry_short); - acl->a_entries[n].e_id = ACL_UNDEFINED_ID; - break; - - case ACL_USER: - case ACL_GROUP: - value = (char *)value + sizeof(ext4_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_id = - le32_to_cpu(entry->e_id); - break; - - default: - goto fail; - } - } - if (value != end) - goto fail; - return acl; - -fail: - posix_acl_release(acl); - return ERR_PTR(-EINVAL); -} - -/* - * Convert from in-memory to filesystem representation. - */ -static void * -ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) -{ - ext4_acl_header *ext_acl; - char *e; - size_t n; - - *size = ext4_acl_size(acl->a_count); - ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count * - sizeof(ext4_acl_entry), GFP_NOFS); - if (!ext_acl) - return ERR_PTR(-ENOMEM); - ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); - e = (char *)ext_acl + sizeof(ext4_acl_header); - for (n = 0; n < acl->a_count; n++) { - ext4_acl_entry *entry = (ext4_acl_entry *)e; - entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); - entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - switch (acl->a_entries[n].e_tag) { - case ACL_USER: - case ACL_GROUP: - entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); - e += sizeof(ext4_acl_entry); - break; - - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - e += sizeof(ext4_acl_entry_short); - break; - - default: - goto fail; - } - } - return (char *)ext_acl; - -fail: - kfree(ext_acl); - return ERR_PTR(-EINVAL); -} - -/* - * Inode operation get_posix_acl(). - * - * inode->i_mutex: don't care - */ -struct posix_acl * -ext4_get_acl(struct inode *inode, int type) -{ - int name_index; - char *value = NULL; - struct posix_acl *acl; - int retval; - - if (!test_opt(inode->i_sb, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - - switch (type) { - case ACL_TYPE_ACCESS: - name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; - break; - default: - BUG(); - } - retval = ext4_xattr_get(inode, name_index, "", NULL, 0); - if (retval > 0) { - value = kmalloc(retval, GFP_NOFS); - if (!value) - return ERR_PTR(-ENOMEM); - retval = ext4_xattr_get(inode, name_index, "", value, retval); - } - if (retval > 0) - acl = ext4_acl_from_disk(value, retval); - else if (retval == -ENODATA || retval == -ENOSYS) - acl = NULL; - else - acl = ERR_PTR(retval); - kfree(value); - - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); - - return acl; -} - -/* - * Set the access or default ACL of an inode. - * - * inode->i_mutex: down unless called from ext4_new_inode - */ -static int -ext4_set_acl(handle_t *handle, struct inode *inode, int type, - struct posix_acl *acl) -{ - int name_index; - void *value = NULL; - size_t size = 0; - int error; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - switch (type) { - case ACL_TYPE_ACCESS: - name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) - return error; - else { - inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - if (error == 0) - acl = NULL; - } - } - break; - - case ACL_TYPE_DEFAULT: - name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; - if (!S_ISDIR(inode->i_mode)) - return acl ? -EACCES : 0; - break; - - default: - return -EINVAL; - } - if (acl) { - value = ext4_acl_to_disk(acl, &size); - if (IS_ERR(value)) - return (int)PTR_ERR(value); - } - - error = ext4_xattr_set_handle(handle, inode, name_index, "", - value, size, 0); - - kfree(value); - if (!error) - set_cached_acl(inode, type, acl); - - return error; -} - -/* - * Initialize the ACLs of a new inode. Called from ext4_new_inode. - * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) - */ -int -ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) -{ - struct posix_acl *acl = NULL; - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(dir->i_sb, POSIX_ACL)) { - acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - if (S_ISDIR(inode->i_mode)) { - error = ext4_set_acl(handle, inode, - ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (error < 0) - return error; - - if (error > 0) { - /* This is an extended ACL */ - error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); - } - } -cleanup: - posix_acl_release(acl); - return error; -} - -/* - * Does chmod for an inode that may have an Access Control List. The - * inode->i_mode field must be updated to the desired value by the caller - * before calling this function. - * Returns 0 on success, or a negative error number. - * - * We change the ACL rather than storing some ACL entries in the file - * mode permission bits (which would be more efficient), because that - * would break once additional permissions (like ACL_APPEND, ACL_DELETE - * for directories) are added. There are no more bits available in the - * file mode. - * - * inode->i_mutex: down - */ -int -ext4_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - handle_t *handle; - int retries = 0; - int error; - - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; -retry: - handle = ext4_journal_start(inode, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - ext4_std_error(inode->i_sb, error); - goto out; - } - error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); - ext4_journal_stop(handle); - if (error == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; -out: - posix_acl_release(acl); - return error; -} - -/* - * Extended attribute handlers - */ -static size_t -ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t -ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int -ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ext4_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - handle_t *handle; - struct posix_acl *acl; - int error, retries = 0; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; - -retry: - handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - error = ext4_set_acl(handle, inode, type, acl); - ext4_journal_stop(handle); - if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - -release_and_out: - posix_acl_release(acl); - return error; -} - -const struct xattr_handler ext4_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ext4_xattr_list_acl_access, - .get = ext4_xattr_get_acl, - .set = ext4_xattr_set_acl, -}; - -const struct xattr_handler ext4_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ext4_xattr_list_acl_default, - .get = ext4_xattr_get_acl, - .set = ext4_xattr_set_acl, -}; diff --git a/ANDROID_3.4.5/fs/ext4/acl.h b/ANDROID_3.4.5/fs/ext4/acl.h deleted file mode 100644 index 18cb39ed..00000000 --- a/ANDROID_3.4.5/fs/ext4/acl.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - File: fs/ext4/acl.h - - (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> -*/ - -#include <linux/posix_acl_xattr.h> - -#define EXT4_ACL_VERSION 0x0001 - -typedef struct { - __le16 e_tag; - __le16 e_perm; - __le32 e_id; -} ext4_acl_entry; - -typedef struct { - __le16 e_tag; - __le16 e_perm; -} ext4_acl_entry_short; - -typedef struct { - __le32 a_version; -} ext4_acl_header; - -static inline size_t ext4_acl_size(int count) -{ - if (count <= 4) { - return sizeof(ext4_acl_header) + - count * sizeof(ext4_acl_entry_short); - } else { - return sizeof(ext4_acl_header) + - 4 * sizeof(ext4_acl_entry_short) + - (count - 4) * sizeof(ext4_acl_entry); - } -} - -static inline int ext4_acl_count(size_t size) -{ - ssize_t s; - size -= sizeof(ext4_acl_header); - s = size - 4 * sizeof(ext4_acl_entry_short); - if (s < 0) { - if (size % sizeof(ext4_acl_entry_short)) - return -1; - return size / sizeof(ext4_acl_entry_short); - } else { - if (s % sizeof(ext4_acl_entry)) - return -1; - return s / sizeof(ext4_acl_entry) + 4; - } -} - -#ifdef CONFIG_EXT4_FS_POSIX_ACL - -/* acl.c */ -struct posix_acl *ext4_get_acl(struct inode *inode, int type); -extern int ext4_acl_chmod(struct inode *); -extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); - -#else /* CONFIG_EXT4_FS_POSIX_ACL */ -#include <linux/sched.h> -#define ext4_get_acl NULL - -static inline int -ext4_acl_chmod(struct inode *inode) -{ - return 0; -} - -static inline int -ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) -{ - return 0; -} -#endif /* CONFIG_EXT4_FS_POSIX_ACL */ - diff --git a/ANDROID_3.4.5/fs/ext4/balloc.c b/ANDROID_3.4.5/fs/ext4/balloc.c deleted file mode 100644 index 8da837be..00000000 --- a/ANDROID_3.4.5/fs/ext4/balloc.c +++ /dev/null @@ -1,766 +0,0 @@ -/* - * linux/fs/ext4/balloc.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - */ - -#include <linux/time.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/jbd2.h> -#include <linux/quotaops.h> -#include <linux/buffer_head.h> -#include "ext4.h" -#include "ext4_jbd2.h" -#include "mballoc.h" - -#include <trace/events/ext4.h> - -static unsigned ext4_num_base_meta_clusters(struct super_block *sb, - ext4_group_t block_group); -/* - * balloc.c contains the blocks allocation and deallocation routines - */ - -/* - * Calculate the block group number and offset into the block/cluster - * allocation bitmap, given a block number - */ -void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, - ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) -{ - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - ext4_grpblk_t offset; - - blocknr = blocknr - le32_to_cpu(es->s_first_data_block); - offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> - EXT4_SB(sb)->s_cluster_bits; - if (offsetp) - *offsetp = offset; - if (blockgrpp) - *blockgrpp = blocknr; - -} - -static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, - ext4_group_t block_group) -{ - ext4_group_t actual_group; - ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); - if (actual_group == block_group) - return 1; - return 0; -} - -/* Return the number of clusters used for file system metadata; this - * represents the overhead needed by the file system. - */ -unsigned ext4_num_overhead_clusters(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp) -{ - unsigned num_clusters; - int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; - ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); - ext4_fsblk_t itbl_blk; - struct ext4_sb_info *sbi = EXT4_SB(sb); - - /* This is the number of clusters used by the superblock, - * block group descriptors, and reserved block group - * descriptor blocks */ - num_clusters = ext4_num_base_meta_clusters(sb, block_group); - - /* - * For the allocation bitmaps and inode table, we first need - * to check to see if the block is in the block group. If it - * is, then check to see if the cluster is already accounted - * for in the clusters used for the base metadata cluster, or - * if we can increment the base metadata cluster to include - * that block. Otherwise, we will have to track the cluster - * used for the allocation bitmap or inode table explicitly. - * Normally all of these blocks are contiguous, so the special - * case handling shouldn't be necessary except for *very* - * unusual file system layouts. - */ - if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { - block_cluster = EXT4_B2C(sbi, - ext4_block_bitmap(sb, gdp) - start); - if (block_cluster < num_clusters) - block_cluster = -1; - else if (block_cluster == num_clusters) { - num_clusters++; - block_cluster = -1; - } - } - - if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { - inode_cluster = EXT4_B2C(sbi, - ext4_inode_bitmap(sb, gdp) - start); - if (inode_cluster < num_clusters) - inode_cluster = -1; - else if (inode_cluster == num_clusters) { - num_clusters++; - inode_cluster = -1; - } - } - - itbl_blk = ext4_inode_table(sb, gdp); - for (i = 0; i < sbi->s_itb_per_group; i++) { - if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { - c = EXT4_B2C(sbi, itbl_blk + i - start); - if ((c < num_clusters) || (c == inode_cluster) || - (c == block_cluster) || (c == itbl_cluster)) - continue; - if (c == num_clusters) { - num_clusters++; - continue; - } - num_clusters++; - itbl_cluster = c; - } - } - - if (block_cluster != -1) - num_clusters++; - if (inode_cluster != -1) - num_clusters++; - - return num_clusters; -} - -static unsigned int num_clusters_in_group(struct super_block *sb, - ext4_group_t block_group) -{ - unsigned int blocks; - - if (block_group == ext4_get_groups_count(sb) - 1) { - /* - * Even though mke2fs always initializes the first and - * last group, just in case some other tool was used, - * we need to make sure we calculate the right free - * blocks. - */ - blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - - ext4_group_first_block_no(sb, block_group); - } else - blocks = EXT4_BLOCKS_PER_GROUP(sb); - return EXT4_NUM_B2C(EXT4_SB(sb), blocks); -} - -/* Initializes an uninitialized block bitmap */ -void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, - ext4_group_t block_group, - struct ext4_group_desc *gdp) -{ - unsigned int bit, bit_max; - struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_fsblk_t start, tmp; - int flex_bg = 0; - - J_ASSERT_BH(bh, buffer_locked(bh)); - - /* If checksum is bad mark all blocks used to prevent allocation - * essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_group_clusters_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - return; - } - memset(bh->b_data, 0, sb->s_blocksize); - - bit_max = ext4_num_base_meta_clusters(sb, block_group); - for (bit = 0; bit < bit_max; bit++) - ext4_set_bit(bit, bh->b_data); - - start = ext4_group_first_block_no(sb, block_group); - - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) - flex_bg = 1; - - /* Set bits for block and inode bitmaps, and inode table */ - tmp = ext4_block_bitmap(sb, gdp); - if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) - ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); - - tmp = ext4_inode_bitmap(sb, gdp); - if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) - ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); - - tmp = ext4_inode_table(sb, gdp); - for (; tmp < ext4_inode_table(sb, gdp) + - sbi->s_itb_per_group; tmp++) { - if (!flex_bg || ext4_block_in_group(sb, tmp, block_group)) - ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); - } - - /* - * Also if the number of blocks within the group is less than - * the blocksize * 8 ( which is the size of bitmap ), set rest - * of the block bitmap to 1 - */ - ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), - sb->s_blocksize * 8, bh->b_data); -} - -/* Return the number of free blocks in a block group. It is used when - * the block bitmap is uninitialized, so we can't just count the bits - * in the bitmap. */ -unsigned ext4_free_clusters_after_init(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp) -{ - return num_clusters_in_group(sb, block_group) - - ext4_num_overhead_clusters(sb, block_group, gdp); -} - -/* - * The free blocks are managed by bitmaps. A file system contains several - * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap - * block for inodes, N blocks for the inode table and data blocks. - * - * The file system contains group descriptors which are located after the - * super block. Each descriptor contains the number of the bitmap block and - * the free blocks count in the block. The descriptors are loaded in memory - * when a file system is mounted (see ext4_fill_super). - */ - -/** - * ext4_get_group_desc() -- load group descriptor from disk - * @sb: super block - * @block_group: given block group - * @bh: pointer to the buffer head to store the block - * group descriptor - */ -struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, - ext4_group_t block_group, - struct buffer_head **bh) -{ - unsigned int group_desc; - unsigned int offset; - ext4_group_t ngroups = ext4_get_groups_count(sb); - struct ext4_group_desc *desc; - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (block_group >= ngroups) { - ext4_error(sb, "block_group >= groups_count - block_group = %u," - " groups_count = %u", block_group, ngroups); - - return NULL; - } - - group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); - offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); - if (!sbi->s_group_desc[group_desc]) { - ext4_error(sb, "Group descriptor not loaded - " - "block_group = %u, group_desc = %u, desc = %u", - block_group, group_desc, offset); - return NULL; - } - - desc = (struct ext4_group_desc *)( - (__u8 *)sbi->s_group_desc[group_desc]->b_data + - offset * EXT4_DESC_SIZE(sb)); - if (bh) - *bh = sbi->s_group_desc[group_desc]; - return desc; -} - -static int ext4_valid_block_bitmap(struct super_block *sb, - struct ext4_group_desc *desc, - unsigned int block_group, - struct buffer_head *bh) -{ - ext4_grpblk_t offset; - ext4_grpblk_t next_zero_bit; - ext4_fsblk_t bitmap_blk; - ext4_fsblk_t group_first_block; - - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { - /* with FLEX_BG, the inode/block bitmaps and itable - * blocks may not be in the group at all - * so the bitmap validation will be skipped for those groups - * or it has to also read the block group where the bitmaps - * are located to verify they are set. - */ - return 1; - } - group_first_block = ext4_group_first_block_no(sb, block_group); - - /* check whether block bitmap block number is set */ - bitmap_blk = ext4_block_bitmap(sb, desc); - offset = bitmap_blk - group_first_block; - if (!ext4_test_bit(offset, bh->b_data)) - /* bad block bitmap */ - goto err_out; - - /* check whether the inode bitmap block number is set */ - bitmap_blk = ext4_inode_bitmap(sb, desc); - offset = bitmap_blk - group_first_block; - if (!ext4_test_bit(offset, bh->b_data)) - /* bad block bitmap */ - goto err_out; - - /* check whether the inode table block number is set */ - bitmap_blk = ext4_inode_table(sb, desc); - offset = bitmap_blk - group_first_block; - next_zero_bit = ext4_find_next_zero_bit(bh->b_data, - offset + EXT4_SB(sb)->s_itb_per_group, - offset); - if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group) - /* good bitmap for inode tables */ - return 1; - -err_out: - ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu", - block_group, bitmap_blk); - return 0; -} -/** - * ext4_read_block_bitmap() - * @sb: super block - * @block_group: given block group - * - * Read the bitmap for a given block_group,and validate the - * bits for block/inode/inode tables are set in the bitmaps - * - * Return buffer_head on success or NULL in case of failure. - */ -struct buffer_head * -ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) -{ - struct ext4_group_desc *desc; - struct buffer_head *bh; - ext4_fsblk_t bitmap_blk; - - desc = ext4_get_group_desc(sb, block_group, NULL); - if (!desc) - return NULL; - bitmap_blk = ext4_block_bitmap(sb, desc); - bh = sb_getblk(sb, bitmap_blk); - if (unlikely(!bh)) { - ext4_error(sb, "Cannot get buffer for block bitmap - " - "block_group = %u, block_bitmap = %llu", - block_group, bitmap_blk); - return NULL; - } - - if (bitmap_uptodate(bh)) - return bh; - - lock_buffer(bh); - if (bitmap_uptodate(bh)) { - unlock_buffer(bh); - return bh; - } - ext4_lock_group(sb, block_group); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - ext4_init_block_bitmap(sb, bh, block_group, desc); - set_bitmap_uptodate(bh); - set_buffer_uptodate(bh); - ext4_unlock_group(sb, block_group); - unlock_buffer(bh); - return bh; - } - ext4_unlock_group(sb, block_group); - if (buffer_uptodate(bh)) { - /* - * if not uninit if bh is uptodate, - * bitmap is also uptodate - */ - set_bitmap_uptodate(bh); - unlock_buffer(bh); - return bh; - } - /* - * submit the buffer_head for reading - */ - set_buffer_new(bh); - trace_ext4_read_block_bitmap_load(sb, block_group); - bh->b_end_io = ext4_end_bitmap_read; - get_bh(bh); - submit_bh(READ, bh); - return bh; -} - -/* Returns 0 on success, 1 on error */ -int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, - struct buffer_head *bh) -{ - struct ext4_group_desc *desc; - - if (!buffer_new(bh)) - return 0; - desc = ext4_get_group_desc(sb, block_group, NULL); - if (!desc) - return 1; - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - ext4_error(sb, "Cannot read block bitmap - " - "block_group = %u, block_bitmap = %llu", - block_group, (unsigned long long) bh->b_blocknr); - return 1; - } - clear_buffer_new(bh); - /* Panic or remount fs read-only if block bitmap is invalid */ - ext4_valid_block_bitmap(sb, desc, block_group, bh); - return 0; -} - -struct buffer_head * -ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) -{ - struct buffer_head *bh; - - bh = ext4_read_block_bitmap_nowait(sb, block_group); - if (ext4_wait_block_bitmap(sb, block_group, bh)) { - put_bh(bh); - return NULL; - } - return bh; -} - -/** - * ext4_has_free_clusters() - * @sbi: in-core super block structure. - * @nclusters: number of needed blocks - * @flags: flags from ext4_mb_new_blocks() - * - * Check if filesystem has nclusters free & available for allocation. - * On success return 1, return 0 on failure. - */ -static int ext4_has_free_clusters(struct ext4_sb_info *sbi, - s64 nclusters, unsigned int flags) -{ - s64 free_clusters, dirty_clusters, root_clusters; - struct percpu_counter *fcc = &sbi->s_freeclusters_counter; - struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; - - free_clusters = percpu_counter_read_positive(fcc); - dirty_clusters = percpu_counter_read_positive(dcc); - root_clusters = EXT4_B2C(sbi, ext4_r_blocks_count(sbi->s_es)); - - if (free_clusters - (nclusters + root_clusters + dirty_clusters) < - EXT4_FREECLUSTERS_WATERMARK) { - free_clusters = EXT4_C2B(sbi, percpu_counter_sum_positive(fcc)); - dirty_clusters = percpu_counter_sum_positive(dcc); - } - /* Check whether we have space after accounting for current - * dirty clusters & root reserved clusters. - */ - if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters)) - return 1; - - /* Hm, nope. Are (enough) root reserved clusters available? */ - if (sbi->s_resuid == current_fsuid() || - ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || - capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { - - if (free_clusters >= (nclusters + dirty_clusters)) - return 1; - } - - return 0; -} - -int ext4_claim_free_clusters(struct ext4_sb_info *sbi, - s64 nclusters, unsigned int flags) -{ - if (ext4_has_free_clusters(sbi, nclusters, flags)) { - percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); - return 0; - } else - return -ENOSPC; -} - -/** - * ext4_should_retry_alloc() - * @sb: super block - * @retries number of attemps has been made - * - * ext4_should_retry_alloc() is called when ENOSPC is returned, and if - * it is profitable to retry the operation, this function will wait - * for the current or committing transaction to complete, and then - * return TRUE. - * - * if the total number of retries exceed three times, return FALSE. - */ -int ext4_should_retry_alloc(struct super_block *sb, int *retries) -{ - if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) || - (*retries)++ > 3 || - !EXT4_SB(sb)->s_journal) - return 0; - - jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); - - return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); -} - -/* - * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks - * - * @handle: handle to this transaction - * @inode: file inode - * @goal: given target block(filesystem wide) - * @count: pointer to total number of clusters needed - * @errp: error code - * - * Return 1st allocated block number on success, *count stores total account - * error stores in errp pointer - */ -ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, unsigned int flags, - unsigned long *count, int *errp) -{ - struct ext4_allocation_request ar; - ext4_fsblk_t ret; - - memset(&ar, 0, sizeof(ar)); - /* Fill with neighbour allocated blocks */ - ar.inode = inode; - ar.goal = goal; - ar.len = count ? *count : 1; - ar.flags = flags; - - ret = ext4_mb_new_blocks(handle, &ar, errp); - if (count) - *count = ar.len; - /* - * Account for the allocated meta blocks. We will never - * fail EDQUOT for metdata, but we do account for it. - */ - if (!(*errp) && - ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - EXT4_I(inode)->i_allocated_meta_blocks += ar.len; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - dquot_alloc_block_nofail(inode, - EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); - } - return ret; -} - -/** - * ext4_count_free_clusters() -- count filesystem free clusters - * @sb: superblock - * - * Adds up the number of free clusters from each block group. - */ -ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) -{ - ext4_fsblk_t desc_count; - struct ext4_group_desc *gdp; - ext4_group_t i; - ext4_group_t ngroups = ext4_get_groups_count(sb); -#ifdef EXT4FS_DEBUG - struct ext4_super_block *es; - ext4_fsblk_t bitmap_count; - unsigned int x; - struct buffer_head *bitmap_bh = NULL; - - es = EXT4_SB(sb)->s_es; - desc_count = 0; - bitmap_count = 0; - gdp = NULL; - - for (i = 0; i < ngroups; i++) { - gdp = ext4_get_group_desc(sb, i, NULL); - if (!gdp) - continue; - desc_count += ext4_free_group_clusters(sb, gdp); - brelse(bitmap_bh); - bitmap_bh = ext4_read_block_bitmap(sb, i); - if (bitmap_bh == NULL) - continue; - - x = ext4_count_free(bitmap_bh, sb->s_blocksize); - printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", - i, ext4_free_group_clusters(sb, gdp), x); - bitmap_count += x; - } - brelse(bitmap_bh); - printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" - ", computed = %llu, %llu\n", - EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), - desc_count, bitmap_count); - return bitmap_count; -#else - desc_count = 0; - for (i = 0; i < ngroups; i++) { - gdp = ext4_get_group_desc(sb, i, NULL); - if (!gdp) - continue; - desc_count += ext4_free_group_clusters(sb, gdp); - } - - return desc_count; -#endif -} - -static inline int test_root(ext4_group_t a, int b) -{ - int num = b; - - while (a > num) - num *= b; - return num == a; -} - -static int ext4_group_sparse(ext4_group_t group) -{ - if (group <= 1) - return 1; - if (!(group & 1)) - return 0; - return (test_root(group, 7) || test_root(group, 5) || - test_root(group, 3)); -} - -/** - * ext4_bg_has_super - number of blocks used by the superblock in group - * @sb: superblock for filesystem - * @group: group number to check - * - * Return the number of blocks used by the superblock (primary or backup) - * in this group. Currently this will be only 0 or 1. - */ -int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) -{ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) && - !ext4_group_sparse(group)) - return 0; - return 1; -} - -static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, - ext4_group_t group) -{ - unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); - ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb); - ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1; - - if (group == first || group == first + 1 || group == last) - return 1; - return 0; -} - -static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, - ext4_group_t group) -{ - if (!ext4_bg_has_super(sb, group)) - return 0; - - if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) - return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); - else - return EXT4_SB(sb)->s_gdb_count; -} - -/** - * ext4_bg_num_gdb - number of blocks used by the group table in group - * @sb: superblock for filesystem - * @group: group number to check - * - * Return the number of blocks used by the group descriptor table - * (primary or backup) in this group. In the future there may be a - * different number of descriptor blocks in each group. - */ -unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) -{ - unsigned long first_meta_bg = - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); - unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); - - if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) || - metagroup < first_meta_bg) - return ext4_bg_num_gdb_nometa(sb, group); - - return ext4_bg_num_gdb_meta(sb,group); - -} - -/* - * This function returns the number of file system metadata clusters at - * the beginning of a block group, including the reserved gdt blocks. - */ -static unsigned ext4_num_base_meta_clusters(struct super_block *sb, - ext4_group_t block_group) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned num; - - /* Check for superblock and gdt backups in this group */ - num = ext4_bg_has_super(sb, block_group); - - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || - block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * - sbi->s_desc_per_block) { - if (num) { - num += ext4_bg_num_gdb(sb, block_group); - num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); - } - } else { /* For META_BG_BLOCK_GROUPS */ - num += ext4_bg_num_gdb(sb, block_group); - } - return EXT4_NUM_B2C(sbi, num); -} -/** - * ext4_inode_to_goal_block - return a hint for block allocation - * @inode: inode for block allocation - * - * Return the ideal location to start allocating blocks for a - * newly created inode. - */ -ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - ext4_group_t block_group; - ext4_grpblk_t colour; - int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); - ext4_fsblk_t bg_start; - ext4_fsblk_t last_block; - - block_group = ei->i_block_group; - if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { - /* - * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME - * block groups per flexgroup, reserve the first block - * group for directories and special files. Regular - * files will start at the second block group. This - * tends to speed up directory access and improves - * fsck times. - */ - block_group &= ~(flex_size-1); - if (S_ISREG(inode->i_mode)) - block_group++; - } - bg_start = ext4_group_first_block_no(inode->i_sb, block_group); - last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; - - /* - * If we are doing delayed allocation, we don't need take - * colour into account. - */ - if (test_opt(inode->i_sb, DELALLOC)) - return bg_start; - - if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * - (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); - else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); - return bg_start + colour; -} - diff --git a/ANDROID_3.4.5/fs/ext4/bitmap.c b/ANDROID_3.4.5/fs/ext4/bitmap.c deleted file mode 100644 index fa3af81a..00000000 --- a/ANDROID_3.4.5/fs/ext4/bitmap.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * linux/fs/ext4/bitmap.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - */ - -#include <linux/buffer_head.h> -#include <linux/jbd2.h> -#include "ext4.h" - -#ifdef EXT4FS_DEBUG - -static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; - -unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars) -{ - unsigned int i, sum = 0; - - if (!map) - return 0; - for (i = 0; i < numchars; i++) - sum += nibblemap[map->b_data[i] & 0xf] + - nibblemap[(map->b_data[i] >> 4) & 0xf]; - return sum; -} - -#endif /* EXT4FS_DEBUG */ - diff --git a/ANDROID_3.4.5/fs/ext4/block_validity.c b/ANDROID_3.4.5/fs/ext4/block_validity.c deleted file mode 100644 index 3f11656b..00000000 --- a/ANDROID_3.4.5/fs/ext4/block_validity.c +++ /dev/null @@ -1,268 +0,0 @@ -/* - * linux/fs/ext4/block_validity.c - * - * Copyright (C) 2009 - * Theodore Ts'o (tytso@mit.edu) - * - * Track which blocks in the filesystem are metadata blocks that - * should never be used as data blocks by files or directories. - */ - -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/namei.h> -#include <linux/quotaops.h> -#include <linux/buffer_head.h> -#include <linux/swap.h> -#include <linux/pagemap.h> -#include <linux/blkdev.h> -#include <linux/mutex.h> -#include <linux/slab.h> -#include "ext4.h" - -struct ext4_system_zone { - struct rb_node node; - ext4_fsblk_t start_blk; - unsigned int count; -}; - -static struct kmem_cache *ext4_system_zone_cachep; - -int __init ext4_init_system_zone(void) -{ - ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0); - if (ext4_system_zone_cachep == NULL) - return -ENOMEM; - return 0; -} - -void ext4_exit_system_zone(void) -{ - kmem_cache_destroy(ext4_system_zone_cachep); -} - -static inline int can_merge(struct ext4_system_zone *entry1, - struct ext4_system_zone *entry2) -{ - if ((entry1->start_blk + entry1->count) == entry2->start_blk) - return 1; - return 0; -} - -/* - * Mark a range of blocks as belonging to the "system zone" --- that - * is, filesystem metadata blocks which should never be used by - * inodes. - */ -static int add_system_zone(struct ext4_sb_info *sbi, - ext4_fsblk_t start_blk, - unsigned int count) -{ - struct ext4_system_zone *new_entry = NULL, *entry; - struct rb_node **n = &sbi->system_blks.rb_node, *node; - struct rb_node *parent = NULL, *new_node = NULL; - - while (*n) { - parent = *n; - entry = rb_entry(parent, struct ext4_system_zone, node); - if (start_blk < entry->start_blk) - n = &(*n)->rb_left; - else if (start_blk >= (entry->start_blk + entry->count)) - n = &(*n)->rb_right; - else { - if (start_blk + count > (entry->start_blk + - entry->count)) - entry->count = (start_blk + count - - entry->start_blk); - new_node = *n; - new_entry = rb_entry(new_node, struct ext4_system_zone, - node); - break; - } - } - - if (!new_entry) { - new_entry = kmem_cache_alloc(ext4_system_zone_cachep, - GFP_KERNEL); - if (!new_entry) - return -ENOMEM; - new_entry->start_blk = start_blk; - new_entry->count = count; - new_node = &new_entry->node; - - rb_link_node(new_node, parent, n); - rb_insert_color(new_node, &sbi->system_blks); - } - - /* Can we merge to the left? */ - node = rb_prev(new_node); - if (node) { - entry = rb_entry(node, struct ext4_system_zone, node); - if (can_merge(entry, new_entry)) { - new_entry->start_blk = entry->start_blk; - new_entry->count += entry->count; - rb_erase(node, &sbi->system_blks); - kmem_cache_free(ext4_system_zone_cachep, entry); - } - } - - /* Can we merge to the right? */ - node = rb_next(new_node); - if (node) { - entry = rb_entry(node, struct ext4_system_zone, node); - if (can_merge(new_entry, entry)) { - new_entry->count += entry->count; - rb_erase(node, &sbi->system_blks); - kmem_cache_free(ext4_system_zone_cachep, entry); - } - } - return 0; -} - -static void debug_print_tree(struct ext4_sb_info *sbi) -{ - struct rb_node *node; - struct ext4_system_zone *entry; - int first = 1; - - printk(KERN_INFO "System zones: "); - node = rb_first(&sbi->system_blks); - while (node) { - entry = rb_entry(node, struct ext4_system_zone, node); - printk("%s%llu-%llu", first ? "" : ", ", - entry->start_blk, entry->start_blk + entry->count - 1); - first = 0; - node = rb_next(node); - } - printk("\n"); -} - -int ext4_setup_system_zone(struct super_block *sb) -{ - ext4_group_t ngroups = ext4_get_groups_count(sb); - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_group_desc *gdp; - ext4_group_t i; - int flex_size = ext4_flex_bg_size(sbi); - int ret; - - if (!test_opt(sb, BLOCK_VALIDITY)) { - if (EXT4_SB(sb)->system_blks.rb_node) - ext4_release_system_zone(sb); - return 0; - } - if (EXT4_SB(sb)->system_blks.rb_node) - return 0; - - for (i=0; i < ngroups; i++) { - if (ext4_bg_has_super(sb, i) && - ((i < 5) || ((i % flex_size) == 0))) - add_system_zone(sbi, ext4_group_first_block_no(sb, i), - ext4_bg_num_gdb(sb, i) + 1); - gdp = ext4_get_group_desc(sb, i, NULL); - ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); - if (ret) - return ret; - ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1); - if (ret) - return ret; - ret = add_system_zone(sbi, ext4_inode_table(sb, gdp), - sbi->s_itb_per_group); - if (ret) - return ret; - } - - if (test_opt(sb, DEBUG)) - debug_print_tree(EXT4_SB(sb)); - return 0; -} - -/* Called when the filesystem is unmounted */ -void ext4_release_system_zone(struct super_block *sb) -{ - struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node; - struct rb_node *parent; - struct ext4_system_zone *entry; - - while (n) { - /* Do the node's children first */ - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - entry = rb_entry(n, struct ext4_system_zone, node); - kmem_cache_free(ext4_system_zone_cachep, entry); - if (!parent) - EXT4_SB(sb)->system_blks = RB_ROOT; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } - EXT4_SB(sb)->system_blks = RB_ROOT; -} - -/* - * Returns 1 if the passed-in block region (start_blk, - * start_blk+count) is valid; 0 if some part of the block region - * overlaps with filesystem metadata blocks. - */ -int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, - unsigned int count) -{ - struct ext4_system_zone *entry; - struct rb_node *n = sbi->system_blks.rb_node; - - if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || - (start_blk + count < start_blk) || - (start_blk + count > ext4_blocks_count(sbi->s_es))) { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); - return 0; - } - while (n) { - entry = rb_entry(n, struct ext4_system_zone, node); - if (start_blk + count - 1 < entry->start_blk) - n = n->rb_left; - else if (start_blk >= (entry->start_blk + entry->count)) - n = n->rb_right; - else { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); - return 0; - } - } - return 1; -} - -int ext4_check_blockref(const char *function, unsigned int line, - struct inode *inode, __le32 *p, unsigned int max) -{ - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - __le32 *bref = p; - unsigned int blk; - - while (bref < p+max) { - blk = le32_to_cpu(*bref++); - if (blk && - unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), - blk, 1))) { - es->s_last_error_block = cpu_to_le64(blk); - ext4_error_inode(inode, function, line, blk, - "invalid block"); - return -EIO; - } - } - return 0; -} - diff --git a/ANDROID_3.4.5/fs/ext4/dir.c b/ANDROID_3.4.5/fs/ext4/dir.c deleted file mode 100644 index b8678620..00000000 --- a/ANDROID_3.4.5/fs/ext4/dir.c +++ /dev/null @@ -1,667 +0,0 @@ -/* - * linux/fs/ext4/dir.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/dir.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext4 directory handling functions - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - * - * Hash Tree Directory indexing (c) 2001 Daniel Phillips - * - */ - -#include <linux/fs.h> -#include <linux/jbd2.h> -#include <linux/buffer_head.h> -#include <linux/slab.h> -#include <linux/rbtree.h> -#include "ext4.h" - -static unsigned char ext4_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; - -static int ext4_dx_readdir(struct file *filp, - void *dirent, filldir_t filldir); - -static unsigned char get_dtype(struct super_block *sb, int filetype) -{ - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || - (filetype >= EXT4_FT_MAX)) - return DT_UNKNOWN; - - return (ext4_filetype_table[filetype]); -} - -/** - * Check if the given dir-inode refers to an htree-indexed directory - * (or a directory which chould potentially get coverted to use htree - * indexing). - * - * Return 1 if it is a dx dir, 0 if not - */ -static int is_dx_dir(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - - if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX) && - ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || - ((inode->i_size >> sb->s_blocksize_bits) == 1))) - return 1; - - return 0; -} - -/* - * Return 0 if the directory entry is OK, and 1 if there is a problem - * - * Note: this is the opposite of what ext2 and ext3 historically returned... - */ -int __ext4_check_dir_entry(const char *function, unsigned int line, - struct inode *dir, struct file *filp, - struct ext4_dir_entry_2 *de, - struct buffer_head *bh, - unsigned int offset) -{ - const char *error_msg = NULL; - const int rlen = ext4_rec_len_from_disk(de->rec_len, - dir->i_sb->s_blocksize); - - if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) - error_msg = "rec_len is smaller than minimal"; - else if (unlikely(rlen % 4 != 0)) - error_msg = "rec_len % 4 != 0"; - else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) - error_msg = "rec_len is too small for name_len"; - else if (unlikely(((char *) de - bh->b_data) + rlen > - dir->i_sb->s_blocksize)) - error_msg = "directory entry across blocks"; - else if (unlikely(le32_to_cpu(de->inode) > - le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) - error_msg = "inode out of bounds"; - else - return 0; - - if (filp) - ext4_error_file(filp, function, line, bh->b_blocknr, - "bad entry in directory: %s - offset=%u(%u), " - "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % bh->b_size), - offset, le32_to_cpu(de->inode), - rlen, de->name_len); - else - ext4_error_inode(dir, function, line, bh->b_blocknr, - "bad entry in directory: %s - offset=%u(%u), " - "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % bh->b_size), - offset, le32_to_cpu(de->inode), - rlen, de->name_len); - - return 1; -} - -static int ext4_readdir(struct file *filp, - void *dirent, filldir_t filldir) -{ - int error = 0; - unsigned int offset; - int i, stored; - struct ext4_dir_entry_2 *de; - int err; - struct inode *inode = filp->f_path.dentry->d_inode; - struct super_block *sb = inode->i_sb; - int ret = 0; - int dir_has_error = 0; - - if (is_dx_dir(inode)) { - err = ext4_dx_readdir(filp, dirent, filldir); - if (err != ERR_BAD_DX_DIR) { - ret = err; - goto out; - } - /* - * We don't set the inode dirty flag since it's not - * critical that it get flushed back to the disk. - */ - ext4_clear_inode_flag(filp->f_path.dentry->d_inode, - EXT4_INODE_INDEX); - } - stored = 0; - offset = filp->f_pos & (sb->s_blocksize - 1); - - while (!error && !stored && filp->f_pos < inode->i_size) { - struct ext4_map_blocks map; - struct buffer_head *bh = NULL; - - map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); - map.m_len = 1; - err = ext4_map_blocks(NULL, inode, &map, 0); - if (err > 0) { - pgoff_t index = map.m_pblk >> - (PAGE_CACHE_SHIFT - inode->i_blkbits); - if (!ra_has_index(&filp->f_ra, index)) - page_cache_sync_readahead( - sb->s_bdev->bd_inode->i_mapping, - &filp->f_ra, filp, - index, 1); - filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; - bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); - } - - /* - * We ignore I/O errors on directories so users have a chance - * of recovering data when there's a bad sector - */ - if (!bh) { - if (!dir_has_error) { - EXT4_ERROR_FILE(filp, 0, - "directory contains a " - "hole at offset %llu", - (unsigned long long) filp->f_pos); - dir_has_error = 1; - } - /* corrupt size? Maybe no more blocks to read */ - if (filp->f_pos > inode->i_blocks << 9) - break; - filp->f_pos += sb->s_blocksize - offset; - continue; - } - -revalidate: - /* If the dir block has changed since the last call to - * readdir(2), then we might be pointing to an invalid - * dirent right now. Scan from the start of the block - * to make sure. */ - if (filp->f_version != inode->i_version) { - for (i = 0; i < sb->s_blocksize && i < offset; ) { - de = (struct ext4_dir_entry_2 *) - (bh->b_data + i); - /* It's too expensive to do a full - * dirent test each time round this - * loop, but we do have to test at - * least that it is non-zero. A - * failure will be detected in the - * dirent test below. */ - if (ext4_rec_len_from_disk(de->rec_len, - sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) - break; - i += ext4_rec_len_from_disk(de->rec_len, - sb->s_blocksize); - } - offset = i; - filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) - | offset; - filp->f_version = inode->i_version; - } - - while (!error && filp->f_pos < inode->i_size - && offset < sb->s_blocksize) { - de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (ext4_check_dir_entry(inode, filp, de, - bh, offset)) { - /* - * On error, skip the f_pos to the next block - */ - filp->f_pos = (filp->f_pos | - (sb->s_blocksize - 1)) + 1; - brelse(bh); - ret = stored; - goto out; - } - offset += ext4_rec_len_from_disk(de->rec_len, - sb->s_blocksize); - if (le32_to_cpu(de->inode)) { - /* We might block in the next section - * if the data destination is - * currently swapped out. So, use a - * version stamp to detect whether or - * not the directory has been modified - * during the copy operation. - */ - u64 version = filp->f_version; - - error = filldir(dirent, de->name, - de->name_len, - filp->f_pos, - le32_to_cpu(de->inode), - get_dtype(sb, de->file_type)); - if (error) - break; - if (version != filp->f_version) - goto revalidate; - stored++; - } - filp->f_pos += ext4_rec_len_from_disk(de->rec_len, - sb->s_blocksize); - } - offset = 0; - brelse(bh); - } -out: - return ret; -} - -static inline int is_32bit_api(void) -{ -#ifdef CONFIG_COMPAT - return is_compat_task(); -#else - return (BITS_PER_LONG == 32); -#endif -} - -/* - * These functions convert from the major/minor hash to an f_pos - * value for dx directories - * - * Upper layer (for example NFS) should specify FMODE_32BITHASH or - * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted - * directly on both 32-bit and 64-bit nodes, under such case, neither - * FMODE_32BITHASH nor FMODE_64BITHASH is specified. - */ -static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) -{ - if ((filp->f_mode & FMODE_32BITHASH) || - (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) - return major >> 1; - else - return ((__u64)(major >> 1) << 32) | (__u64)minor; -} - -static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) -{ - if ((filp->f_mode & FMODE_32BITHASH) || - (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) - return (pos << 1) & 0xffffffff; - else - return ((pos >> 32) << 1) & 0xffffffff; -} - -static inline __u32 pos2min_hash(struct file *filp, loff_t pos) -{ - if ((filp->f_mode & FMODE_32BITHASH) || - (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) - return 0; - else - return pos & 0xffffffff; -} - -/* - * Return 32- or 64-bit end-of-file for dx directories - */ -static inline loff_t ext4_get_htree_eof(struct file *filp) -{ - if ((filp->f_mode & FMODE_32BITHASH) || - (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) - return EXT4_HTREE_EOF_32BIT; - else - return EXT4_HTREE_EOF_64BIT; -} - - -/* - * ext4_dir_llseek() based on generic_file_llseek() to handle both - * non-htree and htree directories, where the "offset" is in terms - * of the filename hash value instead of the byte offset. - * - * NOTE: offsets obtained *before* ext4_set_inode_flag(dir, EXT4_INODE_INDEX) - * will be invalid once the directory was converted into a dx directory - */ -loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin) -{ - struct inode *inode = file->f_mapping->host; - loff_t ret = -EINVAL; - int dx_dir = is_dx_dir(inode); - - mutex_lock(&inode->i_mutex); - - /* NOTE: relative offsets with dx directories might not work - * as expected, as it is difficult to figure out the - * correct offset between dx hashes */ - - switch (origin) { - case SEEK_END: - if (unlikely(offset > 0)) - goto out_err; /* not supported for directories */ - - /* so only negative offsets are left, does that have a - * meaning for directories at all? */ - if (dx_dir) - offset += ext4_get_htree_eof(file); - else - offset += inode->i_size; - break; - case SEEK_CUR: - /* - * Here we special-case the lseek(fd, 0, SEEK_CUR) - * position-querying operation. Avoid rewriting the "same" - * f_pos value back to the file because a concurrent read(), - * write() or lseek() might have altered it - */ - if (offset == 0) { - offset = file->f_pos; - goto out_ok; - } - - offset += file->f_pos; - break; - } - - if (unlikely(offset < 0)) - goto out_err; - - if (!dx_dir) { - if (offset > inode->i_sb->s_maxbytes) - goto out_err; - } else if (offset > ext4_get_htree_eof(file)) - goto out_err; - - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - -out_ok: - ret = offset; -out_err: - mutex_unlock(&inode->i_mutex); - - return ret; -} - -/* - * This structure holds the nodes of the red-black tree used to store - * the directory entry in hash order. - */ -struct fname { - __u32 hash; - __u32 minor_hash; - struct rb_node rb_hash; - struct fname *next; - __u32 inode; - __u8 name_len; - __u8 file_type; - char name[0]; -}; - -/* - * This functoin implements a non-recursive way of freeing all of the - * nodes in the red-black tree. - */ -static void free_rb_tree_fname(struct rb_root *root) -{ - struct rb_node *n = root->rb_node; - struct rb_node *parent; - struct fname *fname; - - while (n) { - /* Do the node's children first */ - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - fname = rb_entry(n, struct fname, rb_hash); - while (fname) { - struct fname *old = fname; - fname = fname->next; - kfree(old); - } - if (!parent) - *root = RB_ROOT; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } -} - - -static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, - loff_t pos) -{ - struct dir_private_info *p; - - p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); - if (!p) - return NULL; - p->curr_hash = pos2maj_hash(filp, pos); - p->curr_minor_hash = pos2min_hash(filp, pos); - return p; -} - -void ext4_htree_free_dir_info(struct dir_private_info *p) -{ - free_rb_tree_fname(&p->root); - kfree(p); -} - -/* - * Given a directory entry, enter it into the fname rb tree. - */ -int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, - __u32 minor_hash, - struct ext4_dir_entry_2 *dirent) -{ - struct rb_node **p, *parent = NULL; - struct fname *fname, *new_fn; - struct dir_private_info *info; - int len; - - info = dir_file->private_data; - p = &info->root.rb_node; - - /* Create and allocate the fname structure */ - len = sizeof(struct fname) + dirent->name_len + 1; - new_fn = kzalloc(len, GFP_KERNEL); - if (!new_fn) - return -ENOMEM; - new_fn->hash = hash; - new_fn->minor_hash = minor_hash; - new_fn->inode = le32_to_cpu(dirent->inode); - new_fn->name_len = dirent->name_len; - new_fn->file_type = dirent->file_type; - memcpy(new_fn->name, dirent->name, dirent->name_len); - new_fn->name[dirent->name_len] = 0; - - while (*p) { - parent = *p; - fname = rb_entry(parent, struct fname, rb_hash); - - /* - * If the hash and minor hash match up, then we put - * them on a linked list. This rarely happens... - */ - if ((new_fn->hash == fname->hash) && - (new_fn->minor_hash == fname->minor_hash)) { - new_fn->next = fname->next; - fname->next = new_fn; - return 0; - } - - if (new_fn->hash < fname->hash) - p = &(*p)->rb_left; - else if (new_fn->hash > fname->hash) - p = &(*p)->rb_right; - else if (new_fn->minor_hash < fname->minor_hash) - p = &(*p)->rb_left; - else /* if (new_fn->minor_hash > fname->minor_hash) */ - p = &(*p)->rb_right; - } - - rb_link_node(&new_fn->rb_hash, parent, p); - rb_insert_color(&new_fn->rb_hash, &info->root); - return 0; -} - - - -/* - * This is a helper function for ext4_dx_readdir. It calls filldir - * for all entres on the fname linked list. (Normally there is only - * one entry on the linked list, unless there are 62 bit hash collisions.) - */ -static int call_filldir(struct file *filp, void *dirent, - filldir_t filldir, struct fname *fname) -{ - struct dir_private_info *info = filp->private_data; - loff_t curr_pos; - struct inode *inode = filp->f_path.dentry->d_inode; - struct super_block *sb; - int error; - - sb = inode->i_sb; - - if (!fname) { - ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " - "called with null fname?!?", __func__, __LINE__, - inode->i_ino, current->comm); - return 0; - } - curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); - while (fname) { - error = filldir(dirent, fname->name, - fname->name_len, curr_pos, - fname->inode, - get_dtype(sb, fname->file_type)); - if (error) { - filp->f_pos = curr_pos; - info->extra_fname = fname; - return error; - } - fname = fname->next; - } - return 0; -} - -static int ext4_dx_readdir(struct file *filp, - void *dirent, filldir_t filldir) -{ - struct dir_private_info *info = filp->private_data; - struct inode *inode = filp->f_path.dentry->d_inode; - struct fname *fname; - int ret; - - if (!info) { - info = ext4_htree_create_dir_info(filp, filp->f_pos); - if (!info) - return -ENOMEM; - filp->private_data = info; - } - - if (filp->f_pos == ext4_get_htree_eof(filp)) - return 0; /* EOF */ - - /* Some one has messed with f_pos; reset the world */ - if (info->last_pos != filp->f_pos) { - free_rb_tree_fname(&info->root); - info->curr_node = NULL; - info->extra_fname = NULL; - info->curr_hash = pos2maj_hash(filp, filp->f_pos); - info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); - } - - /* - * If there are any leftover names on the hash collision - * chain, return them first. - */ - if (info->extra_fname) { - if (call_filldir(filp, dirent, filldir, info->extra_fname)) - goto finished; - info->extra_fname = NULL; - goto next_node; - } else if (!info->curr_node) - info->curr_node = rb_first(&info->root); - - while (1) { - /* - * Fill the rbtree if we have no more entries, - * or the inode has changed since we last read in the - * cached entries. - */ - if ((!info->curr_node) || - (filp->f_version != inode->i_version)) { - info->curr_node = NULL; - free_rb_tree_fname(&info->root); - filp->f_version = inode->i_version; - ret = ext4_htree_fill_tree(filp, info->curr_hash, - info->curr_minor_hash, - &info->next_hash); - if (ret < 0) - return ret; - if (ret == 0) { - filp->f_pos = ext4_get_htree_eof(filp); - break; - } - info->curr_node = rb_first(&info->root); - } - - fname = rb_entry(info->curr_node, struct fname, rb_hash); - info->curr_hash = fname->hash; - info->curr_minor_hash = fname->minor_hash; - if (call_filldir(filp, dirent, filldir, fname)) - break; - next_node: - info->curr_node = rb_next(info->curr_node); - if (info->curr_node) { - fname = rb_entry(info->curr_node, struct fname, - rb_hash); - info->curr_hash = fname->hash; - info->curr_minor_hash = fname->minor_hash; - } else { - if (info->next_hash == ~0) { - filp->f_pos = ext4_get_htree_eof(filp); - break; - } - info->curr_hash = info->next_hash; - info->curr_minor_hash = 0; - } - } -finished: - info->last_pos = filp->f_pos; - return 0; -} - -static int ext4_release_dir(struct inode *inode, struct file *filp) -{ - if (filp->private_data) - ext4_htree_free_dir_info(filp->private_data); - - return 0; -} - -const struct file_operations ext4_dir_operations = { - .llseek = ext4_dir_llseek, - .read = generic_read_dir, - .readdir = ext4_readdir, - .unlocked_ioctl = ext4_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext4_compat_ioctl, -#endif - .fsync = ext4_sync_file, - .release = ext4_release_dir, -}; diff --git a/ANDROID_3.4.5/fs/ext4/ext4.h b/ANDROID_3.4.5/fs/ext4/ext4.h deleted file mode 100644 index 0e01e90a..00000000 --- a/ANDROID_3.4.5/fs/ext4/ext4.h +++ /dev/null @@ -1,2372 +0,0 @@ -/* - * ext4.h - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/include/linux/minix_fs.h - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#ifndef _EXT4_H -#define _EXT4_H - -#include <linux/types.h> -#include <linux/blkdev.h> -#include <linux/magic.h> -#include <linux/jbd2.h> -#include <linux/quota.h> -#include <linux/rwsem.h> -#include <linux/rbtree.h> -#include <linux/seqlock.h> -#include <linux/mutex.h> -#include <linux/timer.h> -#include <linux/wait.h> -#include <linux/blockgroup_lock.h> -#include <linux/percpu_counter.h> -#ifdef __KERNEL__ -#include <linux/compat.h> -#endif - -/* - * The fourth extended filesystem constants/structures - */ - -/* - * Define EXT4FS_DEBUG to produce debug messages - */ -#undef EXT4FS_DEBUG - -/* - * Debug code - */ -#ifdef EXT4FS_DEBUG -#define ext4_debug(f, a...) \ - do { \ - printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ - __FILE__, __LINE__, __func__); \ - printk(KERN_DEBUG f, ## a); \ - } while (0) -#else -#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) -#endif - -#define EXT4_ERROR_INODE(inode, fmt, a...) \ - ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) - -#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ - ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) - -#define EXT4_ERROR_FILE(file, block, fmt, a...) \ - ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) - -/* data type for block offset of block group */ -typedef int ext4_grpblk_t; - -/* data type for filesystem-wide blocks number */ -typedef unsigned long long ext4_fsblk_t; - -/* data type for file logical block number */ -typedef __u32 ext4_lblk_t; - -/* data type for block group number */ -typedef unsigned int ext4_group_t; - -/* - * Flags used in mballoc's allocation_context flags field. - * - * Also used to show what's going on for debugging purposes when the - * flag field is exported via the traceport interface - */ - -/* prefer goal again. length */ -#define EXT4_MB_HINT_MERGE 0x0001 -/* blocks already reserved */ -#define EXT4_MB_HINT_RESERVED 0x0002 -/* metadata is being allocated */ -#define EXT4_MB_HINT_METADATA 0x0004 -/* first blocks in the file */ -#define EXT4_MB_HINT_FIRST 0x0008 -/* search for the best chunk */ -#define EXT4_MB_HINT_BEST 0x0010 -/* data is being allocated */ -#define EXT4_MB_HINT_DATA 0x0020 -/* don't preallocate (for tails) */ -#define EXT4_MB_HINT_NOPREALLOC 0x0040 -/* allocate for locality group */ -#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 -/* allocate goal blocks or none */ -#define EXT4_MB_HINT_GOAL_ONLY 0x0100 -/* goal is meaningful */ -#define EXT4_MB_HINT_TRY_GOAL 0x0200 -/* blocks already pre-reserved by delayed allocation */ -#define EXT4_MB_DELALLOC_RESERVED 0x0400 -/* We are doing stream allocation */ -#define EXT4_MB_STREAM_ALLOC 0x0800 -/* Use reserved root blocks if needed */ -#define EXT4_MB_USE_ROOT_BLOCKS 0x1000 - -struct ext4_allocation_request { - /* target inode for block we're allocating */ - struct inode *inode; - /* how many blocks we want to allocate */ - unsigned int len; - /* logical block in target inode */ - ext4_lblk_t logical; - /* the closest logical allocated block to the left */ - ext4_lblk_t lleft; - /* the closest logical allocated block to the right */ - ext4_lblk_t lright; - /* phys. target (a hint) */ - ext4_fsblk_t goal; - /* phys. block for the closest logical allocated block to the left */ - ext4_fsblk_t pleft; - /* phys. block for the closest logical allocated block to the right */ - ext4_fsblk_t pright; - /* flags. see above EXT4_MB_HINT_* */ - unsigned int flags; -}; - -/* - * Logical to physical block mapping, used by ext4_map_blocks() - * - * This structure is used to pass requests into ext4_map_blocks() as - * well as to store the information returned by ext4_map_blocks(). It - * takes less room on the stack than a struct buffer_head. - */ -#define EXT4_MAP_NEW (1 << BH_New) -#define EXT4_MAP_MAPPED (1 << BH_Mapped) -#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) -#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) -#define EXT4_MAP_UNINIT (1 << BH_Uninit) -/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of - * ext4_map_blocks wants to know whether or not the underlying cluster has - * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that - * the requested mapping was from previously mapped (or delayed allocated) - * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster - * should never appear on buffer_head's state flags. - */ -#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster) -#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ - EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER) - -struct ext4_map_blocks { - ext4_fsblk_t m_pblk; - ext4_lblk_t m_lblk; - unsigned int m_len; - unsigned int m_flags; -}; - -/* - * For delayed allocation tracking - */ -struct mpage_da_data { - struct inode *inode; - sector_t b_blocknr; /* start block number of extent */ - size_t b_size; /* size of extent */ - unsigned long b_state; /* state of the extent */ - unsigned long first_page, next_page; /* extent of pages */ - struct writeback_control *wbc; - int io_done; - int pages_written; - int retval; -}; - -/* - * Flags for ext4_io_end->flags - */ -#define EXT4_IO_END_UNWRITTEN 0x0001 -#define EXT4_IO_END_ERROR 0x0002 -#define EXT4_IO_END_QUEUED 0x0004 -#define EXT4_IO_END_DIRECT 0x0008 -#define EXT4_IO_END_IN_FSYNC 0x0010 - -struct ext4_io_page { - struct page *p_page; - atomic_t p_count; -}; - -#define MAX_IO_PAGES 128 - -/* - * For converting uninitialized extents on a work queue. - * - * 'page' is only used from the writepage() path; 'pages' is only used for - * buffered writes; they are used to keep page references until conversion - * takes place. For AIO/DIO, neither field is filled in. - */ -typedef struct ext4_io_end { - struct list_head list; /* per-file finished IO list */ - struct inode *inode; /* file being written to */ - unsigned int flag; /* unwritten or not */ - struct page *page; /* for writepage() path */ - loff_t offset; /* offset in the file */ - ssize_t size; /* size of the extent */ - struct work_struct work; /* data work queue */ - struct kiocb *iocb; /* iocb struct for AIO */ - int result; /* error value for AIO */ - int num_io_pages; /* for writepages() */ - struct ext4_io_page *pages[MAX_IO_PAGES]; /* for writepages() */ -} ext4_io_end_t; - -struct ext4_io_submit { - int io_op; - struct bio *io_bio; - ext4_io_end_t *io_end; - struct ext4_io_page *io_page; - sector_t io_next_block; -}; - -/* - * Special inodes numbers - */ -#define EXT4_BAD_INO 1 /* Bad blocks inode */ -#define EXT4_ROOT_INO 2 /* Root inode */ -#define EXT4_USR_QUOTA_INO 3 /* User quota inode */ -#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ -#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ -#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ -#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ -#define EXT4_JOURNAL_INO 8 /* Journal inode */ - -/* First non-reserved inode for old ext4 filesystems */ -#define EXT4_GOOD_OLD_FIRST_INO 11 - -/* - * Maximal count of links to a file - */ -#define EXT4_LINK_MAX 65000 - -/* - * Macro-instructions used to manage several block sizes - */ -#define EXT4_MIN_BLOCK_SIZE 1024 -#define EXT4_MAX_BLOCK_SIZE 65536 -#define EXT4_MIN_BLOCK_LOG_SIZE 10 -#define EXT4_MAX_BLOCK_LOG_SIZE 16 -#ifdef __KERNEL__ -# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) -#else -# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) -#endif -#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) -#define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ - EXT4_SB(s)->s_cluster_bits) -#ifdef __KERNEL__ -# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) -# define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) -#else -# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) -#endif -#ifdef __KERNEL__ -#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) -#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) -#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) -#else -#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ - EXT4_GOOD_OLD_INODE_SIZE : \ - (s)->s_inode_size) -#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ - EXT4_GOOD_OLD_FIRST_INO : \ - (s)->s_first_ino) -#endif -#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) - -/* Translate a block number to a cluster number */ -#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) -/* Translate a cluster number to a block number */ -#define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) -/* Translate # of blks to # of clusters */ -#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ - (sbi)->s_cluster_bits) - -/* - * Structure of a blocks group descriptor - */ -struct ext4_group_desc -{ - __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ - __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ - __le32 bg_inode_table_lo; /* Inodes table block */ - __le16 bg_free_blocks_count_lo;/* Free blocks count */ - __le16 bg_free_inodes_count_lo;/* Free inodes count */ - __le16 bg_used_dirs_count_lo; /* Directories count */ - __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ - __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ - __le16 bg_itable_unused_lo; /* Unused inodes count */ - __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ - __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ - __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ - __le32 bg_inode_table_hi; /* Inodes table block MSB */ - __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ - __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ - __le16 bg_used_dirs_count_hi; /* Directories count MSB */ - __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ - __u32 bg_reserved2[3]; -}; - -/* - * Structure of a flex block group info - */ - -struct flex_groups { - atomic_t free_inodes; - atomic_t free_clusters; - atomic_t used_dirs; -}; - -#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ -#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ -#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ - -/* - * Macro-instructions used to manage group descriptors - */ -#define EXT4_MIN_DESC_SIZE 32 -#define EXT4_MIN_DESC_SIZE_64BIT 64 -#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE -#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) -#ifdef __KERNEL__ -# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) -# define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) -# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) -# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) -# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) -#else -# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) -# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) -# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) -#endif - -/* - * Constants relative to the data blocks - */ -#define EXT4_NDIR_BLOCKS 12 -#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS -#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) -#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) -#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) - -/* - * Inode flags - */ -#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ -#define EXT4_UNRM_FL 0x00000002 /* Undelete */ -#define EXT4_COMPR_FL 0x00000004 /* Compress file */ -#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ -#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ -#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ -#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ -#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ -/* Reserved for compression usage... */ -#define EXT4_DIRTY_FL 0x00000100 -#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ -#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ -#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */ -/* End compression flags --- maybe not all used */ -#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ -#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ -#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ -#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ -#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ -#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ -#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ -#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ -#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ -#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ -#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ - -#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ - -/* Flags that should be inherited by new inodes from their parent. */ -#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ - EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ - EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ - EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) - -/* Flags that are appropriate for regular files (all but dir-specific ones). */ -#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) - -/* Flags that are appropriate for non-directories/regular files. */ -#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) - -/* Mask out flags that are inappropriate for the given type of inode. */ -static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) -{ - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & EXT4_REG_FLMASK; - else - return flags & EXT4_OTHER_FLMASK; -} - -/* - * Inode flags used for atomic set/get - */ -enum { - EXT4_INODE_SECRM = 0, /* Secure deletion */ - EXT4_INODE_UNRM = 1, /* Undelete */ - EXT4_INODE_COMPR = 2, /* Compress file */ - EXT4_INODE_SYNC = 3, /* Synchronous updates */ - EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ - EXT4_INODE_APPEND = 5, /* writes to file may only append */ - EXT4_INODE_NODUMP = 6, /* do not dump file */ - EXT4_INODE_NOATIME = 7, /* do not update atime */ -/* Reserved for compression usage... */ - EXT4_INODE_DIRTY = 8, - EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ - EXT4_INODE_NOCOMPR = 10, /* Don't compress */ - EXT4_INODE_ECOMPR = 11, /* Compression error */ -/* End compression flags --- maybe not all used */ - EXT4_INODE_INDEX = 12, /* hash-indexed directory */ - EXT4_INODE_IMAGIC = 13, /* AFS directory */ - EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ - EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ - EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ - EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ - EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ - EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ - EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ - EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ - EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ -}; - -#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) -#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ - printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ - EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } - -/* - * Since it's pretty easy to mix up bit numbers and hex values, and we - * can't do a compile-time test for ENUM values, we use a run-time - * test to make sure that EXT4_XXX_FL is consistent with respect to - * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop - * out so it won't cost any extra space in the compiled kernel image. - * But it's important that these values are the same, since we are - * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL - * must be consistent with the values of FS_XXX_FL defined in - * include/linux/fs.h and the on-disk values found in ext2, ext3, and - * ext4 filesystems, and of course the values defined in e2fsprogs. - * - * It's not paranoia if the Murphy's Law really *is* out to get you. :-) - */ -static inline void ext4_check_flag_values(void) -{ - CHECK_FLAG_VALUE(SECRM); - CHECK_FLAG_VALUE(UNRM); - CHECK_FLAG_VALUE(COMPR); - CHECK_FLAG_VALUE(SYNC); - CHECK_FLAG_VALUE(IMMUTABLE); - CHECK_FLAG_VALUE(APPEND); - CHECK_FLAG_VALUE(NODUMP); - CHECK_FLAG_VALUE(NOATIME); - CHECK_FLAG_VALUE(DIRTY); - CHECK_FLAG_VALUE(COMPRBLK); - CHECK_FLAG_VALUE(NOCOMPR); - CHECK_FLAG_VALUE(ECOMPR); - CHECK_FLAG_VALUE(INDEX); - CHECK_FLAG_VALUE(IMAGIC); - CHECK_FLAG_VALUE(JOURNAL_DATA); - CHECK_FLAG_VALUE(NOTAIL); - CHECK_FLAG_VALUE(DIRSYNC); - CHECK_FLAG_VALUE(TOPDIR); - CHECK_FLAG_VALUE(HUGE_FILE); - CHECK_FLAG_VALUE(EXTENTS); - CHECK_FLAG_VALUE(EA_INODE); - CHECK_FLAG_VALUE(EOFBLOCKS); - CHECK_FLAG_VALUE(RESERVED); -} - -/* Used to pass group descriptor data when online resize is done */ -struct ext4_new_group_input { - __u32 group; /* Group number for this data */ - __u64 block_bitmap; /* Absolute block number of block bitmap */ - __u64 inode_bitmap; /* Absolute block number of inode bitmap */ - __u64 inode_table; /* Absolute block number of inode table start */ - __u32 blocks_count; /* Total number of blocks in this group */ - __u16 reserved_blocks; /* Number of reserved blocks in this group */ - __u16 unused; -}; - -#if defined(__KERNEL__) && defined(CONFIG_COMPAT) -struct compat_ext4_new_group_input { - u32 group; - compat_u64 block_bitmap; - compat_u64 inode_bitmap; - compat_u64 inode_table; - u32 blocks_count; - u16 reserved_blocks; - u16 unused; -}; -#endif - -/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ -struct ext4_new_group_data { - __u32 group; - __u64 block_bitmap; - __u64 inode_bitmap; - __u64 inode_table; - __u32 blocks_count; - __u16 reserved_blocks; - __u16 unused; - __u32 free_blocks_count; -}; - -/* Indexes used to index group tables in ext4_new_group_data */ -enum { - BLOCK_BITMAP = 0, /* block bitmap */ - INODE_BITMAP, /* inode bitmap */ - INODE_TABLE, /* inode tables */ - GROUP_TABLE_COUNT, -}; - -/* - * Flags used by ext4_map_blocks() - */ - /* Allocate any needed blocks and/or convert an unitialized - extent to be an initialized ext4 */ -#define EXT4_GET_BLOCKS_CREATE 0x0001 - /* Request the creation of an unitialized extent */ -#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 -#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ - EXT4_GET_BLOCKS_CREATE) - /* Caller is from the delayed allocation writeout path, - so set the magic i_delalloc_reserve_flag after taking the - inode allocation semaphore for */ -#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 - /* caller is from the direct IO path, request to creation of an - unitialized extents if not allocated, split the uninitialized - extent if blocks has been preallocated already*/ -#define EXT4_GET_BLOCKS_PRE_IO 0x0008 -#define EXT4_GET_BLOCKS_CONVERT 0x0010 -#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) - /* Convert extent to initialized after IO complete */ -#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) - /* Punch out blocks of an extent */ -#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020 - /* Don't normalize allocation size (used for fallocate) */ -#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 - /* Request will not result in inode size update (user for fallocate) */ -#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 - -/* - * Flags used by ext4_free_blocks - */ -#define EXT4_FREE_BLOCKS_METADATA 0x0001 -#define EXT4_FREE_BLOCKS_FORGET 0x0002 -#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 -#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 -#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 -#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 - -/* - * Flags used by ext4_discard_partial_page_buffers - */ -#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001 - -/* - * ioctl commands - */ -#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS -#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS -#define EXT4_IOC_GETVERSION _IOR('f', 3, long) -#define EXT4_IOC_SETVERSION _IOW('f', 4, long) -#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION -#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION -#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) -#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) -#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) -#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) -#define EXT4_IOC_MIGRATE _IO('f', 9) - /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ - /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ -#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) -#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) -#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) - -#if defined(__KERNEL__) && defined(CONFIG_COMPAT) -/* - * ioctl commands in 32 bit emulation - */ -#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS -#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) -#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) -#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) -#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) -#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) -#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) -#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION -#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION -#endif - -/* Max physical block we can address w/o extents */ -#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF - -/* - * Structure of an inode on the disk - */ -struct ext4_inode { - __le16 i_mode; /* File mode */ - __le16 i_uid; /* Low 16 bits of Owner Uid */ - __le32 i_size_lo; /* Size in bytes */ - __le32 i_atime; /* Access time */ - __le32 i_ctime; /* Inode Change time */ - __le32 i_mtime; /* Modification time */ - __le32 i_dtime; /* Deletion Time */ - __le16 i_gid; /* Low 16 bits of Group Id */ - __le16 i_links_count; /* Links count */ - __le32 i_blocks_lo; /* Blocks count */ - __le32 i_flags; /* File flags */ - union { - struct { - __le32 l_i_version; - } linux1; - struct { - __u32 h_i_translator; - } hurd1; - struct { - __u32 m_i_reserved1; - } masix1; - } osd1; /* OS dependent 1 */ - __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ - __le32 i_generation; /* File version (for NFS) */ - __le32 i_file_acl_lo; /* File ACL */ - __le32 i_size_high; - __le32 i_obso_faddr; /* Obsoleted fragment address */ - union { - struct { - __le16 l_i_blocks_high; /* were l_i_reserved1 */ - __le16 l_i_file_acl_high; - __le16 l_i_uid_high; /* these 2 fields */ - __le16 l_i_gid_high; /* were reserved2[0] */ - __u32 l_i_reserved2; - } linux2; - struct { - __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ - __u16 h_i_mode_high; - __u16 h_i_uid_high; - __u16 h_i_gid_high; - __u32 h_i_author; - } hurd2; - struct { - __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ - __le16 m_i_file_acl_high; - __u32 m_i_reserved2[2]; - } masix2; - } osd2; /* OS dependent 2 */ - __le16 i_extra_isize; - __le16 i_pad1; - __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ - __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ - __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ - __le32 i_crtime; /* File Creation time */ - __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ - __le32 i_version_hi; /* high 32 bits for 64-bit version */ -}; - -struct move_extent { - __u32 reserved; /* should be zero */ - __u32 donor_fd; /* donor file descriptor */ - __u64 orig_start; /* logical start offset in block for orig */ - __u64 donor_start; /* logical start offset in block for donor */ - __u64 len; /* block length to be moved */ - __u64 moved_len; /* moved block length */ -}; - -#define EXT4_EPOCH_BITS 2 -#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) -#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) - -/* - * Extended fields will fit into an inode if the filesystem was formatted - * with large inodes (-I 256 or larger) and there are not currently any EAs - * consuming all of the available space. For new inodes we always reserve - * enough space for the kernel's known extended fields, but for inodes - * created with an old kernel this might not have been the case. None of - * the extended inode fields is critical for correct filesystem operation. - * This macro checks if a certain field fits in the inode. Note that - * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize - */ -#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ - ((offsetof(typeof(*ext4_inode), field) + \ - sizeof((ext4_inode)->field)) \ - <= (EXT4_GOOD_OLD_INODE_SIZE + \ - (einode)->i_extra_isize)) \ - -static inline __le32 ext4_encode_extra_time(struct timespec *time) -{ - return cpu_to_le32((sizeof(time->tv_sec) > 4 ? - (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) | - ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK)); -} - -static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) -{ - if (sizeof(time->tv_sec) > 4) - time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) - << 32; - time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; -} - -#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ -do { \ - (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ - if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ - (raw_inode)->xtime ## _extra = \ - ext4_encode_extra_time(&(inode)->xtime); \ -} while (0) - -#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ -do { \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ - (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ - (raw_inode)->xtime ## _extra = \ - ext4_encode_extra_time(&(einode)->xtime); \ -} while (0) - -#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ -do { \ - (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ - if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ - ext4_decode_extra_time(&(inode)->xtime, \ - raw_inode->xtime ## _extra); \ - else \ - (inode)->xtime.tv_nsec = 0; \ -} while (0) - -#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ -do { \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ - (einode)->xtime.tv_sec = \ - (signed)le32_to_cpu((raw_inode)->xtime); \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ - ext4_decode_extra_time(&(einode)->xtime, \ - raw_inode->xtime ## _extra); \ - else \ - (einode)->xtime.tv_nsec = 0; \ -} while (0) - -#define i_disk_version osd1.linux1.l_i_version - -#if defined(__KERNEL__) || defined(__linux__) -#define i_reserved1 osd1.linux1.l_i_reserved1 -#define i_file_acl_high osd2.linux2.l_i_file_acl_high -#define i_blocks_high osd2.linux2.l_i_blocks_high -#define i_uid_low i_uid -#define i_gid_low i_gid -#define i_uid_high osd2.linux2.l_i_uid_high -#define i_gid_high osd2.linux2.l_i_gid_high -#define i_reserved2 osd2.linux2.l_i_reserved2 - -#elif defined(__GNU__) - -#define i_translator osd1.hurd1.h_i_translator -#define i_uid_high osd2.hurd2.h_i_uid_high -#define i_gid_high osd2.hurd2.h_i_gid_high -#define i_author osd2.hurd2.h_i_author - -#elif defined(__masix__) - -#define i_reserved1 osd1.masix1.m_i_reserved1 -#define i_file_acl_high osd2.masix2.m_i_file_acl_high -#define i_reserved2 osd2.masix2.m_i_reserved2 - -#endif /* defined(__KERNEL__) || defined(__linux__) */ - -/* - * storage for cached extent - * If ec_len == 0, then the cache is invalid. - * If ec_start == 0, then the cache represents a gap (null mapping) - */ -struct ext4_ext_cache { - ext4_fsblk_t ec_start; - ext4_lblk_t ec_block; - __u32 ec_len; /* must be 32bit to return holes */ -}; - -/* - * fourth extended file system inode data in memory - */ -struct ext4_inode_info { - __le32 i_data[15]; /* unconverted */ - __u32 i_dtime; - ext4_fsblk_t i_file_acl; - - /* - * i_block_group is the number of the block group which contains - * this file's inode. Constant across the lifetime of the inode, - * it is ued for making block allocation decisions - we try to - * place a file's data blocks near its inode block, and new inodes - * near to their parent directory's inode. - */ - ext4_group_t i_block_group; - ext4_lblk_t i_dir_start_lookup; -#if (BITS_PER_LONG < 64) - unsigned long i_state_flags; /* Dynamic state flags */ -#endif - unsigned long i_flags; - -#ifdef CONFIG_EXT4_FS_XATTR - /* - * Extended attributes can be read independently of the main file - * data. Taking i_mutex even when reading would cause contention - * between readers of EAs and writers of regular file data, so - * instead we synchronize on xattr_sem when reading or changing - * EAs. - */ - struct rw_semaphore xattr_sem; -#endif - - struct list_head i_orphan; /* unlinked but open inodes */ - - /* - * i_disksize keeps track of what the inode size is ON DISK, not - * in memory. During truncate, i_size is set to the new size by - * the VFS prior to calling ext4_truncate(), but the filesystem won't - * set i_disksize to 0 until the truncate is actually under way. - * - * The intent is that i_disksize always represents the blocks which - * are used by this file. This allows recovery to restart truncate - * on orphans if we crash during truncate. We actually write i_disksize - * into the on-disk inode when writing inodes out, instead of i_size. - * - * The only time when i_disksize and i_size may be different is when - * a truncate is in progress. The only things which change i_disksize - * are ext4_get_block (growth) and ext4_truncate (shrinkth). - */ - loff_t i_disksize; - - /* - * i_data_sem is for serialising ext4_truncate() against - * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's - * data tree are chopped off during truncate. We can't do that in - * ext4 because whenever we perform intermediate commits during - * truncate, the inode and all the metadata blocks *must* be in a - * consistent state which allows truncation of the orphans to restart - * during recovery. Hence we must fix the get_block-vs-truncate race - * by other means, so we have i_data_sem. - */ - struct rw_semaphore i_data_sem; - struct inode vfs_inode; - struct jbd2_inode *jinode; - - struct ext4_ext_cache i_cached_extent; - /* - * File creation time. Its function is same as that of - * struct timespec i_{a,c,m}time in the generic inode. - */ - struct timespec i_crtime; - - /* mballoc */ - struct list_head i_prealloc_list; - spinlock_t i_prealloc_lock; - - /* ialloc */ - ext4_group_t i_last_alloc_group; - - /* allocation reservation info for delalloc */ - /* In case of bigalloc, these refer to clusters rather than blocks */ - unsigned int i_reserved_data_blocks; - unsigned int i_reserved_meta_blocks; - unsigned int i_allocated_meta_blocks; - ext4_lblk_t i_da_metadata_calc_last_lblock; - int i_da_metadata_calc_len; - - /* on-disk additional length */ - __u16 i_extra_isize; - -#ifdef CONFIG_QUOTA - /* quota space reservation, managed internally by quota code */ - qsize_t i_reserved_quota; -#endif - - /* completed IOs that might need unwritten extents handling */ - struct list_head i_completed_io_list; - spinlock_t i_completed_io_lock; - atomic_t i_ioend_count; /* Number of outstanding io_end structs */ - /* current io_end structure for async DIO write*/ - ext4_io_end_t *cur_aio_dio; - atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */ - - spinlock_t i_block_reservation_lock; - - /* - * Transactions that contain inode's metadata needed to complete - * fsync and fdatasync, respectively. - */ - tid_t i_sync_tid; - tid_t i_datasync_tid; -}; - -/* - * File system states - */ -#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ -#define EXT4_ERROR_FS 0x0002 /* Errors detected */ -#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ - -/* - * Misc. filesystem flags - */ -#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ -#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ -#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ - -/* - * Mount flags - */ -#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ -#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ -#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ -#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ -#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ -#define EXT4_MOUNT_ERRORS_MASK 0x00070 -#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ -#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ -#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ -#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ -#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ -#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ -#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ -#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ -#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ -#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ -#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ -#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ -#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ -#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ -#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ -#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ -#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ -#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ -#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ -#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ -#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ -#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ -#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ -#define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ - -#define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly - specified delalloc */ - -#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ - ~EXT4_MOUNT_##opt -#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ - EXT4_MOUNT_##opt -#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ - EXT4_MOUNT_##opt) - -#define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ - ~EXT4_MOUNT2_##opt -#define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ - EXT4_MOUNT2_##opt -#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ - EXT4_MOUNT2_##opt) - -#define ext4_test_and_set_bit __test_and_set_bit_le -#define ext4_set_bit __set_bit_le -#define ext4_set_bit_atomic ext2_set_bit_atomic -#define ext4_test_and_clear_bit __test_and_clear_bit_le -#define ext4_clear_bit __clear_bit_le -#define ext4_clear_bit_atomic ext2_clear_bit_atomic -#define ext4_test_bit test_bit_le -#define ext4_find_next_zero_bit find_next_zero_bit_le -#define ext4_find_next_bit find_next_bit_le - -extern void ext4_set_bits(void *bm, int cur, int len); - -/* - * Maximal mount counts between two filesystem checks - */ -#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ -#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ - -/* - * Behaviour when detecting errors - */ -#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ -#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ -#define EXT4_ERRORS_PANIC 3 /* Panic */ -#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE - -/* - * Structure of the super block - */ -struct ext4_super_block { -/*00*/ __le32 s_inodes_count; /* Inodes count */ - __le32 s_blocks_count_lo; /* Blocks count */ - __le32 s_r_blocks_count_lo; /* Reserved blocks count */ - __le32 s_free_blocks_count_lo; /* Free blocks count */ -/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ - __le32 s_first_data_block; /* First Data Block */ - __le32 s_log_block_size; /* Block size */ - __le32 s_log_cluster_size; /* Allocation cluster size */ -/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ - __le32 s_clusters_per_group; /* # Clusters per group */ - __le32 s_inodes_per_group; /* # Inodes per group */ - __le32 s_mtime; /* Mount time */ -/*30*/ __le32 s_wtime; /* Write time */ - __le16 s_mnt_count; /* Mount count */ - __le16 s_max_mnt_count; /* Maximal mount count */ - __le16 s_magic; /* Magic signature */ - __le16 s_state; /* File system state */ - __le16 s_errors; /* Behaviour when detecting errors */ - __le16 s_minor_rev_level; /* minor revision level */ -/*40*/ __le32 s_lastcheck; /* time of last check */ - __le32 s_checkinterval; /* max. time between checks */ - __le32 s_creator_os; /* OS */ - __le32 s_rev_level; /* Revision level */ -/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ - __le16 s_def_resgid; /* Default gid for reserved blocks */ - /* - * These fields are for EXT4_DYNAMIC_REV superblocks only. - * - * Note: the difference between the compatible feature set and - * the incompatible feature set is that if there is a bit set - * in the incompatible feature set that the kernel doesn't - * know about, it should refuse to mount the filesystem. - * - * e2fsck's requirements are more strict; if it doesn't know - * about a feature in either the compatible or incompatible - * feature set, it must abort and not try to meddle with - * things it doesn't understand... - */ - __le32 s_first_ino; /* First non-reserved inode */ - __le16 s_inode_size; /* size of inode structure */ - __le16 s_block_group_nr; /* block group # of this superblock */ - __le32 s_feature_compat; /* compatible feature set */ -/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ - __le32 s_feature_ro_compat; /* readonly-compatible feature set */ -/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ -/*78*/ char s_volume_name[16]; /* volume name */ -/*88*/ char s_last_mounted[64]; /* directory where last mounted */ -/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ - /* - * Performance hints. Directory preallocation should only - * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. - */ - __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ - __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ - __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ - /* - * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. - */ -/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ -/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ - __le32 s_journal_dev; /* device number of journal file */ - __le32 s_last_orphan; /* start of list of inodes to delete */ - __le32 s_hash_seed[4]; /* HTREE hash seed */ - __u8 s_def_hash_version; /* Default hash version to use */ - __u8 s_jnl_backup_type; - __le16 s_desc_size; /* size of group descriptor */ -/*100*/ __le32 s_default_mount_opts; - __le32 s_first_meta_bg; /* First metablock block group */ - __le32 s_mkfs_time; /* When the filesystem was created */ - __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ - /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ -/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ - __le32 s_r_blocks_count_hi; /* Reserved blocks count */ - __le32 s_free_blocks_count_hi; /* Free blocks count */ - __le16 s_min_extra_isize; /* All inodes have at least # bytes */ - __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ - __le32 s_flags; /* Miscellaneous flags */ - __le16 s_raid_stride; /* RAID stride */ - __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ - __le64 s_mmp_block; /* Block for multi-mount protection */ - __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ - __u8 s_log_groups_per_flex; /* FLEX_BG group size */ - __u8 s_reserved_char_pad; - __le16 s_reserved_pad; - __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ - __le32 s_snapshot_inum; /* Inode number of active snapshot */ - __le32 s_snapshot_id; /* sequential ID of active snapshot */ - __le64 s_snapshot_r_blocks_count; /* reserved blocks for active - snapshot's future use */ - __le32 s_snapshot_list; /* inode number of the head of the - on-disk snapshot list */ -#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) - __le32 s_error_count; /* number of fs errors */ - __le32 s_first_error_time; /* first time an error happened */ - __le32 s_first_error_ino; /* inode involved in first error */ - __le64 s_first_error_block; /* block involved of first error */ - __u8 s_first_error_func[32]; /* function where the error happened */ - __le32 s_first_error_line; /* line number where error happened */ - __le32 s_last_error_time; /* most recent time of an error */ - __le32 s_last_error_ino; /* inode involved in last error */ - __le32 s_last_error_line; /* line number where error happened */ - __le64 s_last_error_block; /* block involved of last error */ - __u8 s_last_error_func[32]; /* function where the error happened */ -#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) - __u8 s_mount_opts[64]; - __le32 s_usr_quota_inum; /* inode for tracking user quota */ - __le32 s_grp_quota_inum; /* inode for tracking group quota */ - __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ - __le32 s_reserved[109]; /* Padding to the end of the block */ -}; - -#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) - -#ifdef __KERNEL__ - -/* - * run-time mount flags - */ -#define EXT4_MF_MNTDIR_SAMPLED 0x0001 -#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ - -/* - * fourth extended-fs super-block data in memory - */ -struct ext4_sb_info { - unsigned long s_desc_size; /* Size of a group descriptor in bytes */ - unsigned long s_inodes_per_block;/* Number of inodes per block */ - unsigned long s_blocks_per_group;/* Number of blocks in a group */ - unsigned long s_clusters_per_group; /* Number of clusters in a group */ - unsigned long s_inodes_per_group;/* Number of inodes in a group */ - unsigned long s_itb_per_group; /* Number of inode table blocks per group */ - unsigned long s_gdb_count; /* Number of group descriptor blocks */ - unsigned long s_desc_per_block; /* Number of group descriptors per block */ - ext4_group_t s_groups_count; /* Number of groups in the fs */ - ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ - unsigned long s_overhead_last; /* Last calculated overhead */ - unsigned long s_blocks_last; /* Last seen block count */ - unsigned int s_cluster_ratio; /* Number of blocks per cluster */ - unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ - loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ - struct buffer_head * s_sbh; /* Buffer containing the super block */ - struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ - struct buffer_head **s_group_desc; - unsigned int s_mount_opt; - unsigned int s_mount_opt2; - unsigned int s_mount_flags; - unsigned int s_def_mount_opt; - ext4_fsblk_t s_sb_block; - uid_t s_resuid; - gid_t s_resgid; - unsigned short s_mount_state; - unsigned short s_pad; - int s_addr_per_block_bits; - int s_desc_per_block_bits; - int s_inode_size; - int s_first_ino; - unsigned int s_inode_readahead_blks; - unsigned int s_inode_goal; - spinlock_t s_next_gen_lock; - u32 s_next_generation; - u32 s_hash_seed[4]; - int s_def_hash_version; - int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ - struct percpu_counter s_freeclusters_counter; - struct percpu_counter s_freeinodes_counter; - struct percpu_counter s_dirs_counter; - struct percpu_counter s_dirtyclusters_counter; - struct blockgroup_lock *s_blockgroup_lock; - struct proc_dir_entry *s_proc; - struct kobject s_kobj; - struct completion s_kobj_unregister; - - /* Journaling */ - struct journal_s *s_journal; - struct list_head s_orphan; - struct mutex s_orphan_lock; - unsigned long s_resize_flags; /* Flags indicating if there - is a resizer */ - unsigned long s_commit_interval; - u32 s_max_batch_time; - u32 s_min_batch_time; - struct block_device *journal_bdev; -#ifdef CONFIG_QUOTA - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ -#endif - unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ - struct rb_root system_blks; - -#ifdef EXTENTS_STATS - /* ext4 extents stats */ - unsigned long s_ext_min; - unsigned long s_ext_max; - unsigned long s_depth_max; - spinlock_t s_ext_stats_lock; - unsigned long s_ext_blocks; - unsigned long s_ext_extents; -#endif - - /* for buddy allocator */ - struct ext4_group_info ***s_group_info; - struct inode *s_buddy_cache; - spinlock_t s_md_lock; - unsigned short *s_mb_offsets; - unsigned int *s_mb_maxs; - - /* tunables */ - unsigned long s_stripe; - unsigned int s_mb_stream_request; - unsigned int s_mb_max_to_scan; - unsigned int s_mb_min_to_scan; - unsigned int s_mb_stats; - unsigned int s_mb_order2_reqs; - unsigned int s_mb_group_prealloc; - unsigned int s_max_writeback_mb_bump; - /* where last allocation was done - for stream allocation */ - unsigned long s_mb_last_group; - unsigned long s_mb_last_start; - - /* stats for buddy allocator */ - atomic_t s_bal_reqs; /* number of reqs with len > 1 */ - atomic_t s_bal_success; /* we found long enough chunks */ - atomic_t s_bal_allocated; /* in blocks */ - atomic_t s_bal_ex_scanned; /* total extents scanned */ - atomic_t s_bal_goals; /* goal hits */ - atomic_t s_bal_breaks; /* too long searches */ - atomic_t s_bal_2orders; /* 2^order hits */ - spinlock_t s_bal_lock; - unsigned long s_mb_buddies_generated; - unsigned long long s_mb_generation_time; - atomic_t s_mb_lost_chunks; - atomic_t s_mb_preallocated; - atomic_t s_mb_discarded; - atomic_t s_lock_busy; - - /* locality groups */ - struct ext4_locality_group __percpu *s_locality_groups; - - /* for write statistics */ - unsigned long s_sectors_written_start; - u64 s_kbytes_written; - - unsigned int s_log_groups_per_flex; - struct flex_groups *s_flex_groups; - - /* workqueue for dio unwritten */ - struct workqueue_struct *dio_unwritten_wq; - - /* timer for periodic error stats printing */ - struct timer_list s_err_report; - - /* Lazy inode table initialization info */ - struct ext4_li_request *s_li_request; - /* Wait multiplier for lazy initialization thread */ - unsigned int s_li_wait_mult; - - /* Kernel thread for multiple mount protection */ - struct task_struct *s_mmp_tsk; - - /* record the last minlen when FITRIM is called. */ - atomic_t s_last_trim_minblks; -}; - -static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) -{ - return sb->s_fs_info; -} -static inline struct ext4_inode_info *EXT4_I(struct inode *inode) -{ - return container_of(inode, struct ext4_inode_info, vfs_inode); -} - -static inline struct timespec ext4_current_time(struct inode *inode) -{ - return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? - current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; -} - -static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) -{ - return ino == EXT4_ROOT_INO || - ino == EXT4_JOURNAL_INO || - ino == EXT4_RESIZE_INO || - (ino >= EXT4_FIRST_INO(sb) && - ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); -} - -static inline void ext4_set_io_unwritten_flag(struct inode *inode, - struct ext4_io_end *io_end) -{ - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); - } -} - -/* - * Inode dynamic state flags - */ -enum { - EXT4_STATE_JDATA, /* journaled data exists */ - EXT4_STATE_NEW, /* inode is newly created */ - EXT4_STATE_XATTR, /* has in-inode xattrs */ - EXT4_STATE_NO_EXPAND, /* No space for expansion */ - EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ - EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ - EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ - EXT4_STATE_NEWENTRY, /* File just added to dir */ - EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ -}; - -#define EXT4_INODE_BIT_FNS(name, field, offset) \ -static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ -{ \ - return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ -} \ -static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ -{ \ - set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ -} \ -static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ -{ \ - clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ -} - -EXT4_INODE_BIT_FNS(flag, flags, 0) -#if (BITS_PER_LONG < 64) -EXT4_INODE_BIT_FNS(state, state_flags, 0) - -static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) -{ - (ei)->i_state_flags = 0; -} -#else -EXT4_INODE_BIT_FNS(state, flags, 32) - -static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) -{ - /* We depend on the fact that callers will set i_flags */ -} -#endif -#else -/* Assume that user mode programs are passing in an ext4fs superblock, not - * a kernel struct super_block. This will allow us to call the feature-test - * macros from user land. */ -#define EXT4_SB(sb) (sb) -#endif - -#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime - -/* - * Codes for operating systems - */ -#define EXT4_OS_LINUX 0 -#define EXT4_OS_HURD 1 -#define EXT4_OS_MASIX 2 -#define EXT4_OS_FREEBSD 3 -#define EXT4_OS_LITES 4 - -/* - * Revision levels - */ -#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ -#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ - -#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV -#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV - -#define EXT4_GOOD_OLD_INODE_SIZE 128 - -/* - * Feature set definitions - */ - -#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) -#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0) -#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ - ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0) -#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) -#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) -#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) -#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) -#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) -#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \ - EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) - -#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 -#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 -#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 -#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 -#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 -#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 - -#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 -#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 -#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 -#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 -#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 -#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 -#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 -#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 -#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 -#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 - -#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 -#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 -#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ -#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ -#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 -#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ -#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 -#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 -#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 -#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ -#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ -#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ -#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ -#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */ - -#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR -#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ - EXT4_FEATURE_INCOMPAT_META_BG) -#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT4_FEATURE_RO_COMPAT_BTREE_DIR) - -#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR -#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ - EXT4_FEATURE_INCOMPAT_RECOVER| \ - EXT4_FEATURE_INCOMPAT_META_BG) -#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT4_FEATURE_RO_COMPAT_BTREE_DIR) - -#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR -#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ - EXT4_FEATURE_INCOMPAT_RECOVER| \ - EXT4_FEATURE_INCOMPAT_META_BG| \ - EXT4_FEATURE_INCOMPAT_EXTENTS| \ - EXT4_FEATURE_INCOMPAT_64BIT| \ - EXT4_FEATURE_INCOMPAT_FLEX_BG| \ - EXT4_FEATURE_INCOMPAT_MMP) -#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ - EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ - EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ - EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ - EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ - EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ - EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ - EXT4_FEATURE_RO_COMPAT_BIGALLOC) - -/* - * Default values for user and/or group using reserved blocks - */ -#define EXT4_DEF_RESUID 0 -#define EXT4_DEF_RESGID 0 - -#define EXT4_DEF_INODE_READAHEAD_BLKS 32 - -/* - * Default mount options - */ -#define EXT4_DEFM_DEBUG 0x0001 -#define EXT4_DEFM_BSDGROUPS 0x0002 -#define EXT4_DEFM_XATTR_USER 0x0004 -#define EXT4_DEFM_ACL 0x0008 -#define EXT4_DEFM_UID16 0x0010 -#define EXT4_DEFM_JMODE 0x0060 -#define EXT4_DEFM_JMODE_DATA 0x0020 -#define EXT4_DEFM_JMODE_ORDERED 0x0040 -#define EXT4_DEFM_JMODE_WBACK 0x0060 -#define EXT4_DEFM_NOBARRIER 0x0100 -#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 -#define EXT4_DEFM_DISCARD 0x0400 -#define EXT4_DEFM_NODELALLOC 0x0800 - -/* - * Default journal batch times - */ -#define EXT4_DEF_MIN_BATCH_TIME 0 -#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ - -/* - * Minimum number of groups in a flexgroup before we separate out - * directories into the first block group of a flexgroup - */ -#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 - -/* - * Structure of a directory entry - */ -#define EXT4_NAME_LEN 255 - -struct ext4_dir_entry { - __le32 inode; /* Inode number */ - __le16 rec_len; /* Directory entry length */ - __le16 name_len; /* Name length */ - char name[EXT4_NAME_LEN]; /* File name */ -}; - -/* - * The new version of the directory entry. Since EXT4 structures are - * stored in intel byte order, and the name_len field could never be - * bigger than 255 chars, it's safe to reclaim the extra byte for the - * file_type field. - */ -struct ext4_dir_entry_2 { - __le32 inode; /* Inode number */ - __le16 rec_len; /* Directory entry length */ - __u8 name_len; /* Name length */ - __u8 file_type; - char name[EXT4_NAME_LEN]; /* File name */ -}; - -/* - * Ext4 directory file types. Only the low 3 bits are used. The - * other bits are reserved for now. - */ -#define EXT4_FT_UNKNOWN 0 -#define EXT4_FT_REG_FILE 1 -#define EXT4_FT_DIR 2 -#define EXT4_FT_CHRDEV 3 -#define EXT4_FT_BLKDEV 4 -#define EXT4_FT_FIFO 5 -#define EXT4_FT_SOCK 6 -#define EXT4_FT_SYMLINK 7 - -#define EXT4_FT_MAX 8 - -/* - * EXT4_DIR_PAD defines the directory entries boundaries - * - * NOTE: It must be a multiple of 4 - */ -#define EXT4_DIR_PAD 4 -#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) -#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ - ~EXT4_DIR_ROUND) -#define EXT4_MAX_REC_LEN ((1<<16)-1) - -/* - * If we ever get support for fs block sizes > page_size, we'll need - * to remove the #if statements in the next two functions... - */ -static inline unsigned int -ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) -{ - unsigned len = le16_to_cpu(dlen); - -#if (PAGE_CACHE_SIZE >= 65536) - if (len == EXT4_MAX_REC_LEN || len == 0) - return blocksize; - return (len & 65532) | ((len & 3) << 16); -#else - return len; -#endif -} - -static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) -{ - if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) - BUG(); -#if (PAGE_CACHE_SIZE >= 65536) - if (len < 65536) - return cpu_to_le16(len); - if (len == blocksize) { - if (blocksize == 65536) - return cpu_to_le16(EXT4_MAX_REC_LEN); - else - return cpu_to_le16(0); - } - return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); -#else - return cpu_to_le16(len); -#endif -} - -/* - * Hash Tree Directory indexing - * (c) Daniel Phillips, 2001 - */ - -#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ - EXT4_FEATURE_COMPAT_DIR_INDEX) && \ - ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) -#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) -#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) - -/* Legal values for the dx_root hash_version field: */ - -#define DX_HASH_LEGACY 0 -#define DX_HASH_HALF_MD4 1 -#define DX_HASH_TEA 2 -#define DX_HASH_LEGACY_UNSIGNED 3 -#define DX_HASH_HALF_MD4_UNSIGNED 4 -#define DX_HASH_TEA_UNSIGNED 5 - -#ifdef __KERNEL__ - -/* hash info structure used by the directory hash */ -struct dx_hash_info -{ - u32 hash; - u32 minor_hash; - int hash_version; - u32 *seed; -}; - - -/* 32 and 64 bit signed EOF for dx directories */ -#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) -#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) - - -/* - * Control parameters used by ext4_htree_next_block - */ -#define HASH_NB_ALWAYS 1 - - -/* - * Describe an inode's exact location on disk and in memory - */ -struct ext4_iloc -{ - struct buffer_head *bh; - unsigned long offset; - ext4_group_t block_group; -}; - -static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) -{ - return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); -} - -/* - * This structure is stuffed into the struct file's private_data field - * for directories. It is where we put information so that we can do - * readdir operations in hash tree order. - */ -struct dir_private_info { - struct rb_root root; - struct rb_node *curr_node; - struct fname *extra_fname; - loff_t last_pos; - __u32 curr_hash; - __u32 curr_minor_hash; - __u32 next_hash; -}; - -/* calculate the first block number of the group */ -static inline ext4_fsblk_t -ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) -{ - return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); -} - -/* - * Special error return code only used by dx_probe() and its callers. - */ -#define ERR_BAD_DX_DIR -75000 - -void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, - ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); - -/* - * Timeout and state flag for lazy initialization inode thread. - */ -#define EXT4_DEF_LI_WAIT_MULT 10 -#define EXT4_DEF_LI_MAX_START_DELAY 5 -#define EXT4_LAZYINIT_QUIT 0x0001 -#define EXT4_LAZYINIT_RUNNING 0x0002 - -/* - * Lazy inode table initialization info - */ -struct ext4_lazy_init { - unsigned long li_state; - struct list_head li_request_list; - struct mutex li_list_mtx; -}; - -struct ext4_li_request { - struct super_block *lr_super; - struct ext4_sb_info *lr_sbi; - ext4_group_t lr_next_group; - struct list_head lr_request; - unsigned long lr_next_sched; - unsigned long lr_timeout; -}; - -struct ext4_features { - struct kobject f_kobj; - struct completion f_kobj_unregister; -}; - -/* - * This structure will be used for multiple mount protection. It will be - * written into the block number saved in the s_mmp_block field in the - * superblock. Programs that check MMP should assume that if - * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe - * to use the filesystem, regardless of how old the timestamp is. - */ -#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ -#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ -#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ -#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ - -struct mmp_struct { - __le32 mmp_magic; /* Magic number for MMP */ - __le32 mmp_seq; /* Sequence no. updated periodically */ - - /* - * mmp_time, mmp_nodename & mmp_bdevname are only used for information - * purposes and do not affect the correctness of the algorithm - */ - __le64 mmp_time; /* Time last updated */ - char mmp_nodename[64]; /* Node which last updated MMP block */ - char mmp_bdevname[32]; /* Bdev which last updated MMP block */ - - /* - * mmp_check_interval is used to verify if the MMP block has been - * updated on the block device. The value is updated based on the - * maximum time to write the MMP block during an update cycle. - */ - __le16 mmp_check_interval; - - __le16 mmp_pad1; - __le32 mmp_pad2[227]; -}; - -/* arguments passed to the mmp thread */ -struct mmpd_data { - struct buffer_head *bh; /* bh from initial read_mmp_block() */ - struct super_block *sb; /* super block of the fs */ -}; - -/* - * Check interval multiplier - * The MMP block is written every update interval and initially checked every - * update interval x the multiplier (the value is then adapted based on the - * write latency). The reason is that writes can be delayed under load and we - * don't want readers to incorrectly assume that the filesystem is no longer - * in use. - */ -#define EXT4_MMP_CHECK_MULT 2UL - -/* - * Minimum interval for MMP checking in seconds. - */ -#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL - -/* - * Maximum interval for MMP checking in seconds. - */ -#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL - -/* - * Function prototypes - */ - -/* - * Ok, these declarations are also in <linux/kernel.h> but none of the - * ext4 source programs needs to include it so they are duplicated here. - */ -# define NORET_TYPE /**/ -# define ATTRIB_NORET __attribute__((noreturn)) -# define NORET_AND noreturn, - -/* bitmap.c */ -extern unsigned int ext4_count_free(struct buffer_head *, unsigned); - -/* balloc.c */ -extern unsigned int ext4_block_group(struct super_block *sb, - ext4_fsblk_t blocknr); -extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, - ext4_fsblk_t blocknr); -extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); -extern unsigned long ext4_bg_num_gdb(struct super_block *sb, - ext4_group_t group); -extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, - unsigned int flags, - unsigned long *count, - int *errp); -extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, - s64 nclusters, unsigned int flags); -extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); -extern void ext4_check_blocks_bitmap(struct super_block *); -extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, - ext4_group_t block_group, - struct buffer_head ** bh); -extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); - -extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, - ext4_group_t block_group); -extern int ext4_wait_block_bitmap(struct super_block *sb, - ext4_group_t block_group, - struct buffer_head *bh); -extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, - ext4_group_t block_group); -extern void ext4_init_block_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); -extern unsigned ext4_free_clusters_after_init(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp); -extern unsigned ext4_num_overhead_clusters(struct super_block *sb, - ext4_group_t block_group, - struct ext4_group_desc *gdp); -ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); - -/* dir.c */ -extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, - struct file *, - struct ext4_dir_entry_2 *, - struct buffer_head *, unsigned int); -#define ext4_check_dir_entry(dir, filp, de, bh, offset) \ - unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ - (de), (bh), (offset))) -extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, - __u32 minor_hash, - struct ext4_dir_entry_2 *dirent); -extern void ext4_htree_free_dir_info(struct dir_private_info *p); - -/* fsync.c */ -extern int ext4_sync_file(struct file *, loff_t, loff_t, int); -extern int ext4_flush_completed_IO(struct inode *); - -/* hash.c */ -extern int ext4fs_dirhash(const char *name, int len, struct - dx_hash_info *hinfo); - -/* ialloc.c */ -extern struct inode *ext4_new_inode(handle_t *, struct inode *, umode_t, - const struct qstr *qstr, __u32 goal, - uid_t *owner); -extern void ext4_free_inode(handle_t *, struct inode *); -extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); -extern unsigned long ext4_count_free_inodes(struct super_block *); -extern unsigned long ext4_count_dirs(struct super_block *); -extern void ext4_check_inodes_bitmap(struct super_block *); -extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); -extern int ext4_init_inode_table(struct super_block *sb, - ext4_group_t group, int barrier); -extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); - -/* mballoc.c */ -extern long ext4_mb_stats; -extern long ext4_mb_max_to_scan; -extern int ext4_mb_init(struct super_block *, int); -extern int ext4_mb_release(struct super_block *); -extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, - struct ext4_allocation_request *, int *); -extern int ext4_mb_reserve_blocks(struct super_block *, int); -extern void ext4_discard_preallocations(struct inode *); -extern int __init ext4_init_mballoc(void); -extern void ext4_exit_mballoc(void); -extern void ext4_free_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t block, - unsigned long count, int flags); -extern int ext4_mb_add_groupinfo(struct super_block *sb, - ext4_group_t i, struct ext4_group_desc *desc); -extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count); -extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); - -/* inode.c */ -struct buffer_head *ext4_getblk(handle_t *, struct inode *, - ext4_lblk_t, int, int *); -struct buffer_head *ext4_bread(handle_t *, struct inode *, - ext4_lblk_t, int, int *); -int ext4_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); - -extern struct inode *ext4_iget(struct super_block *, unsigned long); -extern int ext4_write_inode(struct inode *, struct writeback_control *); -extern int ext4_setattr(struct dentry *, struct iattr *); -extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat); -extern void ext4_evict_inode(struct inode *); -extern void ext4_clear_inode(struct inode *); -extern int ext4_sync_inode(handle_t *, struct inode *); -extern void ext4_dirty_inode(struct inode *, int); -extern int ext4_change_inode_journal_flag(struct inode *, int); -extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); -extern int ext4_can_truncate(struct inode *inode); -extern void ext4_truncate(struct inode *); -extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); -extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); -extern void ext4_set_inode_flags(struct inode *); -extern void ext4_get_inode_flags(struct ext4_inode_info *); -extern int ext4_alloc_da_blocks(struct inode *inode); -extern void ext4_set_aops(struct inode *inode); -extern int ext4_writepage_trans_blocks(struct inode *); -extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); -extern int ext4_discard_partial_page_buffers(handle_t *handle, - struct address_space *mapping, loff_t from, - loff_t length, int flags); -extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); -extern qsize_t *ext4_get_reserved_space(struct inode *inode); -extern void ext4_da_update_reserve_space(struct inode *inode, - int used, int quota_claim); - -/* indirect.c */ -extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, int flags); -extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs); -extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); -extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); -extern void ext4_ind_truncate(struct inode *inode); - -/* ioctl.c */ -extern long ext4_ioctl(struct file *, unsigned int, unsigned long); -extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); - -/* migrate.c */ -extern int ext4_ext_migrate(struct inode *); - -/* namei.c */ -extern int ext4_orphan_add(handle_t *, struct inode *); -extern int ext4_orphan_del(handle_t *, struct inode *); -extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, - __u32 start_minor_hash, __u32 *next_hash); - -/* resize.c */ -extern int ext4_group_add(struct super_block *sb, - struct ext4_new_group_data *input); -extern int ext4_group_extend(struct super_block *sb, - struct ext4_super_block *es, - ext4_fsblk_t n_blocks_count); -extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); - -/* super.c */ -extern void *ext4_kvmalloc(size_t size, gfp_t flags); -extern void *ext4_kvzalloc(size_t size, gfp_t flags); -extern void ext4_kvfree(void *ptr); -extern __printf(4, 5) -void __ext4_error(struct super_block *, const char *, unsigned int, - const char *, ...); -#define ext4_error(sb, message...) __ext4_error(sb, __func__, \ - __LINE__, ## message) -extern __printf(5, 6) -void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, - const char *, ...); -extern __printf(5, 6) -void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, - const char *, ...); -extern void __ext4_std_error(struct super_block *, const char *, - unsigned int, int); -extern __printf(4, 5) -void __ext4_abort(struct super_block *, const char *, unsigned int, - const char *, ...); -#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ - __LINE__, ## message) -extern __printf(4, 5) -void __ext4_warning(struct super_block *, const char *, unsigned int, - const char *, ...); -#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ - __LINE__, ## message) -extern __printf(3, 4) -void ext4_msg(struct super_block *, const char *, const char *, ...); -extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, - const char *, unsigned int, const char *); -#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \ - __LINE__, msg) -extern __printf(7, 8) -void __ext4_grp_locked_error(const char *, unsigned int, - struct super_block *, ext4_group_t, - unsigned long, ext4_fsblk_t, - const char *, ...); -#define ext4_grp_locked_error(sb, grp, message...) \ - __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) -extern void ext4_update_dynamic_rev(struct super_block *sb); -extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, - __u32 compat); -extern int ext4_update_rocompat_feature(handle_t *handle, - struct super_block *sb, __u32 rocompat); -extern int ext4_update_incompat_feature(handle_t *handle, - struct super_block *sb, __u32 incompat); -extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, - struct ext4_group_desc *bg); -extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, - struct ext4_group_desc *bg); -extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, - struct ext4_group_desc *bg); -extern __u32 ext4_free_group_clusters(struct super_block *sb, - struct ext4_group_desc *bg); -extern __u32 ext4_free_inodes_count(struct super_block *sb, - struct ext4_group_desc *bg); -extern __u32 ext4_used_dirs_count(struct super_block *sb, - struct ext4_group_desc *bg); -extern __u32 ext4_itable_unused_count(struct super_block *sb, - struct ext4_group_desc *bg); -extern void ext4_block_bitmap_set(struct super_block *sb, - struct ext4_group_desc *bg, ext4_fsblk_t blk); -extern void ext4_inode_bitmap_set(struct super_block *sb, - struct ext4_group_desc *bg, ext4_fsblk_t blk); -extern void ext4_inode_table_set(struct super_block *sb, - struct ext4_group_desc *bg, ext4_fsblk_t blk); -extern void ext4_free_group_clusters_set(struct super_block *sb, - struct ext4_group_desc *bg, - __u32 count); -extern void ext4_free_inodes_set(struct super_block *sb, - struct ext4_group_desc *bg, __u32 count); -extern void ext4_used_dirs_set(struct super_block *sb, - struct ext4_group_desc *bg, __u32 count); -extern void ext4_itable_unused_set(struct super_block *sb, - struct ext4_group_desc *bg, __u32 count); -extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, - struct ext4_group_desc *gdp); -extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, - struct ext4_group_desc *gdp); - -static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) -{ - return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | - le32_to_cpu(es->s_blocks_count_lo); -} - -static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) -{ - return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | - le32_to_cpu(es->s_r_blocks_count_lo); -} - -static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) -{ - return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) | - le32_to_cpu(es->s_free_blocks_count_lo); -} - -static inline void ext4_blocks_count_set(struct ext4_super_block *es, - ext4_fsblk_t blk) -{ - es->s_blocks_count_lo = cpu_to_le32((u32)blk); - es->s_blocks_count_hi = cpu_to_le32(blk >> 32); -} - -static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, - ext4_fsblk_t blk) -{ - es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); - es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); -} - -static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, - ext4_fsblk_t blk) -{ - es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); - es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); -} - -static inline loff_t ext4_isize(struct ext4_inode *raw_inode) -{ - if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) - return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | - le32_to_cpu(raw_inode->i_size_lo); - else - return (loff_t) le32_to_cpu(raw_inode->i_size_lo); -} - -static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) -{ - raw_inode->i_size_lo = cpu_to_le32(i_size); - raw_inode->i_size_high = cpu_to_le32(i_size >> 32); -} - -static inline -struct ext4_group_info *ext4_get_group_info(struct super_block *sb, - ext4_group_t group) -{ - struct ext4_group_info ***grp_info; - long indexv, indexh; - grp_info = EXT4_SB(sb)->s_group_info; - indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); - indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); - return grp_info[indexv][indexh]; -} - -/* - * Reading s_groups_count requires using smp_rmb() afterwards. See - * the locking protocol documented in the comments of ext4_group_add() - * in resize.c - */ -static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) -{ - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; - - smp_rmb(); - return ngroups; -} - -static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, - ext4_group_t block_group) -{ - return block_group >> sbi->s_log_groups_per_flex; -} - -static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) -{ - return 1 << sbi->s_log_groups_per_flex; -} - -#define ext4_std_error(sb, errno) \ -do { \ - if ((errno)) \ - __ext4_std_error((sb), __func__, __LINE__, (errno)); \ -} while (0) - -#ifdef CONFIG_SMP -/* Each CPU can accumulate percpu_counter_batch clusters in their local - * counters. So we need to make sure we have free clusters more - * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. - */ -#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) -#else -#define EXT4_FREECLUSTERS_WATERMARK 0 -#endif - -static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) -{ - /* - * XXX: replace with spinlock if seen contended -bzzz - */ - down_write(&EXT4_I(inode)->i_data_sem); - if (newsize > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = newsize; - up_write(&EXT4_I(inode)->i_data_sem); - return ; -} - -struct ext4_group_info { - unsigned long bb_state; - struct rb_root bb_free_root; - ext4_grpblk_t bb_first_free; /* first free block */ - ext4_grpblk_t bb_free; /* total free blocks */ - ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ - ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ - struct list_head bb_prealloc_list; -#ifdef DOUBLE_CHECK - void *bb_bitmap; -#endif - struct rw_semaphore alloc_sem; - ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block - * regions, index is order. - * bb_counters[3] = 5 means - * 5 free 8-block regions. */ -}; - -#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 -#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 - -#define EXT4_MB_GRP_NEED_INIT(grp) \ - (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) - -#define EXT4_MB_GRP_WAS_TRIMMED(grp) \ - (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) -#define EXT4_MB_GRP_SET_TRIMMED(grp) \ - (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) -#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ - (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) - -#define EXT4_MAX_CONTENTION 8 -#define EXT4_CONTENTION_THRESHOLD 2 - -static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, - ext4_group_t group) -{ - return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); -} - -/* - * Returns true if the filesystem is busy enough that attempts to - * access the block group locks has run into contention. - */ -static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) -{ - return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); -} - -static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) -{ - spinlock_t *lock = ext4_group_lock_ptr(sb, group); - if (spin_trylock(lock)) - /* - * We're able to grab the lock right away, so drop the - * lock contention counter. - */ - atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); - else { - /* - * The lock is busy, so bump the contention counter, - * and then wait on the spin lock. - */ - atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, - EXT4_MAX_CONTENTION); - spin_lock(lock); - } -} - -static inline void ext4_unlock_group(struct super_block *sb, - ext4_group_t group) -{ - spin_unlock(ext4_group_lock_ptr(sb, group)); -} - -static inline void ext4_mark_super_dirty(struct super_block *sb) -{ - if (EXT4_SB(sb)->s_journal == NULL) - sb->s_dirt =1; -} - -/* - * Block validity checking - */ -#define ext4_check_indirect_blockref(inode, bh) \ - ext4_check_blockref(__func__, __LINE__, inode, \ - (__le32 *)(bh)->b_data, \ - EXT4_ADDR_PER_BLOCK((inode)->i_sb)) - -#define ext4_ind_check_inode(inode) \ - ext4_check_blockref(__func__, __LINE__, inode, \ - EXT4_I(inode)->i_data, \ - EXT4_NDIR_BLOCKS) - -/* - * Inodes and files operations - */ - -/* dir.c */ -extern const struct file_operations ext4_dir_operations; - -/* file.c */ -extern const struct inode_operations ext4_file_inode_operations; -extern const struct file_operations ext4_file_operations; -extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); - -/* namei.c */ -extern const struct inode_operations ext4_dir_inode_operations; -extern const struct inode_operations ext4_special_inode_operations; -extern struct dentry *ext4_get_parent(struct dentry *child); - -/* symlink.c */ -extern const struct inode_operations ext4_symlink_inode_operations; -extern const struct inode_operations ext4_fast_symlink_inode_operations; - -/* block_validity */ -extern void ext4_release_system_zone(struct super_block *sb); -extern int ext4_setup_system_zone(struct super_block *sb); -extern int __init ext4_init_system_zone(void); -extern void ext4_exit_system_zone(void); -extern int ext4_data_block_valid(struct ext4_sb_info *sbi, - ext4_fsblk_t start_blk, - unsigned int count); -extern int ext4_check_blockref(const char *, unsigned int, - struct inode *, __le32 *, unsigned int); - -/* extents.c */ -extern int ext4_ext_tree_init(handle_t *handle, struct inode *); -extern int ext4_ext_writepage_trans_blocks(struct inode *, int); -extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, - int chunk); -extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, int flags); -extern void ext4_ext_truncate(struct inode *); -extern int ext4_ext_punch_hole(struct file *file, loff_t offset, - loff_t length); -extern void ext4_ext_init(struct super_block *); -extern void ext4_ext_release(struct super_block *); -extern long ext4_fallocate(struct file *file, int mode, loff_t offset, - loff_t len); -extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - ssize_t len); -extern int ext4_map_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, int flags); -extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len); -/* move_extent.c */ -extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, - __u64 start_orig, __u64 start_donor, - __u64 len, __u64 *moved_len); - -/* page-io.c */ -extern int __init ext4_init_pageio(void); -extern void ext4_exit_pageio(void); -extern void ext4_ioend_wait(struct inode *); -extern void ext4_free_io_end(ext4_io_end_t *io); -extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); -extern int ext4_end_io_nolock(ext4_io_end_t *io); -extern void ext4_io_submit(struct ext4_io_submit *io); -extern int ext4_bio_write_page(struct ext4_io_submit *io, - struct page *page, - int len, - struct writeback_control *wbc); - -/* mmp.c */ -extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); - -/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ -enum ext4_state_bits { - BH_Uninit /* blocks are allocated but uninitialized on disk */ - = BH_JBDPrivateStart, - BH_AllocFromCluster, /* allocated blocks were part of already - * allocated cluster. Note that this flag will - * never, ever appear in a buffer_head's state - * flag. See EXT4_MAP_FROM_CLUSTER to see where - * this is used. */ - BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This - * flag is set when ext4_map_blocks is called on a - * delayed allocated block to get its real mapping. */ -}; - -BUFFER_FNS(Uninit, uninit) -TAS_BUFFER_FNS(Uninit, uninit) -BUFFER_FNS(Da_Mapped, da_mapped) - -/* - * Add new method to test wether block and inode bitmaps are properly - * initialized. With uninit_bg reading the block from disk is not enough - * to mark the bitmap uptodate. We need to also zero-out the bitmap - */ -#define BH_BITMAP_UPTODATE BH_JBDPrivateStart - -static inline int bitmap_uptodate(struct buffer_head *bh) -{ - return (buffer_uptodate(bh) && - test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); -} -static inline void set_bitmap_uptodate(struct buffer_head *bh) -{ - set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); -} - -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - -/* For ioend & aio unwritten conversion wait queues */ -#define EXT4_WQ_HASH_SZ 37 -#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ - EXT4_WQ_HASH_SZ]) -#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\ - EXT4_WQ_HASH_SZ]) -extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; -extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; - -#define EXT4_RESIZING 0 -extern int ext4_resize_begin(struct super_block *sb); -extern void ext4_resize_end(struct super_block *sb); - -#endif /* __KERNEL__ */ - -#include "ext4_extents.h" - -#endif /* _EXT4_H */ diff --git a/ANDROID_3.4.5/fs/ext4/ext4_extents.h b/ANDROID_3.4.5/fs/ext4/ext4_extents.h deleted file mode 100644 index 0f58b86e..00000000 --- a/ANDROID_3.4.5/fs/ext4/ext4_extents.h +++ /dev/null @@ -1,296 +0,0 @@ -/* - * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com - * Written by Alex Tomas <alex@clusterfs.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public Licens - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- - */ - -#ifndef _EXT4_EXTENTS -#define _EXT4_EXTENTS - -#include "ext4.h" - -/* - * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks - * becomes very small, so index split, in-depth growing and - * other hard changes happen much more often. - * This is for debug purposes only. - */ -#define AGGRESSIVE_TEST_ - -/* - * With EXTENTS_STATS defined, the number of blocks and extents - * are collected in the truncate path. They'll be shown at - * umount time. - */ -#define EXTENTS_STATS__ - -/* - * If CHECK_BINSEARCH is defined, then the results of the binary search - * will also be checked by linear search. - */ -#define CHECK_BINSEARCH__ - -/* - * Turn on EXT_DEBUG to get lots of info about extents operations. - */ -#define EXT_DEBUG__ -#ifdef EXT_DEBUG -#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) -#else -#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) -#endif - -/* - * If EXT_STATS is defined then stats numbers are collected. - * These number will be displayed at umount time. - */ -#define EXT_STATS_ - - -/* - * ext4_inode has i_block array (60 bytes total). - * The first 12 bytes store ext4_extent_header; - * the remainder stores an array of ext4_extent. - */ - -/* - * This is the extent on-disk structure. - * It's used at the bottom of the tree. - */ -struct ext4_extent { - __le32 ee_block; /* first logical block extent covers */ - __le16 ee_len; /* number of blocks covered by extent */ - __le16 ee_start_hi; /* high 16 bits of physical block */ - __le32 ee_start_lo; /* low 32 bits of physical block */ -}; - -/* - * This is index on-disk structure. - * It's used at all the levels except the bottom. - */ -struct ext4_extent_idx { - __le32 ei_block; /* index covers logical blocks from 'block' */ - __le32 ei_leaf_lo; /* pointer to the physical block of the next * - * level. leaf or next index could be there */ - __le16 ei_leaf_hi; /* high 16 bits of physical block */ - __u16 ei_unused; -}; - -/* - * Each block (leaves and indexes), even inode-stored has header. - */ -struct ext4_extent_header { - __le16 eh_magic; /* probably will support different formats */ - __le16 eh_entries; /* number of valid entries */ - __le16 eh_max; /* capacity of store in entries */ - __le16 eh_depth; /* has tree real underlying blocks? */ - __le32 eh_generation; /* generation of the tree */ -}; - -#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) - -/* - * Array of ext4_ext_path contains path to some extent. - * Creation/lookup routines use it for traversal/splitting/etc. - * Truncate uses it to simulate recursive walking. - */ -struct ext4_ext_path { - ext4_fsblk_t p_block; - __u16 p_depth; - struct ext4_extent *p_ext; - struct ext4_extent_idx *p_idx; - struct ext4_extent_header *p_hdr; - struct buffer_head *p_bh; -}; - -/* - * structure for external API - */ - -/* - * to be called by ext4_ext_walk_space() - * negative retcode - error - * positive retcode - signal for ext4_ext_walk_space(), see below - * callback must return valid extent (passed or newly created) - */ -typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t, - struct ext4_ext_cache *, - struct ext4_extent *, void *); - -#define EXT_CONTINUE 0 -#define EXT_BREAK 1 -#define EXT_REPEAT 2 - -/* - * Maximum number of logical blocks in a file; ext4_extent's ee_block is - * __le32. - */ -#define EXT_MAX_BLOCKS 0xffffffff - -/* - * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an - * initialized extent. This is 2^15 and not (2^16 - 1), since we use the - * MSB of ee_len field in the extent datastructure to signify if this - * particular extent is an initialized extent or an uninitialized (i.e. - * preallocated). - * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an - * uninitialized extent. - * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an - * uninitialized one. In other words, if MSB of ee_len is set, it is an - * uninitialized extent with only one special scenario when ee_len = 0x8000. - * In this case we can not have an uninitialized extent of zero length and - * thus we make it as a special case of initialized extent with 0x8000 length. - * This way we get better extent-to-group alignment for initialized extents. - * Hence, the maximum number of blocks we can have in an *initialized* - * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767). - */ -#define EXT_INIT_MAX_LEN (1UL << 15) -#define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1) - - -#define EXT_FIRST_EXTENT(__hdr__) \ - ((struct ext4_extent *) (((char *) (__hdr__)) + \ - sizeof(struct ext4_extent_header))) -#define EXT_FIRST_INDEX(__hdr__) \ - ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \ - sizeof(struct ext4_extent_header))) -#define EXT_HAS_FREE_INDEX(__path__) \ - (le16_to_cpu((__path__)->p_hdr->eh_entries) \ - < le16_to_cpu((__path__)->p_hdr->eh_max)) -#define EXT_LAST_EXTENT(__hdr__) \ - (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) -#define EXT_LAST_INDEX(__hdr__) \ - (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) -#define EXT_MAX_EXTENT(__hdr__) \ - (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) -#define EXT_MAX_INDEX(__hdr__) \ - (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) - -static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) -{ - return (struct ext4_extent_header *) EXT4_I(inode)->i_data; -} - -static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh) -{ - return (struct ext4_extent_header *) bh->b_data; -} - -static inline unsigned short ext_depth(struct inode *inode) -{ - return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); -} - -static inline void -ext4_ext_invalidate_cache(struct inode *inode) -{ - EXT4_I(inode)->i_cached_extent.ec_len = 0; -} - -static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) -{ - /* We can not have an uninitialized extent of zero length! */ - BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); - ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); -} - -static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext) -{ - /* Extent with ee_len of 0x8000 is treated as an initialized extent */ - return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); -} - -static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) -{ - return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ? - le16_to_cpu(ext->ee_len) : - (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); -} - -static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) -{ - ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); -} - -/* - * ext4_ext_pblock: - * combine low and high parts of physical block number into ext4_fsblk_t - */ -static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex) -{ - ext4_fsblk_t block; - - block = le32_to_cpu(ex->ee_start_lo); - block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; - return block; -} - -/* - * ext4_idx_pblock: - * combine low and high parts of a leaf physical block number into ext4_fsblk_t - */ -static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix) -{ - ext4_fsblk_t block; - - block = le32_to_cpu(ix->ei_leaf_lo); - block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; - return block; -} - -/* - * ext4_ext_store_pblock: - * stores a large physical block number into an extent struct, - * breaking it into parts - */ -static inline void ext4_ext_store_pblock(struct ext4_extent *ex, - ext4_fsblk_t pb) -{ - ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); - ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & - 0xffff); -} - -/* - * ext4_idx_store_pblock: - * stores a large physical block number into an index struct, - * breaking it into parts - */ -static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, - ext4_fsblk_t pb) -{ - ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); - ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & - 0xffff); -} - -extern int ext4_ext_calc_metadata_amount(struct inode *inode, - ext4_lblk_t lblocks); -extern int ext4_extent_tree_init(handle_t *, struct inode *); -extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, - int num, - struct ext4_ext_path *path); -extern int ext4_can_extents_be_merged(struct inode *inode, - struct ext4_extent *ex1, - struct ext4_extent *ex2); -extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); -extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path *); -extern void ext4_ext_drop_refs(struct ext4_ext_path *); -extern int ext4_ext_check_inode(struct inode *inode); -extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, - int search_hint_reverse); -#endif /* _EXT4_EXTENTS */ - diff --git a/ANDROID_3.4.5/fs/ext4/ext4_jbd2.c b/ANDROID_3.4.5/fs/ext4/ext4_jbd2.c deleted file mode 100644 index aca17901..00000000 --- a/ANDROID_3.4.5/fs/ext4/ext4_jbd2.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Interface between ext4 and JBD - */ - -#include "ext4_jbd2.h" - -#include <trace/events/ext4.h> - -int __ext4_journal_get_write_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh) -{ - int err = 0; - - if (ext4_handle_valid(handle)) { - err = jbd2_journal_get_write_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, line, __func__, bh, - handle, err); - } - return err; -} - -/* - * The ext4 forget function must perform a revoke if we are freeing data - * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. - * - * "bh" may be NULL: a metadata block may have been freed from memory - * but there may still be a record of it in the journal, and that record - * still needs to be revoked. - * - * If the handle isn't valid we're not journaling, but we still need to - * call into ext4_journal_revoke() to put the buffer head. - */ -int __ext4_forget(const char *where, unsigned int line, handle_t *handle, - int is_metadata, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t blocknr) -{ - int err; - - might_sleep(); - - trace_ext4_forget(inode, is_metadata, blocknr); - BUFFER_TRACE(bh, "enter"); - - jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " - "data mode %x\n", - bh, is_metadata, inode->i_mode, - test_opt(inode->i_sb, DATA_FLAGS)); - - /* In the no journal case, we can just do a bforget and return */ - if (!ext4_handle_valid(handle)) { - bforget(bh); - return 0; - } - - /* Never use the revoke function if we are doing full data - * journaling: there is no need to, and a V1 superblock won't - * support it. Otherwise, only skip the revoke on un-journaled - * data blocks. */ - - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - (!is_metadata && !ext4_should_journal_data(inode))) { - if (bh) { - BUFFER_TRACE(bh, "call jbd2_journal_forget"); - err = jbd2_journal_forget(handle, bh); - if (err) - ext4_journal_abort_handle(where, line, __func__, - bh, handle, err); - return err; - } - return 0; - } - - /* - * data!=journal && (is_metadata || should_journal_data(inode)) - */ - BUFFER_TRACE(bh, "call jbd2_journal_revoke"); - err = jbd2_journal_revoke(handle, blocknr, bh); - if (err) { - ext4_journal_abort_handle(where, line, __func__, - bh, handle, err); - __ext4_abort(inode->i_sb, where, line, - "error %d when attempting revoke", err); - } - BUFFER_TRACE(bh, "exit"); - return err; -} - -int __ext4_journal_get_create_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh) -{ - int err = 0; - - if (ext4_handle_valid(handle)) { - err = jbd2_journal_get_create_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, line, __func__, - bh, handle, err); - } - return err; -} - -int __ext4_handle_dirty_metadata(const char *where, unsigned int line, - handle_t *handle, struct inode *inode, - struct buffer_head *bh) -{ - int err = 0; - - if (ext4_handle_valid(handle)) { - err = jbd2_journal_dirty_metadata(handle, bh); - if (err) { - /* Errors can only happen if there is a bug */ - handle->h_err = err; - __ext4_journal_stop(where, line, handle); - } - } else { - if (inode) - mark_buffer_dirty_inode(bh, inode); - else - mark_buffer_dirty(bh); - if (inode && inode_needs_sync(inode)) { - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) { - struct ext4_super_block *es; - - es = EXT4_SB(inode->i_sb)->s_es; - es->s_last_error_block = - cpu_to_le64(bh->b_blocknr); - ext4_error_inode(inode, where, line, - bh->b_blocknr, - "IO error syncing itable block"); - err = -EIO; - } - } - } - return err; -} - -int __ext4_handle_dirty_super(const char *where, unsigned int line, - handle_t *handle, struct super_block *sb) -{ - struct buffer_head *bh = EXT4_SB(sb)->s_sbh; - int err = 0; - - if (ext4_handle_valid(handle)) { - err = jbd2_journal_dirty_metadata(handle, bh); - if (err) - ext4_journal_abort_handle(where, line, __func__, - bh, handle, err); - } else - sb->s_dirt = 1; - return err; -} diff --git a/ANDROID_3.4.5/fs/ext4/ext4_jbd2.h b/ANDROID_3.4.5/fs/ext4/ext4_jbd2.h deleted file mode 100644 index 83b20fcf..00000000 --- a/ANDROID_3.4.5/fs/ext4/ext4_jbd2.h +++ /dev/null @@ -1,399 +0,0 @@ -/* - * ext4_jbd2.h - * - * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 - * - * Copyright 1998--1999 Red Hat corp --- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - * - * Ext4-specific journaling extensions. - */ - -#ifndef _EXT4_JBD2_H -#define _EXT4_JBD2_H - -#include <linux/fs.h> -#include <linux/jbd2.h> -#include "ext4.h" - -#define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal) - -/* Define the number of blocks we need to account to a transaction to - * modify one block of data. - * - * We may have to touch one inode, one bitmap buffer, up to three - * indirection blocks, the group and superblock summaries, and the data - * block to complete the transaction. - * - * For extents-enabled fs we may have to allocate and modify up to - * 5 levels of tree + root which are stored in the inode. */ - -#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ - (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ - ? 27U : 8U) - -/* Extended attribute operations touch at most two data buffers, - * two bitmap buffers, and two group summaries, in addition to the inode - * and the superblock, which are already accounted for. */ - -#define EXT4_XATTR_TRANS_BLOCKS 6U - -/* Define the minimum size for a transaction which modifies data. This - * needs to take into account the fact that we may end up modifying two - * quota files too (one for the group, one for the user quota). The - * superblock only gets updated once, of course, so don't bother - * counting that again for the quota updates. */ - -#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ - EXT4_XATTR_TRANS_BLOCKS - 2 + \ - EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) - -/* - * Define the number of metadata blocks we need to account to modify data. - * - * This include super block, inode block, quota blocks and xattr blocks - */ -#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ - EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) - -/* Delete operations potentially hit one directory's namespace plus an - * entire inode, plus arbitrary amounts of bitmap/indirection data. Be - * generous. We can grow the delete transaction later if necessary. */ - -#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64) - -/* Define an arbitrary limit for the amount of data we will anticipate - * writing to any given transaction. For unbounded transactions such as - * write(2) and truncate(2) we can write more than this, but we always - * start off at the maximum transaction size and grow the transaction - * optimistically as we go. */ - -#define EXT4_MAX_TRANS_DATA 64U - -/* We break up a large truncate or write transaction once the handle's - * buffer credits gets this low, we need either to extend the - * transaction or to start a new one. Reserve enough space here for - * inode, bitmap, superblock, group and indirection updates for at least - * one block, plus two quota updates. Quota allocations are not - * needed. */ - -#define EXT4_RESERVE_TRANS_BLOCKS 12U - -#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8 - -#ifdef CONFIG_QUOTA -/* Amount of blocks needed for quota update - we know that the structure was - * allocated so we need to update only data block */ -#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0) -/* Amount of blocks needed for quota insert/delete - we do some block writes - * but inode, sb and group updates are done only once */ -#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ - (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) - -#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ - (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) -#else -#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0 -#define EXT4_QUOTA_INIT_BLOCKS(sb) 0 -#define EXT4_QUOTA_DEL_BLOCKS(sb) 0 -#endif -#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) -#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) -#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) - -/** - * struct ext4_journal_cb_entry - Base structure for callback information. - * - * This struct is a 'seed' structure for a using with your own callback - * structs. If you are using callbacks you must allocate one of these - * or another struct of your own definition which has this struct - * as it's first element and pass it to ext4_journal_callback_add(). - */ -struct ext4_journal_cb_entry { - /* list information for other callbacks attached to the same handle */ - struct list_head jce_list; - - /* Function to call with this callback structure */ - void (*jce_func)(struct super_block *sb, - struct ext4_journal_cb_entry *jce, int error); - - /* user data goes here */ -}; - -/** - * ext4_journal_callback_add: add a function to call after transaction commit - * @handle: active journal transaction handle to register callback on - * @func: callback function to call after the transaction has committed: - * @sb: superblock of current filesystem for transaction - * @jce: returned journal callback data - * @rc: journal state at commit (0 = transaction committed properly) - * @jce: journal callback data (internal and function private data struct) - * - * The registered function will be called in the context of the journal thread - * after the transaction for which the handle was created has completed. - * - * No locks are held when the callback function is called, so it is safe to - * call blocking functions from within the callback, but the callback should - * not block or run for too long, or the filesystem will be blocked waiting for - * the next transaction to commit. No journaling functions can be used, or - * there is a risk of deadlock. - * - * There is no guaranteed calling order of multiple registered callbacks on - * the same transaction. - */ -static inline void ext4_journal_callback_add(handle_t *handle, - void (*func)(struct super_block *sb, - struct ext4_journal_cb_entry *jce, - int rc), - struct ext4_journal_cb_entry *jce) -{ - struct ext4_sb_info *sbi = - EXT4_SB(handle->h_transaction->t_journal->j_private); - - /* Add the jce to transaction's private list */ - jce->jce_func = func; - spin_lock(&sbi->s_md_lock); - list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); - spin_unlock(&sbi->s_md_lock); -} - -/** - * ext4_journal_callback_del: delete a registered callback - * @handle: active journal transaction handle on which callback was registered - * @jce: registered journal callback entry to unregister - */ -static inline void ext4_journal_callback_del(handle_t *handle, - struct ext4_journal_cb_entry *jce) -{ - struct ext4_sb_info *sbi = - EXT4_SB(handle->h_transaction->t_journal->j_private); - - spin_lock(&sbi->s_md_lock); - list_del_init(&jce->jce_list); - spin_unlock(&sbi->s_md_lock); -} - -int -ext4_mark_iloc_dirty(handle_t *handle, - struct inode *inode, - struct ext4_iloc *iloc); - -/* - * On success, We end up with an outstanding reference count against - * iloc->bh. This _must_ be cleaned up later. - */ - -int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, - struct ext4_iloc *iloc); - -int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); - -/* - * Wrapper functions with which ext4 calls into JBD. - */ -void ext4_journal_abort_handle(const char *caller, unsigned int line, - const char *err_fn, - struct buffer_head *bh, handle_t *handle, int err); - -int __ext4_journal_get_write_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh); - -int __ext4_forget(const char *where, unsigned int line, handle_t *handle, - int is_metadata, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t blocknr); - -int __ext4_journal_get_create_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh); - -int __ext4_handle_dirty_metadata(const char *where, unsigned int line, - handle_t *handle, struct inode *inode, - struct buffer_head *bh); - -int __ext4_handle_dirty_super(const char *where, unsigned int line, - handle_t *handle, struct super_block *sb); - -#define ext4_journal_get_write_access(handle, bh) \ - __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) -#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ - __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ - (bh), (block_nr)) -#define ext4_journal_get_create_access(handle, bh) \ - __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh)) -#define ext4_handle_dirty_metadata(handle, inode, bh) \ - __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ - (bh)) -#define ext4_handle_dirty_super(handle, sb) \ - __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) - -handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); -int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); - -#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) - -/* Note: Do not use this for NULL handles. This is only to determine if - * a properly allocated handle is using a journal or not. */ -static inline int ext4_handle_valid(handle_t *handle) -{ - if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) - return 0; - return 1; -} - -static inline void ext4_handle_sync(handle_t *handle) -{ - if (ext4_handle_valid(handle)) - handle->h_sync = 1; -} - -static inline void ext4_handle_release_buffer(handle_t *handle, - struct buffer_head *bh) -{ - if (ext4_handle_valid(handle)) - jbd2_journal_release_buffer(handle, bh); -} - -static inline int ext4_handle_is_aborted(handle_t *handle) -{ - if (ext4_handle_valid(handle)) - return is_handle_aborted(handle); - return 0; -} - -static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) -{ - if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed) - return 0; - return 1; -} - -static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) -{ - return ext4_journal_start_sb(inode->i_sb, nblocks); -} - -#define ext4_journal_stop(handle) \ - __ext4_journal_stop(__func__, __LINE__, (handle)) - -static inline handle_t *ext4_journal_current_handle(void) -{ - return journal_current_handle(); -} - -static inline int ext4_journal_extend(handle_t *handle, int nblocks) -{ - if (ext4_handle_valid(handle)) - return jbd2_journal_extend(handle, nblocks); - return 0; -} - -static inline int ext4_journal_restart(handle_t *handle, int nblocks) -{ - if (ext4_handle_valid(handle)) - return jbd2_journal_restart(handle, nblocks); - return 0; -} - -static inline int ext4_journal_blocks_per_page(struct inode *inode) -{ - if (EXT4_JOURNAL(inode) != NULL) - return jbd2_journal_blocks_per_page(inode); - return 0; -} - -static inline int ext4_journal_force_commit(journal_t *journal) -{ - if (journal) - return jbd2_journal_force_commit(journal); - return 0; -} - -static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) -{ - if (ext4_handle_valid(handle)) - return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode); - return 0; -} - -static inline void ext4_update_inode_fsync_trans(handle_t *handle, - struct inode *inode, - int datasync) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_handle_valid(handle)) { - ei->i_sync_tid = handle->h_transaction->t_tid; - if (datasync) - ei->i_datasync_tid = handle->h_transaction->t_tid; - } -} - -/* super.c */ -int ext4_force_commit(struct super_block *sb); - -/* - * Ext4 inode journal modes - */ -#define EXT4_INODE_JOURNAL_DATA_MODE 0x01 /* journal data mode */ -#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ -#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ - -static inline int ext4_inode_journal_mode(struct inode *inode) -{ - if (EXT4_JOURNAL(inode) == NULL) - return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - /* We do not support data journalling with delayed allocation */ - if (!S_ISREG(inode->i_mode) || - test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC)) - return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - else - BUG(); -} - -static inline int ext4_should_journal_data(struct inode *inode) -{ - return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE; -} - -static inline int ext4_should_order_data(struct inode *inode) -{ - return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE; -} - -static inline int ext4_should_writeback_data(struct inode *inode) -{ - return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE; -} - -/* - * This function controls whether or not we should try to go down the - * dioread_nolock code paths, which makes it safe to avoid taking - * i_mutex for direct I/O reads. This only works for extent-based - * files, and it doesn't work if data journaling is enabled, since the - * dioread_nolock code uses b_private to pass information back to the - * I/O completion handler, and this conflicts with the jbd's use of - * b_private. - */ -static inline int ext4_should_dioread_nolock(struct inode *inode) -{ - if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) - return 0; - if (!S_ISREG(inode->i_mode)) - return 0; - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return 0; - if (ext4_should_journal_data(inode)) - return 0; - return 1; -} - -#endif /* _EXT4_JBD2_H */ diff --git a/ANDROID_3.4.5/fs/ext4/extents.c b/ANDROID_3.4.5/fs/ext4/extents.c deleted file mode 100644 index abcdeab6..00000000 --- a/ANDROID_3.4.5/fs/ext4/extents.c +++ /dev/null @@ -1,4866 +0,0 @@ -/* - * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com - * Written by Alex Tomas <alex@clusterfs.com> - * - * Architecture independence: - * Copyright (c) 2005, Bull S.A. - * Written by Pierre Peiffer <pierre.peiffer@bull.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public Licens - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- - */ - -/* - * Extents support for EXT4 - * - * TODO: - * - ext4*_error() should be used in some situations - * - analyze all BUG()/BUG_ON(), use -EIO where appropriate - * - smart tree reduction - */ - -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/jbd2.h> -#include <linux/highuid.h> -#include <linux/pagemap.h> -#include <linux/quotaops.h> -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/falloc.h> -#include <asm/uaccess.h> -#include <linux/fiemap.h> -#include "ext4_jbd2.h" - -#include <trace/events/ext4.h> - -/* - * used by extent splitting. - */ -#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ - due to ENOSPC */ -#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ -#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ - -static int ext4_split_extent(handle_t *handle, - struct inode *inode, - struct ext4_ext_path *path, - struct ext4_map_blocks *map, - int split_flag, - int flags); - -static int ext4_split_extent_at(handle_t *handle, - struct inode *inode, - struct ext4_ext_path *path, - ext4_lblk_t split, - int split_flag, - int flags); - -static int ext4_ext_truncate_extend_restart(handle_t *handle, - struct inode *inode, - int needed) -{ - int err; - - if (!ext4_handle_valid(handle)) - return 0; - if (handle->h_buffer_credits > needed) - return 0; - err = ext4_journal_extend(handle, needed); - if (err <= 0) - return err; - err = ext4_truncate_restart_trans(handle, inode, needed); - if (err == 0) - err = -EAGAIN; - - return err; -} - -/* - * could return: - * - EROFS - * - ENOMEM - */ -static int ext4_ext_get_access(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) -{ - if (path->p_bh) { - /* path points to block */ - return ext4_journal_get_write_access(handle, path->p_bh); - } - /* path points to leaf/index in inode body */ - /* we use in-core data, no need to protect them */ - return 0; -} - -/* - * could return: - * - EROFS - * - ENOMEM - * - EIO - */ -#define ext4_ext_dirty(handle, inode, path) \ - __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) -static int __ext4_ext_dirty(const char *where, unsigned int line, - handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) -{ - int err; - if (path->p_bh) { - /* path points to block */ - err = __ext4_handle_dirty_metadata(where, line, handle, - inode, path->p_bh); - } else { - /* path points to leaf/index in inode body */ - err = ext4_mark_inode_dirty(handle, inode); - } - return err; -} - -static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, - struct ext4_ext_path *path, - ext4_lblk_t block) -{ - if (path) { - int depth = path->p_depth; - struct ext4_extent *ex; - - /* - * Try to predict block placement assuming that we are - * filling in a file which will eventually be - * non-sparse --- i.e., in the case of libbfd writing - * an ELF object sections out-of-order but in a way - * the eventually results in a contiguous object or - * executable file, or some database extending a table - * space file. However, this is actually somewhat - * non-ideal if we are writing a sparse file such as - * qemu or KVM writing a raw image file that is going - * to stay fairly sparse, since it will end up - * fragmenting the file system's free space. Maybe we - * should have some hueristics or some way to allow - * userspace to pass a hint to file system, - * especially if the latter case turns out to be - * common. - */ - ex = path[depth].p_ext; - if (ex) { - ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); - ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); - - if (block > ext_block) - return ext_pblk + (block - ext_block); - else - return ext_pblk - (ext_block - block); - } - - /* it looks like index is empty; - * try to find starting block from index itself */ - if (path[depth].p_bh) - return path[depth].p_bh->b_blocknr; - } - - /* OK. use inode's group */ - return ext4_inode_to_goal_block(inode); -} - -/* - * Allocation for a meta data block - */ -static ext4_fsblk_t -ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *ex, int *err, unsigned int flags) -{ - ext4_fsblk_t goal, newblock; - - goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); - newblock = ext4_new_meta_blocks(handle, inode, goal, flags, - NULL, err); - return newblock; -} - -static inline int ext4_ext_space_block(struct inode *inode, int check) -{ - int size; - - size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) - / sizeof(struct ext4_extent); -#ifdef AGGRESSIVE_TEST - if (!check && size > 6) - size = 6; -#endif - return size; -} - -static inline int ext4_ext_space_block_idx(struct inode *inode, int check) -{ - int size; - - size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) - / sizeof(struct ext4_extent_idx); -#ifdef AGGRESSIVE_TEST - if (!check && size > 5) - size = 5; -#endif - return size; -} - -static inline int ext4_ext_space_root(struct inode *inode, int check) -{ - int size; - - size = sizeof(EXT4_I(inode)->i_data); - size -= sizeof(struct ext4_extent_header); - size /= sizeof(struct ext4_extent); -#ifdef AGGRESSIVE_TEST - if (!check && size > 3) - size = 3; -#endif - return size; -} - -static inline int ext4_ext_space_root_idx(struct inode *inode, int check) -{ - int size; - - size = sizeof(EXT4_I(inode)->i_data); - size -= sizeof(struct ext4_extent_header); - size /= sizeof(struct ext4_extent_idx); -#ifdef AGGRESSIVE_TEST - if (!check && size > 4) - size = 4; -#endif - return size; -} - -/* - * Calculate the number of metadata blocks needed - * to allocate @blocks - * Worse case is one block per extent - */ -int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - int idxs; - - idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) - / sizeof(struct ext4_extent_idx)); - - /* - * If the new delayed allocation block is contiguous with the - * previous da block, it can share index blocks with the - * previous block, so we only need to allocate a new index - * block every idxs leaf blocks. At ldxs**2 blocks, we need - * an additional index block, and at ldxs**3 blocks, yet - * another index blocks. - */ - if (ei->i_da_metadata_calc_len && - ei->i_da_metadata_calc_last_lblock+1 == lblock) { - int num = 0; - - if ((ei->i_da_metadata_calc_len % idxs) == 0) - num++; - if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) - num++; - if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { - num++; - ei->i_da_metadata_calc_len = 0; - } else - ei->i_da_metadata_calc_len++; - ei->i_da_metadata_calc_last_lblock++; - return num; - } - - /* - * In the worst case we need a new set of index blocks at - * every level of the inode's extent tree. - */ - ei->i_da_metadata_calc_len = 1; - ei->i_da_metadata_calc_last_lblock = lblock; - return ext_depth(inode) + 1; -} - -static int -ext4_ext_max_entries(struct inode *inode, int depth) -{ - int max; - - if (depth == ext_depth(inode)) { - if (depth == 0) - max = ext4_ext_space_root(inode, 1); - else - max = ext4_ext_space_root_idx(inode, 1); - } else { - if (depth == 0) - max = ext4_ext_space_block(inode, 1); - else - max = ext4_ext_space_block_idx(inode, 1); - } - - return max; -} - -static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) -{ - ext4_fsblk_t block = ext4_ext_pblock(ext); - int len = ext4_ext_get_actual_len(ext); - - if (len == 0) - return 0; - return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); -} - -static int ext4_valid_extent_idx(struct inode *inode, - struct ext4_extent_idx *ext_idx) -{ - ext4_fsblk_t block = ext4_idx_pblock(ext_idx); - - return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); -} - -static int ext4_valid_extent_entries(struct inode *inode, - struct ext4_extent_header *eh, - int depth) -{ - unsigned short entries; - if (eh->eh_entries == 0) - return 1; - - entries = le16_to_cpu(eh->eh_entries); - - if (depth == 0) { - /* leaf entries */ - struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); - while (entries) { - if (!ext4_valid_extent(inode, ext)) - return 0; - ext++; - entries--; - } - } else { - struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); - while (entries) { - if (!ext4_valid_extent_idx(inode, ext_idx)) - return 0; - ext_idx++; - entries--; - } - } - return 1; -} - -static int __ext4_ext_check(const char *function, unsigned int line, - struct inode *inode, struct ext4_extent_header *eh, - int depth) -{ - const char *error_msg; - int max = 0; - - if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { - error_msg = "invalid magic"; - goto corrupted; - } - if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { - error_msg = "unexpected eh_depth"; - goto corrupted; - } - if (unlikely(eh->eh_max == 0)) { - error_msg = "invalid eh_max"; - goto corrupted; - } - max = ext4_ext_max_entries(inode, depth); - if (unlikely(le16_to_cpu(eh->eh_max) > max)) { - error_msg = "too large eh_max"; - goto corrupted; - } - if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { - error_msg = "invalid eh_entries"; - goto corrupted; - } - if (!ext4_valid_extent_entries(inode, eh, depth)) { - error_msg = "invalid extent entries"; - goto corrupted; - } - return 0; - -corrupted: - ext4_error_inode(inode, function, line, 0, - "bad header/extent: %s - magic %x, " - "entries %u, max %u(%u), depth %u(%u)", - error_msg, le16_to_cpu(eh->eh_magic), - le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), - max, le16_to_cpu(eh->eh_depth), depth); - - return -EIO; -} - -#define ext4_ext_check(inode, eh, depth) \ - __ext4_ext_check(__func__, __LINE__, inode, eh, depth) - -int ext4_ext_check_inode(struct inode *inode) -{ - return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); -} - -#ifdef EXT_DEBUG -static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) -{ - int k, l = path->p_depth; - - ext_debug("path:"); - for (k = 0; k <= l; k++, path++) { - if (path->p_idx) { - ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), - ext4_idx_pblock(path->p_idx)); - } else if (path->p_ext) { - ext_debug(" %d:[%d]%d:%llu ", - le32_to_cpu(path->p_ext->ee_block), - ext4_ext_is_uninitialized(path->p_ext), - ext4_ext_get_actual_len(path->p_ext), - ext4_ext_pblock(path->p_ext)); - } else - ext_debug(" []"); - } - ext_debug("\n"); -} - -static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) -{ - int depth = ext_depth(inode); - struct ext4_extent_header *eh; - struct ext4_extent *ex; - int i; - - if (!path) - return; - - eh = path[depth].p_hdr; - ex = EXT_FIRST_EXTENT(eh); - - ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); - - for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { - ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), - ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); - } - ext_debug("\n"); -} - -static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, - ext4_fsblk_t newblock, int level) -{ - int depth = ext_depth(inode); - struct ext4_extent *ex; - - if (depth != level) { - struct ext4_extent_idx *idx; - idx = path[level].p_idx; - while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { - ext_debug("%d: move %d:%llu in new index %llu\n", level, - le32_to_cpu(idx->ei_block), - ext4_idx_pblock(idx), - newblock); - idx++; - } - - return; - } - - ex = path[depth].p_ext; - while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { - ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", - le32_to_cpu(ex->ee_block), - ext4_ext_pblock(ex), - ext4_ext_is_uninitialized(ex), - ext4_ext_get_actual_len(ex), - newblock); - ex++; - } -} - -#else -#define ext4_ext_show_path(inode, path) -#define ext4_ext_show_leaf(inode, path) -#define ext4_ext_show_move(inode, path, newblock, level) -#endif - -void ext4_ext_drop_refs(struct ext4_ext_path *path) -{ - int depth = path->p_depth; - int i; - - for (i = 0; i <= depth; i++, path++) - if (path->p_bh) { - brelse(path->p_bh); - path->p_bh = NULL; - } -} - -/* - * ext4_ext_binsearch_idx: - * binary search for the closest index of the given block - * the header must be checked before calling this - */ -static void -ext4_ext_binsearch_idx(struct inode *inode, - struct ext4_ext_path *path, ext4_lblk_t block) -{ - struct ext4_extent_header *eh = path->p_hdr; - struct ext4_extent_idx *r, *l, *m; - - - ext_debug("binsearch for %u(idx): ", block); - - l = EXT_FIRST_INDEX(eh) + 1; - r = EXT_LAST_INDEX(eh); - while (l <= r) { - m = l + (r - l) / 2; - if (block < le32_to_cpu(m->ei_block)) - r = m - 1; - else - l = m + 1; - ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), - m, le32_to_cpu(m->ei_block), - r, le32_to_cpu(r->ei_block)); - } - - path->p_idx = l - 1; - ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), - ext4_idx_pblock(path->p_idx)); - -#ifdef CHECK_BINSEARCH - { - struct ext4_extent_idx *chix, *ix; - int k; - - chix = ix = EXT_FIRST_INDEX(eh); - for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { - if (k != 0 && - le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { - printk(KERN_DEBUG "k=%d, ix=0x%p, " - "first=0x%p\n", k, - ix, EXT_FIRST_INDEX(eh)); - printk(KERN_DEBUG "%u <= %u\n", - le32_to_cpu(ix->ei_block), - le32_to_cpu(ix[-1].ei_block)); - } - BUG_ON(k && le32_to_cpu(ix->ei_block) - <= le32_to_cpu(ix[-1].ei_block)); - if (block < le32_to_cpu(ix->ei_block)) - break; - chix = ix; - } - BUG_ON(chix != path->p_idx); - } -#endif - -} - -/* - * ext4_ext_binsearch: - * binary search for closest extent of the given block - * the header must be checked before calling this - */ -static void -ext4_ext_binsearch(struct inode *inode, - struct ext4_ext_path *path, ext4_lblk_t block) -{ - struct ext4_extent_header *eh = path->p_hdr; - struct ext4_extent *r, *l, *m; - - if (eh->eh_entries == 0) { - /* - * this leaf is empty: - * we get such a leaf in split/add case - */ - return; - } - - ext_debug("binsearch for %u: ", block); - - l = EXT_FIRST_EXTENT(eh) + 1; - r = EXT_LAST_EXTENT(eh); - - while (l <= r) { - m = l + (r - l) / 2; - if (block < le32_to_cpu(m->ee_block)) - r = m - 1; - else - l = m + 1; - ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), - m, le32_to_cpu(m->ee_block), - r, le32_to_cpu(r->ee_block)); - } - - path->p_ext = l - 1; - ext_debug(" -> %d:%llu:[%d]%d ", - le32_to_cpu(path->p_ext->ee_block), - ext4_ext_pblock(path->p_ext), - ext4_ext_is_uninitialized(path->p_ext), - ext4_ext_get_actual_len(path->p_ext)); - -#ifdef CHECK_BINSEARCH - { - struct ext4_extent *chex, *ex; - int k; - - chex = ex = EXT_FIRST_EXTENT(eh); - for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { - BUG_ON(k && le32_to_cpu(ex->ee_block) - <= le32_to_cpu(ex[-1].ee_block)); - if (block < le32_to_cpu(ex->ee_block)) - break; - chex = ex; - } - BUG_ON(chex != path->p_ext); - } -#endif - -} - -int ext4_ext_tree_init(handle_t *handle, struct inode *inode) -{ - struct ext4_extent_header *eh; - - eh = ext_inode_hdr(inode); - eh->eh_depth = 0; - eh->eh_entries = 0; - eh->eh_magic = EXT4_EXT_MAGIC; - eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); - ext4_mark_inode_dirty(handle, inode); - ext4_ext_invalidate_cache(inode); - return 0; -} - -struct ext4_ext_path * -ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_path *path) -{ - struct ext4_extent_header *eh; - struct buffer_head *bh; - short int depth, i, ppos = 0, alloc = 0; - - eh = ext_inode_hdr(inode); - depth = ext_depth(inode); - - /* account possible depth increase */ - if (!path) { - path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), - GFP_NOFS); - if (!path) - return ERR_PTR(-ENOMEM); - alloc = 1; - } - path[0].p_hdr = eh; - path[0].p_bh = NULL; - - i = depth; - /* walk through the tree */ - while (i) { - int need_to_validate = 0; - - ext_debug("depth %d: num %d, max %d\n", - ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); - - ext4_ext_binsearch_idx(inode, path + ppos, block); - path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); - path[ppos].p_depth = i; - path[ppos].p_ext = NULL; - - bh = sb_getblk(inode->i_sb, path[ppos].p_block); - if (unlikely(!bh)) - goto err; - if (!bh_uptodate_or_lock(bh)) { - trace_ext4_ext_load_extent(inode, block, - path[ppos].p_block); - if (bh_submit_read(bh) < 0) { - put_bh(bh); - goto err; - } - /* validate the extent entries */ - need_to_validate = 1; - } - eh = ext_block_hdr(bh); - ppos++; - if (unlikely(ppos > depth)) { - put_bh(bh); - EXT4_ERROR_INODE(inode, - "ppos %d > depth %d", ppos, depth); - goto err; - } - path[ppos].p_bh = bh; - path[ppos].p_hdr = eh; - i--; - - if (need_to_validate && ext4_ext_check(inode, eh, i)) - goto err; - } - - path[ppos].p_depth = i; - path[ppos].p_ext = NULL; - path[ppos].p_idx = NULL; - - /* find extent */ - ext4_ext_binsearch(inode, path + ppos, block); - /* if not an empty leaf */ - if (path[ppos].p_ext) - path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); - - ext4_ext_show_path(inode, path); - - return path; - -err: - ext4_ext_drop_refs(path); - if (alloc) - kfree(path); - return ERR_PTR(-EIO); -} - -/* - * ext4_ext_insert_index: - * insert new index [@logical;@ptr] into the block at @curp; - * check where to insert: before @curp or after @curp - */ -static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, - struct ext4_ext_path *curp, - int logical, ext4_fsblk_t ptr) -{ - struct ext4_extent_idx *ix; - int len, err; - - err = ext4_ext_get_access(handle, inode, curp); - if (err) - return err; - - if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { - EXT4_ERROR_INODE(inode, - "logical %d == ei_block %d!", - logical, le32_to_cpu(curp->p_idx->ei_block)); - return -EIO; - } - - if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) - >= le16_to_cpu(curp->p_hdr->eh_max))) { - EXT4_ERROR_INODE(inode, - "eh_entries %d >= eh_max %d!", - le16_to_cpu(curp->p_hdr->eh_entries), - le16_to_cpu(curp->p_hdr->eh_max)); - return -EIO; - } - - if (logical > le32_to_cpu(curp->p_idx->ei_block)) { - /* insert after */ - ext_debug("insert new index %d after: %llu\n", logical, ptr); - ix = curp->p_idx + 1; - } else { - /* insert before */ - ext_debug("insert new index %d before: %llu\n", logical, ptr); - ix = curp->p_idx; - } - - len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; - BUG_ON(len < 0); - if (len > 0) { - ext_debug("insert new index %d: " - "move %d indices from 0x%p to 0x%p\n", - logical, len, ix, ix + 1); - memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); - } - - if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { - EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); - return -EIO; - } - - ix->ei_block = cpu_to_le32(logical); - ext4_idx_store_pblock(ix, ptr); - le16_add_cpu(&curp->p_hdr->eh_entries, 1); - - if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { - EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); - return -EIO; - } - - err = ext4_ext_dirty(handle, inode, curp); - ext4_std_error(inode->i_sb, err); - - return err; -} - -/* - * ext4_ext_split: - * inserts new subtree into the path, using free index entry - * at depth @at: - * - allocates all needed blocks (new leaf and all intermediate index blocks) - * - makes decision where to split - * - moves remaining extents and index entries (right to the split point) - * into the newly allocated blocks - * - initializes subtree - */ -static int ext4_ext_split(handle_t *handle, struct inode *inode, - unsigned int flags, - struct ext4_ext_path *path, - struct ext4_extent *newext, int at) -{ - struct buffer_head *bh = NULL; - int depth = ext_depth(inode); - struct ext4_extent_header *neh; - struct ext4_extent_idx *fidx; - int i = at, k, m, a; - ext4_fsblk_t newblock, oldblock; - __le32 border; - ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ - int err = 0; - - /* make decision: where to split? */ - /* FIXME: now decision is simplest: at current extent */ - - /* if current leaf will be split, then we should use - * border from split point */ - if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { - EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); - return -EIO; - } - if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { - border = path[depth].p_ext[1].ee_block; - ext_debug("leaf will be split." - " next leaf starts at %d\n", - le32_to_cpu(border)); - } else { - border = newext->ee_block; - ext_debug("leaf will be added." - " next leaf starts at %d\n", - le32_to_cpu(border)); - } - - /* - * If error occurs, then we break processing - * and mark filesystem read-only. index won't - * be inserted and tree will be in consistent - * state. Next mount will repair buffers too. - */ - - /* - * Get array to track all allocated blocks. - * We need this to handle errors and free blocks - * upon them. - */ - ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); - if (!ablocks) - return -ENOMEM; - - /* allocate all needed blocks */ - ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); - for (a = 0; a < depth - at; a++) { - newblock = ext4_ext_new_meta_block(handle, inode, path, - newext, &err, flags); - if (newblock == 0) - goto cleanup; - ablocks[a] = newblock; - } - - /* initialize new leaf */ - newblock = ablocks[--a]; - if (unlikely(newblock == 0)) { - EXT4_ERROR_INODE(inode, "newblock == 0!"); - err = -EIO; - goto cleanup; - } - bh = sb_getblk(inode->i_sb, newblock); - if (!bh) { - err = -EIO; - goto cleanup; - } - lock_buffer(bh); - - err = ext4_journal_get_create_access(handle, bh); - if (err) - goto cleanup; - - neh = ext_block_hdr(bh); - neh->eh_entries = 0; - neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); - neh->eh_magic = EXT4_EXT_MAGIC; - neh->eh_depth = 0; - - /* move remainder of path[depth] to the new leaf */ - if (unlikely(path[depth].p_hdr->eh_entries != - path[depth].p_hdr->eh_max)) { - EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", - path[depth].p_hdr->eh_entries, - path[depth].p_hdr->eh_max); - err = -EIO; - goto cleanup; - } - /* start copy from next extent */ - m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; - ext4_ext_show_move(inode, path, newblock, depth); - if (m) { - struct ext4_extent *ex; - ex = EXT_FIRST_EXTENT(neh); - memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); - le16_add_cpu(&neh->eh_entries, m); - } - - set_buffer_uptodate(bh); - unlock_buffer(bh); - - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (err) - goto cleanup; - brelse(bh); - bh = NULL; - - /* correct old leaf */ - if (m) { - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto cleanup; - le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); - err = ext4_ext_dirty(handle, inode, path + depth); - if (err) - goto cleanup; - - } - - /* create intermediate indexes */ - k = depth - at - 1; - if (unlikely(k < 0)) { - EXT4_ERROR_INODE(inode, "k %d < 0!", k); - err = -EIO; - goto cleanup; - } - if (k) - ext_debug("create %d intermediate indices\n", k); - /* insert new index into current index block */ - /* current depth stored in i var */ - i = depth - 1; - while (k--) { - oldblock = newblock; - newblock = ablocks[--a]; - bh = sb_getblk(inode->i_sb, newblock); - if (!bh) { - err = -EIO; - goto cleanup; - } - lock_buffer(bh); - - err = ext4_journal_get_create_access(handle, bh); - if (err) - goto cleanup; - - neh = ext_block_hdr(bh); - neh->eh_entries = cpu_to_le16(1); - neh->eh_magic = EXT4_EXT_MAGIC; - neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); - neh->eh_depth = cpu_to_le16(depth - i); - fidx = EXT_FIRST_INDEX(neh); - fidx->ei_block = border; - ext4_idx_store_pblock(fidx, oldblock); - - ext_debug("int.index at %d (block %llu): %u -> %llu\n", - i, newblock, le32_to_cpu(border), oldblock); - - /* move remainder of path[i] to the new index block */ - if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != - EXT_LAST_INDEX(path[i].p_hdr))) { - EXT4_ERROR_INODE(inode, - "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", - le32_to_cpu(path[i].p_ext->ee_block)); - err = -EIO; - goto cleanup; - } - /* start copy indexes */ - m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; - ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, - EXT_MAX_INDEX(path[i].p_hdr)); - ext4_ext_show_move(inode, path, newblock, i); - if (m) { - memmove(++fidx, path[i].p_idx, - sizeof(struct ext4_extent_idx) * m); - le16_add_cpu(&neh->eh_entries, m); - } - set_buffer_uptodate(bh); - unlock_buffer(bh); - - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (err) - goto cleanup; - brelse(bh); - bh = NULL; - - /* correct old index */ - if (m) { - err = ext4_ext_get_access(handle, inode, path + i); - if (err) - goto cleanup; - le16_add_cpu(&path[i].p_hdr->eh_entries, -m); - err = ext4_ext_dirty(handle, inode, path + i); - if (err) - goto cleanup; - } - - i--; - } - - /* insert new index */ - err = ext4_ext_insert_index(handle, inode, path + at, - le32_to_cpu(border), newblock); - -cleanup: - if (bh) { - if (buffer_locked(bh)) - unlock_buffer(bh); - brelse(bh); - } - - if (err) { - /* free all allocated blocks in error case */ - for (i = 0; i < depth; i++) { - if (!ablocks[i]) - continue; - ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, - EXT4_FREE_BLOCKS_METADATA); - } - } - kfree(ablocks); - - return err; -} - -/* - * ext4_ext_grow_indepth: - * implements tree growing procedure: - * - allocates new block - * - moves top-level data (index block or leaf) into the new block - * - initializes new top-level, creating index that points to the - * just created block - */ -static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, - unsigned int flags, - struct ext4_extent *newext) -{ - struct ext4_extent_header *neh; - struct buffer_head *bh; - ext4_fsblk_t newblock; - int err = 0; - - newblock = ext4_ext_new_meta_block(handle, inode, NULL, - newext, &err, flags); - if (newblock == 0) - return err; - - bh = sb_getblk(inode->i_sb, newblock); - if (!bh) { - err = -EIO; - ext4_std_error(inode->i_sb, err); - return err; - } - lock_buffer(bh); - - err = ext4_journal_get_create_access(handle, bh); - if (err) { - unlock_buffer(bh); - goto out; - } - - /* move top-level index/leaf into new block */ - memmove(bh->b_data, EXT4_I(inode)->i_data, - sizeof(EXT4_I(inode)->i_data)); - - /* set size of new block */ - neh = ext_block_hdr(bh); - /* old root could have indexes or leaves - * so calculate e_max right way */ - if (ext_depth(inode)) - neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); - else - neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); - neh->eh_magic = EXT4_EXT_MAGIC; - set_buffer_uptodate(bh); - unlock_buffer(bh); - - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (err) - goto out; - - /* Update top-level index: num,max,pointer */ - neh = ext_inode_hdr(inode); - neh->eh_entries = cpu_to_le16(1); - ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); - if (neh->eh_depth == 0) { - /* Root extent block becomes index block */ - neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); - EXT_FIRST_INDEX(neh)->ei_block = - EXT_FIRST_EXTENT(neh)->ee_block; - } - ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", - le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), - le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), - ext4_idx_pblock(EXT_FIRST_INDEX(neh))); - - neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1); - ext4_mark_inode_dirty(handle, inode); -out: - brelse(bh); - - return err; -} - -/* - * ext4_ext_create_new_leaf: - * finds empty index and adds new leaf. - * if no free index is found, then it requests in-depth growing. - */ -static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, - unsigned int flags, - struct ext4_ext_path *path, - struct ext4_extent *newext) -{ - struct ext4_ext_path *curp; - int depth, i, err = 0; - -repeat: - i = depth = ext_depth(inode); - - /* walk up to the tree and look for free index entry */ - curp = path + depth; - while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { - i--; - curp--; - } - - /* we use already allocated block for index block, - * so subsequent data blocks should be contiguous */ - if (EXT_HAS_FREE_INDEX(curp)) { - /* if we found index with free entry, then use that - * entry: create all needed subtree and add new leaf */ - err = ext4_ext_split(handle, inode, flags, path, newext, i); - if (err) - goto out; - - /* refill path */ - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, - (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path); - if (IS_ERR(path)) - err = PTR_ERR(path); - } else { - /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, flags, newext); - if (err) - goto out; - - /* refill path */ - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, - (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } - - /* - * only first (depth 0 -> 1) produces free space; - * in all other cases we have to split the grown tree - */ - depth = ext_depth(inode); - if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { - /* now we need to split */ - goto repeat; - } - } - -out: - return err; -} - -/* - * search the closest allocated block to the left for *logical - * and returns it at @logical + it's physical address at @phys - * if *logical is the smallest allocated block, the function - * returns 0 at @phys - * return value contains 0 (success) or error code - */ -static int ext4_ext_search_left(struct inode *inode, - struct ext4_ext_path *path, - ext4_lblk_t *logical, ext4_fsblk_t *phys) -{ - struct ext4_extent_idx *ix; - struct ext4_extent *ex; - int depth, ee_len; - - if (unlikely(path == NULL)) { - EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); - return -EIO; - } - depth = path->p_depth; - *phys = 0; - - if (depth == 0 && path->p_ext == NULL) - return 0; - - /* usually extent in the path covers blocks smaller - * then *logical, but it can be that extent is the - * first one in the file */ - - ex = path[depth].p_ext; - ee_len = ext4_ext_get_actual_len(ex); - if (*logical < le32_to_cpu(ex->ee_block)) { - if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { - EXT4_ERROR_INODE(inode, - "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", - *logical, le32_to_cpu(ex->ee_block)); - return -EIO; - } - while (--depth >= 0) { - ix = path[depth].p_idx; - if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { - EXT4_ERROR_INODE(inode, - "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", - ix != NULL ? le32_to_cpu(ix->ei_block) : 0, - EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? - le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, - depth); - return -EIO; - } - } - return 0; - } - - if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { - EXT4_ERROR_INODE(inode, - "logical %d < ee_block %d + ee_len %d!", - *logical, le32_to_cpu(ex->ee_block), ee_len); - return -EIO; - } - - *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; - *phys = ext4_ext_pblock(ex) + ee_len - 1; - return 0; -} - -/* - * search the closest allocated block to the right for *logical - * and returns it at @logical + it's physical address at @phys - * if *logical is the largest allocated block, the function - * returns 0 at @phys - * return value contains 0 (success) or error code - */ -static int ext4_ext_search_right(struct inode *inode, - struct ext4_ext_path *path, - ext4_lblk_t *logical, ext4_fsblk_t *phys, - struct ext4_extent **ret_ex) -{ - struct buffer_head *bh = NULL; - struct ext4_extent_header *eh; - struct ext4_extent_idx *ix; - struct ext4_extent *ex; - ext4_fsblk_t block; - int depth; /* Note, NOT eh_depth; depth from top of tree */ - int ee_len; - - if (unlikely(path == NULL)) { - EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); - return -EIO; - } - depth = path->p_depth; - *phys = 0; - - if (depth == 0 && path->p_ext == NULL) - return 0; - - /* usually extent in the path covers blocks smaller - * then *logical, but it can be that extent is the - * first one in the file */ - - ex = path[depth].p_ext; - ee_len = ext4_ext_get_actual_len(ex); - if (*logical < le32_to_cpu(ex->ee_block)) { - if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { - EXT4_ERROR_INODE(inode, - "first_extent(path[%d].p_hdr) != ex", - depth); - return -EIO; - } - while (--depth >= 0) { - ix = path[depth].p_idx; - if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { - EXT4_ERROR_INODE(inode, - "ix != EXT_FIRST_INDEX *logical %d!", - *logical); - return -EIO; - } - } - goto found_extent; - } - - if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { - EXT4_ERROR_INODE(inode, - "logical %d < ee_block %d + ee_len %d!", - *logical, le32_to_cpu(ex->ee_block), ee_len); - return -EIO; - } - - if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { - /* next allocated block in this leaf */ - ex++; - goto found_extent; - } - - /* go up and search for index to the right */ - while (--depth >= 0) { - ix = path[depth].p_idx; - if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) - goto got_index; - } - - /* we've gone up to the root and found no index to the right */ - return 0; - -got_index: - /* we've found index to the right, let's - * follow it and find the closest allocated - * block to the right */ - ix++; - block = ext4_idx_pblock(ix); - while (++depth < path->p_depth) { - bh = sb_bread(inode->i_sb, block); - if (bh == NULL) - return -EIO; - eh = ext_block_hdr(bh); - /* subtract from p_depth to get proper eh_depth */ - if (ext4_ext_check(inode, eh, path->p_depth - depth)) { - put_bh(bh); - return -EIO; - } - ix = EXT_FIRST_INDEX(eh); - block = ext4_idx_pblock(ix); - put_bh(bh); - } - - bh = sb_bread(inode->i_sb, block); - if (bh == NULL) - return -EIO; - eh = ext_block_hdr(bh); - if (ext4_ext_check(inode, eh, path->p_depth - depth)) { - put_bh(bh); - return -EIO; - } - ex = EXT_FIRST_EXTENT(eh); -found_extent: - *logical = le32_to_cpu(ex->ee_block); - *phys = ext4_ext_pblock(ex); - *ret_ex = ex; - if (bh) - put_bh(bh); - return 0; -} - -/* - * ext4_ext_next_allocated_block: - * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. - * NOTE: it considers block number from index entry as - * allocated block. Thus, index entries have to be consistent - * with leaves. - */ -static ext4_lblk_t -ext4_ext_next_allocated_block(struct ext4_ext_path *path) -{ - int depth; - - BUG_ON(path == NULL); - depth = path->p_depth; - - if (depth == 0 && path->p_ext == NULL) - return EXT_MAX_BLOCKS; - - while (depth >= 0) { - if (depth == path->p_depth) { - /* leaf */ - if (path[depth].p_ext && - path[depth].p_ext != - EXT_LAST_EXTENT(path[depth].p_hdr)) - return le32_to_cpu(path[depth].p_ext[1].ee_block); - } else { - /* index */ - if (path[depth].p_idx != - EXT_LAST_INDEX(path[depth].p_hdr)) - return le32_to_cpu(path[depth].p_idx[1].ei_block); - } - depth--; - } - - return EXT_MAX_BLOCKS; -} - -/* - * ext4_ext_next_leaf_block: - * returns first allocated block from next leaf or EXT_MAX_BLOCKS - */ -static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) -{ - int depth; - - BUG_ON(path == NULL); - depth = path->p_depth; - - /* zero-tree has no leaf blocks at all */ - if (depth == 0) - return EXT_MAX_BLOCKS; - - /* go to index block */ - depth--; - - while (depth >= 0) { - if (path[depth].p_idx != - EXT_LAST_INDEX(path[depth].p_hdr)) - return (ext4_lblk_t) - le32_to_cpu(path[depth].p_idx[1].ei_block); - depth--; - } - - return EXT_MAX_BLOCKS; -} - -/* - * ext4_ext_correct_indexes: - * if leaf gets modified and modified extent is first in the leaf, - * then we have to correct all indexes above. - * TODO: do we need to correct tree in all cases? - */ -static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) -{ - struct ext4_extent_header *eh; - int depth = ext_depth(inode); - struct ext4_extent *ex; - __le32 border; - int k, err = 0; - - eh = path[depth].p_hdr; - ex = path[depth].p_ext; - - if (unlikely(ex == NULL || eh == NULL)) { - EXT4_ERROR_INODE(inode, - "ex %p == NULL or eh %p == NULL", ex, eh); - return -EIO; - } - - if (depth == 0) { - /* there is no tree at all */ - return 0; - } - - if (ex != EXT_FIRST_EXTENT(eh)) { - /* we correct tree if first leaf got modified only */ - return 0; - } - - /* - * TODO: we need correction if border is smaller than current one - */ - k = depth - 1; - border = path[depth].p_ext->ee_block; - err = ext4_ext_get_access(handle, inode, path + k); - if (err) - return err; - path[k].p_idx->ei_block = border; - err = ext4_ext_dirty(handle, inode, path + k); - if (err) - return err; - - while (k--) { - /* change all left-side indexes */ - if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) - break; - err = ext4_ext_get_access(handle, inode, path + k); - if (err) - break; - path[k].p_idx->ei_block = border; - err = ext4_ext_dirty(handle, inode, path + k); - if (err) - break; - } - - return err; -} - -int -ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, - struct ext4_extent *ex2) -{ - unsigned short ext1_ee_len, ext2_ee_len, max_len; - - /* - * Make sure that either both extents are uninitialized, or - * both are _not_. - */ - if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) - return 0; - - if (ext4_ext_is_uninitialized(ex1)) - max_len = EXT_UNINIT_MAX_LEN; - else - max_len = EXT_INIT_MAX_LEN; - - ext1_ee_len = ext4_ext_get_actual_len(ex1); - ext2_ee_len = ext4_ext_get_actual_len(ex2); - - if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != - le32_to_cpu(ex2->ee_block)) - return 0; - - /* - * To allow future support for preallocated extents to be added - * as an RO_COMPAT feature, refuse to merge to extents if - * this can result in the top bit of ee_len being set. - */ - if (ext1_ee_len + ext2_ee_len > max_len) - return 0; -#ifdef AGGRESSIVE_TEST - if (ext1_ee_len >= 4) - return 0; -#endif - - if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) - return 1; - return 0; -} - -/* - * This function tries to merge the "ex" extent to the next extent in the tree. - * It always tries to merge towards right. If you want to merge towards - * left, pass "ex - 1" as argument instead of "ex". - * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns - * 1 if they got merged. - */ -static int ext4_ext_try_to_merge_right(struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *ex) -{ - struct ext4_extent_header *eh; - unsigned int depth, len; - int merge_done = 0; - int uninitialized = 0; - - depth = ext_depth(inode); - BUG_ON(path[depth].p_hdr == NULL); - eh = path[depth].p_hdr; - - while (ex < EXT_LAST_EXTENT(eh)) { - if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) - break; - /* merge with next extent! */ - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; - ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) - + ext4_ext_get_actual_len(ex + 1)); - if (uninitialized) - ext4_ext_mark_uninitialized(ex); - - if (ex + 1 < EXT_LAST_EXTENT(eh)) { - len = (EXT_LAST_EXTENT(eh) - ex - 1) - * sizeof(struct ext4_extent); - memmove(ex + 1, ex + 2, len); - } - le16_add_cpu(&eh->eh_entries, -1); - merge_done = 1; - WARN_ON(eh->eh_entries == 0); - if (!eh->eh_entries) - EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); - } - - return merge_done; -} - -/* - * This function tries to merge the @ex extent to neighbours in the tree. - * return 1 if merge left else 0. - */ -static int ext4_ext_try_to_merge(struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *ex) { - struct ext4_extent_header *eh; - unsigned int depth; - int merge_done = 0; - int ret = 0; - - depth = ext_depth(inode); - BUG_ON(path[depth].p_hdr == NULL); - eh = path[depth].p_hdr; - - if (ex > EXT_FIRST_EXTENT(eh)) - merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); - - if (!merge_done) - ret = ext4_ext_try_to_merge_right(inode, path, ex); - - return ret; -} - -/* - * check if a portion of the "newext" extent overlaps with an - * existing extent. - * - * If there is an overlap discovered, it updates the length of the newext - * such that there will be no overlap, and then returns 1. - * If there is no overlap found, it returns 0. - */ -static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, - struct inode *inode, - struct ext4_extent *newext, - struct ext4_ext_path *path) -{ - ext4_lblk_t b1, b2; - unsigned int depth, len1; - unsigned int ret = 0; - - b1 = le32_to_cpu(newext->ee_block); - len1 = ext4_ext_get_actual_len(newext); - depth = ext_depth(inode); - if (!path[depth].p_ext) - goto out; - b2 = le32_to_cpu(path[depth].p_ext->ee_block); - b2 &= ~(sbi->s_cluster_ratio - 1); - - /* - * get the next allocated block if the extent in the path - * is before the requested block(s) - */ - if (b2 < b1) { - b2 = ext4_ext_next_allocated_block(path); - if (b2 == EXT_MAX_BLOCKS) - goto out; - b2 &= ~(sbi->s_cluster_ratio - 1); - } - - /* check for wrap through zero on extent logical start block*/ - if (b1 + len1 < b1) { - len1 = EXT_MAX_BLOCKS - b1; - newext->ee_len = cpu_to_le16(len1); - ret = 1; - } - - /* check for overlap */ - if (b1 + len1 > b2) { - newext->ee_len = cpu_to_le16(b2 - b1); - ret = 1; - } -out: - return ret; -} - -/* - * ext4_ext_insert_extent: - * tries to merge requsted extent into the existing extent or - * inserts requested extent as new one into the tree, - * creating new leaf in the no-space case. - */ -int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, - struct ext4_extent *newext, int flag) -{ - struct ext4_extent_header *eh; - struct ext4_extent *ex, *fex; - struct ext4_extent *nearex; /* nearest extent */ - struct ext4_ext_path *npath = NULL; - int depth, len, err; - ext4_lblk_t next; - unsigned uninitialized = 0; - int flags = 0; - - if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { - EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); - return -EIO; - } - depth = ext_depth(inode); - ex = path[depth].p_ext; - if (unlikely(path[depth].p_hdr == NULL)) { - EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - return -EIO; - } - - /* try to insert block into found extent and return */ - if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) - && ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append [%d]%d block to %u:[%d]%d (from %llu)\n", - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext), - le32_to_cpu(ex->ee_block), - ext4_ext_is_uninitialized(ex), - ext4_ext_get_actual_len(ex), - ext4_ext_pblock(ex)); - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - return err; - - /* - * ext4_can_extents_be_merged should have checked that either - * both extents are uninitialized, or both aren't. Thus we - * need to check only one of them here. - */ - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; - ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) - + ext4_ext_get_actual_len(newext)); - if (uninitialized) - ext4_ext_mark_uninitialized(ex); - eh = path[depth].p_hdr; - nearex = ex; - goto merge; - } - - depth = ext_depth(inode); - eh = path[depth].p_hdr; - if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) - goto has_space; - - /* probably next leaf has space for us? */ - fex = EXT_LAST_EXTENT(eh); - next = EXT_MAX_BLOCKS; - if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) - next = ext4_ext_next_leaf_block(path); - if (next != EXT_MAX_BLOCKS) { - ext_debug("next leaf block - %u\n", next); - BUG_ON(npath != NULL); - npath = ext4_ext_find_extent(inode, next, NULL); - if (IS_ERR(npath)) - return PTR_ERR(npath); - BUG_ON(npath->p_depth != path->p_depth); - eh = npath[depth].p_hdr; - if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { - ext_debug("next leaf isn't full(%d)\n", - le16_to_cpu(eh->eh_entries)); - path = npath; - goto has_space; - } - ext_debug("next leaf has no free space(%d,%d)\n", - le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); - } - - /* - * There is no free space in the found leaf. - * We're gonna add a new leaf in the tree. - */ - if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) - flags = EXT4_MB_USE_ROOT_BLOCKS; - err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); - if (err) - goto cleanup; - depth = ext_depth(inode); - eh = path[depth].p_hdr; - -has_space: - nearex = path[depth].p_ext; - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto cleanup; - - if (!nearex) { - /* there is no extent in this leaf, create first one */ - ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", - le32_to_cpu(newext->ee_block), - ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext)); - nearex = EXT_FIRST_EXTENT(eh); - } else { - if (le32_to_cpu(newext->ee_block) - > le32_to_cpu(nearex->ee_block)) { - /* Insert after */ - ext_debug("insert %u:%llu:[%d]%d before: " - "nearest %p\n", - le32_to_cpu(newext->ee_block), - ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext), - nearex); - nearex++; - } else { - /* Insert before */ - BUG_ON(newext->ee_block == nearex->ee_block); - ext_debug("insert %u:%llu:[%d]%d after: " - "nearest %p\n", - le32_to_cpu(newext->ee_block), - ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext), - nearex); - } - len = EXT_LAST_EXTENT(eh) - nearex + 1; - if (len > 0) { - ext_debug("insert %u:%llu:[%d]%d: " - "move %d extents from 0x%p to 0x%p\n", - le32_to_cpu(newext->ee_block), - ext4_ext_pblock(newext), - ext4_ext_is_uninitialized(newext), - ext4_ext_get_actual_len(newext), - len, nearex, nearex + 1); - memmove(nearex + 1, nearex, - len * sizeof(struct ext4_extent)); - } - } - - le16_add_cpu(&eh->eh_entries, 1); - path[depth].p_ext = nearex; - nearex->ee_block = newext->ee_block; - ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); - nearex->ee_len = newext->ee_len; - -merge: - /* try to merge extents to the right */ - if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) - ext4_ext_try_to_merge(inode, path, nearex); - - /* try to merge extents to the left */ - - /* time to correct all indexes above */ - err = ext4_ext_correct_indexes(handle, inode, path); - if (err) - goto cleanup; - - err = ext4_ext_dirty(handle, inode, path + depth); - -cleanup: - if (npath) { - ext4_ext_drop_refs(npath); - kfree(npath); - } - ext4_ext_invalidate_cache(inode); - return err; -} - -static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, - ext4_lblk_t num, ext_prepare_callback func, - void *cbdata) -{ - struct ext4_ext_path *path = NULL; - struct ext4_ext_cache cbex; - struct ext4_extent *ex; - ext4_lblk_t next, start = 0, end = 0; - ext4_lblk_t last = block + num; - int depth, exists, err = 0; - - BUG_ON(func == NULL); - BUG_ON(inode == NULL); - - while (block < last && block != EXT_MAX_BLOCKS) { - num = last - block; - /* find extent for this block */ - down_read(&EXT4_I(inode)->i_data_sem); - path = ext4_ext_find_extent(inode, block, path); - up_read(&EXT4_I(inode)->i_data_sem); - if (IS_ERR(path)) { - err = PTR_ERR(path); - path = NULL; - break; - } - - depth = ext_depth(inode); - if (unlikely(path[depth].p_hdr == NULL)) { - EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - err = -EIO; - break; - } - ex = path[depth].p_ext; - next = ext4_ext_next_allocated_block(path); - - exists = 0; - if (!ex) { - /* there is no extent yet, so try to allocate - * all requested space */ - start = block; - end = block + num; - } else if (le32_to_cpu(ex->ee_block) > block) { - /* need to allocate space before found extent */ - start = block; - end = le32_to_cpu(ex->ee_block); - if (block + num < end) - end = block + num; - } else if (block >= le32_to_cpu(ex->ee_block) - + ext4_ext_get_actual_len(ex)) { - /* need to allocate space after found extent */ - start = block; - end = block + num; - if (end >= next) - end = next; - } else if (block >= le32_to_cpu(ex->ee_block)) { - /* - * some part of requested space is covered - * by found extent - */ - start = block; - end = le32_to_cpu(ex->ee_block) - + ext4_ext_get_actual_len(ex); - if (block + num < end) - end = block + num; - exists = 1; - } else { - BUG(); - } - BUG_ON(end <= start); - - if (!exists) { - cbex.ec_block = start; - cbex.ec_len = end - start; - cbex.ec_start = 0; - } else { - cbex.ec_block = le32_to_cpu(ex->ee_block); - cbex.ec_len = ext4_ext_get_actual_len(ex); - cbex.ec_start = ext4_ext_pblock(ex); - } - - if (unlikely(cbex.ec_len == 0)) { - EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); - err = -EIO; - break; - } - err = func(inode, next, &cbex, ex, cbdata); - ext4_ext_drop_refs(path); - - if (err < 0) - break; - - if (err == EXT_REPEAT) - continue; - else if (err == EXT_BREAK) { - err = 0; - break; - } - - if (ext_depth(inode) != depth) { - /* depth was changed. we have to realloc path */ - kfree(path); - path = NULL; - } - - block = cbex.ec_block + cbex.ec_len; - } - - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } - - return err; -} - -static void -ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, - __u32 len, ext4_fsblk_t start) -{ - struct ext4_ext_cache *cex; - BUG_ON(len == 0); - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - trace_ext4_ext_put_in_cache(inode, block, len, start); - cex = &EXT4_I(inode)->i_cached_extent; - cex->ec_block = block; - cex->ec_len = len; - cex->ec_start = start; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); -} - -/* - * ext4_ext_put_gap_in_cache: - * calculate boundaries of the gap that the requested block fits into - * and cache this gap - */ -static void -ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, - ext4_lblk_t block) -{ - int depth = ext_depth(inode); - unsigned long len; - ext4_lblk_t lblock; - struct ext4_extent *ex; - - ex = path[depth].p_ext; - if (ex == NULL) { - /* there is no extent yet, so gap is [0;-] */ - lblock = 0; - len = EXT_MAX_BLOCKS; - ext_debug("cache gap(whole file):"); - } else if (block < le32_to_cpu(ex->ee_block)) { - lblock = block; - len = le32_to_cpu(ex->ee_block) - block; - ext_debug("cache gap(before): %u [%u:%u]", - block, - le32_to_cpu(ex->ee_block), - ext4_ext_get_actual_len(ex)); - } else if (block >= le32_to_cpu(ex->ee_block) - + ext4_ext_get_actual_len(ex)) { - ext4_lblk_t next; - lblock = le32_to_cpu(ex->ee_block) - + ext4_ext_get_actual_len(ex); - - next = ext4_ext_next_allocated_block(path); - ext_debug("cache gap(after): [%u:%u] %u", - le32_to_cpu(ex->ee_block), - ext4_ext_get_actual_len(ex), - block); - BUG_ON(next == lblock); - len = next - lblock; - } else { - lblock = len = 0; - BUG(); - } - - ext_debug(" -> %u:%lu\n", lblock, len); - ext4_ext_put_in_cache(inode, lblock, len, 0); -} - -/* - * ext4_ext_check_cache() - * Checks to see if the given block is in the cache. - * If it is, the cached extent is stored in the given - * cache extent pointer. If the cached extent is a hole, - * this routine should be used instead of - * ext4_ext_in_cache if the calling function needs to - * know the size of the hole. - * - * @inode: The files inode - * @block: The block to look for in the cache - * @ex: Pointer where the cached extent will be stored - * if it contains block - * - * Return 0 if cache is invalid; 1 if the cache is valid - */ -static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_cache *ex){ - struct ext4_ext_cache *cex; - struct ext4_sb_info *sbi; - int ret = 0; - - /* - * We borrow i_block_reservation_lock to protect i_cached_extent - */ - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - cex = &EXT4_I(inode)->i_cached_extent; - sbi = EXT4_SB(inode->i_sb); - - /* has cache valid data? */ - if (cex->ec_len == 0) - goto errout; - - if (in_range(block, cex->ec_block, cex->ec_len)) { - memcpy(ex, cex, sizeof(struct ext4_ext_cache)); - ext_debug("%u cached by %u:%u:%llu\n", - block, - cex->ec_block, cex->ec_len, cex->ec_start); - ret = 1; - } -errout: - trace_ext4_ext_in_cache(inode, block, ret); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return ret; -} - -/* - * ext4_ext_in_cache() - * Checks to see if the given block is in the cache. - * If it is, the cached extent is stored in the given - * extent pointer. - * - * @inode: The files inode - * @block: The block to look for in the cache - * @ex: Pointer where the cached extent will be stored - * if it contains block - * - * Return 0 if cache is invalid; 1 if the cache is valid - */ -static int -ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, - struct ext4_extent *ex) -{ - struct ext4_ext_cache cex; - int ret = 0; - - if (ext4_ext_check_cache(inode, block, &cex)) { - ex->ee_block = cpu_to_le32(cex.ec_block); - ext4_ext_store_pblock(ex, cex.ec_start); - ex->ee_len = cpu_to_le16(cex.ec_len); - ret = 1; - } - - return ret; -} - - -/* - * ext4_ext_rm_idx: - * removes index from the index block. - */ -static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) -{ - int err; - ext4_fsblk_t leaf; - - /* free index block */ - path--; - leaf = ext4_idx_pblock(path->p_idx); - if (unlikely(path->p_hdr->eh_entries == 0)) { - EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); - return -EIO; - } - err = ext4_ext_get_access(handle, inode, path); - if (err) - return err; - - if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { - int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; - len *= sizeof(struct ext4_extent_idx); - memmove(path->p_idx, path->p_idx + 1, len); - } - - le16_add_cpu(&path->p_hdr->eh_entries, -1); - err = ext4_ext_dirty(handle, inode, path); - if (err) - return err; - ext_debug("index is empty, remove it, free block %llu\n", leaf); - trace_ext4_ext_rm_idx(inode, leaf); - - ext4_free_blocks(handle, inode, NULL, leaf, 1, - EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - return err; -} - -/* - * ext4_ext_calc_credits_for_single_extent: - * This routine returns max. credits that needed to insert an extent - * to the extent tree. - * When pass the actual path, the caller should calculate credits - * under i_data_sem. - */ -int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, - struct ext4_ext_path *path) -{ - if (path) { - int depth = ext_depth(inode); - int ret = 0; - - /* probably there is space in leaf? */ - if (le16_to_cpu(path[depth].p_hdr->eh_entries) - < le16_to_cpu(path[depth].p_hdr->eh_max)) { - - /* - * There are some space in the leaf tree, no - * need to account for leaf block credit - * - * bitmaps and block group descriptor blocks - * and other metadata blocks still need to be - * accounted. - */ - /* 1 bitmap, 1 block group descriptor */ - ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); - return ret; - } - } - - return ext4_chunk_trans_blocks(inode, nrblocks); -} - -/* - * How many index/leaf blocks need to change/allocate to modify nrblocks? - * - * if nrblocks are fit in a single extent (chunk flag is 1), then - * in the worse case, each tree level index/leaf need to be changed - * if the tree split due to insert a new extent, then the old tree - * index/leaf need to be updated too - * - * If the nrblocks are discontiguous, they could cause - * the whole tree split more than once, but this is really rare. - */ -int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) -{ - int index; - int depth = ext_depth(inode); - - if (chunk) - index = depth * 2; - else - index = depth * 3; - - return index; -} - -static int ext4_remove_blocks(handle_t *handle, struct inode *inode, - struct ext4_extent *ex, - ext4_fsblk_t *partial_cluster, - ext4_lblk_t from, ext4_lblk_t to) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned short ee_len = ext4_ext_get_actual_len(ex); - ext4_fsblk_t pblk; - int flags = EXT4_FREE_BLOCKS_FORGET; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - /* - * For bigalloc file systems, we never free a partial cluster - * at the beginning of the extent. Instead, we make a note - * that we tried freeing the cluster, and check to see if we - * need to free it on a subsequent call to ext4_remove_blocks, - * or at the end of the ext4_truncate() operation. - */ - flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; - - trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); - /* - * If we have a partial cluster, and it's different from the - * cluster of the last block, we need to explicitly free the - * partial cluster here. - */ - pblk = ext4_ext_pblock(ex) + ee_len - 1; - if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { - ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), - sbi->s_cluster_ratio, flags); - *partial_cluster = 0; - } - -#ifdef EXTENTS_STATS - { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - spin_lock(&sbi->s_ext_stats_lock); - sbi->s_ext_blocks += ee_len; - sbi->s_ext_extents++; - if (ee_len < sbi->s_ext_min) - sbi->s_ext_min = ee_len; - if (ee_len > sbi->s_ext_max) - sbi->s_ext_max = ee_len; - if (ext_depth(inode) > sbi->s_depth_max) - sbi->s_depth_max = ext_depth(inode); - spin_unlock(&sbi->s_ext_stats_lock); - } -#endif - if (from >= le32_to_cpu(ex->ee_block) - && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { - /* tail removal */ - ext4_lblk_t num; - - num = le32_to_cpu(ex->ee_block) + ee_len - from; - pblk = ext4_ext_pblock(ex) + ee_len - num; - ext_debug("free last %u blocks starting %llu\n", num, pblk); - ext4_free_blocks(handle, inode, NULL, pblk, num, flags); - /* - * If the block range to be freed didn't start at the - * beginning of a cluster, and we removed the entire - * extent, save the partial cluster here, since we - * might need to delete if we determine that the - * truncate operation has removed all of the blocks in - * the cluster. - */ - if (pblk & (sbi->s_cluster_ratio - 1) && - (ee_len == num)) - *partial_cluster = EXT4_B2C(sbi, pblk); - else - *partial_cluster = 0; - } else if (from == le32_to_cpu(ex->ee_block) - && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { - /* head removal */ - ext4_lblk_t num; - ext4_fsblk_t start; - - num = to - from; - start = ext4_ext_pblock(ex); - - ext_debug("free first %u blocks starting %llu\n", num, start); - ext4_free_blocks(handle, inode, NULL, start, num, flags); - - } else { - printk(KERN_INFO "strange request: removal(2) " - "%u-%u from %u:%u\n", - from, to, le32_to_cpu(ex->ee_block), ee_len); - } - return 0; -} - - -/* - * ext4_ext_rm_leaf() Removes the extents associated with the - * blocks appearing between "start" and "end", and splits the extents - * if "start" and "end" appear in the same extent - * - * @handle: The journal handle - * @inode: The files inode - * @path: The path to the leaf - * @start: The first block to remove - * @end: The last block to remove - */ -static int -ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, - ext4_lblk_t start, ext4_lblk_t end) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int err = 0, correct_index = 0; - int depth = ext_depth(inode), credits; - struct ext4_extent_header *eh; - ext4_lblk_t a, b; - unsigned num; - ext4_lblk_t ex_ee_block; - unsigned short ex_ee_len; - unsigned uninitialized = 0; - struct ext4_extent *ex; - - /* the header must be checked already in ext4_ext_remove_space() */ - ext_debug("truncate since %u in leaf to %u\n", start, end); - if (!path[depth].p_hdr) - path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); - eh = path[depth].p_hdr; - if (unlikely(path[depth].p_hdr == NULL)) { - EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - return -EIO; - } - /* find where to start removing */ - ex = EXT_LAST_EXTENT(eh); - - ex_ee_block = le32_to_cpu(ex->ee_block); - ex_ee_len = ext4_ext_get_actual_len(ex); - - trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); - - while (ex >= EXT_FIRST_EXTENT(eh) && - ex_ee_block + ex_ee_len > start) { - - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; - else - uninitialized = 0; - - ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, - uninitialized, ex_ee_len); - path[depth].p_ext = ex; - - a = ex_ee_block > start ? ex_ee_block : start; - b = ex_ee_block+ex_ee_len - 1 < end ? - ex_ee_block+ex_ee_len - 1 : end; - - ext_debug(" border %u:%u\n", a, b); - - /* If this extent is beyond the end of the hole, skip it */ - if (end < ex_ee_block) { - ex--; - ex_ee_block = le32_to_cpu(ex->ee_block); - ex_ee_len = ext4_ext_get_actual_len(ex); - continue; - } else if (b != ex_ee_block + ex_ee_len - 1) { - EXT4_ERROR_INODE(inode, - "can not handle truncate %u:%u " - "on extent %u:%u", - start, end, ex_ee_block, - ex_ee_block + ex_ee_len - 1); - err = -EIO; - goto out; - } else if (a != ex_ee_block) { - /* remove tail of the extent */ - num = a - ex_ee_block; - } else { - /* remove whole extent: excellent! */ - num = 0; - } - /* - * 3 for leaf, sb, and inode plus 2 (bmap and group - * descriptor) for each block group; assume two block - * groups plus ex_ee_len/blocks_per_block_group for - * the worst case - */ - credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); - if (ex == EXT_FIRST_EXTENT(eh)) { - correct_index = 1; - credits += (ext_depth(inode)) + 1; - } - credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); - - err = ext4_ext_truncate_extend_restart(handle, inode, credits); - if (err) - goto out; - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - - err = ext4_remove_blocks(handle, inode, ex, partial_cluster, - a, b); - if (err) - goto out; - - if (num == 0) - /* this extent is removed; mark slot entirely unused */ - ext4_ext_store_pblock(ex, 0); - - ex->ee_len = cpu_to_le16(num); - /* - * Do not mark uninitialized if all the blocks in the - * extent have been removed. - */ - if (uninitialized && num) - ext4_ext_mark_uninitialized(ex); - /* - * If the extent was completely released, - * we need to remove it from the leaf - */ - if (num == 0) { - if (end != EXT_MAX_BLOCKS - 1) { - /* - * For hole punching, we need to scoot all the - * extents up when an extent is removed so that - * we dont have blank extents in the middle - */ - memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * - sizeof(struct ext4_extent)); - - /* Now get rid of the one at the end */ - memset(EXT_LAST_EXTENT(eh), 0, - sizeof(struct ext4_extent)); - } - le16_add_cpu(&eh->eh_entries, -1); - } else - *partial_cluster = 0; - - err = ext4_ext_dirty(handle, inode, path + depth); - if (err) - goto out; - - ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num, - ext4_ext_pblock(ex)); - ex--; - ex_ee_block = le32_to_cpu(ex->ee_block); - ex_ee_len = ext4_ext_get_actual_len(ex); - } - - if (correct_index && eh->eh_entries) - err = ext4_ext_correct_indexes(handle, inode, path); - - /* - * If there is still a entry in the leaf node, check to see if - * it references the partial cluster. This is the only place - * where it could; if it doesn't, we can free the cluster. - */ - if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && - (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != - *partial_cluster)) { - int flags = EXT4_FREE_BLOCKS_FORGET; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - - ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), - sbi->s_cluster_ratio, flags); - *partial_cluster = 0; - } - - /* if this leaf is free, then we should - * remove it from index block above */ - if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) - err = ext4_ext_rm_idx(handle, inode, path + depth); - -out: - return err; -} - -/* - * ext4_ext_more_to_rm: - * returns 1 if current index has to be freed (even partial) - */ -static int -ext4_ext_more_to_rm(struct ext4_ext_path *path) -{ - BUG_ON(path->p_idx == NULL); - - if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) - return 0; - - /* - * if truncate on deeper level happened, it wasn't partial, - * so we have to consider current index for truncation - */ - if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) - return 0; - return 1; -} - -static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t end) -{ - struct super_block *sb = inode->i_sb; - int depth = ext_depth(inode); - struct ext4_ext_path *path; - ext4_fsblk_t partial_cluster = 0; - handle_t *handle; - int i, err; - - ext_debug("truncate since %u to %u\n", start, end); - - /* probably first extent we're gonna free will be last in block */ - handle = ext4_journal_start(inode, depth + 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); - -again: - ext4_ext_invalidate_cache(inode); - - trace_ext4_ext_remove_space(inode, start, depth); - - /* - * Check if we are removing extents inside the extent tree. If that - * is the case, we are going to punch a hole inside the extent tree - * so we have to check whether we need to split the extent covering - * the last block to remove so we can easily remove the part of it - * in ext4_ext_rm_leaf(). - */ - if (end < EXT_MAX_BLOCKS - 1) { - struct ext4_extent *ex; - ext4_lblk_t ee_block; - - /* find extent for this block */ - path = ext4_ext_find_extent(inode, end, NULL); - if (IS_ERR(path)) { - ext4_journal_stop(handle); - return PTR_ERR(path); - } - depth = ext_depth(inode); - ex = path[depth].p_ext; - if (!ex) - goto cont; - - ee_block = le32_to_cpu(ex->ee_block); - - /* - * See if the last block is inside the extent, if so split - * the extent at 'end' block so we can easily remove the - * tail of the first part of the split extent in - * ext4_ext_rm_leaf(). - */ - if (end >= ee_block && - end < ee_block + ext4_ext_get_actual_len(ex) - 1) { - int split_flag = 0; - - if (ext4_ext_is_uninitialized(ex)) - split_flag = EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2; - - /* - * Split the extent in two so that 'end' is the last - * block in the first new extent - */ - err = ext4_split_extent_at(handle, inode, path, - end + 1, split_flag, - EXT4_GET_BLOCKS_PRE_IO | - EXT4_GET_BLOCKS_PUNCH_OUT_EXT); - - if (err < 0) - goto out; - } - ext4_ext_drop_refs(path); - kfree(path); - } -cont: - - /* - * We start scanning from right side, freeing all the blocks - * after i_size and walking into the tree depth-wise. - */ - depth = ext_depth(inode); - path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); - if (path == NULL) { - ext4_journal_stop(handle); - return -ENOMEM; - } - path[0].p_depth = depth; - path[0].p_hdr = ext_inode_hdr(inode); - - if (ext4_ext_check(inode, path[0].p_hdr, depth)) { - err = -EIO; - goto out; - } - i = err = 0; - - while (i >= 0 && err == 0) { - if (i == depth) { - /* this is leaf block */ - err = ext4_ext_rm_leaf(handle, inode, path, - &partial_cluster, start, - end); - /* root level has p_bh == NULL, brelse() eats this */ - brelse(path[i].p_bh); - path[i].p_bh = NULL; - i--; - continue; - } - - /* this is index block */ - if (!path[i].p_hdr) { - ext_debug("initialize header\n"); - path[i].p_hdr = ext_block_hdr(path[i].p_bh); - } - - if (!path[i].p_idx) { - /* this level hasn't been touched yet */ - path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); - path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; - ext_debug("init index ptr: hdr 0x%p, num %d\n", - path[i].p_hdr, - le16_to_cpu(path[i].p_hdr->eh_entries)); - } else { - /* we were already here, see at next index */ - path[i].p_idx--; - } - - ext_debug("level %d - index, first 0x%p, cur 0x%p\n", - i, EXT_FIRST_INDEX(path[i].p_hdr), - path[i].p_idx); - if (ext4_ext_more_to_rm(path + i)) { - struct buffer_head *bh; - /* go to the next level */ - ext_debug("move to level %d (block %llu)\n", - i + 1, ext4_idx_pblock(path[i].p_idx)); - memset(path + i + 1, 0, sizeof(*path)); - bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); - if (!bh) { - /* should we reset i_size? */ - err = -EIO; - break; - } - if (WARN_ON(i + 1 > depth)) { - err = -EIO; - break; - } - if (ext4_ext_check(inode, ext_block_hdr(bh), - depth - i - 1)) { - err = -EIO; - break; - } - path[i + 1].p_bh = bh; - - /* save actual number of indexes since this - * number is changed at the next iteration */ - path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); - i++; - } else { - /* we finished processing this index, go up */ - if (path[i].p_hdr->eh_entries == 0 && i > 0) { - /* index is empty, remove it; - * handle must be already prepared by the - * truncatei_leaf() */ - err = ext4_ext_rm_idx(handle, inode, path + i); - } - /* root level has p_bh == NULL, brelse() eats this */ - brelse(path[i].p_bh); - path[i].p_bh = NULL; - i--; - ext_debug("return to level %d\n", i); - } - } - - trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, - path->p_hdr->eh_entries); - - /* If we still have something in the partial cluster and we have removed - * even the first extent, then we should free the blocks in the partial - * cluster as well. */ - if (partial_cluster && path->p_hdr->eh_entries == 0) { - int flags = EXT4_FREE_BLOCKS_FORGET; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - - ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(EXT4_SB(sb), partial_cluster), - EXT4_SB(sb)->s_cluster_ratio, flags); - partial_cluster = 0; - } - - /* TODO: flexible tree reduction should be here */ - if (path->p_hdr->eh_entries == 0) { - /* - * truncate to zero freed all the tree, - * so we need to correct eh_depth - */ - err = ext4_ext_get_access(handle, inode, path); - if (err == 0) { - ext_inode_hdr(inode)->eh_depth = 0; - ext_inode_hdr(inode)->eh_max = - cpu_to_le16(ext4_ext_space_root(inode, 0)); - err = ext4_ext_dirty(handle, inode, path); - } - } -out: - ext4_ext_drop_refs(path); - kfree(path); - if (err == -EAGAIN) - goto again; - ext4_journal_stop(handle); - - return err; -} - -/* - * called at mount time - */ -void ext4_ext_init(struct super_block *sb) -{ - /* - * possible initialization would be here - */ - - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { -#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) - printk(KERN_INFO "EXT4-fs: file extents enabled" -#ifdef AGGRESSIVE_TEST - ", aggressive tests" -#endif -#ifdef CHECK_BINSEARCH - ", check binsearch" -#endif -#ifdef EXTENTS_STATS - ", stats" -#endif - "\n"); -#endif -#ifdef EXTENTS_STATS - spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); - EXT4_SB(sb)->s_ext_min = 1 << 30; - EXT4_SB(sb)->s_ext_max = 0; -#endif - } -} - -/* - * called at umount time - */ -void ext4_ext_release(struct super_block *sb) -{ - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) - return; - -#ifdef EXTENTS_STATS - if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", - sbi->s_ext_blocks, sbi->s_ext_extents, - sbi->s_ext_blocks / sbi->s_ext_extents); - printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", - sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); - } -#endif -} - -/* FIXME!! we need to try to merge to left or right after zero-out */ -static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) -{ - ext4_fsblk_t ee_pblock; - unsigned int ee_len; - int ret; - - ee_len = ext4_ext_get_actual_len(ex); - ee_pblock = ext4_ext_pblock(ex); - - ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); - if (ret > 0) - ret = 0; - - return ret; -} - -/* - * ext4_split_extent_at() splits an extent at given block. - * - * @handle: the journal handle - * @inode: the file inode - * @path: the path to the extent - * @split: the logical block where the extent is splitted. - * @split_flags: indicates if the extent could be zeroout if split fails, and - * the states(init or uninit) of new extents. - * @flags: flags used to insert new extent to extent tree. - * - * - * Splits extent [a, b] into two extents [a, @split) and [@split, b], states - * of which are deterimined by split_flag. - * - * There are two cases: - * a> the extent are splitted into two extent. - * b> split is not needed, and just mark the extent. - * - * return 0 on success. - */ -static int ext4_split_extent_at(handle_t *handle, - struct inode *inode, - struct ext4_ext_path *path, - ext4_lblk_t split, - int split_flag, - int flags) -{ - ext4_fsblk_t newblock; - ext4_lblk_t ee_block; - struct ext4_extent *ex, newex, orig_ex; - struct ext4_extent *ex2 = NULL; - unsigned int ee_len, depth; - int err = 0; - - ext_debug("ext4_split_extents_at: inode %lu, logical" - "block %llu\n", inode->i_ino, (unsigned long long)split); - - ext4_ext_show_leaf(inode, path); - - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); - newblock = split - ee_block + ext4_ext_pblock(ex); - - BUG_ON(split < ee_block || split >= (ee_block + ee_len)); - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - - if (split == ee_block) { - /* - * case b: block @split is the block that the extent begins with - * then we just change the state of the extent, and splitting - * is not needed. - */ - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex); - else - ext4_ext_mark_initialized(ex); - - if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) - ext4_ext_try_to_merge(inode, path, ex); - - err = ext4_ext_dirty(handle, inode, path + depth); - goto out; - } - - /* case a */ - memcpy(&orig_ex, ex, sizeof(orig_ex)); - ex->ee_len = cpu_to_le16(split - ee_block); - if (split_flag & EXT4_EXT_MARK_UNINIT1) - ext4_ext_mark_uninitialized(ex); - - /* - * path may lead to new leaf, not to original leaf any more - * after ext4_ext_insert_extent() returns, - */ - err = ext4_ext_dirty(handle, inode, path + depth); - if (err) - goto fix_extent_len; - - ex2 = &newex; - ex2->ee_block = cpu_to_le32(split); - ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); - ext4_ext_store_pblock(ex2, newblock); - if (split_flag & EXT4_EXT_MARK_UNINIT2) - ext4_ext_mark_uninitialized(ex2); - - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); - if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { - err = ext4_ext_zeroout(inode, &orig_ex); - if (err) - goto fix_extent_len; - /* update the extent length and mark as initialized */ - ex->ee_len = cpu_to_le16(ee_len); - ext4_ext_try_to_merge(inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); - goto out; - } else if (err) - goto fix_extent_len; - -out: - ext4_ext_show_leaf(inode, path); - return err; - -fix_extent_len: - ex->ee_len = orig_ex.ee_len; - ext4_ext_dirty(handle, inode, path + depth); - return err; -} - -/* - * ext4_split_extents() splits an extent and mark extent which is covered - * by @map as split_flags indicates - * - * It may result in splitting the extent into multiple extents (upto three) - * There are three possibilities: - * a> There is no split required - * b> Splits in two extents: Split is happening at either end of the extent - * c> Splits in three extents: Somone is splitting in middle of the extent - * - */ -static int ext4_split_extent(handle_t *handle, - struct inode *inode, - struct ext4_ext_path *path, - struct ext4_map_blocks *map, - int split_flag, - int flags) -{ - ext4_lblk_t ee_block; - struct ext4_extent *ex; - unsigned int ee_len, depth; - int err = 0; - int uninitialized; - int split_flag1, flags1; - - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); - uninitialized = ext4_ext_is_uninitialized(ex); - - if (map->m_lblk + map->m_len < ee_block + ee_len) { - split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? - EXT4_EXT_MAY_ZEROOUT : 0; - flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; - if (uninitialized) - split_flag1 |= EXT4_EXT_MARK_UNINIT1 | - EXT4_EXT_MARK_UNINIT2; - err = ext4_split_extent_at(handle, inode, path, - map->m_lblk + map->m_len, split_flag1, flags1); - if (err) - goto out; - } - - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path); - if (IS_ERR(path)) - return PTR_ERR(path); - - if (map->m_lblk >= ee_block) { - split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? - EXT4_EXT_MAY_ZEROOUT : 0; - if (uninitialized) - split_flag1 |= EXT4_EXT_MARK_UNINIT1; - if (split_flag & EXT4_EXT_MARK_UNINIT2) - split_flag1 |= EXT4_EXT_MARK_UNINIT2; - err = ext4_split_extent_at(handle, inode, path, - map->m_lblk, split_flag1, flags); - if (err) - goto out; - } - - ext4_ext_show_leaf(inode, path); -out: - return err ? err : map->m_len; -} - -#define EXT4_EXT_ZERO_LEN 7 -/* - * This function is called by ext4_ext_map_blocks() if someone tries to write - * to an uninitialized extent. It may result in splitting the uninitialized - * extent into multiple extents (up to three - one initialized and two - * uninitialized). - * There are three possibilities: - * a> There is no split required: Entire extent should be initialized - * b> Splits in two extents: Write is happening at either end of the extent - * c> Splits in three extents: Somone is writing in middle of the extent - * - * Pre-conditions: - * - The extent pointed to by 'path' is uninitialized. - * - The extent pointed to by 'path' contains a superset - * of the logical span [map->m_lblk, map->m_lblk + map->m_len). - * - * Post-conditions on success: - * - the returned value is the number of blocks beyond map->l_lblk - * that are allocated and initialized. - * It is guaranteed to be >= map->m_len. - */ -static int ext4_ext_convert_to_initialized(handle_t *handle, - struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path *path) -{ - struct ext4_extent_header *eh; - struct ext4_map_blocks split_map; - struct ext4_extent zero_ex; - struct ext4_extent *ex; - ext4_lblk_t ee_block, eof_block; - unsigned int ee_len, depth; - int allocated; - int err = 0; - int split_flag = 0; - - ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map->m_len); - - eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> - inode->i_sb->s_blocksize_bits; - if (eof_block < map->m_lblk + map->m_len) - eof_block = map->m_lblk + map->m_len; - - depth = ext_depth(inode); - eh = path[depth].p_hdr; - ex = path[depth].p_ext; - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); - allocated = ee_len - (map->m_lblk - ee_block); - - trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); - - /* Pre-conditions */ - BUG_ON(!ext4_ext_is_uninitialized(ex)); - BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); - - /* - * Attempt to transfer newly initialized blocks from the currently - * uninitialized extent to its left neighbor. This is much cheaper - * than an insertion followed by a merge as those involve costly - * memmove() calls. This is the common case in steady state for - * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append - * writes. - * - * Limitations of the current logic: - * - L1: we only deal with writes at the start of the extent. - * The approach could be extended to writes at the end - * of the extent but this scenario was deemed less common. - * - L2: we do not deal with writes covering the whole extent. - * This would require removing the extent if the transfer - * is possible. - * - L3: we only attempt to merge with an extent stored in the - * same extent tree node. - */ - if ((map->m_lblk == ee_block) && /*L1*/ - (map->m_len < ee_len) && /*L2*/ - (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/ - struct ext4_extent *prev_ex; - ext4_lblk_t prev_lblk; - ext4_fsblk_t prev_pblk, ee_pblk; - unsigned int prev_len, write_len; - - prev_ex = ex - 1; - prev_lblk = le32_to_cpu(prev_ex->ee_block); - prev_len = ext4_ext_get_actual_len(prev_ex); - prev_pblk = ext4_ext_pblock(prev_ex); - ee_pblk = ext4_ext_pblock(ex); - write_len = map->m_len; - - /* - * A transfer of blocks from 'ex' to 'prev_ex' is allowed - * upon those conditions: - * - C1: prev_ex is initialized, - * - C2: prev_ex is logically abutting ex, - * - C3: prev_ex is physically abutting ex, - * - C4: prev_ex can receive the additional blocks without - * overflowing the (initialized) length limit. - */ - if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/ - ((prev_lblk + prev_len) == ee_block) && /*C2*/ - ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ - (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/ - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - - trace_ext4_ext_convert_to_initialized_fastpath(inode, - map, ex, prev_ex); - - /* Shift the start of ex by 'write_len' blocks */ - ex->ee_block = cpu_to_le32(ee_block + write_len); - ext4_ext_store_pblock(ex, ee_pblk + write_len); - ex->ee_len = cpu_to_le16(ee_len - write_len); - ext4_ext_mark_uninitialized(ex); /* Restore the flag */ - - /* Extend prev_ex by 'write_len' blocks */ - prev_ex->ee_len = cpu_to_le16(prev_len + write_len); - - /* Mark the block containing both extents as dirty */ - ext4_ext_dirty(handle, inode, path + depth); - - /* Update path to point to the right extent */ - path[depth].p_ext = prev_ex; - - /* Result: number of initialized blocks past m_lblk */ - allocated = write_len; - goto out; - } - } - - WARN_ON(map->m_lblk < ee_block); - /* - * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully insde i_size or new_size. - */ - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - - /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ - if (ee_len <= 2*EXT4_EXT_ZERO_LEN && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { - err = ext4_ext_zeroout(inode, ex); - if (err) - goto out; - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - ext4_ext_mark_initialized(ex); - ext4_ext_try_to_merge(inode, path, ex); - err = ext4_ext_dirty(handle, inode, path + depth); - goto out; - } - - /* - * four cases: - * 1. split the extent into three extents. - * 2. split the extent into two extents, zeroout the first half. - * 3. split the extent into two extents, zeroout the second half. - * 4. split the extent into two extents with out zeroout. - */ - split_map.m_lblk = map->m_lblk; - split_map.m_len = map->m_len; - - if (allocated > map->m_len) { - if (allocated <= EXT4_EXT_ZERO_LEN && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { - /* case 3 */ - zero_ex.ee_block = - cpu_to_le32(map->m_lblk); - zero_ex.ee_len = cpu_to_le16(allocated); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(ex) + map->m_lblk - ee_block); - err = ext4_ext_zeroout(inode, &zero_ex); - if (err) - goto out; - split_map.m_lblk = map->m_lblk; - split_map.m_len = allocated; - } else if ((map->m_lblk - ee_block + map->m_len < - EXT4_EXT_ZERO_LEN) && - (EXT4_EXT_MAY_ZEROOUT & split_flag)) { - /* case 2 */ - if (map->m_lblk != ee_block) { - zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = cpu_to_le16(map->m_lblk - - ee_block); - ext4_ext_store_pblock(&zero_ex, - ext4_ext_pblock(ex)); - err = ext4_ext_zeroout(inode, &zero_ex); - if (err) - goto out; - } - - split_map.m_lblk = ee_block; - split_map.m_len = map->m_lblk - ee_block + map->m_len; - allocated = map->m_len; - } - } - - allocated = ext4_split_extent(handle, inode, path, - &split_map, split_flag, 0); - if (allocated < 0) - err = allocated; - -out: - return err ? err : allocated; -} - -/* - * This function is called by ext4_ext_map_blocks() from - * ext4_get_blocks_dio_write() when DIO to write - * to an uninitialized extent. - * - * Writing to an uninitialized extent may result in splitting the uninitialized - * extent into multiple /initialized uninitialized extents (up to three) - * There are three possibilities: - * a> There is no split required: Entire extent should be uninitialized - * b> Splits in two extents: Write is happening at either end of the extent - * c> Splits in three extents: Somone is writing in middle of the extent - * - * One of more index blocks maybe needed if the extent tree grow after - * the uninitialized extent split. To prevent ENOSPC occur at the IO - * complete, we need to split the uninitialized extent before DIO submit - * the IO. The uninitialized extent called at this time will be split - * into three uninitialized extent(at most). After IO complete, the part - * being filled will be convert to initialized by the end_io callback function - * via ext4_convert_unwritten_extents(). - * - * Returns the size of uninitialized extent to be written on success. - */ -static int ext4_split_unwritten_extents(handle_t *handle, - struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path *path, - int flags) -{ - ext4_lblk_t eof_block; - ext4_lblk_t ee_block; - struct ext4_extent *ex; - unsigned int ee_len; - int split_flag = 0, depth; - - ext_debug("ext4_split_unwritten_extents: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map->m_len); - - eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> - inode->i_sb->s_blocksize_bits; - if (eof_block < map->m_lblk + map->m_len) - eof_block = map->m_lblk + map->m_len; - /* - * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully insde i_size or new_size. - */ - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); - - split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; - split_flag |= EXT4_EXT_MARK_UNINIT2; - - flags |= EXT4_GET_BLOCKS_PRE_IO; - return ext4_split_extent(handle, inode, path, map, split_flag, flags); -} - -static int ext4_convert_unwritten_extents_endio(handle_t *handle, - struct inode *inode, - struct ext4_ext_path *path) -{ - struct ext4_extent *ex; - int depth; - int err = 0; - - depth = ext_depth(inode); - ex = path[depth].p_ext; - - ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)le32_to_cpu(ex->ee_block), - ext4_ext_get_actual_len(ex)); - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - /* first mark the extent as initialized */ - ext4_ext_mark_initialized(ex); - - /* note: ext4_ext_correct_indexes() isn't needed here because - * borders are not changed - */ - ext4_ext_try_to_merge(inode, path, ex); - - /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + depth); -out: - ext4_ext_show_leaf(inode, path); - return err; -} - -static void unmap_underlying_metadata_blocks(struct block_device *bdev, - sector_t block, int count) -{ - int i; - for (i = 0; i < count; i++) - unmap_underlying_metadata(bdev, block + i); -} - -/* - * Handle EOFBLOCKS_FL flag, clearing it if necessary - */ -static int check_eofblocks_fl(handle_t *handle, struct inode *inode, - ext4_lblk_t lblk, - struct ext4_ext_path *path, - unsigned int len) -{ - int i, depth; - struct ext4_extent_header *eh; - struct ext4_extent *last_ex; - - if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) - return 0; - - depth = ext_depth(inode); - eh = path[depth].p_hdr; - - /* - * We're going to remove EOFBLOCKS_FL entirely in future so we - * do not care for this case anymore. Simply remove the flag - * if there are no extents. - */ - if (unlikely(!eh->eh_entries)) - goto out; - last_ex = EXT_LAST_EXTENT(eh); - /* - * We should clear the EOFBLOCKS_FL flag if we are writing the - * last block in the last extent in the file. We test this by - * first checking to see if the caller to - * ext4_ext_get_blocks() was interested in the last block (or - * a block beyond the last block) in the current extent. If - * this turns out to be false, we can bail out from this - * function immediately. - */ - if (lblk + len < le32_to_cpu(last_ex->ee_block) + - ext4_ext_get_actual_len(last_ex)) - return 0; - /* - * If the caller does appear to be planning to write at or - * beyond the end of the current extent, we then test to see - * if the current extent is the last extent in the file, by - * checking to make sure it was reached via the rightmost node - * at each level of the tree. - */ - for (i = depth-1; i >= 0; i--) - if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) - return 0; -out: - ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); - return ext4_mark_inode_dirty(handle, inode); -} - -/** - * ext4_find_delalloc_range: find delayed allocated block in the given range. - * - * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns - * whether there are any buffers marked for delayed allocation. It returns '1' - * on the first delalloc'ed buffer head found. If no buffer head in the given - * range is marked for delalloc, it returns 0. - * lblk_start should always be <= lblk_end. - * search_hint_reverse is to indicate that searching in reverse from lblk_end to - * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed - * block sooner). This is useful when blocks are truncated sequentially from - * lblk_start towards lblk_end. - */ -static int ext4_find_delalloc_range(struct inode *inode, - ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end, - int search_hint_reverse) -{ - struct address_space *mapping = inode->i_mapping; - struct buffer_head *head, *bh = NULL; - struct page *page; - ext4_lblk_t i, pg_lblk; - pgoff_t index; - - if (!test_opt(inode->i_sb, DELALLOC)) - return 0; - - /* reverse search wont work if fs block size is less than page size */ - if (inode->i_blkbits < PAGE_CACHE_SHIFT) - search_hint_reverse = 0; - - if (search_hint_reverse) - i = lblk_end; - else - i = lblk_start; - - index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - - while ((i >= lblk_start) && (i <= lblk_end)) { - page = find_get_page(mapping, index); - if (!page) - goto nextpage; - - if (!page_has_buffers(page)) - goto nextpage; - - head = page_buffers(page); - if (!head) - goto nextpage; - - bh = head; - pg_lblk = index << (PAGE_CACHE_SHIFT - - inode->i_blkbits); - do { - if (unlikely(pg_lblk < lblk_start)) { - /* - * This is possible when fs block size is less - * than page size and our cluster starts/ends in - * middle of the page. So we need to skip the - * initial few blocks till we reach the 'lblk' - */ - pg_lblk++; - continue; - } - - /* Check if the buffer is delayed allocated and that it - * is not yet mapped. (when da-buffers are mapped during - * their writeout, their da_mapped bit is set.) - */ - if (buffer_delay(bh) && !buffer_da_mapped(bh)) { - page_cache_release(page); - trace_ext4_find_delalloc_range(inode, - lblk_start, lblk_end, - search_hint_reverse, - 1, i); - return 1; - } - if (search_hint_reverse) - i--; - else - i++; - } while ((i >= lblk_start) && (i <= lblk_end) && - ((bh = bh->b_this_page) != head)); -nextpage: - if (page) - page_cache_release(page); - /* - * Move to next page. 'i' will be the first lblk in the next - * page. - */ - if (search_hint_reverse) - index--; - else - index++; - i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - } - - trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end, - search_hint_reverse, 0, 0); - return 0; -} - -int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, - int search_hint_reverse) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - ext4_lblk_t lblk_start, lblk_end; - lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); - lblk_end = lblk_start + sbi->s_cluster_ratio - 1; - - return ext4_find_delalloc_range(inode, lblk_start, lblk_end, - search_hint_reverse); -} - -/** - * Determines how many complete clusters (out of those specified by the 'map') - * are under delalloc and were reserved quota for. - * This function is called when we are writing out the blocks that were - * originally written with their allocation delayed, but then the space was - * allocated using fallocate() before the delayed allocation could be resolved. - * The cases to look for are: - * ('=' indicated delayed allocated blocks - * '-' indicates non-delayed allocated blocks) - * (a) partial clusters towards beginning and/or end outside of allocated range - * are not delalloc'ed. - * Ex: - * |----c---=|====c====|====c====|===-c----| - * |++++++ allocated ++++++| - * ==> 4 complete clusters in above example - * - * (b) partial cluster (outside of allocated range) towards either end is - * marked for delayed allocation. In this case, we will exclude that - * cluster. - * Ex: - * |----====c========|========c========| - * |++++++ allocated ++++++| - * ==> 1 complete clusters in above example - * - * Ex: - * |================c================| - * |++++++ allocated ++++++| - * ==> 0 complete clusters in above example - * - * The ext4_da_update_reserve_space will be called only if we - * determine here that there were some "entire" clusters that span - * this 'allocated' range. - * In the non-bigalloc case, this function will just end up returning num_blks - * without ever calling ext4_find_delalloc_range. - */ -static unsigned int -get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, - unsigned int num_blks) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - ext4_lblk_t alloc_cluster_start, alloc_cluster_end; - ext4_lblk_t lblk_from, lblk_to, c_offset; - unsigned int allocated_clusters = 0; - - alloc_cluster_start = EXT4_B2C(sbi, lblk_start); - alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); - - /* max possible clusters for this allocation */ - allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; - - trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); - - /* Check towards left side */ - c_offset = lblk_start & (sbi->s_cluster_ratio - 1); - if (c_offset) { - lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); - lblk_to = lblk_from + c_offset - 1; - - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) - allocated_clusters--; - } - - /* Now check towards right. */ - c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); - if (allocated_clusters && c_offset) { - lblk_from = lblk_start + num_blks; - lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; - - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) - allocated_clusters--; - } - - return allocated_clusters; -} - -static int -ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path *path, int flags, - unsigned int allocated, ext4_fsblk_t newblock) -{ - int ret = 0; - int err = 0; - ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; - - ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " - "block %llu, max_blocks %u, flags %x, allocated %u\n", - inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, - flags, allocated); - ext4_ext_show_leaf(inode, path); - - trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated, - newblock); - - /* get_block() before submit the IO, split the extent */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - ret = ext4_split_unwritten_extents(handle, inode, map, - path, flags); - /* - * Flag the inode(non aio case) or end_io struct (aio case) - * that this IO needs to conversion to written when IO is - * completed - */ - if (io) - ext4_set_io_unwritten_flag(inode, io); - else - ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - if (ext4_should_dioread_nolock(inode)) - map->m_flags |= EXT4_MAP_UNINIT; - goto out; - } - /* IO end_io complete, convert the filled extent to written */ - if ((flags & EXT4_GET_BLOCKS_CONVERT)) { - ret = ext4_convert_unwritten_extents_endio(handle, inode, - path); - if (ret >= 0) { - ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map->m_lblk, - path, map->m_len); - } else - err = ret; - goto out2; - } - /* buffered IO case */ - /* - * repeat fallocate creation request - * we already have an unwritten extent - */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) - goto map_out; - - /* buffered READ or buffered write_begin() lookup */ - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { - /* - * We have blocks reserved already. We - * return allocated blocks so that delalloc - * won't do block reservation for us. But - * the buffer head will be unmapped so that - * a read from the block returns 0s. - */ - map->m_flags |= EXT4_MAP_UNWRITTEN; - goto out1; - } - - /* buffered write, writepage time, convert*/ - ret = ext4_ext_convert_to_initialized(handle, inode, map, path); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out: - if (ret <= 0) { - err = ret; - goto out2; - } else - allocated = ret; - map->m_flags |= EXT4_MAP_NEW; - /* - * if we allocated more blocks than requested - * we need to make sure we unmap the extra block - * allocated. The actual needed block will get - * unmapped later when we find the buffer_head marked - * new. - */ - if (allocated > map->m_len) { - unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, - newblock + map->m_len, - allocated - map->m_len); - allocated = map->m_len; - } - - /* - * If we have done fallocate with the offset that is already - * delayed allocated, we would have block reservation - * and quota reservation done in the delayed write path. - * But fallocate would have already updated quota and block - * count for this offset. So cancel these reservation - */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - unsigned int reserved_clusters; - reserved_clusters = get_reserved_cluster_alloc(inode, - map->m_lblk, map->m_len); - if (reserved_clusters) - ext4_da_update_reserve_space(inode, - reserved_clusters, - 0); - } - -map_out: - map->m_flags |= EXT4_MAP_MAPPED; - if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { - err = check_eofblocks_fl(handle, inode, map->m_lblk, path, - map->m_len); - if (err < 0) - goto out2; - } -out1: - if (allocated > map->m_len) - allocated = map->m_len; - ext4_ext_show_leaf(inode, path); - map->m_pblk = newblock; - map->m_len = allocated; -out2: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } - return err ? err : allocated; -} - -/* - * get_implied_cluster_alloc - check to see if the requested - * allocation (in the map structure) overlaps with a cluster already - * allocated in an extent. - * @sb The filesystem superblock structure - * @map The requested lblk->pblk mapping - * @ex The extent structure which might contain an implied - * cluster allocation - * - * This function is called by ext4_ext_map_blocks() after we failed to - * find blocks that were already in the inode's extent tree. Hence, - * we know that the beginning of the requested region cannot overlap - * the extent from the inode's extent tree. There are three cases we - * want to catch. The first is this case: - * - * |--- cluster # N--| - * |--- extent ---| |---- requested region ---| - * |==========| - * - * The second case that we need to test for is this one: - * - * |--------- cluster # N ----------------| - * |--- requested region --| |------- extent ----| - * |=======================| - * - * The third case is when the requested region lies between two extents - * within the same cluster: - * |------------- cluster # N-------------| - * |----- ex -----| |---- ex_right ----| - * |------ requested region ------| - * |================| - * - * In each of the above cases, we need to set the map->m_pblk and - * map->m_len so it corresponds to the return the extent labelled as - * "|====|" from cluster #N, since it is already in use for data in - * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to - * signal to ext4_ext_map_blocks() that map->m_pblk should be treated - * as a new "allocated" block region. Otherwise, we will return 0 and - * ext4_ext_map_blocks() will then allocate one or more new clusters - * by calling ext4_mb_new_blocks(). - */ -static int get_implied_cluster_alloc(struct super_block *sb, - struct ext4_map_blocks *map, - struct ext4_extent *ex, - struct ext4_ext_path *path) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); - ext4_lblk_t ex_cluster_start, ex_cluster_end; - ext4_lblk_t rr_cluster_start; - ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); - ext4_fsblk_t ee_start = ext4_ext_pblock(ex); - unsigned short ee_len = ext4_ext_get_actual_len(ex); - - /* The extent passed in that we are trying to match */ - ex_cluster_start = EXT4_B2C(sbi, ee_block); - ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); - - /* The requested region passed into ext4_map_blocks() */ - rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); - - if ((rr_cluster_start == ex_cluster_end) || - (rr_cluster_start == ex_cluster_start)) { - if (rr_cluster_start == ex_cluster_end) - ee_start += ee_len - 1; - map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + - c_offset; - map->m_len = min(map->m_len, - (unsigned) sbi->s_cluster_ratio - c_offset); - /* - * Check for and handle this case: - * - * |--------- cluster # N-------------| - * |------- extent ----| - * |--- requested region ---| - * |===========| - */ - - if (map->m_lblk < ee_block) - map->m_len = min(map->m_len, ee_block - map->m_lblk); - - /* - * Check for the case where there is already another allocated - * block to the right of 'ex' but before the end of the cluster. - * - * |------------- cluster # N-------------| - * |----- ex -----| |---- ex_right ----| - * |------ requested region ------| - * |================| - */ - if (map->m_lblk > ee_block) { - ext4_lblk_t next = ext4_ext_next_allocated_block(path); - map->m_len = min(map->m_len, next - map->m_lblk); - } - - trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); - return 1; - } - - trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); - return 0; -} - - -/* - * Block allocation/map/preallocation routine for extents based files - * - * - * Need to be called with - * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block - * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) - * - * return > 0, number of of blocks already mapped/allocated - * if create == 0 and these are pre-allocated blocks - * buffer head is unmapped - * otherwise blocks are mapped - * - * return = 0, if plain look up failed (blocks have not been allocated) - * buffer head is unmapped - * - * return < 0, error case. - */ -int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, int flags) -{ - struct ext4_ext_path *path = NULL; - struct ext4_extent newex, *ex, *ex2; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - ext4_fsblk_t newblock = 0; - int free_on_err = 0, err = 0, depth, ret; - unsigned int allocated = 0, offset = 0; - unsigned int allocated_clusters = 0; - struct ext4_allocation_request ar; - ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; - ext4_lblk_t cluster_offset; - - ext_debug("blocks %u/%u requested for inode %lu\n", - map->m_lblk, map->m_len, inode->i_ino); - trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); - - /* check in cache */ - if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { - if (!newex.ee_start_lo && !newex.ee_start_hi) { - if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { - /* - * block isn't allocated yet and - * user doesn't want to allocate it - */ - goto out2; - } - /* we should allocate requested block */ - } else { - /* block is already allocated */ - if (sbi->s_cluster_ratio > 1) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - newblock = map->m_lblk - - le32_to_cpu(newex.ee_block) - + ext4_ext_pblock(&newex); - /* number of remaining blocks in the extent */ - allocated = ext4_ext_get_actual_len(&newex) - - (map->m_lblk - le32_to_cpu(newex.ee_block)); - goto out; - } - } - - /* find extent for this block */ - path = ext4_ext_find_extent(inode, map->m_lblk, NULL); - if (IS_ERR(path)) { - err = PTR_ERR(path); - path = NULL; - goto out2; - } - - depth = ext_depth(inode); - - /* - * consistent leaf must not be empty; - * this situation is possible, though, _during_ tree modification; - * this is why assert can't be put in ext4_ext_find_extent() - */ - if (unlikely(path[depth].p_ext == NULL && depth != 0)) { - EXT4_ERROR_INODE(inode, "bad extent address " - "lblock: %lu, depth: %d pblock %lld", - (unsigned long) map->m_lblk, depth, - path[depth].p_block); - err = -EIO; - goto out2; - } - - ex = path[depth].p_ext; - if (ex) { - ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); - ext4_fsblk_t ee_start = ext4_ext_pblock(ex); - unsigned short ee_len; - - /* - * Uninitialized extents are treated as holes, except that - * we split out initialized portions during a write. - */ - ee_len = ext4_ext_get_actual_len(ex); - - trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); - - /* if found extent covers block, simply return it */ - if (in_range(map->m_lblk, ee_block, ee_len)) { - newblock = map->m_lblk - ee_block + ee_start; - /* number of remaining blocks in the extent */ - allocated = ee_len - (map->m_lblk - ee_block); - ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, - ee_block, ee_len, newblock); - - /* - * Do not put uninitialized extent - * in the cache - */ - if (!ext4_ext_is_uninitialized(ex)) { - ext4_ext_put_in_cache(inode, ee_block, - ee_len, ee_start); - goto out; - } - ret = ext4_ext_handle_uninitialized_extents( - handle, inode, map, path, flags, - allocated, newblock); - return ret; - } - } - - if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - - /* - * requested block isn't allocated yet; - * we couldn't try to create block if create flag is zero - */ - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { - /* - * put just found gap into cache to speed up - * subsequent requests - */ - ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); - goto out2; - } - - /* - * Okay, we need to do block allocation. - */ - map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; - newex.ee_block = cpu_to_le32(map->m_lblk); - cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); - - /* - * If we are doing bigalloc, check to see if the extent returned - * by ext4_ext_find_extent() implies a cluster we can use. - */ - if (cluster_offset && ex && - get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { - ar.len = allocated = map->m_len; - newblock = map->m_pblk; - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - goto got_allocated_blocks; - } - - /* find neighbour allocated blocks */ - ar.lleft = map->m_lblk; - err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); - if (err) - goto out2; - ar.lright = map->m_lblk; - ex2 = NULL; - err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); - if (err) - goto out2; - - /* Check if the extent after searching to the right implies a - * cluster we can use. */ - if ((sbi->s_cluster_ratio > 1) && ex2 && - get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { - ar.len = allocated = map->m_len; - newblock = map->m_pblk; - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - goto got_allocated_blocks; - } - - /* - * See if request is beyond maximum number of blocks we can have in - * a single extent. For an initialized extent this limit is - * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is - * EXT_UNINIT_MAX_LEN. - */ - if (map->m_len > EXT_INIT_MAX_LEN && - !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) - map->m_len = EXT_INIT_MAX_LEN; - else if (map->m_len > EXT_UNINIT_MAX_LEN && - (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) - map->m_len = EXT_UNINIT_MAX_LEN; - - /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ - newex.ee_len = cpu_to_le16(map->m_len); - err = ext4_ext_check_overlap(sbi, inode, &newex, path); - if (err) - allocated = ext4_ext_get_actual_len(&newex); - else - allocated = map->m_len; - - /* allocate new block */ - ar.inode = inode; - ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); - ar.logical = map->m_lblk; - /* - * We calculate the offset from the beginning of the cluster - * for the logical block number, since when we allocate a - * physical cluster, the physical block should start at the - * same offset from the beginning of the cluster. This is - * needed so that future calls to get_implied_cluster_alloc() - * work correctly. - */ - offset = map->m_lblk & (sbi->s_cluster_ratio - 1); - ar.len = EXT4_NUM_B2C(sbi, offset+allocated); - ar.goal -= offset; - ar.logical -= offset; - if (S_ISREG(inode->i_mode)) - ar.flags = EXT4_MB_HINT_DATA; - else - /* disable in-core preallocation for non-regular files */ - ar.flags = 0; - if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) - ar.flags |= EXT4_MB_HINT_NOPREALLOC; - newblock = ext4_mb_new_blocks(handle, &ar, &err); - if (!newblock) - goto out2; - ext_debug("allocate new block: goal %llu, found %llu/%u\n", - ar.goal, newblock, allocated); - free_on_err = 1; - allocated_clusters = ar.len; - ar.len = EXT4_C2B(sbi, ar.len) - offset; - if (ar.len > allocated) - ar.len = allocated; - -got_allocated_blocks: - /* try to insert new extent into found leaf and return */ - ext4_ext_store_pblock(&newex, newblock + offset); - newex.ee_len = cpu_to_le16(ar.len); - /* Mark uninitialized */ - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ - ext4_ext_mark_uninitialized(&newex); - /* - * io_end structure was created for every IO write to an - * uninitialized extent. To avoid unnecessary conversion, - * here we flag the IO that really needs the conversion. - * For non asycn direct IO case, flag the inode state - * that we need to perform conversion when IO is done. - */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { - if (io) - ext4_set_io_unwritten_flag(inode, io); - else - ext4_set_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN); - } - if (ext4_should_dioread_nolock(inode)) - map->m_flags |= EXT4_MAP_UNINIT; - } - - err = 0; - if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) - err = check_eofblocks_fl(handle, inode, map->m_lblk, - path, ar.len); - if (!err) - err = ext4_ext_insert_extent(handle, inode, path, - &newex, flags); - if (err && free_on_err) { - int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? - EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; - /* free data blocks we just allocated */ - /* not a good idea to call discard here directly, - * but otherwise we'd need to call it every free() */ - ext4_discard_preallocations(inode); - ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), - ext4_ext_get_actual_len(&newex), fb_flags); - goto out2; - } - - /* previous routine could use block we allocated */ - newblock = ext4_ext_pblock(&newex); - allocated = ext4_ext_get_actual_len(&newex); - if (allocated > map->m_len) - allocated = map->m_len; - map->m_flags |= EXT4_MAP_NEW; - - /* - * Update reserved blocks/metadata blocks after successful - * block allocation which had been deferred till now. - */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - unsigned int reserved_clusters; - /* - * Check how many clusters we had reserved this allocated range - */ - reserved_clusters = get_reserved_cluster_alloc(inode, - map->m_lblk, allocated); - if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { - if (reserved_clusters) { - /* - * We have clusters reserved for this range. - * But since we are not doing actual allocation - * and are simply using blocks from previously - * allocated cluster, we should release the - * reservation and not claim quota. - */ - ext4_da_update_reserve_space(inode, - reserved_clusters, 0); - } - } else { - BUG_ON(allocated_clusters < reserved_clusters); - /* We will claim quota for all newly allocated blocks.*/ - ext4_da_update_reserve_space(inode, allocated_clusters, - 1); - if (reserved_clusters < allocated_clusters) { - struct ext4_inode_info *ei = EXT4_I(inode); - int reservation = allocated_clusters - - reserved_clusters; - /* - * It seems we claimed few clusters outside of - * the range of this allocation. We should give - * it back to the reservation pool. This can - * happen in the following case: - * - * * Suppose s_cluster_ratio is 4 (i.e., each - * cluster has 4 blocks. Thus, the clusters - * are [0-3],[4-7],[8-11]... - * * First comes delayed allocation write for - * logical blocks 10 & 11. Since there were no - * previous delayed allocated blocks in the - * range [8-11], we would reserve 1 cluster - * for this write. - * * Next comes write for logical blocks 3 to 8. - * In this case, we will reserve 2 clusters - * (for [0-3] and [4-7]; and not for [8-11] as - * that range has a delayed allocated blocks. - * Thus total reserved clusters now becomes 3. - * * Now, during the delayed allocation writeout - * time, we will first write blocks [3-8] and - * allocate 3 clusters for writing these - * blocks. Also, we would claim all these - * three clusters above. - * * Now when we come here to writeout the - * blocks [10-11], we would expect to claim - * the reservation of 1 cluster we had made - * (and we would claim it since there are no - * more delayed allocated blocks in the range - * [8-11]. But our reserved cluster count had - * already gone to 0. - * - * Thus, at the step 4 above when we determine - * that there are still some unwritten delayed - * allocated blocks outside of our current - * block range, we should increment the - * reserved clusters count so that when the - * remaining blocks finally gets written, we - * could claim them. - */ - dquot_reserve_block(inode, - EXT4_C2B(sbi, reservation)); - spin_lock(&ei->i_block_reservation_lock); - ei->i_reserved_data_blocks += reservation; - spin_unlock(&ei->i_block_reservation_lock); - } - } - } - - /* - * Cache the extent and update transaction to commit on fdatasync only - * when it is _not_ an uninitialized extent. - */ - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { - ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock); - ext4_update_inode_fsync_trans(handle, inode, 1); - } else - ext4_update_inode_fsync_trans(handle, inode, 0); -out: - if (allocated > map->m_len) - allocated = map->m_len; - ext4_ext_show_leaf(inode, path); - map->m_flags |= EXT4_MAP_MAPPED; - map->m_pblk = newblock; - map->m_len = allocated; -out2: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } - - trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, - newblock, map->m_len, err ? err : allocated); - - return err ? err : allocated; -} - -void ext4_ext_truncate(struct inode *inode) -{ - struct address_space *mapping = inode->i_mapping; - struct super_block *sb = inode->i_sb; - ext4_lblk_t last_block; - handle_t *handle; - loff_t page_len; - int err = 0; - - /* - * finish any pending end_io work so we won't run the risk of - * converting any truncated blocks to initialized later - */ - ext4_flush_completed_IO(inode); - - /* - * probably first extent we're gonna free will be last in block - */ - err = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, err); - if (IS_ERR(handle)) - return; - - if (inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out_stop; - } - - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - down_write(&EXT4_I(inode)->i_data_sem); - ext4_ext_invalidate_cache(inode); - - ext4_discard_preallocations(inode); - - /* - * TODO: optimization is possible here. - * Probably we need not scan at all, - * because page truncation is enough. - */ - - /* we have to know where to truncate from in crash case */ - EXT4_I(inode)->i_disksize = inode->i_size; - ext4_mark_inode_dirty(handle, inode); - - last_block = (inode->i_size + sb->s_blocksize - 1) - >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); - - /* In a multi-transaction truncate, we only make the final - * transaction synchronous. - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out_stop: - /* - * If this was a simple ftruncate() and the file will remain alive, - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); -} - -static void ext4_falloc_update_inode(struct inode *inode, - int mode, loff_t new_size, int update_ctime) -{ - struct timespec now; - - if (update_ctime) { - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_ctime, &now)) - inode->i_ctime = now; - } - /* - * Update only when preallocation was requested beyond - * the file size. - */ - if (!(mode & FALLOC_FL_KEEP_SIZE)) { - if (new_size > i_size_read(inode)) - i_size_write(inode, new_size); - if (new_size > EXT4_I(inode)->i_disksize) - ext4_update_i_disksize(inode, new_size); - } else { - /* - * Mark that we allocate beyond EOF so the subsequent truncate - * can proceed even if the new size is the same as i_size. - */ - if (new_size > i_size_read(inode)) - ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); - } - -} - -/* - * preallocate space for a file. This implements ext4's fallocate file - * operation, which gets called from sys_fallocate system call. - * For block-mapped files, posix_fallocate should fall back to the method - * of writing zeroes to the required new blocks (the same behavior which is - * expected for file systems which do not support fallocate() system call). - */ -long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) -{ - struct inode *inode = file->f_path.dentry->d_inode; - handle_t *handle; - loff_t new_size; - unsigned int max_blocks; - int ret = 0; - int ret2 = 0; - int retries = 0; - int flags; - struct ext4_map_blocks map; - unsigned int credits, blkbits = inode->i_blkbits; - - /* - * currently supporting (pre)allocate mode for extent-based - * files _only_ - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return -EOPNOTSUPP; - - /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) - return -EOPNOTSUPP; - - if (mode & FALLOC_FL_PUNCH_HOLE) - return ext4_punch_hole(file, offset, len); - - trace_ext4_fallocate_enter(inode, offset, len, mode); - map.m_lblk = offset >> blkbits; - /* - * We can't just convert len to max_blocks because - * If blocksize = 4096 offset = 3072 and len = 2048 - */ - max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - map.m_lblk; - /* - * credits to insert 1 extent into extent tree - */ - credits = ext4_chunk_trans_blocks(inode, max_blocks); - mutex_lock(&inode->i_mutex); - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) { - mutex_unlock(&inode->i_mutex); - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); - return ret; - } - flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; - if (mode & FALLOC_FL_KEEP_SIZE) - flags |= EXT4_GET_BLOCKS_KEEP_SIZE; - /* - * Don't normalize the request if it can fit in one extent so - * that it doesn't get unnecessarily split into multiple - * extents. - */ - if (len <= EXT_UNINIT_MAX_LEN << blkbits) - flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; -retry: - while (ret >= 0 && ret < max_blocks) { - map.m_lblk = map.m_lblk + ret; - map.m_len = max_blocks = max_blocks - ret; - handle = ext4_journal_start(inode, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - break; - } - ret = ext4_map_blocks(handle, inode, &map, flags); - if (ret <= 0) { -#ifdef EXT4FS_DEBUG - WARN_ON(ret <= 0); - printk(KERN_ERR "%s: ext4_ext_map_blocks " - "returned error inode#%lu, block=%u, " - "max_blocks=%u", __func__, - inode->i_ino, map.m_lblk, max_blocks); -#endif - ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - break; - } - if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, - blkbits) >> blkbits)) - new_size = offset + len; - else - new_size = ((loff_t) map.m_lblk + ret) << blkbits; - - ext4_falloc_update_inode(inode, mode, new_size, - (map.m_flags & EXT4_MAP_NEW)); - ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - if (ret2) - break; - } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) { - ret = 0; - goto retry; - } - mutex_unlock(&inode->i_mutex); - trace_ext4_fallocate_exit(inode, offset, max_blocks, - ret > 0 ? ret2 : ret); - return ret > 0 ? ret2 : ret; -} - -/* - * This function convert a range of blocks to written extents - * The caller of this function will pass the start offset and the size. - * all unwritten extents within this range will be converted to - * written extents. - * - * This function is called from the direct IO end io call back - * function, to convert the fallocated extents after IO is completed. - * Returns 0 on success. - */ -int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, - ssize_t len) -{ - handle_t *handle; - unsigned int max_blocks; - int ret = 0; - int ret2 = 0; - struct ext4_map_blocks map; - unsigned int credits, blkbits = inode->i_blkbits; - - map.m_lblk = offset >> blkbits; - /* - * We can't just convert len to max_blocks because - * If blocksize = 4096 offset = 3072 and len = 2048 - */ - max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - map.m_lblk); - /* - * credits to insert 1 extent into extent tree - */ - credits = ext4_chunk_trans_blocks(inode, max_blocks); - while (ret >= 0 && ret < max_blocks) { - map.m_lblk += ret; - map.m_len = (max_blocks -= ret); - handle = ext4_journal_start(inode, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - break; - } - ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_IO_CONVERT_EXT); - if (ret <= 0) { - WARN_ON(ret <= 0); - ext4_msg(inode->i_sb, KERN_ERR, - "%s:%d: inode #%lu: block %u: len %u: " - "ext4_ext_map_blocks returned %d", - __func__, __LINE__, inode->i_ino, map.m_lblk, - map.m_len, ret); - } - ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - if (ret <= 0 || ret2 ) - break; - } - return ret > 0 ? ret2 : ret; -} - -/* - * Callback function called for each extent to gather FIEMAP information. - */ -static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, - struct ext4_ext_cache *newex, struct ext4_extent *ex, - void *data) -{ - __u64 logical; - __u64 physical; - __u64 length; - __u32 flags = 0; - int ret = 0; - struct fiemap_extent_info *fieinfo = data; - unsigned char blksize_bits; - - blksize_bits = inode->i_sb->s_blocksize_bits; - logical = (__u64)newex->ec_block << blksize_bits; - - if (newex->ec_start == 0) { - /* - * No extent in extent-tree contains block @newex->ec_start, - * then the block may stay in 1)a hole or 2)delayed-extent. - * - * Holes or delayed-extents are processed as follows. - * 1. lookup dirty pages with specified range in pagecache. - * If no page is got, then there is no delayed-extent and - * return with EXT_CONTINUE. - * 2. find the 1st mapped buffer, - * 3. check if the mapped buffer is both in the request range - * and a delayed buffer. If not, there is no delayed-extent, - * then return. - * 4. a delayed-extent is found, the extent will be collected. - */ - ext4_lblk_t end = 0; - pgoff_t last_offset; - pgoff_t offset; - pgoff_t index; - pgoff_t start_index = 0; - struct page **pages = NULL; - struct buffer_head *bh = NULL; - struct buffer_head *head = NULL; - unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *); - - pages = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (pages == NULL) - return -ENOMEM; - - offset = logical >> PAGE_SHIFT; -repeat: - last_offset = offset; - head = NULL; - ret = find_get_pages_tag(inode->i_mapping, &offset, - PAGECACHE_TAG_DIRTY, nr_pages, pages); - - if (!(flags & FIEMAP_EXTENT_DELALLOC)) { - /* First time, try to find a mapped buffer. */ - if (ret == 0) { -out: - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - /* just a hole. */ - kfree(pages); - return EXT_CONTINUE; - } - index = 0; - -next_page: - /* Try to find the 1st mapped buffer. */ - end = ((__u64)pages[index]->index << PAGE_SHIFT) >> - blksize_bits; - if (!page_has_buffers(pages[index])) - goto out; - head = page_buffers(pages[index]); - if (!head) - goto out; - - index++; - bh = head; - do { - if (end >= newex->ec_block + - newex->ec_len) - /* The buffer is out of - * the request range. - */ - goto out; - - if (buffer_mapped(bh) && - end >= newex->ec_block) { - start_index = index - 1; - /* get the 1st mapped buffer. */ - goto found_mapped_buffer; - } - - bh = bh->b_this_page; - end++; - } while (bh != head); - - /* No mapped buffer in the range found in this page, - * We need to look up next page. - */ - if (index >= ret) { - /* There is no page left, but we need to limit - * newex->ec_len. - */ - newex->ec_len = end - newex->ec_block; - goto out; - } - goto next_page; - } else { - /*Find contiguous delayed buffers. */ - if (ret > 0 && pages[0]->index == last_offset) - head = page_buffers(pages[0]); - bh = head; - index = 1; - start_index = 0; - } - -found_mapped_buffer: - if (bh != NULL && buffer_delay(bh)) { - /* 1st or contiguous delayed buffer found. */ - if (!(flags & FIEMAP_EXTENT_DELALLOC)) { - /* - * 1st delayed buffer found, record - * the start of extent. - */ - flags |= FIEMAP_EXTENT_DELALLOC; - newex->ec_block = end; - logical = (__u64)end << blksize_bits; - } - /* Find contiguous delayed buffers. */ - do { - if (!buffer_delay(bh)) - goto found_delayed_extent; - bh = bh->b_this_page; - end++; - } while (bh != head); - - for (; index < ret; index++) { - if (!page_has_buffers(pages[index])) { - bh = NULL; - break; - } - head = page_buffers(pages[index]); - if (!head) { - bh = NULL; - break; - } - - if (pages[index]->index != - pages[start_index]->index + index - - start_index) { - /* Blocks are not contiguous. */ - bh = NULL; - break; - } - bh = head; - do { - if (!buffer_delay(bh)) - /* Delayed-extent ends. */ - goto found_delayed_extent; - bh = bh->b_this_page; - end++; - } while (bh != head); - } - } else if (!(flags & FIEMAP_EXTENT_DELALLOC)) - /* a hole found. */ - goto out; - -found_delayed_extent: - newex->ec_len = min(end - newex->ec_block, - (ext4_lblk_t)EXT_INIT_MAX_LEN); - if (ret == nr_pages && bh != NULL && - newex->ec_len < EXT_INIT_MAX_LEN && - buffer_delay(bh)) { - /* Have not collected an extent and continue. */ - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - goto repeat; - } - - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - kfree(pages); - } - - physical = (__u64)newex->ec_start << blksize_bits; - length = (__u64)newex->ec_len << blksize_bits; - - if (ex && ext4_ext_is_uninitialized(ex)) - flags |= FIEMAP_EXTENT_UNWRITTEN; - - if (next == EXT_MAX_BLOCKS) - flags |= FIEMAP_EXTENT_LAST; - - ret = fiemap_fill_next_extent(fieinfo, logical, physical, - length, flags); - if (ret < 0) - return ret; - if (ret == 1) - return EXT_BREAK; - return EXT_CONTINUE; -} -/* fiemap flags we can handle specified here */ -#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) - -static int ext4_xattr_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo) -{ - __u64 physical = 0; - __u64 length; - __u32 flags = FIEMAP_EXTENT_LAST; - int blockbits = inode->i_sb->s_blocksize_bits; - int error = 0; - - /* in-inode? */ - if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { - struct ext4_iloc iloc; - int offset; /* offset of xattr in inode */ - - error = ext4_get_inode_loc(inode, &iloc); - if (error) - return error; - physical = iloc.bh->b_blocknr << blockbits; - offset = EXT4_GOOD_OLD_INODE_SIZE + - EXT4_I(inode)->i_extra_isize; - physical += offset; - length = EXT4_SB(inode->i_sb)->s_inode_size - offset; - flags |= FIEMAP_EXTENT_DATA_INLINE; - brelse(iloc.bh); - } else { /* external block */ - physical = EXT4_I(inode)->i_file_acl << blockbits; - length = inode->i_sb->s_blocksize; - } - - if (physical) - error = fiemap_fill_next_extent(fieinfo, 0, physical, - length, flags); - return (error < 0 ? error : 0); -} - -/* - * ext4_ext_punch_hole - * - * Punches a hole of "length" bytes in a file starting - * at byte "offset" - * - * @inode: The inode of the file to punch a hole in - * @offset: The starting byte offset of the hole - * @length: The length of the hole - * - * Returns the number of blocks removed or negative on err - */ -int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) -{ - struct inode *inode = file->f_path.dentry->d_inode; - struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - handle_t *handle; - loff_t first_page, last_page, page_len; - loff_t first_page_offset, last_page_offset; - int credits, err = 0; - - /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - return 0; - - /* - * If the hole extends beyond i_size, set the hole - * to end after the page that contains i_size - */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) - - offset; - } - - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - last_page = (offset + length) >> PAGE_CACHE_SHIFT; - - first_page_offset = first_page << PAGE_CACHE_SHIFT; - last_page_offset = last_page << PAGE_CACHE_SHIFT; - - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - err = filemap_write_and_wait_range(mapping, - offset, offset + length - 1); - - if (err) - return err; - } - - /* Now release the pages */ - if (last_page_offset > first_page_offset) { - truncate_inode_pages_range(mapping, first_page_offset, - last_page_offset-1); - } - - /* finish any pending end_io work */ - ext4_flush_completed_IO(inode); - - credits = ext4_writepage_trans_blocks(inode); - handle = ext4_journal_start(inode, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - err = ext4_orphan_add(handle, inode); - if (err) - goto out; - - /* - * Now we need to zero out the non-page-aligned data in the - * pages at the start and tail of the hole, and unmap the buffer - * heads for the block aligned regions of the page that were - * completely zeroed. - */ - if (first_page > last_page) { - /* - * If the file space being truncated is contained within a page - * just zero out and unmap the middle of that page - */ - err = ext4_discard_partial_page_buffers(handle, - mapping, offset, length, 0); - - if (err) - goto out; - } else { - /* - * zero out and unmap the partial page that contains - * the start of the hole - */ - page_len = first_page_offset - offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - offset, page_len, 0); - if (err) - goto out; - } - - /* - * zero out and unmap the partial page that contains - * the end of the hole - */ - page_len = offset + length - last_page_offset; - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, mapping, - last_page_offset, page_len, 0); - if (err) - goto out; - } - } - - /* - * If i_size is contained in the last page, we need to - * unmap and zero the partial page after i_size - */ - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && - inode->i_size % PAGE_CACHE_SIZE != 0) { - - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - if (page_len > 0) { - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out; - } - } - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - - /* If there are no blocks to remove, return now */ - if (first_block >= stop_block) - goto out; - - down_write(&EXT4_I(inode)->i_data_sem); - ext4_ext_invalidate_cache(inode); - ext4_discard_preallocations(inode); - - err = ext4_ext_remove_space(inode, first_block, stop_block - 1); - - ext4_ext_invalidate_cache(inode); - ext4_discard_preallocations(inode); - - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - - up_write(&EXT4_I(inode)->i_data_sem); - -out: - ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - return err; -} -int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) -{ - ext4_lblk_t start_blk; - int error = 0; - - /* fallback to generic here if not in extents fmt */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return generic_block_fiemap(inode, fieinfo, start, len, - ext4_get_block); - - if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) - return -EBADR; - - if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { - error = ext4_xattr_fiemap(inode, fieinfo); - } else { - ext4_lblk_t len_blks; - __u64 last_blk; - - start_blk = start >> inode->i_sb->s_blocksize_bits; - last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; - if (last_blk >= EXT_MAX_BLOCKS) - last_blk = EXT_MAX_BLOCKS-1; - len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; - - /* - * Walk the extent tree gathering extent information. - * ext4_ext_fiemap_cb will push extents back to user. - */ - error = ext4_ext_walk_space(inode, start_blk, len_blks, - ext4_ext_fiemap_cb, fieinfo); - } - - return error; -} diff --git a/ANDROID_3.4.5/fs/ext4/file.c b/ANDROID_3.4.5/fs/ext4/file.c deleted file mode 100644 index cb70f181..00000000 --- a/ANDROID_3.4.5/fs/ext4/file.c +++ /dev/null @@ -1,262 +0,0 @@ -/* - * linux/fs/ext4/file.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/file.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext4 fs regular file handling primitives - * - * 64-bit file support on 64-bit platforms by Jakub Jelinek - * (jj@sunsite.ms.mff.cuni.cz) - */ - -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd2.h> -#include <linux/mount.h> -#include <linux/path.h> -#include <linux/quotaops.h> -#include "ext4.h" -#include "ext4_jbd2.h" -#include "xattr.h" -#include "acl.h" - -/* - * Called when an inode is released. Note that this is different - * from ext4_file_open: open gets called at every open, but release - * gets called only when /all/ the files are closed. - */ -static int ext4_release_file(struct inode *inode, struct file *filp) -{ - if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { - ext4_alloc_da_blocks(inode); - ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); - } - /* if we are the last writer on the inode, drop the block reservation */ - if ((filp->f_mode & FMODE_WRITE) && - (atomic_read(&inode->i_writecount) == 1) && - !EXT4_I(inode)->i_reserved_data_blocks) - { - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - up_write(&EXT4_I(inode)->i_data_sem); - } - if (is_dx(inode) && filp->private_data) - ext4_htree_free_dir_info(filp->private_data); - - return 0; -} - -static void ext4_aiodio_wait(struct inode *inode) -{ - wait_queue_head_t *wq = ext4_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0)); -} - -/* - * This tests whether the IO in question is block-aligned or not. - * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they - * are converted to written only after the IO is complete. Until they are - * mapped, these blocks appear as holes, so dio_zero_block() will assume that - * it needs to zero out portions of the start and/or end block. If 2 AIO - * threads are at work on the same unwritten block, they must be synchronized - * or one thread will zero the other's data, causing corruption. - */ -static int -ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct super_block *sb = inode->i_sb; - int blockmask = sb->s_blocksize - 1; - size_t count = iov_length(iov, nr_segs); - loff_t final_size = pos + count; - - if (pos >= inode->i_size) - return 0; - - if ((pos & blockmask) || (final_size & blockmask)) - return 1; - - return 0; -} - -static ssize_t -ext4_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; - int unaligned_aio = 0; - int ret; - - /* - * If we have encountered a bitmap-format file, the size limit - * is smaller than s_maxbytes, which is for extent-mapped files. - */ - - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - size_t length = iov_length(iov, nr_segs); - - if ((pos > sbi->s_bitmap_maxbytes || - (pos == sbi->s_bitmap_maxbytes && length > 0))) - return -EFBIG; - - if (pos + length > sbi->s_bitmap_maxbytes) { - nr_segs = iov_shorten((struct iovec *)iov, nr_segs, - sbi->s_bitmap_maxbytes - pos); - } - } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) && - !is_sync_kiocb(iocb))) { - unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos); - } - - /* Unaligned direct AIO must be serialized; see comment above */ - if (unaligned_aio) { - static unsigned long unaligned_warn_time; - - /* Warn about this once per day */ - if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ)) - ext4_msg(inode->i_sb, KERN_WARNING, - "Unaligned AIO/DIO on inode %ld by %s; " - "performance will be poor.", - inode->i_ino, current->comm); - mutex_lock(ext4_aio_mutex(inode)); - ext4_aiodio_wait(inode); - } - - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - - if (unaligned_aio) - mutex_unlock(ext4_aio_mutex(inode)); - - return ret; -} - -static const struct vm_operations_struct ext4_file_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = ext4_page_mkwrite, -}; - -static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct address_space *mapping = file->f_mapping; - - if (!mapping->a_ops->readpage) - return -ENOEXEC; - file_accessed(file); - vma->vm_ops = &ext4_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; - return 0; -} - -static int ext4_file_open(struct inode * inode, struct file * filp) -{ - struct super_block *sb = inode->i_sb; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct ext4_inode_info *ei = EXT4_I(inode); - struct vfsmount *mnt = filp->f_path.mnt; - struct path path; - char buf[64], *cp; - - if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && - !(sb->s_flags & MS_RDONLY))) { - sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; - /* - * Sample where the filesystem has been mounted and - * store it in the superblock for sysadmin convenience - * when trying to sort through large numbers of block - * devices or filesystem images. - */ - memset(buf, 0, sizeof(buf)); - path.mnt = mnt; - path.dentry = mnt->mnt_root; - cp = d_path(&path, buf, sizeof(buf)); - if (!IS_ERR(cp)) { - strlcpy(sbi->s_es->s_last_mounted, cp, - sizeof(sbi->s_es->s_last_mounted)); - ext4_mark_super_dirty(sb); - } - } - /* - * Set up the jbd2_inode if we are opening the inode for - * writing and the journal is present - */ - if (sbi->s_journal && !ei->jinode && (filp->f_mode & FMODE_WRITE)) { - struct jbd2_inode *jinode = jbd2_alloc_inode(GFP_KERNEL); - - spin_lock(&inode->i_lock); - if (!ei->jinode) { - if (!jinode) { - spin_unlock(&inode->i_lock); - return -ENOMEM; - } - ei->jinode = jinode; - jbd2_journal_init_jbd_inode(ei->jinode, inode); - jinode = NULL; - } - spin_unlock(&inode->i_lock); - if (unlikely(jinode != NULL)) - jbd2_free_inode(jinode); - } - return dquot_file_open(inode, filp); -} - -/* - * ext4_llseek() copied from generic_file_llseek() to handle both - * block-mapped and extent-mapped maxbytes values. This should - * otherwise be identical with generic_file_llseek(). - */ -loff_t ext4_llseek(struct file *file, loff_t offset, int origin) -{ - struct inode *inode = file->f_mapping->host; - loff_t maxbytes; - - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; - else - maxbytes = inode->i_sb->s_maxbytes; - - return generic_file_llseek_size(file, offset, origin, maxbytes); -} - -const struct file_operations ext4_file_operations = { - .llseek = ext4_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = ext4_file_write, - .unlocked_ioctl = ext4_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext4_compat_ioctl, -#endif - .mmap = ext4_file_mmap, - .open = ext4_file_open, - .release = ext4_release_file, - .fsync = ext4_sync_file, - .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, - .fallocate = ext4_fallocate, -}; - -const struct inode_operations ext4_file_inode_operations = { - .setattr = ext4_setattr, - .getattr = ext4_getattr, -#ifdef CONFIG_EXT4_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext4_listxattr, - .removexattr = generic_removexattr, -#endif - .get_acl = ext4_get_acl, - .fiemap = ext4_fiemap, -}; - diff --git a/ANDROID_3.4.5/fs/ext4/fsync.c b/ANDROID_3.4.5/fs/ext4/fsync.c deleted file mode 100644 index bb6c7d81..00000000 --- a/ANDROID_3.4.5/fs/ext4/fsync.c +++ /dev/null @@ -1,271 +0,0 @@ -/* - * linux/fs/ext4/fsync.c - * - * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) - * from - * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds - * - * ext4fs fsync primitive - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - * - * Removed unnecessary code duplication for little endian machines - * and excessive __inline__s. - * Andi Kleen, 1997 - * - * Major simplications and cleanup - we only need to do the metadata, because - * we can depend on generic_block_fdatasync() to sync the data blocks. - */ - -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/sched.h> -#include <linux/writeback.h> -#include <linux/jbd2.h> -#include <linux/blkdev.h> - -#include "ext4.h" -#include "ext4_jbd2.h" - -#include <trace/events/ext4.h> - -static void dump_completed_IO(struct inode * inode) -{ -#ifdef EXT4FS_DEBUG - struct list_head *cur, *before, *after; - ext4_io_end_t *io, *io0, *io1; - unsigned long flags; - - if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ - ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); - return; - } - - ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ - cur = &io->list; - before = cur->prev; - io0 = container_of(before, ext4_io_end_t, list); - after = cur->next; - io1 = container_of(after, ext4_io_end_t, list); - - ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", - io, inode->i_ino, io0, io1); - } - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); -#endif -} - -/* - * This function is called from ext4_sync_file(). - * - * When IO is completed, the work to convert unwritten extents to - * written is queued on workqueue but may not get immediately - * scheduled. When fsync is called, we need to ensure the - * conversion is complete before fsync returns. - * The inode keeps track of a list of pending/completed IO that - * might needs to do the conversion. This function walks through - * the list and convert the related unwritten extents for completed IO - * to written. - * The function return the number of pending IOs on success. - */ -int ext4_flush_completed_IO(struct inode *inode) -{ - ext4_io_end_t *io; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long flags; - int ret = 0; - int ret2 = 0; - - dump_completed_IO(inode); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - while (!list_empty(&ei->i_completed_io_list)){ - io = list_entry(ei->i_completed_io_list.next, - ext4_io_end_t, list); - list_del_init(&io->list); - io->flag |= EXT4_IO_END_IN_FSYNC; - /* - * Calling ext4_end_io_nolock() to convert completed - * IO to written. - * - * When ext4_sync_file() is called, run_queue() may already - * about to flush the work corresponding to this io structure. - * It will be upset if it founds the io structure related - * to the work-to-be schedule is freed. - * - * Thus we need to keep the io structure still valid here after - * conversion finished. The io structure has a flag to - * avoid double converting from both fsync and background work - * queue work. - */ - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - ret = ext4_end_io_nolock(io); - if (ret < 0) - ret2 = ret; - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - io->flag &= ~EXT4_IO_END_IN_FSYNC; - } - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - return (ret2 < 0) ? ret2 : 0; -} - -/* - * If we're not journaling and this is a just-created file, we have to - * sync our parent directory (if it was freshly created) since - * otherwise it will only be written by writeback, leaving a huge - * window during which a crash may lose the file. This may apply for - * the parent directory's parent as well, and so on recursively, if - * they are also freshly created. - */ -static int ext4_sync_parent(struct inode *inode) -{ - struct writeback_control wbc; - struct dentry *dentry = NULL; - struct inode *next; - int ret = 0; - - if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) - return 0; - inode = igrab(inode); - while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { - ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); - dentry = NULL; - spin_lock(&inode->i_lock); - if (!list_empty(&inode->i_dentry)) { - dentry = list_first_entry(&inode->i_dentry, - struct dentry, d_alias); - dget(dentry); - } - spin_unlock(&inode->i_lock); - if (!dentry) - break; - next = igrab(dentry->d_parent->d_inode); - dput(dentry); - if (!next) - break; - iput(inode); - inode = next; - ret = sync_mapping_buffers(inode->i_mapping); - if (ret) - break; - memset(&wbc, 0, sizeof(wbc)); - wbc.sync_mode = WB_SYNC_ALL; - wbc.nr_to_write = 0; /* only write out the inode */ - ret = sync_inode(inode, &wbc); - if (ret) - break; - } - iput(inode); - return ret; -} - -/** - * __sync_file - generic_file_fsync without the locking and filemap_write - * @inode: inode to sync - * @datasync: only sync essential metadata if true - * - * This is just generic_file_fsync without the locking. This is needed for - * nojournal mode to make sure this inodes data/metadata makes it to disk - * properly. The i_mutex should be held already. - */ -static int __sync_inode(struct inode *inode, int datasync) -{ - int err; - int ret; - - ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) - return ret; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return ret; - - err = sync_inode_metadata(inode, 1); - if (ret == 0) - ret = err; - return ret; -} - -/* - * akpm: A new design for ext4_sync_file(). - * - * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). - * There cannot be a transaction open by this task. - * Another task could have dirtied this inode. Its data can be in any - * state in the journalling system. - * - * What we do is just kick off a commit and wait on it. This will snapshot the - * inode to disk. - * - * i_mutex lock is held when entering and exiting this function - */ - -int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) -{ - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - int ret; - tid_t commit_tid; - bool needs_barrier = false; - - J_ASSERT(ext4_journal_current_handle() == NULL); - - trace_ext4_sync_file_enter(file, datasync); - - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - mutex_lock(&inode->i_mutex); - - if (inode->i_sb->s_flags & MS_RDONLY) - goto out; - - ret = ext4_flush_completed_IO(inode); - if (ret < 0) - goto out; - - if (!journal) { - ret = __sync_inode(inode, datasync); - if (!ret && !list_empty(&inode->i_dentry)) - ret = ext4_sync_parent(inode); - goto out; - } - - /* - * data=writeback,ordered: - * The caller's filemap_fdatawrite()/wait will sync the data. - * Metadata is in the journal, we wait for proper transaction to - * commit here. - * - * data=journal: - * filemap_fdatawrite won't do anything (the buffers are clean). - * ext4_force_commit will write the file data into the journal and - * will wait on that. - * filemap_fdatawait() will encounter a ton of newly-dirtied pages - * (they were dirtied by commit). But that's OK - the blocks are - * safe in-journal, which is all fsync() needs to ensure. - */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - goto out; - } - - commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; - if (journal->j_flags & JBD2_BARRIER && - !jbd2_trans_will_send_data_barrier(journal, commit_tid)) - needs_barrier = true; - jbd2_log_start_commit(journal, commit_tid); - ret = jbd2_log_wait_commit(journal, commit_tid); - if (needs_barrier) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - out: - mutex_unlock(&inode->i_mutex); - trace_ext4_sync_file_exit(inode, ret); - return ret; -} diff --git a/ANDROID_3.4.5/fs/ext4/hash.c b/ANDROID_3.4.5/fs/ext4/hash.c deleted file mode 100644 index fa8e4911..00000000 --- a/ANDROID_3.4.5/fs/ext4/hash.c +++ /dev/null @@ -1,208 +0,0 @@ -/* - * linux/fs/ext4/hash.c - * - * Copyright (C) 2002 by Theodore Ts'o - * - * This file is released under the GPL v2. - * - * This file may be redistributed under the terms of the GNU Public - * License. - */ - -#include <linux/fs.h> -#include <linux/jbd2.h> -#include <linux/cryptohash.h> -#include "ext4.h" - -#define DELTA 0x9E3779B9 - -static void TEA_transform(__u32 buf[4], __u32 const in[]) -{ - __u32 sum = 0; - __u32 b0 = buf[0], b1 = buf[1]; - __u32 a = in[0], b = in[1], c = in[2], d = in[3]; - int n = 16; - - do { - sum += DELTA; - b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); - b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); - } while (--n); - - buf[0] += b0; - buf[1] += b1; -} - - -/* The old legacy hash */ -static __u32 dx_hack_hash_unsigned(const char *name, int len) -{ - __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; - const unsigned char *ucp = (const unsigned char *) name; - - while (len--) { - hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); - - if (hash & 0x80000000) - hash -= 0x7fffffff; - hash1 = hash0; - hash0 = hash; - } - return hash0 << 1; -} - -static __u32 dx_hack_hash_signed(const char *name, int len) -{ - __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; - const signed char *scp = (const signed char *) name; - - while (len--) { - hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); - - if (hash & 0x80000000) - hash -= 0x7fffffff; - hash1 = hash0; - hash0 = hash; - } - return hash0 << 1; -} - -static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) -{ - __u32 pad, val; - int i; - const signed char *scp = (const signed char *) msg; - - pad = (__u32)len | ((__u32)len << 8); - pad |= pad << 16; - - val = pad; - if (len > num*4) - len = num * 4; - for (i = 0; i < len; i++) { - if ((i % 4) == 0) - val = pad; - val = ((int) scp[i]) + (val << 8); - if ((i % 4) == 3) { - *buf++ = val; - val = pad; - num--; - } - } - if (--num >= 0) - *buf++ = val; - while (--num >= 0) - *buf++ = pad; -} - -static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) -{ - __u32 pad, val; - int i; - const unsigned char *ucp = (const unsigned char *) msg; - - pad = (__u32)len | ((__u32)len << 8); - pad |= pad << 16; - - val = pad; - if (len > num*4) - len = num * 4; - for (i = 0; i < len; i++) { - if ((i % 4) == 0) - val = pad; - val = ((int) ucp[i]) + (val << 8); - if ((i % 4) == 3) { - *buf++ = val; - val = pad; - num--; - } - } - if (--num >= 0) - *buf++ = val; - while (--num >= 0) - *buf++ = pad; -} - -/* - * Returns the hash of a filename. If len is 0 and name is NULL, then - * this function can be used to test whether or not a hash version is - * supported. - * - * The seed is an 4 longword (32 bits) "secret" which can be used to - * uniquify a hash. If the seed is all zero's, then some default seed - * may be used. - * - * A particular hash version specifies whether or not the seed is - * represented, and whether or not the returned hash is 32 bits or 64 - * bits. 32 bit hashes will return 0 for the minor hash. - */ -int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) -{ - __u32 hash; - __u32 minor_hash = 0; - const char *p; - int i; - __u32 in[8], buf[4]; - void (*str2hashbuf)(const char *, int, __u32 *, int) = - str2hashbuf_signed; - - /* Initialize the default seed for the hash checksum functions */ - buf[0] = 0x67452301; - buf[1] = 0xefcdab89; - buf[2] = 0x98badcfe; - buf[3] = 0x10325476; - - /* Check to see if the seed is all zero's */ - if (hinfo->seed) { - for (i = 0; i < 4; i++) { - if (hinfo->seed[i]) - break; - } - if (i < 4) - memcpy(buf, hinfo->seed, sizeof(buf)); - } - - switch (hinfo->hash_version) { - case DX_HASH_LEGACY_UNSIGNED: - hash = dx_hack_hash_unsigned(name, len); - break; - case DX_HASH_LEGACY: - hash = dx_hack_hash_signed(name, len); - break; - case DX_HASH_HALF_MD4_UNSIGNED: - str2hashbuf = str2hashbuf_unsigned; - case DX_HASH_HALF_MD4: - p = name; - while (len > 0) { - (*str2hashbuf)(p, len, in, 8); - half_md4_transform(buf, in); - len -= 32; - p += 32; - } - minor_hash = buf[2]; - hash = buf[1]; - break; - case DX_HASH_TEA_UNSIGNED: - str2hashbuf = str2hashbuf_unsigned; - case DX_HASH_TEA: - p = name; - while (len > 0) { - (*str2hashbuf)(p, len, in, 4); - TEA_transform(buf, in); - len -= 16; - p += 16; - } - hash = buf[0]; - minor_hash = buf[1]; - break; - default: - hinfo->hash = 0; - return -1; - } - hash = hash & ~1; - if (hash == (EXT4_HTREE_EOF_32BIT << 1)) - hash = (EXT4_HTREE_EOF_32BIT - 1) << 1; - hinfo->hash = hash; - hinfo->minor_hash = minor_hash; - return 0; -} diff --git a/ANDROID_3.4.5/fs/ext4/ialloc.c b/ANDROID_3.4.5/fs/ext4/ialloc.c deleted file mode 100644 index b4a7dd56..00000000 --- a/ANDROID_3.4.5/fs/ext4/ialloc.c +++ /dev/null @@ -1,1161 +0,0 @@ -/* - * linux/fs/ext4/ialloc.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * BSD ufs-inspired inode and directory allocation by - * Stephen Tweedie (sct@redhat.com), 1993 - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - */ - -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/jbd2.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/quotaops.h> -#include <linux/buffer_head.h> -#include <linux/random.h> -#include <linux/bitops.h> -#include <linux/blkdev.h> -#include <asm/byteorder.h> - -#include "ext4.h" -#include "ext4_jbd2.h" -#include "xattr.h" -#include "acl.h" - -#include <trace/events/ext4.h> - -/* - * ialloc.c contains the inodes allocation and deallocation routines - */ - -/* - * The free inodes are managed by bitmaps. A file system contains several - * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap - * block for inodes, N blocks for the inode table and data blocks. - * - * The file system contains group descriptors which are located after the - * super block. Each descriptor contains the number of the bitmap block and - * the free blocks count in the block. - */ - -/* - * To avoid calling the atomic setbit hundreds or thousands of times, we only - * need to use it within a single byte (to ensure we get endianness right). - * We can use memset for the rest of the bitmap as there are no other users. - */ -void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) -{ - int i; - - if (start_bit >= end_bit) - return; - - ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); - for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) - ext4_set_bit(i, bitmap); - if (i < end_bit) - memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); -} - -/* Initializes an uninitialized inode bitmap */ -static unsigned ext4_init_inode_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t block_group, - struct ext4_group_desc *gdp) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - J_ASSERT_BH(bh, buffer_locked(bh)); - - /* If checksum is bad mark all blocks and inodes use to prevent - * allocation, essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_group_clusters_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - return 0; - } - - memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); - ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, - bh->b_data); - - return EXT4_INODES_PER_GROUP(sb); -} - -void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) -{ - if (uptodate) { - set_buffer_uptodate(bh); - set_bitmap_uptodate(bh); - } - unlock_buffer(bh); - put_bh(bh); -} - -/* - * Read the inode allocation bitmap for a given block_group, reading - * into the specified slot in the superblock's bitmap cache. - * - * Return buffer_head of bitmap on success or NULL. - */ -static struct buffer_head * -ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) -{ - struct ext4_group_desc *desc; - struct buffer_head *bh = NULL; - ext4_fsblk_t bitmap_blk; - - desc = ext4_get_group_desc(sb, block_group, NULL); - if (!desc) - return NULL; - - bitmap_blk = ext4_inode_bitmap(sb, desc); - bh = sb_getblk(sb, bitmap_blk); - if (unlikely(!bh)) { - ext4_error(sb, "Cannot read inode bitmap - " - "block_group = %u, inode_bitmap = %llu", - block_group, bitmap_blk); - return NULL; - } - if (bitmap_uptodate(bh)) - return bh; - - lock_buffer(bh); - if (bitmap_uptodate(bh)) { - unlock_buffer(bh); - return bh; - } - - ext4_lock_group(sb, block_group); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - ext4_init_inode_bitmap(sb, bh, block_group, desc); - set_bitmap_uptodate(bh); - set_buffer_uptodate(bh); - ext4_unlock_group(sb, block_group); - unlock_buffer(bh); - return bh; - } - ext4_unlock_group(sb, block_group); - - if (buffer_uptodate(bh)) { - /* - * if not uninit if bh is uptodate, - * bitmap is also uptodate - */ - set_bitmap_uptodate(bh); - unlock_buffer(bh); - return bh; - } - /* - * submit the buffer_head for reading - */ - trace_ext4_load_inode_bitmap(sb, block_group); - bh->b_end_io = ext4_end_bitmap_read; - get_bh(bh); - submit_bh(READ, bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - put_bh(bh); - ext4_error(sb, "Cannot read inode bitmap - " - "block_group = %u, inode_bitmap = %llu", - block_group, bitmap_blk); - return NULL; - } - return bh; -} - -/* - * NOTE! When we get the inode, we're the only people - * that have access to it, and as such there are no - * race conditions we have to worry about. The inode - * is not on the hash-lists, and it cannot be reached - * through the filesystem because the directory entry - * has been deleted earlier. - * - * HOWEVER: we must make sure that we get no aliases, - * which means that we have to call "clear_inode()" - * _before_ we mark the inode not in use in the inode - * bitmaps. Otherwise a newly created file might use - * the same inode number (not actually the same pointer - * though), and then we'd have two inodes sharing the - * same inode number and space on the harddisk. - */ -void ext4_free_inode(handle_t *handle, struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - int is_directory; - unsigned long ino; - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *bh2; - ext4_group_t block_group; - unsigned long bit; - struct ext4_group_desc *gdp; - struct ext4_super_block *es; - struct ext4_sb_info *sbi; - int fatal = 0, err, count, cleared; - - if (!sb) { - printk(KERN_ERR "EXT4-fs: %s:%d: inode on " - "nonexistent device\n", __func__, __LINE__); - return; - } - if (atomic_read(&inode->i_count) > 1) { - ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", - __func__, __LINE__, inode->i_ino, - atomic_read(&inode->i_count)); - return; - } - if (inode->i_nlink) { - ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", - __func__, __LINE__, inode->i_ino, inode->i_nlink); - return; - } - sbi = EXT4_SB(sb); - - ino = inode->i_ino; - ext4_debug("freeing inode %lu\n", ino); - trace_ext4_free_inode(inode); - - /* - * Note: we must free any quota before locking the superblock, - * as writing the quota to disk may need the lock as well. - */ - dquot_initialize(inode); - ext4_xattr_delete_inode(handle, inode); - dquot_free_inode(inode); - dquot_drop(inode); - - is_directory = S_ISDIR(inode->i_mode); - - /* Do this BEFORE marking the inode not in use or returning an error */ - ext4_clear_inode(inode); - - es = EXT4_SB(sb)->s_es; - if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { - ext4_error(sb, "reserved or nonexistent inode %lu", ino); - goto error_return; - } - block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); - bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); - bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (!bitmap_bh) - goto error_return; - - BUFFER_TRACE(bitmap_bh, "get_write_access"); - fatal = ext4_journal_get_write_access(handle, bitmap_bh); - if (fatal) - goto error_return; - - fatal = -ESRCH; - gdp = ext4_get_group_desc(sb, block_group, &bh2); - if (gdp) { - BUFFER_TRACE(bh2, "get_write_access"); - fatal = ext4_journal_get_write_access(handle, bh2); - } - ext4_lock_group(sb, block_group); - cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); - if (fatal || !cleared) { - ext4_unlock_group(sb, block_group); - goto out; - } - - count = ext4_free_inodes_count(sb, gdp) + 1; - ext4_free_inodes_set(sb, gdp, count); - if (is_directory) { - count = ext4_used_dirs_count(sb, gdp) - 1; - ext4_used_dirs_set(sb, gdp, count); - percpu_counter_dec(&sbi->s_dirs_counter); - } - gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); - ext4_unlock_group(sb, block_group); - - percpu_counter_inc(&sbi->s_freeinodes_counter); - if (sbi->s_log_groups_per_flex) { - ext4_group_t f = ext4_flex_group(sbi, block_group); - - atomic_inc(&sbi->s_flex_groups[f].free_inodes); - if (is_directory) - atomic_dec(&sbi->s_flex_groups[f].used_dirs); - } - BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); - fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); -out: - if (cleared) { - BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - if (!fatal) - fatal = err; - ext4_mark_super_dirty(sb); - } else - ext4_error(sb, "bit already cleared for inode %lu", ino); - -error_return: - brelse(bitmap_bh); - ext4_std_error(sb, fatal); -} - -struct orlov_stats { - __u32 free_inodes; - __u32 free_clusters; - __u32 used_dirs; -}; - -/* - * Helper function for Orlov's allocator; returns critical information - * for a particular block group or flex_bg. If flex_size is 1, then g - * is a block group number; otherwise it is flex_bg number. - */ -static void get_orlov_stats(struct super_block *sb, ext4_group_t g, - int flex_size, struct orlov_stats *stats) -{ - struct ext4_group_desc *desc; - struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; - - if (flex_size > 1) { - stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_clusters = atomic_read(&flex_group[g].free_clusters); - stats->used_dirs = atomic_read(&flex_group[g].used_dirs); - return; - } - - desc = ext4_get_group_desc(sb, g, NULL); - if (desc) { - stats->free_inodes = ext4_free_inodes_count(sb, desc); - stats->free_clusters = ext4_free_group_clusters(sb, desc); - stats->used_dirs = ext4_used_dirs_count(sb, desc); - } else { - stats->free_inodes = 0; - stats->free_clusters = 0; - stats->used_dirs = 0; - } -} - -/* - * Orlov's allocator for directories. - * - * We always try to spread first-level directories. - * - * If there are blockgroups with both free inodes and free blocks counts - * not worse than average we return one with smallest directory count. - * Otherwise we simply return a random group. - * - * For the rest rules look so: - * - * It's OK to put directory into a group unless - * it has too many directories already (max_dirs) or - * it has too few free inodes left (min_inodes) or - * it has too few free blocks left (min_blocks) or - * Parent's group is preferred, if it doesn't satisfy these - * conditions we search cyclically through the rest. If none - * of the groups look good we just look for a group with more - * free inodes than average (starting at parent's group). - */ - -static int find_group_orlov(struct super_block *sb, struct inode *parent, - ext4_group_t *group, umode_t mode, - const struct qstr *qstr) -{ - ext4_group_t parent_group = EXT4_I(parent)->i_block_group; - struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_group_t real_ngroups = ext4_get_groups_count(sb); - int inodes_per_group = EXT4_INODES_PER_GROUP(sb); - unsigned int freei, avefreei, grp_free; - ext4_fsblk_t freeb, avefreec; - unsigned int ndirs; - int max_dirs, min_inodes; - ext4_grpblk_t min_clusters; - ext4_group_t i, grp, g, ngroups; - struct ext4_group_desc *desc; - struct orlov_stats stats; - int flex_size = ext4_flex_bg_size(sbi); - struct dx_hash_info hinfo; - - ngroups = real_ngroups; - if (flex_size > 1) { - ngroups = (real_ngroups + flex_size - 1) >> - sbi->s_log_groups_per_flex; - parent_group >>= sbi->s_log_groups_per_flex; - } - - freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); - avefreei = freei / ngroups; - freeb = EXT4_C2B(sbi, - percpu_counter_read_positive(&sbi->s_freeclusters_counter)); - avefreec = freeb; - do_div(avefreec, ngroups); - ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); - - if (S_ISDIR(mode) && - ((parent == sb->s_root->d_inode) || - (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { - int best_ndir = inodes_per_group; - int ret = -1; - - if (qstr) { - hinfo.hash_version = DX_HASH_HALF_MD4; - hinfo.seed = sbi->s_hash_seed; - ext4fs_dirhash(qstr->name, qstr->len, &hinfo); - grp = hinfo.hash; - } else - get_random_bytes(&grp, sizeof(grp)); - parent_group = (unsigned)grp % ngroups; - for (i = 0; i < ngroups; i++) { - g = (parent_group + i) % ngroups; - get_orlov_stats(sb, g, flex_size, &stats); - if (!stats.free_inodes) - continue; - if (stats.used_dirs >= best_ndir) - continue; - if (stats.free_inodes < avefreei) - continue; - if (stats.free_clusters < avefreec) - continue; - grp = g; - ret = 0; - best_ndir = stats.used_dirs; - } - if (ret) - goto fallback; - found_flex_bg: - if (flex_size == 1) { - *group = grp; - return 0; - } - - /* - * We pack inodes at the beginning of the flexgroup's - * inode tables. Block allocation decisions will do - * something similar, although regular files will - * start at 2nd block group of the flexgroup. See - * ext4_ext_find_goal() and ext4_find_near(). - */ - grp *= flex_size; - for (i = 0; i < flex_size; i++) { - if (grp+i >= real_ngroups) - break; - desc = ext4_get_group_desc(sb, grp+i, NULL); - if (desc && ext4_free_inodes_count(sb, desc)) { - *group = grp+i; - return 0; - } - } - goto fallback; - } - - max_dirs = ndirs / ngroups + inodes_per_group / 16; - min_inodes = avefreei - inodes_per_group*flex_size / 4; - if (min_inodes < 1) - min_inodes = 1; - min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; - - /* - * Start looking in the flex group where we last allocated an - * inode for this parent directory - */ - if (EXT4_I(parent)->i_last_alloc_group != ~0) { - parent_group = EXT4_I(parent)->i_last_alloc_group; - if (flex_size > 1) - parent_group >>= sbi->s_log_groups_per_flex; - } - - for (i = 0; i < ngroups; i++) { - grp = (parent_group + i) % ngroups; - get_orlov_stats(sb, grp, flex_size, &stats); - if (stats.used_dirs >= max_dirs) - continue; - if (stats.free_inodes < min_inodes) - continue; - if (stats.free_clusters < min_clusters) - continue; - goto found_flex_bg; - } - -fallback: - ngroups = real_ngroups; - avefreei = freei / ngroups; -fallback_retry: - parent_group = EXT4_I(parent)->i_block_group; - for (i = 0; i < ngroups; i++) { - grp = (parent_group + i) % ngroups; - desc = ext4_get_group_desc(sb, grp, NULL); - if (desc) { - grp_free = ext4_free_inodes_count(sb, desc); - if (grp_free && grp_free >= avefreei) { - *group = grp; - return 0; - } - } - } - - if (avefreei) { - /* - * The free-inodes counter is approximate, and for really small - * filesystems the above test can fail to find any blockgroups - */ - avefreei = 0; - goto fallback_retry; - } - - return -1; -} - -static int find_group_other(struct super_block *sb, struct inode *parent, - ext4_group_t *group, umode_t mode) -{ - ext4_group_t parent_group = EXT4_I(parent)->i_block_group; - ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); - struct ext4_group_desc *desc; - int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); - - /* - * Try to place the inode is the same flex group as its - * parent. If we can't find space, use the Orlov algorithm to - * find another flex group, and store that information in the - * parent directory's inode information so that use that flex - * group for future allocations. - */ - if (flex_size > 1) { - int retry = 0; - - try_again: - parent_group &= ~(flex_size-1); - last = parent_group + flex_size; - if (last > ngroups) - last = ngroups; - for (i = parent_group; i < last; i++) { - desc = ext4_get_group_desc(sb, i, NULL); - if (desc && ext4_free_inodes_count(sb, desc)) { - *group = i; - return 0; - } - } - if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { - retry = 1; - parent_group = EXT4_I(parent)->i_last_alloc_group; - goto try_again; - } - /* - * If this didn't work, use the Orlov search algorithm - * to find a new flex group; we pass in the mode to - * avoid the topdir algorithms. - */ - *group = parent_group + flex_size; - if (*group > ngroups) - *group = 0; - return find_group_orlov(sb, parent, group, mode, NULL); - } - - /* - * Try to place the inode in its parent directory - */ - *group = parent_group; - desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_group_clusters(sb, desc)) - return 0; - - /* - * We're going to place this inode in a different blockgroup from its - * parent. We want to cause files in a common directory to all land in - * the same blockgroup. But we want files which are in a different - * directory which shares a blockgroup with our parent to land in a - * different blockgroup. - * - * So add our directory's i_ino into the starting point for the hash. - */ - *group = (*group + parent->i_ino) % ngroups; - - /* - * Use a quadratic hash to find a group with a free inode and some free - * blocks. - */ - for (i = 1; i < ngroups; i <<= 1) { - *group += i; - if (*group >= ngroups) - *group -= ngroups; - desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_group_clusters(sb, desc)) - return 0; - } - - /* - * That failed: try linear search for a free inode, even if that group - * has no free blocks. - */ - *group = parent_group; - for (i = 0; i < ngroups; i++) { - if (++*group >= ngroups) - *group = 0; - desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && ext4_free_inodes_count(sb, desc)) - return 0; - } - - return -1; -} - -/* - * There are two policies for allocating an inode. If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory's block - * group to find a free inode. - */ -struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, - const struct qstr *qstr, __u32 goal, uid_t *owner) -{ - struct super_block *sb; - struct buffer_head *inode_bitmap_bh = NULL; - struct buffer_head *group_desc_bh; - ext4_group_t ngroups, group = 0; - unsigned long ino = 0; - struct inode *inode; - struct ext4_group_desc *gdp = NULL; - struct ext4_inode_info *ei; - struct ext4_sb_info *sbi; - int ret2, err = 0; - struct inode *ret; - ext4_group_t i; - ext4_group_t flex_group; - - /* Cannot create files in a deleted directory */ - if (!dir || !dir->i_nlink) - return ERR_PTR(-EPERM); - - sb = dir->i_sb; - ngroups = ext4_get_groups_count(sb); - trace_ext4_request_inode(dir, mode); - inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - ei = EXT4_I(inode); - sbi = EXT4_SB(sb); - - if (!goal) - goal = sbi->s_inode_goal; - - if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { - group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); - ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); - ret2 = 0; - goto got_group; - } - - if (S_ISDIR(mode)) - ret2 = find_group_orlov(sb, dir, &group, mode, qstr); - else - ret2 = find_group_other(sb, dir, &group, mode); - -got_group: - EXT4_I(dir)->i_last_alloc_group = group; - err = -ENOSPC; - if (ret2 == -1) - goto out; - - /* - * Normally we will only go through one pass of this loop, - * unless we get unlucky and it turns out the group we selected - * had its last inode grabbed by someone else. - */ - for (i = 0; i < ngroups; i++, ino = 0) { - err = -EIO; - - gdp = ext4_get_group_desc(sb, group, &group_desc_bh); - if (!gdp) - goto fail; - - brelse(inode_bitmap_bh); - inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); - if (!inode_bitmap_bh) - goto fail; - -repeat_in_this_group: - ino = ext4_find_next_zero_bit((unsigned long *) - inode_bitmap_bh->b_data, - EXT4_INODES_PER_GROUP(sb), ino); - if (ino >= EXT4_INODES_PER_GROUP(sb)) { - if (++group == ngroups) - group = 0; - continue; - } - if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { - ext4_error(sb, "reserved inode found cleared - " - "inode=%lu", ino + 1); - continue; - } - ext4_lock_group(sb, group); - ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); - ext4_unlock_group(sb, group); - ino++; /* the inode bitmap is zero-based */ - if (!ret2) - goto got; /* we grabbed the inode! */ - if (ino < EXT4_INODES_PER_GROUP(sb)) - goto repeat_in_this_group; - } - err = -ENOSPC; - goto out; - -got: - /* We may have to initialize the block bitmap if it isn't already */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && - gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - struct buffer_head *block_bitmap_bh; - - block_bitmap_bh = ext4_read_block_bitmap(sb, group); - BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); - err = ext4_journal_get_write_access(handle, block_bitmap_bh); - if (err) { - brelse(block_bitmap_bh); - goto fail; - } - - BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); - err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); - brelse(block_bitmap_bh); - - /* recheck and clear flag under lock if we still need to */ - ext4_lock_group(sb, group); - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_group_clusters_set(sb, gdp, - ext4_free_clusters_after_init(sb, group, gdp)); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, - gdp); - } - ext4_unlock_group(sb, group); - - if (err) - goto fail; - } - - BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, inode_bitmap_bh); - if (err) - goto fail; - - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) - goto fail; - - /* Update the relevant bg descriptor fields */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - int free; - struct ext4_group_info *grp = ext4_get_group_info(sb, group); - - down_read(&grp->alloc_sem); /* protect vs itable lazyinit */ - ext4_lock_group(sb, group); /* while we modify the bg desc */ - free = EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp); - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); - free = 0; - } - /* - * Check the relative inode number against the last used - * relative inode number in this group. if it is greater - * we need to update the bg_itable_unused count - */ - if (ino > free) - ext4_itable_unused_set(sb, gdp, - (EXT4_INODES_PER_GROUP(sb) - ino)); - up_read(&grp->alloc_sem); - } else { - ext4_lock_group(sb, group); - } - - ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); - if (S_ISDIR(mode)) { - ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); - if (sbi->s_log_groups_per_flex) { - ext4_group_t f = ext4_flex_group(sbi, group); - - atomic_inc(&sbi->s_flex_groups[f].used_dirs); - } - } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); - } - ext4_unlock_group(sb, group); - - BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); - if (err) - goto fail; - - BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); - if (err) - goto fail; - - percpu_counter_dec(&sbi->s_freeinodes_counter); - if (S_ISDIR(mode)) - percpu_counter_inc(&sbi->s_dirs_counter); - ext4_mark_super_dirty(sb); - - if (sbi->s_log_groups_per_flex) { - flex_group = ext4_flex_group(sbi, group); - atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); - } - if (owner) { - inode->i_mode = mode; - inode->i_uid = owner[0]; - inode->i_gid = owner[1]; - } else if (test_opt(sb, GRPID)) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = dir->i_gid; - } else - inode_init_owner(inode, dir, mode); - - inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); - /* This is the optimal IO size (for stat), not the fs block size */ - inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = - ext4_current_time(inode); - - memset(ei->i_data, 0, sizeof(ei->i_data)); - ei->i_dir_start_lookup = 0; - ei->i_disksize = 0; - - /* Don't inherit extent flag from directory, amongst others. */ - ei->i_flags = - ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); - ei->i_file_acl = 0; - ei->i_dtime = 0; - ei->i_block_group = group; - ei->i_last_alloc_group = ~0; - - ext4_set_inode_flags(inode); - if (IS_DIRSYNC(inode)) - ext4_handle_sync(handle); - if (insert_inode_locked(inode) < 0) { - /* - * Likely a bitmap corruption causing inode to be allocated - * twice. - */ - err = -EIO; - goto fail; - } - spin_lock(&sbi->s_next_gen_lock); - inode->i_generation = sbi->s_next_generation++; - spin_unlock(&sbi->s_next_gen_lock); - - ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ - ext4_set_inode_state(inode, EXT4_STATE_NEW); - - ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; - - ret = inode; - dquot_initialize(inode); - err = dquot_alloc_inode(inode); - if (err) - goto fail_drop; - - err = ext4_init_acl(handle, inode, dir); - if (err) - goto fail_free_drop; - - err = ext4_init_security(handle, inode, dir, qstr); - if (err) - goto fail_free_drop; - - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { - /* set extent flag only for directory, file and normal symlink*/ - if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { - ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); - ext4_ext_tree_init(handle, inode); - } - } - - if (ext4_handle_valid(handle)) { - ei->i_sync_tid = handle->h_transaction->t_tid; - ei->i_datasync_tid = handle->h_transaction->t_tid; - } - - err = ext4_mark_inode_dirty(handle, inode); - if (err) { - ext4_std_error(sb, err); - goto fail_free_drop; - } - - ext4_debug("allocating inode %lu\n", inode->i_ino); - trace_ext4_allocate_inode(inode, dir, mode); - goto really_out; -fail: - ext4_std_error(sb, err); -out: - iput(inode); - ret = ERR_PTR(err); -really_out: - brelse(inode_bitmap_bh); - return ret; - -fail_free_drop: - dquot_free_inode(inode); - -fail_drop: - dquot_drop(inode); - inode->i_flags |= S_NOQUOTA; - clear_nlink(inode); - unlock_new_inode(inode); - iput(inode); - brelse(inode_bitmap_bh); - return ERR_PTR(err); -} - -/* Verify that we are loading a valid orphan from disk */ -struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) -{ - unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); - ext4_group_t block_group; - int bit; - struct buffer_head *bitmap_bh; - struct inode *inode = NULL; - long err = -EIO; - - /* Error cases - e2fsck has already cleaned up for us */ - if (ino > max_ino) { - ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino); - goto error; - } - - block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); - bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); - bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (!bitmap_bh) { - ext4_warning(sb, "inode bitmap error for orphan %lu", ino); - goto error; - } - - /* Having the inode bit set should be a 100% indicator that this - * is a valid orphan (no e2fsck run on fs). Orphans also include - * inodes that were being truncated, so we can't check i_nlink==0. - */ - if (!ext4_test_bit(bit, bitmap_bh->b_data)) - goto bad_orphan; - - inode = ext4_iget(sb, ino); - if (IS_ERR(inode)) - goto iget_failed; - - /* - * If the orphans has i_nlinks > 0 then it should be able to be - * truncated, otherwise it won't be removed from the orphan list - * during processing and an infinite loop will result. - */ - if (inode->i_nlink && !ext4_can_truncate(inode)) - goto bad_orphan; - - if (NEXT_ORPHAN(inode) > max_ino) - goto bad_orphan; - brelse(bitmap_bh); - return inode; - -iget_failed: - err = PTR_ERR(inode); - inode = NULL; -bad_orphan: - ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); - printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", - bit, (unsigned long long)bitmap_bh->b_blocknr, - ext4_test_bit(bit, bitmap_bh->b_data)); - printk(KERN_NOTICE "inode=%p\n", inode); - if (inode) { - printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", - is_bad_inode(inode)); - printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", - NEXT_ORPHAN(inode)); - printk(KERN_NOTICE "max_ino=%lu\n", max_ino); - printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); - /* Avoid freeing blocks if we got a bad deleted inode */ - if (inode->i_nlink == 0) - inode->i_blocks = 0; - iput(inode); - } - brelse(bitmap_bh); -error: - return ERR_PTR(err); -} - -unsigned long ext4_count_free_inodes(struct super_block *sb) -{ - unsigned long desc_count; - struct ext4_group_desc *gdp; - ext4_group_t i, ngroups = ext4_get_groups_count(sb); -#ifdef EXT4FS_DEBUG - struct ext4_super_block *es; - unsigned long bitmap_count, x; - struct buffer_head *bitmap_bh = NULL; - - es = EXT4_SB(sb)->s_es; - desc_count = 0; - bitmap_count = 0; - gdp = NULL; - for (i = 0; i < ngroups; i++) { - gdp = ext4_get_group_desc(sb, i, NULL); - if (!gdp) - continue; - desc_count += ext4_free_inodes_count(sb, gdp); - brelse(bitmap_bh); - bitmap_bh = ext4_read_inode_bitmap(sb, i); - if (!bitmap_bh) - continue; - - x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); - printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", - (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); - bitmap_count += x; - } - brelse(bitmap_bh); - printk(KERN_DEBUG "ext4_count_free_inodes: " - "stored = %u, computed = %lu, %lu\n", - le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); - return desc_count; -#else - desc_count = 0; - for (i = 0; i < ngroups; i++) { - gdp = ext4_get_group_desc(sb, i, NULL); - if (!gdp) - continue; - desc_count += ext4_free_inodes_count(sb, gdp); - cond_resched(); - } - return desc_count; -#endif -} - -/* Called at mount-time, super-block is locked */ -unsigned long ext4_count_dirs(struct super_block * sb) -{ - unsigned long count = 0; - ext4_group_t i, ngroups = ext4_get_groups_count(sb); - - for (i = 0; i < ngroups; i++) { - struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); - if (!gdp) - continue; - count += ext4_used_dirs_count(sb, gdp); - } - return count; -} - -/* - * Zeroes not yet zeroed inode table - just write zeroes through the whole - * inode table. Must be called without any spinlock held. The only place - * where it is called from on active part of filesystem is ext4lazyinit - * thread, so we do not need any special locks, however we have to prevent - * inode allocation from the current group, so we take alloc_sem lock, to - * block ext4_new_inode() until we are finished. - */ -int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, - int barrier) -{ - struct ext4_group_info *grp = ext4_get_group_info(sb, group); - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_group_desc *gdp = NULL; - struct buffer_head *group_desc_bh; - handle_t *handle; - ext4_fsblk_t blk; - int num, ret = 0, used_blks = 0; - - /* This should not happen, but just to be sure check this */ - if (sb->s_flags & MS_RDONLY) { - ret = 1; - goto out; - } - - gdp = ext4_get_group_desc(sb, group, &group_desc_bh); - if (!gdp) - goto out; - - /* - * We do not need to lock this, because we are the only one - * handling this flag. - */ - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) - goto out; - - handle = ext4_journal_start_sb(sb, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - down_write(&grp->alloc_sem); - /* - * If inode bitmap was already initialized there may be some - * used inodes so we need to skip blocks with used inodes in - * inode table. - */ - if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) - used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp)), - sbi->s_inodes_per_block); - - if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { - ext4_error(sb, "Something is wrong with group %u: " - "used itable blocks: %d; " - "itable unused count: %u", - group, used_blks, - ext4_itable_unused_count(sb, gdp)); - ret = 1; - goto err_out; - } - - blk = ext4_inode_table(sb, gdp) + used_blks; - num = sbi->s_itb_per_group - used_blks; - - BUFFER_TRACE(group_desc_bh, "get_write_access"); - ret = ext4_journal_get_write_access(handle, - group_desc_bh); - if (ret) - goto err_out; - - /* - * Skip zeroout if the inode table is full. But we set the ZEROED - * flag anyway, because obviously, when it is full it does not need - * further zeroing. - */ - if (unlikely(num == 0)) - goto skip_zeroout; - - ext4_debug("going to zero out inode table in group %d\n", - group); - ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS); - if (ret < 0) - goto err_out; - if (barrier) - blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL); - -skip_zeroout: - ext4_lock_group(sb, group); - gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); - ext4_unlock_group(sb, group); - - BUFFER_TRACE(group_desc_bh, - "call ext4_handle_dirty_metadata"); - ret = ext4_handle_dirty_metadata(handle, NULL, - group_desc_bh); - -err_out: - up_write(&grp->alloc_sem); - ext4_journal_stop(handle); -out: - return ret; -} diff --git a/ANDROID_3.4.5/fs/ext4/indirect.c b/ANDROID_3.4.5/fs/ext4/indirect.c deleted file mode 100644 index 830e1b2b..00000000 --- a/ANDROID_3.4.5/fs/ext4/indirect.c +++ /dev/null @@ -1,1502 +0,0 @@ -/* - * linux/fs/ext4/indirect.c - * - * from - * - * linux/fs/ext4/inode.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/inode.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Goal-directed block allocation by Stephen Tweedie - * (sct@redhat.com), 1993, 1998 - */ - -#include "ext4_jbd2.h" -#include "truncate.h" - -#include <trace/events/ext4.h> - -typedef struct { - __le32 *p; - __le32 key; - struct buffer_head *bh; -} Indirect; - -static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) -{ - p->key = *(p->p = v); - p->bh = bh; -} - -/** - * ext4_block_to_path - parse the block number into array of offsets - * @inode: inode in question (we are only interested in its superblock) - * @i_block: block number to be parsed - * @offsets: array to store the offsets in - * @boundary: set this non-zero if the referred-to block is likely to be - * followed (on disk) by an indirect block. - * - * To store the locations of file's data ext4 uses a data structure common - * for UNIX filesystems - tree of pointers anchored in the inode, with - * data blocks at leaves and indirect blocks in intermediate nodes. - * This function translates the block number into path in that tree - - * return value is the path length and @offsets[n] is the offset of - * pointer to (n+1)th node in the nth one. If @block is out of range - * (negative or too large) warning is printed and zero returned. - * - * Note: function doesn't find node addresses, so no IO is needed. All - * we need to know is the capacity of indirect blocks (taken from the - * inode->i_sb). - */ - -/* - * Portability note: the last comparison (check that we fit into triple - * indirect block) is spelled differently, because otherwise on an - * architecture with 32-bit longs and 8Kb pages we might get into trouble - * if our filesystem had 8Kb blocks. We might use long long, but that would - * kill us on x86. Oh, well, at least the sign propagation does not matter - - * i_block would have to be negative in the very beginning, so we would not - * get there at all. - */ - -static int ext4_block_to_path(struct inode *inode, - ext4_lblk_t i_block, - ext4_lblk_t offsets[4], int *boundary) -{ - int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); - const long direct_blocks = EXT4_NDIR_BLOCKS, - indirect_blocks = ptrs, - double_blocks = (1 << (ptrs_bits * 2)); - int n = 0; - int final = 0; - - if (i_block < direct_blocks) { - offsets[n++] = i_block; - final = direct_blocks; - } else if ((i_block -= direct_blocks) < indirect_blocks) { - offsets[n++] = EXT4_IND_BLOCK; - offsets[n++] = i_block; - final = ptrs; - } else if ((i_block -= indirect_blocks) < double_blocks) { - offsets[n++] = EXT4_DIND_BLOCK; - offsets[n++] = i_block >> ptrs_bits; - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { - offsets[n++] = EXT4_TIND_BLOCK; - offsets[n++] = i_block >> (ptrs_bits * 2); - offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); - offsets[n++] = i_block & (ptrs - 1); - final = ptrs; - } else { - ext4_warning(inode->i_sb, "block %lu > max in inode %lu", - i_block + direct_blocks + - indirect_blocks + double_blocks, inode->i_ino); - } - if (boundary) - *boundary = final - 1 - (i_block & (ptrs - 1)); - return n; -} - -/** - * ext4_get_branch - read the chain of indirect blocks leading to data - * @inode: inode in question - * @depth: depth of the chain (1 - direct pointer, etc.) - * @offsets: offsets of pointers in inode/indirect blocks - * @chain: place to store the result - * @err: here we store the error value - * - * Function fills the array of triples <key, p, bh> and returns %NULL - * if everything went OK or the pointer to the last filled triple - * (incomplete one) otherwise. Upon the return chain[i].key contains - * the number of (i+1)-th block in the chain (as it is stored in memory, - * i.e. little-endian 32-bit), chain[i].p contains the address of that - * number (it points into struct inode for i==0 and into the bh->b_data - * for i>0) and chain[i].bh points to the buffer_head of i-th indirect - * block for i>0 and NULL for i==0. In other words, it holds the block - * numbers of the chain, addresses they were taken from (and where we can - * verify that chain did not change) and buffer_heads hosting these - * numbers. - * - * Function stops when it stumbles upon zero pointer (absent block) - * (pointer to last triple returned, *@err == 0) - * or when it gets an IO error reading an indirect block - * (ditto, *@err == -EIO) - * or when it reads all @depth-1 indirect blocks successfully and finds - * the whole chain, all way to the data (returns %NULL, *err == 0). - * - * Need to be called with - * down_read(&EXT4_I(inode)->i_data_sem) - */ -static Indirect *ext4_get_branch(struct inode *inode, int depth, - ext4_lblk_t *offsets, - Indirect chain[4], int *err) -{ - struct super_block *sb = inode->i_sb; - Indirect *p = chain; - struct buffer_head *bh; - - *err = 0; - /* i_data is not going away, no lock needed */ - add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); - if (!p->key) - goto no_block; - while (--depth) { - bh = sb_getblk(sb, le32_to_cpu(p->key)); - if (unlikely(!bh)) - goto failure; - - if (!bh_uptodate_or_lock(bh)) { - if (bh_submit_read(bh) < 0) { - put_bh(bh); - goto failure; - } - /* validate block references */ - if (ext4_check_indirect_blockref(inode, bh)) { - put_bh(bh); - goto failure; - } - } - - add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); - /* Reader: end */ - if (!p->key) - goto no_block; - } - return NULL; - -failure: - *err = -EIO; -no_block: - return p; -} - -/** - * ext4_find_near - find a place for allocation with sufficient locality - * @inode: owner - * @ind: descriptor of indirect block. - * - * This function returns the preferred place for block allocation. - * It is used when heuristic for sequential allocation fails. - * Rules are: - * + if there is a block to the left of our position - allocate near it. - * + if pointer will live in indirect block - allocate near that block. - * + if pointer will live in inode - allocate in the same - * cylinder group. - * - * In the latter case we colour the starting block by the callers PID to - * prevent it from clashing with concurrent allocations for a different inode - * in the same block group. The PID is used here so that functionally related - * files will be close-by on-disk. - * - * Caller must make sure that @ind is valid and will stay that way. - */ -static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; - __le32 *p; - - /* Try to find previous block */ - for (p = ind->p - 1; p >= start; p--) { - if (*p) - return le32_to_cpu(*p); - } - - /* No such thing, so let's try location of indirect block */ - if (ind->bh) - return ind->bh->b_blocknr; - - /* - * It is going to be referred to from the inode itself? OK, just put it - * into the same cylinder group then. - */ - return ext4_inode_to_goal_block(inode); -} - -/** - * ext4_find_goal - find a preferred place for allocation. - * @inode: owner - * @block: block we want - * @partial: pointer to the last triple within a chain - * - * Normally this function find the preferred place for block allocation, - * returns it. - * Because this is only used for non-extent files, we limit the block nr - * to 32 bits. - */ -static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, - Indirect *partial) -{ - ext4_fsblk_t goal; - - /* - * XXX need to get goal block from mballoc's data structures - */ - - goal = ext4_find_near(inode, partial); - goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - return goal; -} - -/** - * ext4_blks_to_allocate - Look up the block map and count the number - * of direct blocks need to be allocated for the given branch. - * - * @branch: chain of indirect blocks - * @k: number of blocks need for indirect blocks - * @blks: number of data blocks to be mapped. - * @blocks_to_boundary: the offset in the indirect block - * - * return the total number of blocks to be allocate, including the - * direct and indirect blocks. - */ -static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, - int blocks_to_boundary) -{ - unsigned int count = 0; - - /* - * Simple case, [t,d]Indirect block(s) has not allocated yet - * then it's clear blocks on that path have not allocated - */ - if (k > 0) { - /* right now we don't handle cross boundary allocation */ - if (blks < blocks_to_boundary + 1) - count += blks; - else - count += blocks_to_boundary + 1; - return count; - } - - count++; - while (count < blks && count <= blocks_to_boundary && - le32_to_cpu(*(branch[0].p + count)) == 0) { - count++; - } - return count; -} - -/** - * ext4_alloc_blocks: multiple allocate blocks needed for a branch - * @handle: handle for this transaction - * @inode: inode which needs allocated blocks - * @iblock: the logical block to start allocated at - * @goal: preferred physical block of allocation - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: number of desired blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, - * @err: on return it will store the error code - * - * This function will return the number of blocks allocated as - * requested by the passed-in parameters. - */ -static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) -{ - struct ext4_allocation_request ar; - int target, i; - unsigned long count = 0, blk_allocated = 0; - int index = 0; - ext4_fsblk_t current_block = 0; - int ret = 0; - - /* - * Here we try to allocate the requested multiple blocks at once, - * on a best-effort basis. - * To build a branch, we should allocate blocks for - * the indirect blocks(if not allocated yet), and at least - * the first direct block of this branch. That's the - * minimum number of blocks need to allocate(required) - */ - /* first we try to allocate the indirect blocks */ - target = indirect_blks; - while (target > 0) { - count = target; - /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_meta_blocks(handle, inode, goal, - 0, &count, err); - if (*err) - goto failed_out; - - if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + count %lu > %d!", - current_block, count, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - target -= count; - /* allocate blocks for indirect blocks */ - while (index < indirect_blks && count) { - new_blocks[index++] = current_block++; - count--; - } - if (count > 0) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - printk(KERN_INFO "%s returned more blocks than " - "requested\n", __func__); - WARN_ON(1); - break; - } - } - - target = blks - count ; - blk_allocated = count; - if (!target) - goto allocated; - /* Now allocate data blocks */ - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.goal = goal; - ar.len = target; - ar.logical = iblock; - if (S_ISREG(inode->i_mode)) - /* enable in-core preallocation only for regular files */ - ar.flags = EXT4_MB_HINT_DATA; - - current_block = ext4_mb_new_blocks(handle, &ar, err); - if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { - EXT4_ERROR_INODE(inode, - "current_block %llu + ar.len %d > %d!", - current_block, ar.len, - EXT4_MAX_BLOCK_FILE_PHYS); - *err = -EIO; - goto failed_out; - } - - if (*err && (target == blks)) { - /* - * if the allocation failed and we didn't allocate - * any blocks before - */ - goto failed_out; - } - if (!*err) { - if (target == blks) { - /* - * save the new block number - * for the first direct block - */ - new_blocks[index] = current_block; - } - blk_allocated += ar.len; - } -allocated: - /* total number of blocks allocated for direct blocks */ - ret = blk_allocated; - *err = 0; - return ret; -failed_out: - for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - return ret; -} - -/** - * ext4_alloc_branch - allocate and set up a chain of blocks. - * @handle: handle for this transaction - * @inode: owner - * @indirect_blks: number of allocated indirect blocks - * @blks: number of allocated direct blocks - * @goal: preferred place for allocation - * @offsets: offsets (in the blocks) to store the pointers to next. - * @branch: place to store the chain in. - * - * This function allocates blocks, zeroes out all but the last one, - * links them into chain and (if we are synchronous) writes them to disk. - * In other words, it prepares a branch that can be spliced onto the - * inode. It stores the information about that chain in the branch[], in - * the same format as ext4_get_branch() would do. We are calling it after - * we had read the existing part of chain and partial points to the last - * triple of that (one with zero ->key). Upon the exit we have the same - * picture as after the successful ext4_get_block(), except that in one - * place chain is disconnected - *branch->p is still zero (we did not - * set the last link), but branch->key contains the number that should - * be placed into *branch->p to fill that gap. - * - * If allocation fails we free all blocks we've allocated (and forget - * their buffer_heads) and return the error value the from failed - * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain - * as described above and return 0. - */ -static int ext4_alloc_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, int indirect_blks, - int *blks, ext4_fsblk_t goal, - ext4_lblk_t *offsets, Indirect *branch) -{ - int blocksize = inode->i_sb->s_blocksize; - int i, n = 0; - int err = 0; - struct buffer_head *bh; - int num; - ext4_fsblk_t new_blocks[4]; - ext4_fsblk_t current_block; - - num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, - *blks, new_blocks, &err); - if (err) - return err; - - branch[0].key = cpu_to_le32(new_blocks[0]); - /* - * metadata blocks and data blocks are allocated. - */ - for (n = 1; n <= indirect_blks; n++) { - /* - * Get buffer_head for parent block, zero it out - * and set the pointer to new one, then send - * parent to disk. - */ - bh = sb_getblk(inode->i_sb, new_blocks[n-1]); - if (unlikely(!bh)) { - err = -EIO; - goto failed; - } - - branch[n].bh = bh; - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - err = ext4_journal_get_create_access(handle, bh); - if (err) { - /* Don't brelse(bh) here; it's done in - * ext4_journal_forget() below */ - unlock_buffer(bh); - goto failed; - } - - memset(bh->b_data, 0, blocksize); - branch[n].p = (__le32 *) bh->b_data + offsets[n]; - branch[n].key = cpu_to_le32(new_blocks[n]); - *branch[n].p = branch[n].key; - if (n == indirect_blks) { - current_block = new_blocks[n]; - /* - * End of chain, update the last new metablock of - * the chain to point to the new allocated - * data blocks numbers - */ - for (i = 1; i < num; i++) - *(branch[n].p + i) = cpu_to_le32(++current_block); - } - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); - unlock_buffer(bh); - - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (err) - goto failed; - } - *blks = num; - return err; -failed: - /* Allocation failed, free what we already allocated */ - ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); - for (i = 1; i <= n ; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, - EXT4_FREE_BLOCKS_FORGET); - } - for (i = n+1; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - - ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); - - return err; -} - -/** - * ext4_splice_branch - splice the allocated branch onto inode. - * @handle: handle for this transaction - * @inode: owner - * @block: (logical) number of block we are adding - * @chain: chain of indirect blocks (with a missing link - see - * ext4_alloc_branch) - * @where: location of missing link - * @num: number of indirect blocks we are adding - * @blks: number of direct blocks we are adding - * - * This function fills the missing link and does all housekeeping needed in - * inode (->i_blocks, etc.). In case of success we end up with the full - * chain to new block and return 0. - */ -static int ext4_splice_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t block, Indirect *where, int num, - int blks) -{ - int i; - int err = 0; - ext4_fsblk_t current_block; - - /* - * If we're splicing into a [td]indirect block (as opposed to the - * inode) then we need to get write access to the [td]indirect block - * before the splice. - */ - if (where->bh) { - BUFFER_TRACE(where->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, where->bh); - if (err) - goto err_out; - } - /* That's it */ - - *where->p = where->key; - - /* - * Update the host buffer_head or inode to point to more just allocated - * direct blocks blocks - */ - if (num == 0 && blks > 1) { - current_block = le32_to_cpu(where->key) + 1; - for (i = 1; i < blks; i++) - *(where->p + i) = cpu_to_le32(current_block++); - } - - /* We are done with atomic stuff, now do the rest of housekeeping */ - /* had we spliced it onto indirect block? */ - if (where->bh) { - /* - * If we spliced it onto an indirect block, we haven't - * altered the inode. Note however that if it is being spliced - * onto an indirect block at the very end of the file (the - * file is growing) then we *will* alter the inode to reflect - * the new i_size. But that is not done here - it is done in - * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. - */ - jbd_debug(5, "splicing indirect only\n"); - BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, where->bh); - if (err) - goto err_out; - } else { - /* - * OK, we spliced it into the inode itself on a direct block. - */ - ext4_mark_inode_dirty(handle, inode); - jbd_debug(5, "splicing direct\n"); - } - return err; - -err_out: - for (i = 1; i <= num; i++) { - /* - * branch[i].bh is newly allocated, so there is no - * need to revoke the block, which is why we don't - * need to set EXT4_FREE_BLOCKS_METADATA. - */ - ext4_free_blocks(handle, inode, where[i].bh, 0, 1, - EXT4_FREE_BLOCKS_FORGET); - } - ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), - blks, 0); - - return err; -} - -/* - * The ext4_ind_map_blocks() function handles non-extents inodes - * (i.e., using the traditional indirect/double-indirect i_blocks - * scheme) for ext4_map_blocks(). - * - * Allocation strategy is simple: if we have to allocate something, we will - * have to go the whole way to leaf. So let's do it before attaching anything - * to tree, set linkage between the newborn blocks, write them if sync is - * required, recheck the path, free and repeat if check fails, otherwise - * set the last missing link (that will protect us from any truncate-generated - * removals - all blocks on the path are immune now) and possibly force the - * write on the parent block. - * That has a nice additional property: no special recovery from the failed - * allocations is needed - we simply release blocks and do not touch anything - * reachable from inode. - * - * `handle' can be NULL if create == 0. - * - * return > 0, # of blocks mapped or allocated. - * return = 0, if plain lookup failed. - * return < 0, error case. - * - * The ext4_ind_get_blocks() function should be called with - * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem - * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or - * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system - * blocks. - */ -int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, - int flags) -{ - int err = -EIO; - ext4_lblk_t offsets[4]; - Indirect chain[4]; - Indirect *partial; - ext4_fsblk_t goal; - int indirect_blks; - int blocks_to_boundary = 0; - int depth; - int count = 0; - ext4_fsblk_t first_block = 0; - - trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); - J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); - J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); - depth = ext4_block_to_path(inode, map->m_lblk, offsets, - &blocks_to_boundary); - - if (depth == 0) - goto out; - - partial = ext4_get_branch(inode, depth, offsets, chain, &err); - - /* Simplest case - block found, no allocation needed */ - if (!partial) { - first_block = le32_to_cpu(chain[depth - 1].key); - count++; - /*map more blocks*/ - while (count < map->m_len && count <= blocks_to_boundary) { - ext4_fsblk_t blk; - - blk = le32_to_cpu(*(chain[depth-1].p + count)); - - if (blk == first_block + count) - count++; - else - break; - } - goto got_it; - } - - /* Next simple case - plain lookup or failed read of indirect block */ - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) - goto cleanup; - - /* - * Okay, we need to do block allocation. - */ - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { - EXT4_ERROR_INODE(inode, "Can't allocate blocks for " - "non-extent mapped inodes with bigalloc"); - return -ENOSPC; - } - - goal = ext4_find_goal(inode, map->m_lblk, partial); - - /* the number of blocks need to allocate for [d,t]indirect blocks */ - indirect_blks = (chain + depth) - partial - 1; - - /* - * Next look up the indirect map to count the totoal number of - * direct blocks to allocate for this branch. - */ - count = ext4_blks_to_allocate(partial, indirect_blks, - map->m_len, blocks_to_boundary); - /* - * Block out ext4_truncate while we alter the tree - */ - err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, - &count, goal, - offsets + (partial - chain), partial); - - /* - * The ext4_splice_branch call will free and forget any buffers - * on the new chain if there is a failure, but that risks using - * up transaction credits, especially for bitmaps where the - * credits cannot be returned. Can we handle this somehow? We - * may need to return -EAGAIN upwards in the worst case. --sct - */ - if (!err) - err = ext4_splice_branch(handle, inode, map->m_lblk, - partial, indirect_blks, count); - if (err) - goto cleanup; - - map->m_flags |= EXT4_MAP_NEW; - - ext4_update_inode_fsync_trans(handle, inode, 1); -got_it: - map->m_flags |= EXT4_MAP_MAPPED; - map->m_pblk = le32_to_cpu(chain[depth-1].key); - map->m_len = count; - if (count > blocks_to_boundary) - map->m_flags |= EXT4_MAP_BOUNDARY; - err = count; - /* Clean up and exit */ - partial = chain + depth - 1; /* the whole chain */ -cleanup: - while (partial > chain) { - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - partial--; - } -out: - trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, - map->m_pblk, map->m_len, err); - return err; -} - -/* - * O_DIRECT for ext3 (or indirect map) based files - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - * If the O_DIRECT write is intantiating holes inside i_size and the machine - * crashes then stale disk data _may_ be exposed inside the file. But current - * VFS code falls back into buffered path in that case so we are safe. - */ -ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - handle_t *handle; - ssize_t ret; - int orphan = 0; - size_t count = iov_length(iov, nr_segs); - int retries = 0; - - if (rw == WRITE) { - loff_t final_size = offset + count; - - if (final_size > inode->i_size) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ei->i_disksize = inode->i_size; - ext4_journal_stop(handle); - } - } - -retry: - if (rw == READ && ext4_should_dioread_nolock(inode)) { - if (unlikely(!list_empty(&ei->i_completed_io_list))) { - mutex_lock(&inode->i_mutex); - ext4_flush_completed_IO(inode); - mutex_unlock(&inode->i_mutex); - } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block, NULL, NULL, 0); - } else { - ret = blockdev_direct_IO(rw, iocb, inode, iov, - offset, nr_segs, ext4_get_block); - - if (unlikely((rw & WRITE) && ret < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); - - if (end > isize) - ext4_truncate_failed_write(inode); - } - } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - - if (orphan) { - int err; - - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) { - /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; - } -out: - return ret; -} - -/* - * Calculate the number of metadata blocks need to reserve - * to allocate a new block at @lblocks for non extent file based file - */ -int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); - int blk_bits; - - if (lblock < EXT4_NDIR_BLOCKS) - return 0; - - lblock -= EXT4_NDIR_BLOCKS; - - if (ei->i_da_metadata_calc_len && - (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { - ei->i_da_metadata_calc_len++; - return 0; - } - ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; - ei->i_da_metadata_calc_len = 1; - blk_bits = order_base_2(lblock); - return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; -} - -int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) -{ - int indirects; - - /* if nrblocks are contiguous */ - if (chunk) { - /* - * With N contiguous data blocks, we need at most - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, - * 2 dindirect blocks, and 1 tindirect block - */ - return DIV_ROUND_UP(nrblocks, - EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; - } - /* - * if nrblocks are not contiguous, worse case, each block touch - * a indirect block, and each indirect block touch a double indirect - * block, plus a triple indirect block - */ - indirects = nrblocks * 2 + 1; - return indirects; -} - -/* - * Truncate transactions can be complex and absolutely huge. So we need to - * be able to restart the transaction at a conventient checkpoint to make - * sure we don't overflow the journal. - * - * start_transaction gets us a new handle for a truncate transaction, - * and extend_transaction tries to extend the existing one a bit. If - * extend fails, we need to propagate the failure up and restart the - * transaction in the top-level truncate loop. --sct - */ -static handle_t *start_transaction(struct inode *inode) -{ - handle_t *result; - - result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); - if (!IS_ERR(result)) - return result; - - ext4_std_error(inode->i_sb, PTR_ERR(result)); - return result; -} - -/* - * Try to extend this transaction for the purposes of truncation. - * - * Returns 0 if we managed to create more room. If we can't create more - * room, and the transaction must be restarted we return 1. - */ -static int try_to_extend_transaction(handle_t *handle, struct inode *inode) -{ - if (!ext4_handle_valid(handle)) - return 0; - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) - return 0; - return 1; -} - -/* - * Probably it should be a library function... search for first non-zero word - * or memcmp with zero_page, whatever is better for particular architecture. - * Linus? - */ -static inline int all_zeroes(__le32 *p, __le32 *q) -{ - while (p < q) - if (*p++) - return 0; - return 1; -} - -/** - * ext4_find_shared - find the indirect blocks for partial truncation. - * @inode: inode in question - * @depth: depth of the affected branch - * @offsets: offsets of pointers in that branch (see ext4_block_to_path) - * @chain: place to store the pointers to partial indirect blocks - * @top: place to the (detached) top of branch - * - * This is a helper function used by ext4_truncate(). - * - * When we do truncate() we may have to clean the ends of several - * indirect blocks but leave the blocks themselves alive. Block is - * partially truncated if some data below the new i_size is referred - * from it (and it is on the path to the first completely truncated - * data block, indeed). We have to free the top of that path along - * with everything to the right of the path. Since no allocation - * past the truncation point is possible until ext4_truncate() - * finishes, we may safely do the latter, but top of branch may - * require special attention - pageout below the truncation point - * might try to populate it. - * - * We atomically detach the top of branch from the tree, store the - * block number of its root in *@top, pointers to buffer_heads of - * partially truncated blocks - in @chain[].bh and pointers to - * their last elements that should not be removed - in - * @chain[].p. Return value is the pointer to last filled element - * of @chain. - * - * The work left to caller to do the actual freeing of subtrees: - * a) free the subtree starting from *@top - * b) free the subtrees whose roots are stored in - * (@chain[i].p+1 .. end of @chain[i].bh->b_data) - * c) free the subtrees growing from the inode past the @chain[0]. - * (no partially truncated stuff there). */ - -static Indirect *ext4_find_shared(struct inode *inode, int depth, - ext4_lblk_t offsets[4], Indirect chain[4], - __le32 *top) -{ - Indirect *partial, *p; - int k, err; - - *top = 0; - /* Make k index the deepest non-null offset + 1 */ - for (k = depth; k > 1 && !offsets[k-1]; k--) - ; - partial = ext4_get_branch(inode, k, offsets, chain, &err); - /* Writer: pointers */ - if (!partial) - partial = chain + k-1; - /* - * If the branch acquired continuation since we've looked at it - - * fine, it should all survive and (new) top doesn't belong to us. - */ - if (!partial->key && *partial->p) - /* Writer: end */ - goto no_top; - for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) - ; - /* - * OK, we've found the last block that must survive. The rest of our - * branch should be detached before unlocking. However, if that rest - * of branch is all ours and does not grow immediately from the inode - * it's easier to cheat and just decrement partial->p. - */ - if (p == chain + k - 1 && p > chain) { - p->p--; - } else { - *top = *p->p; - /* Nope, don't do this in ext4. Must leave the tree intact */ -#if 0 - *p->p = 0; -#endif - } - /* Writer: end */ - - while (partial > p) { - brelse(partial->bh); - partial--; - } -no_top: - return partial; -} - -/* - * Zero a number of block pointers in either an inode or an indirect block. - * If we restart the transaction we must again get write access to the - * indirect block for further modification. - * - * We release `count' blocks on disk, but (last - first) may be greater - * than `count' because there can be holes in there. - * - * Return 0 on success, 1 on invalid block range - * and < 0 on fatal error. - */ -static int ext4_clear_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, - ext4_fsblk_t block_to_free, - unsigned long count, __le32 *first, - __le32 *last) -{ - __le32 *p; - int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; - int err; - - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, - count)) { - EXT4_ERROR_INODE(inode, "attempt to clear invalid " - "blocks %llu len %lu", - (unsigned long long) block_to_free, count); - return 1; - } - - if (try_to_extend_transaction(handle, inode)) { - if (bh) { - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (unlikely(err)) - goto out_err; - } - err = ext4_mark_inode_dirty(handle, inode); - if (unlikely(err)) - goto out_err; - err = ext4_truncate_restart_trans(handle, inode, - ext4_blocks_for_truncate(inode)); - if (unlikely(err)) - goto out_err; - if (bh) { - BUFFER_TRACE(bh, "retaking write access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) - goto out_err; - } - } - - for (p = first; p < last; p++) - *p = 0; - - ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); - return 0; -out_err: - ext4_std_error(inode->i_sb, err); - return err; -} - -/** - * ext4_free_data - free a list of data blocks - * @handle: handle for this transaction - * @inode: inode we are dealing with - * @this_bh: indirect buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: points immediately past the end of array - * - * We are freeing all blocks referred from that array (numbers are stored as - * little-endian 32-bit) and updating @inode->i_blocks appropriately. - * - * We accumulate contiguous runs of blocks to free. Conveniently, if these - * blocks are contiguous then releasing them at one time will only affect one - * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't - * actually use a lot of journal space. - * - * @this_bh will be %NULL if @first and @last point into the inode's direct - * block pointers. - */ -static void ext4_free_data(handle_t *handle, struct inode *inode, - struct buffer_head *this_bh, - __le32 *first, __le32 *last) -{ - ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ - unsigned long count = 0; /* Number of blocks in the run */ - __le32 *block_to_free_p = NULL; /* Pointer into inode/ind - corresponding to - block_to_free */ - ext4_fsblk_t nr; /* Current block # */ - __le32 *p; /* Pointer into inode/ind - for current block */ - int err = 0; - - if (this_bh) { /* For indirect block */ - BUFFER_TRACE(this_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, this_bh); - /* Important: if we can't update the indirect pointers - * to the blocks, we can't free them. */ - if (err) - return; - } - - for (p = first; p < last; p++) { - nr = le32_to_cpu(*p); - if (nr) { - /* accumulate blocks to free if they're contiguous */ - if (count == 0) { - block_to_free = nr; - block_to_free_p = p; - count = 1; - } else if (nr == block_to_free + count) { - count++; - } else { - err = ext4_clear_blocks(handle, inode, this_bh, - block_to_free, count, - block_to_free_p, p); - if (err) - break; - block_to_free = nr; - block_to_free_p = p; - count = 1; - } - } - } - - if (!err && count > 0) - err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, - count, block_to_free_p, p); - if (err < 0) - /* fatal error */ - return; - - if (this_bh) { - BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); - - /* - * The buffer head should have an attached journal head at this - * point. However, if the data is corrupted and an indirect - * block pointed to itself, it would have been detached when - * the block was cleared. Check for this instead of OOPSing. - */ - if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) - ext4_handle_dirty_metadata(handle, inode, this_bh); - else - EXT4_ERROR_INODE(inode, - "circular indirect block detected at " - "block %llu", - (unsigned long long) this_bh->b_blocknr); - } -} - -/** - * ext4_free_branches - free an array of branches - * @handle: JBD handle for this transaction - * @inode: inode we are dealing with - * @parent_bh: the buffer_head which contains *@first and *@last - * @first: array of block numbers - * @last: pointer immediately past the end of array - * @depth: depth of the branches to free - * - * We are freeing all blocks referred from these branches (numbers are - * stored as little-endian 32-bit) and updating @inode->i_blocks - * appropriately. - */ -static void ext4_free_branches(handle_t *handle, struct inode *inode, - struct buffer_head *parent_bh, - __le32 *first, __le32 *last, int depth) -{ - ext4_fsblk_t nr; - __le32 *p; - - if (ext4_handle_is_aborted(handle)) - return; - - if (depth--) { - struct buffer_head *bh; - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - p = last; - while (--p >= first) { - nr = le32_to_cpu(*p); - if (!nr) - continue; /* A hole */ - - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), - nr, 1)) { - EXT4_ERROR_INODE(inode, - "invalid indirect mapped " - "block %lu (level %d)", - (unsigned long) nr, depth); - break; - } - - /* Go read the buffer for the next level down */ - bh = sb_bread(inode->i_sb, nr); - - /* - * A read failure? Report error and clear slot - * (should be rare). - */ - if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, nr, - "Read failure"); - continue; - } - - /* This zaps the entire block. Bottom up. */ - BUFFER_TRACE(bh, "free child branches"); - ext4_free_branches(handle, inode, bh, - (__le32 *) bh->b_data, - (__le32 *) bh->b_data + addr_per_block, - depth); - brelse(bh); - - /* - * Everything below this this pointer has been - * released. Now let this top-of-subtree go. - * - * We want the freeing of this indirect block to be - * atomic in the journal with the updating of the - * bitmap block which owns it. So make some room in - * the journal. - * - * We zero the parent pointer *after* freeing its - * pointee in the bitmaps, so if extend_transaction() - * for some reason fails to put the bitmap changes and - * the release into the same transaction, recovery - * will merely complain about releasing a free block, - * rather than leaking blocks. - */ - if (ext4_handle_is_aborted(handle)) - return; - if (try_to_extend_transaction(handle, inode)) { - ext4_mark_inode_dirty(handle, inode); - ext4_truncate_restart_trans(handle, inode, - ext4_blocks_for_truncate(inode)); - } - - /* - * The forget flag here is critical because if - * we are journaling (and not doing data - * journaling), we have to make sure a revoke - * record is written to prevent the journal - * replay from overwriting the (former) - * indirect block if it gets reallocated as a - * data block. This must happen in the same - * transaction where the data blocks are - * actually freed. - */ - ext4_free_blocks(handle, inode, NULL, nr, 1, - EXT4_FREE_BLOCKS_METADATA| - EXT4_FREE_BLOCKS_FORGET); - - if (parent_bh) { - /* - * The block which we have just freed is - * pointed to by an indirect block: journal it - */ - BUFFER_TRACE(parent_bh, "get_write_access"); - if (!ext4_journal_get_write_access(handle, - parent_bh)){ - *p = 0; - BUFFER_TRACE(parent_bh, - "call ext4_handle_dirty_metadata"); - ext4_handle_dirty_metadata(handle, - inode, - parent_bh); - } - } - } - } else { - /* We have reached the bottom of the tree. */ - BUFFER_TRACE(parent_bh, "free data blocks"); - ext4_free_data(handle, inode, parent_bh, first, last); - } -} - -void ext4_ind_truncate(struct inode *inode) -{ - handle_t *handle; - struct ext4_inode_info *ei = EXT4_I(inode); - __le32 *i_data = ei->i_data; - int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); - struct address_space *mapping = inode->i_mapping; - ext4_lblk_t offsets[4]; - Indirect chain[4]; - Indirect *partial; - __le32 nr = 0; - int n = 0; - ext4_lblk_t last_block, max_block; - loff_t page_len; - unsigned blocksize = inode->i_sb->s_blocksize; - int err; - - handle = start_transaction(inode); - if (IS_ERR(handle)) - return; /* AKPM: return what? */ - - last_block = (inode->i_size + blocksize-1) - >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) - >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); - - if (inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - err = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (err) - goto out_stop; - } - - if (last_block != max_block) { - n = ext4_block_to_path(inode, last_block, offsets, NULL); - if (n == 0) - goto out_stop; /* error */ - } - - /* - * OK. This truncate is going to happen. We add the inode to the - * orphan list, so that if this truncate spans multiple transactions, - * and we crash, we will resume the truncate when the filesystem - * recovers. It also marks the inode dirty, to catch the new size. - * - * Implication: the file must always be in a sane, consistent - * truncatable state while each transaction commits. - */ - if (ext4_orphan_add(handle, inode)) - goto out_stop; - - /* - * From here we block out all ext4_get_block() callers who want to - * modify the block allocation tree. - */ - down_write(&ei->i_data_sem); - - ext4_discard_preallocations(inode); - - /* - * The orphan list entry will now protect us from any crash which - * occurs before the truncate completes, so it is now safe to propagate - * the new, shorter inode size (held for now in i_size) into the - * on-disk inode. We do this via i_disksize, which is the value which - * ext4 *really* writes onto the disk inode. - */ - ei->i_disksize = inode->i_size; - - if (last_block == max_block) { - /* - * It is unnecessary to free any data blocks if last_block is - * equal to the indirect block limit. - */ - goto out_unlock; - } else if (n == 1) { /* direct blocks */ - ext4_free_data(handle, inode, NULL, i_data+offsets[0], - i_data + EXT4_NDIR_BLOCKS); - goto do_indirects; - } - - partial = ext4_find_shared(inode, n, offsets, chain, &nr); - /* Kill the top of shared branch (not detached) */ - if (nr) { - if (partial == chain) { - /* Shared branch grows from the inode */ - ext4_free_branches(handle, inode, NULL, - &nr, &nr+1, (chain+n-1) - partial); - *partial->p = 0; - /* - * We mark the inode dirty prior to restart, - * and prior to stop. No need for it here. - */ - } else { - /* Shared branch grows from an indirect block */ - BUFFER_TRACE(partial->bh, "get_write_access"); - ext4_free_branches(handle, inode, partial->bh, - partial->p, - partial->p+1, (chain+n-1) - partial); - } - } - /* Clear the ends of indirect blocks on the shared branch */ - while (partial > chain) { - ext4_free_branches(handle, inode, partial->bh, partial->p + 1, - (__le32*)partial->bh->b_data+addr_per_block, - (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - partial--; - } -do_indirects: - /* Kill the remaining (whole) subtrees */ - switch (offsets[0]) { - default: - nr = i_data[EXT4_IND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); - i_data[EXT4_IND_BLOCK] = 0; - } - case EXT4_IND_BLOCK: - nr = i_data[EXT4_DIND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); - i_data[EXT4_DIND_BLOCK] = 0; - } - case EXT4_DIND_BLOCK: - nr = i_data[EXT4_TIND_BLOCK]; - if (nr) { - ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); - i_data[EXT4_TIND_BLOCK] = 0; - } - case EXT4_TIND_BLOCK: - ; - } - -out_unlock: - up_write(&ei->i_data_sem); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - - /* - * In a multi-transaction truncate, we only make the final transaction - * synchronous - */ - if (IS_SYNC(inode)) - ext4_handle_sync(handle); -out_stop: - /* - * If this was a simple ftruncate(), and the file will remain alive - * then we need to clear up the orphan record which we created above. - * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the - * orphan info for us. - */ - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - - ext4_journal_stop(handle); - trace_ext4_truncate_exit(inode); -} - diff --git a/ANDROID_3.4.5/fs/ext4/inode.c b/ANDROID_3.4.5/fs/ext4/inode.c deleted file mode 100644 index c77b0bd2..00000000 --- a/ANDROID_3.4.5/fs/ext4/inode.c +++ /dev/null @@ -1,4676 +0,0 @@ -/* - * linux/fs/ext4/inode.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/inode.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * 64-bit file support on 64-bit platforms by Jakub Jelinek - * (jj@sunsite.ms.mff.cuni.cz) - * - * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 - */ - -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/jbd2.h> -#include <linux/highuid.h> -#include <linux/pagemap.h> -#include <linux/quotaops.h> -#include <linux/string.h> -#include <linux/buffer_head.h> -#include <linux/writeback.h> -#include <linux/pagevec.h> -#include <linux/mpage.h> -#include <linux/namei.h> -#include <linux/uio.h> -#include <linux/bio.h> -#include <linux/workqueue.h> -#include <linux/kernel.h> -#include <linux/printk.h> -#include <linux/slab.h> -#include <linux/ratelimit.h> - -#include "ext4_jbd2.h" -#include "xattr.h" -#include "acl.h" -#include "truncate.h" - -#include <trace/events/ext4.h> - -#define MPAGE_DA_EXTENT_TAIL 0x01 - -static inline int ext4_begin_ordered_truncate(struct inode *inode, - loff_t new_size) -{ - trace_ext4_begin_ordered_truncate(inode, new_size); - /* - * If jinode is zero, then we never opened the file for - * writing, so there's no need to call - * jbd2_journal_begin_ordered_truncate() since there's no - * outstanding writes we need to flush. - */ - if (!EXT4_I(inode)->jinode) - return 0; - return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), - EXT4_I(inode)->jinode, - new_size); -} - -static void ext4_invalidatepage(struct page *page, unsigned long offset); -static int noalloc_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); -static int __ext4_journalled_writepage(struct page *page, unsigned int len); -static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); -static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, - struct inode *inode, struct page *page, loff_t from, - loff_t length, int flags); - -/* - * Test whether an inode is a fast symlink. - */ -static int ext4_inode_is_fast_symlink(struct inode *inode) -{ - int ea_blocks = EXT4_I(inode)->i_file_acl ? - (inode->i_sb->s_blocksize >> 9) : 0; - - return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); -} - -/* - * Restart the transaction associated with *handle. This does a commit, - * so before we call here everything must be consistently dirtied against - * this transaction. - */ -int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, - int nblocks) -{ - int ret; - - /* - * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this - * moment, get_block can be called only for blocks inside i_size since - * page cache has been already dropped and writes are blocked by - * i_mutex. So we can safely drop the i_data_sem here. - */ - BUG_ON(EXT4_JOURNAL(inode) == NULL); - jbd_debug(2, "restarting handle %p\n", handle); - up_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_journal_restart(handle, nblocks); - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); - - return ret; -} - -/* - * Called at the last iput() if i_nlink is zero. - */ -void ext4_evict_inode(struct inode *inode) -{ - handle_t *handle; - int err; - - trace_ext4_evict_inode(inode); - - ext4_ioend_wait(inode); - - if (inode->i_nlink) { - /* - * When journalling data dirty buffers are tracked only in the - * journal. So although mm thinks everything is clean and - * ready for reaping the inode might still have some pages to - * write in the running transaction or waiting to be - * checkpointed. Thus calling jbd2_journal_invalidatepage() - * (via truncate_inode_pages()) to discard these buffers can - * cause data loss. Also even if we did not discard these - * buffers, we would have no way to find them after the inode - * is reaped and thus user could see stale data if he tries to - * read them before the transaction is checkpointed. So be - * careful and force everything to disk here... We use - * ei->i_datasync_tid to store the newest transaction - * containing inode's data. - * - * Note that directories do not have this problem because they - * don't use page cache. - */ - if (ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { - journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; - - jbd2_log_start_commit(journal, commit_tid); - jbd2_log_wait_commit(journal, commit_tid); - filemap_write_and_wait(&inode->i_data); - } - truncate_inode_pages(&inode->i_data, 0); - goto no_delete; - } - - if (!is_bad_inode(inode)) - dquot_initialize(inode); - - if (ext4_should_order_data(inode)) - ext4_begin_ordered_truncate(inode, 0); - truncate_inode_pages(&inode->i_data, 0); - - if (is_bad_inode(inode)) - goto no_delete; - - handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); - if (IS_ERR(handle)) { - ext4_std_error(inode->i_sb, PTR_ERR(handle)); - /* - * If we're going to skip the normal cleanup, we still need to - * make sure that the in-core orphan linked list is properly - * cleaned up. - */ - ext4_orphan_del(NULL, inode); - goto no_delete; - } - - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - inode->i_size = 0; - err = ext4_mark_inode_dirty(handle, inode); - if (err) { - ext4_warning(inode->i_sb, - "couldn't mark inode dirty (err %d)", err); - goto stop_handle; - } - if (inode->i_blocks) - ext4_truncate(inode); - - /* - * ext4_ext_truncate() doesn't reserve any slop when it - * restarts journal transactions; therefore there may not be - * enough credits left in the handle to remove the inode from - * the orphan list and set the dtime field. - */ - if (!ext4_handle_has_enough_credits(handle, 3)) { - err = ext4_journal_extend(handle, 3); - if (err > 0) - err = ext4_journal_restart(handle, 3); - if (err != 0) { - ext4_warning(inode->i_sb, - "couldn't extend journal (err %d)", err); - stop_handle: - ext4_journal_stop(handle); - ext4_orphan_del(NULL, inode); - goto no_delete; - } - } - - /* - * Kill off the orphan record which ext4_truncate created. - * AKPM: I think this can be inside the above `if'. - * Note that ext4_orphan_del() has to be able to cope with the - * deletion of a non-existent orphan - this is because we don't - * know if ext4_truncate() actually created an orphan record. - * (Well, we could do this if we need to, but heck - it works) - */ - ext4_orphan_del(handle, inode); - EXT4_I(inode)->i_dtime = get_seconds(); - - /* - * One subtle ordering requirement: if anything has gone wrong - * (transaction abort, IO errors, whatever), then we can still - * do these next steps (the fs will already have been marked as - * having errors), but we can't free the inode if the mark_dirty - * fails. - */ - if (ext4_mark_inode_dirty(handle, inode)) - /* If that failed, just do the required in-core inode clear. */ - ext4_clear_inode(inode); - else - ext4_free_inode(handle, inode); - ext4_journal_stop(handle); - return; -no_delete: - ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ -} - -#ifdef CONFIG_QUOTA -qsize_t *ext4_get_reserved_space(struct inode *inode) -{ - return &EXT4_I(inode)->i_reserved_quota; -} -#endif - -/* - * Calculate the number of metadata blocks need to reserve - * to allocate a block located at @lblock - */ -static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) -{ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - return ext4_ext_calc_metadata_amount(inode, lblock); - - return ext4_ind_calc_metadata_amount(inode, lblock); -} - -/* - * Called with i_data_sem down, which is important since we can call - * ext4_discard_preallocations() from here. - */ -void ext4_da_update_reserve_space(struct inode *inode, - int used, int quota_claim) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct ext4_inode_info *ei = EXT4_I(inode); - - spin_lock(&ei->i_block_reservation_lock); - trace_ext4_da_update_reserve_space(inode, used, quota_claim); - if (unlikely(used > ei->i_reserved_data_blocks)) { - ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " - "with only %d reserved data blocks", - __func__, inode->i_ino, used, - ei->i_reserved_data_blocks); - WARN_ON(1); - used = ei->i_reserved_data_blocks; - } - - /* Update per-inode reservations */ - ei->i_reserved_data_blocks -= used; - ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - used + ei->i_allocated_meta_blocks); - ei->i_allocated_meta_blocks = 0; - - if (ei->i_reserved_data_blocks == 0) { - /* - * We can release all of the reserved metadata blocks - * only when we have written all of the delayed - * allocation blocks. - */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - ei->i_reserved_meta_blocks); - ei->i_reserved_meta_blocks = 0; - ei->i_da_metadata_calc_len = 0; - } - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - - /* Update quota subsystem for data blocks */ - if (quota_claim) - dquot_claim_block(inode, EXT4_C2B(sbi, used)); - else { - /* - * We did fallocate with an offset that is already delayed - * allocated. So on delayed allocated writeback we should - * not re-claim the quota for fallocated blocks. - */ - dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); - } - - /* - * If we have done all the pending block allocations and if - * there aren't any writers on the inode, we can discard the - * inode's preallocations. - */ - if ((ei->i_reserved_data_blocks == 0) && - (atomic_read(&inode->i_writecount) == 0)) - ext4_discard_preallocations(inode); -} - -static int __check_block_validity(struct inode *inode, const char *func, - unsigned int line, - struct ext4_map_blocks *map) -{ - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, - map->m_len)) { - ext4_error_inode(inode, func, line, map->m_pblk, - "lblock %lu mapped to illegal pblock " - "(length %d)", (unsigned long) map->m_lblk, - map->m_len); - return -EIO; - } - return 0; -} - -#define check_block_validity(inode, map) \ - __check_block_validity((inode), __func__, __LINE__, (map)) - -/* - * Return the number of contiguous dirty pages in a given inode - * starting at page frame idx. - */ -static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, - unsigned int max_pages) -{ - struct address_space *mapping = inode->i_mapping; - pgoff_t index; - struct pagevec pvec; - pgoff_t num = 0; - int i, nr_pages, done = 0; - - if (max_pages == 0) - return 0; - pagevec_init(&pvec, 0); - while (!done) { - index = idx; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - (pgoff_t)PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; - - lock_page(page); - if (unlikely(page->mapping != mapping) || - !PageDirty(page) || - PageWriteback(page) || - page->index != idx) { - done = 1; - unlock_page(page); - break; - } - if (page_has_buffers(page)) { - bh = head = page_buffers(page); - do { - if (!buffer_delay(bh) && - !buffer_unwritten(bh)) - done = 1; - bh = bh->b_this_page; - } while (!done && (bh != head)); - } - unlock_page(page); - if (done) - break; - idx++; - num++; - if (num >= max_pages) { - done = 1; - break; - } - } - pagevec_release(&pvec); - } - return num; -} - -/* - * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map. - */ -static void set_buffers_da_mapped(struct inode *inode, - struct ext4_map_blocks *map) -{ - struct address_space *mapping = inode->i_mapping; - struct pagevec pvec; - int i, nr_pages; - pgoff_t index, end; - - index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - end = (map->m_lblk + map->m_len - 1) >> - (PAGE_CACHE_SHIFT - inode->i_blkbits); - - pagevec_init(&pvec, 0); - while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, - min(end - index + 1, - (pgoff_t)PAGEVEC_SIZE)); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; - - if (unlikely(page->mapping != mapping) || - !PageDirty(page)) - break; - - if (page_has_buffers(page)) { - bh = head = page_buffers(page); - do { - set_buffer_da_mapped(bh); - bh = bh->b_this_page; - } while (bh != head); - } - index++; - } - pagevec_release(&pvec); - } -} - -/* - * The ext4_map_blocks() function tries to look up the requested blocks, - * and returns if the blocks are already mapped. - * - * Otherwise it takes the write lock of the i_data_sem and allocate blocks - * and store the allocated blocks in the result buffer head and mark it - * mapped. - * - * If file type is extents based, it will call ext4_ext_map_blocks(), - * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping - * based files - * - * On success, it returns the number of blocks being mapped or allocate. - * if create==0 and the blocks are pre-allocated and uninitialized block, - * the result buffer head is unmapped. If the create ==1, it will make sure - * the buffer head is mapped. - * - * It returns 0 if plain look up failed (blocks have not been allocated), in - * that case, buffer head is unmapped - * - * It returns the error in case of allocation failure. - */ -int ext4_map_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, int flags) -{ - int retval; - - map->m_flags = 0; - ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," - "logical block %lu\n", inode->i_ino, flags, map->m_len, - (unsigned long) map->m_lblk); - /* - * Try to see if we can get the block without requesting a new - * file system block. - */ - down_read((&EXT4_I(inode)->i_data_sem)); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, flags & - EXT4_GET_BLOCKS_KEEP_SIZE); - } else { - retval = ext4_ind_map_blocks(handle, inode, map, flags & - EXT4_GET_BLOCKS_KEEP_SIZE); - } - up_read((&EXT4_I(inode)->i_data_sem)); - - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); - if (ret != 0) - return ret; - } - - /* If it is only a block(s) look up */ - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) - return retval; - - /* - * Returns if the blocks have already allocated - * - * Note that if blocks have been preallocated - * ext4_ext_get_block() returns the create = 0 - * with buffer head unmapped. - */ - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) - return retval; - - /* - * When we call get_blocks without the create flag, the - * BH_Unwritten flag could have gotten set if the blocks - * requested were part of a uninitialized extent. We need to - * clear this flag now that we are committed to convert all or - * part of the uninitialized extent to be an initialized - * extent. This is because we need to avoid the combination - * of BH_Unwritten and BH_Mapped flags being simultaneously - * set on the buffer_head. - */ - map->m_flags &= ~EXT4_MAP_UNWRITTEN; - - /* - * New blocks allocate and/or writing to uninitialized extent - * will possibly result in updating i_data, so we take - * the write lock of i_data_sem, and call get_blocks() - * with create == 1 flag. - */ - down_write((&EXT4_I(inode)->i_data_sem)); - - /* - * if the caller is from delayed allocation writeout path - * we have already reserved fs blocks for allocation - * let the underlying get_block() function know to - * avoid double accounting - */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); - /* - * We need to check for EXT4 here because migrate - * could have changed the inode type in between - */ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, flags); - } else { - retval = ext4_ind_map_blocks(handle, inode, map, flags); - - if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { - /* - * We allocated new blocks which will result in - * i_data's format changing. Force the migrate - * to fail by clearing migrate flags - */ - ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); - } - - /* - * Update reserved blocks/metadata blocks after successful - * block allocation which had been deferred till now. We don't - * support fallocate for non extent files. So we can update - * reserve space here. - */ - if ((retval > 0) && - (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) - ext4_da_update_reserve_space(inode, retval, 1); - } - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); - - /* If we have successfully mapped the delayed allocated blocks, - * set the BH_Da_Mapped bit on them. Its important to do this - * under the protection of i_data_sem. - */ - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) - set_buffers_da_mapped(inode, map); - } - - up_write((&EXT4_I(inode)->i_data_sem)); - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); - if (ret != 0) - return ret; - } - return retval; -} - -/* Maximum number of blocks we map for direct IO at once. */ -#define DIO_MAX_BLOCKS 4096 - -static int _ext4_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int flags) -{ - handle_t *handle = ext4_journal_current_handle(); - struct ext4_map_blocks map; - int ret = 0, started = 0; - int dio_credits; - - map.m_lblk = iblock; - map.m_len = bh->b_size >> inode->i_blkbits; - - if (flags && !handle) { - /* Direct IO write... */ - if (map.m_len > DIO_MAX_BLOCKS) - map.m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); - handle = ext4_journal_start(inode, dio_credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - return ret; - } - started = 1; - } - - ret = ext4_map_blocks(handle, inode, &map, flags); - if (ret > 0) { - map_bh(bh, inode->i_sb, map.m_pblk); - bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; - bh->b_size = inode->i_sb->s_blocksize * map.m_len; - ret = 0; - } - if (started) - ext4_journal_stop(handle); - return ret; -} - -int ext4_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - return _ext4_get_block(inode, iblock, bh, - create ? EXT4_GET_BLOCKS_CREATE : 0); -} - -/* - * `handle' can be NULL if create is zero - */ -struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create, int *errp) -{ - struct ext4_map_blocks map; - struct buffer_head *bh; - int fatal = 0, err; - - J_ASSERT(handle != NULL || create == 0); - - map.m_lblk = block; - map.m_len = 1; - err = ext4_map_blocks(handle, inode, &map, - create ? EXT4_GET_BLOCKS_CREATE : 0); - - if (err < 0) - *errp = err; - if (err <= 0) - return NULL; - *errp = 0; - - bh = sb_getblk(inode->i_sb, map.m_pblk); - if (!bh) { - *errp = -EIO; - return NULL; - } - if (map.m_flags & EXT4_MAP_NEW) { - J_ASSERT(create != 0); - J_ASSERT(handle != NULL); - - /* - * Now that we do not always journal data, we should - * keep in mind whether this should always journal the - * new buffer as metadata. For now, regular file - * writes use ext4_get_block instead, so it's not a - * problem. - */ - lock_buffer(bh); - BUFFER_TRACE(bh, "call get_create_access"); - fatal = ext4_journal_get_create_access(handle, bh); - if (!fatal && !buffer_uptodate(bh)) { - memset(bh->b_data, 0, inode->i_sb->s_blocksize); - set_buffer_uptodate(bh); - } - unlock_buffer(bh); - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); - if (!fatal) - fatal = err; - } else { - BUFFER_TRACE(bh, "not a new buffer"); - } - if (fatal) { - *errp = fatal; - brelse(bh); - bh = NULL; - } - return bh; -} - -struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create, int *err) -{ - struct buffer_head *bh; - - bh = ext4_getblk(handle, inode, block, create, err); - if (!bh) - return bh; - if (buffer_uptodate(bh)) - return bh; - ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); - wait_on_buffer(bh); - if (buffer_uptodate(bh)) - return bh; - put_bh(bh); - *err = -EIO; - return NULL; -} - -static int walk_page_buffers(handle_t *handle, - struct buffer_head *head, - unsigned from, - unsigned to, - int *partial, - int (*fn)(handle_t *handle, - struct buffer_head *bh)) -{ - struct buffer_head *bh; - unsigned block_start, block_end; - unsigned blocksize = head->b_size; - int err, ret = 0; - struct buffer_head *next; - - for (bh = head, block_start = 0; - ret == 0 && (bh != head || !block_start); - block_start = block_end, bh = next) { - next = bh->b_this_page; - block_end = block_start + blocksize; - if (block_end <= from || block_start >= to) { - if (partial && !buffer_uptodate(bh)) - *partial = 1; - continue; - } - err = (*fn)(handle, bh); - if (!ret) - ret = err; - } - return ret; -} - -/* - * To preserve ordering, it is essential that the hole instantiation and - * the data write be encapsulated in a single transaction. We cannot - * close off a transaction and start a new one between the ext4_get_block() - * and the commit_write(). So doing the jbd2_journal_start at the start of - * prepare_write() is the right place. - * - * Also, this function can nest inside ext4_writepage() -> - * block_write_full_page(). In that case, we *know* that ext4_writepage() - * has generated enough buffer credits to do the whole page. So we won't - * block on the journal in that case, which is good, because the caller may - * be PF_MEMALLOC. - * - * By accident, ext4 can be reentered when a transaction is open via - * quota file writes. If we were to commit the transaction while thus - * reentered, there can be a deadlock - we would be holding a quota - * lock, and the commit would never complete if another thread had a - * transaction open and was blocking on the quota lock - a ranking - * violation. - * - * So what we do is to rely on the fact that jbd2_journal_stop/journal_start - * will _not_ run commit under these circumstances because handle->h_ref - * is elevated. We'll still have enough credits for the tiny quotafile - * write. - */ -static int do_journal_get_write_access(handle_t *handle, - struct buffer_head *bh) -{ - int dirty = buffer_dirty(bh); - int ret; - - if (!buffer_mapped(bh) || buffer_freed(bh)) - return 0; - /* - * __block_write_begin() could have dirtied some buffers. Clean - * the dirty bit as jbd2_journal_get_write_access() could complain - * otherwise about fs integrity issues. Setting of the dirty bit - * by __block_write_begin() isn't a real problem here as we clear - * the bit before releasing a page lock and thus writeback cannot - * ever write the buffer. - */ - if (dirty) - clear_buffer_dirty(bh); - ret = ext4_journal_get_write_access(handle, bh); - if (!ret && dirty) - ret = ext4_handle_dirty_metadata(handle, NULL, bh); - return ret; -} - -static int ext4_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); -static int ext4_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - struct inode *inode = mapping->host; - int ret, needed_blocks; - handle_t *handle; - int retries = 0; - struct page *page; - pgoff_t index; - unsigned from, to; - - trace_ext4_write_begin(inode, pos, len, flags); - /* - * Reserve one block more for addition to orphan list in case - * we allocate blocks but write fails for some reason - */ - needed_blocks = ext4_writepage_trans_blocks(inode) + 1; - index = pos >> PAGE_CACHE_SHIFT; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; - -retry: - handle = ext4_journal_start(inode, needed_blocks); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - /* We cannot recurse into the filesystem as the transaction is already - * started */ - flags |= AOP_FLAG_NOFS; - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { - ext4_journal_stop(handle); - ret = -ENOMEM; - goto out; - } - *pagep = page; - - if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(page, pos, len, ext4_get_block_write); - else - ret = __block_write_begin(page, pos, len, ext4_get_block); - - if (!ret && ext4_should_journal_data(inode)) { - ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, do_journal_get_write_access); - } - - if (ret) { - unlock_page(page); - page_cache_release(page); - /* - * __block_write_begin may have instantiated a few blocks - * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. - * - * Add inode to orphan list in case we crash before - * truncate finishes - */ - if (pos + len > inode->i_size && ext4_can_truncate(inode)) - ext4_orphan_add(handle, inode); - - ext4_journal_stop(handle); - if (pos + len > inode->i_size) { - ext4_truncate_failed_write(inode); - /* - * If truncate failed early the inode might - * still be on the orphan list; we need to - * make sure the inode is removed from the - * orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - } - - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; -out: - return ret; -} - -/* For write_end() in data=journal mode */ -static int write_end_fn(handle_t *handle, struct buffer_head *bh) -{ - if (!buffer_mapped(bh) || buffer_freed(bh)) - return 0; - set_buffer_uptodate(bh); - return ext4_handle_dirty_metadata(handle, NULL, bh); -} - -static int ext4_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - int i_size_changed = 0; - struct inode *inode = mapping->host; - handle_t *handle = ext4_journal_current_handle(); - - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. - * - * But it's important to update i_size while still holding page lock: - * page writeout could otherwise come in and zero beyond i_size. - */ - if (pos + copied > inode->i_size) { - i_size_write(inode, pos + copied); - i_size_changed = 1; - } - - if (pos + copied > EXT4_I(inode)->i_disksize) { - /* We need to mark inode dirty even if - * new_i_size is less that inode->i_size - * bu greater than i_disksize.(hint delalloc) - */ - ext4_update_i_disksize(inode, (pos + copied)); - i_size_changed = 1; - } - unlock_page(page); - page_cache_release(page); - - /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling - * filesystems. - */ - if (i_size_changed) - ext4_mark_inode_dirty(handle, inode); - - return copied; -} - -/* - * We need to pick up the new inode size which generic_commit_write gave us - * `file' can be NULL - eg, when called from page_symlink(). - * - * ext4 never places buffers on inode->i_mapping->private_list. metadata - * buffers are managed internally. - */ -static int ext4_ordered_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; - int ret = 0, ret2; - - trace_ext4_ordered_write_end(inode, pos, len, copied); - ret = ext4_jbd2_file_inode(handle, inode); - - if (ret == 0) { - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; - if (pos + len > inode->i_size && ext4_can_truncate(inode)) - /* if we have allocated more blocks and copied - * less. We will have blocks allocated outside - * inode->i_size. So truncate them - */ - ext4_orphan_add(handle, inode); - if (ret2 < 0) - ret = ret2; - } else { - unlock_page(page); - page_cache_release(page); - } - - ret2 = ext4_journal_stop(handle); - if (!ret) - ret = ret2; - - if (pos + len > inode->i_size) { - ext4_truncate_failed_write(inode); - /* - * If truncate failed early the inode might still be - * on the orphan list; we need to make sure the inode - * is removed from the orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - - - return ret ? ret : copied; -} - -static int ext4_writeback_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; - int ret = 0, ret2; - - trace_ext4_writeback_write_end(inode, pos, len, copied); - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; - if (pos + len > inode->i_size && ext4_can_truncate(inode)) - /* if we have allocated more blocks and copied - * less. We will have blocks allocated outside - * inode->i_size. So truncate them - */ - ext4_orphan_add(handle, inode); - - if (ret2 < 0) - ret = ret2; - - ret2 = ext4_journal_stop(handle); - if (!ret) - ret = ret2; - - if (pos + len > inode->i_size) { - ext4_truncate_failed_write(inode); - /* - * If truncate failed early the inode might still be - * on the orphan list; we need to make sure the inode - * is removed from the orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - - return ret ? ret : copied; -} - -static int ext4_journalled_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = mapping->host; - int ret = 0, ret2; - int partial = 0; - unsigned from, to; - loff_t new_i_size; - - trace_ext4_journalled_write_end(inode, pos, len, copied); - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; - - BUG_ON(!ext4_handle_valid(handle)); - - if (copied < len) { - if (!PageUptodate(page)) - copied = 0; - page_zero_new_buffers(page, from+copied, to); - } - - ret = walk_page_buffers(handle, page_buffers(page), from, - to, &partial, write_end_fn); - if (!partial) - SetPageUptodate(page); - new_i_size = pos + copied; - if (new_i_size > inode->i_size) - i_size_write(inode, pos+copied); - ext4_set_inode_state(inode, EXT4_STATE_JDATA); - EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - if (new_i_size > EXT4_I(inode)->i_disksize) { - ext4_update_i_disksize(inode, new_i_size); - ret2 = ext4_mark_inode_dirty(handle, inode); - if (!ret) - ret = ret2; - } - - unlock_page(page); - page_cache_release(page); - if (pos + len > inode->i_size && ext4_can_truncate(inode)) - /* if we have allocated more blocks and copied - * less. We will have blocks allocated outside - * inode->i_size. So truncate them - */ - ext4_orphan_add(handle, inode); - - ret2 = ext4_journal_stop(handle); - if (!ret) - ret = ret2; - if (pos + len > inode->i_size) { - ext4_truncate_failed_write(inode); - /* - * If truncate failed early the inode might still be - * on the orphan list; we need to make sure the inode - * is removed from the orphan list in that case. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - - return ret ? ret : copied; -} - -/* - * Reserve a single cluster located at lblock - */ -static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) -{ - int retries = 0; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned int md_needed; - int ret; - - /* - * recalculate the amount of metadata blocks to reserve - * in order to allocate nrblocks - * worse case is one extent per block - */ -repeat: - spin_lock(&ei->i_block_reservation_lock); - md_needed = EXT4_NUM_B2C(sbi, - ext4_calc_metadata_amount(inode, lblock)); - trace_ext4_da_reserve_space(inode, md_needed); - spin_unlock(&ei->i_block_reservation_lock); - - /* - * We will charge metadata quota at writeout time; this saves - * us from metadata over-estimation, though we may go over by - * a small amount in the end. Here we just reserve for data. - */ - ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); - if (ret) - return ret; - /* - * We do still charge estimated metadata to the sb though; - * we cannot afford to run out of free blocks. - */ - if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) { - dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); - if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - yield(); - goto repeat; - } - return -ENOSPC; - } - spin_lock(&ei->i_block_reservation_lock); - ei->i_reserved_data_blocks++; - ei->i_reserved_meta_blocks += md_needed; - spin_unlock(&ei->i_block_reservation_lock); - - return 0; /* success */ -} - -static void ext4_da_release_space(struct inode *inode, int to_free) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct ext4_inode_info *ei = EXT4_I(inode); - - if (!to_free) - return; /* Nothing to release, exit */ - - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - - trace_ext4_da_release_space(inode, to_free); - if (unlikely(to_free > ei->i_reserved_data_blocks)) { - /* - * if there aren't enough reserved blocks, then the - * counter is messed up somewhere. Since this - * function is called from invalidate page, it's - * harmless to return without any action. - */ - ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " - "ino %lu, to_free %d with only %d reserved " - "data blocks", inode->i_ino, to_free, - ei->i_reserved_data_blocks); - WARN_ON(1); - to_free = ei->i_reserved_data_blocks; - } - ei->i_reserved_data_blocks -= to_free; - - if (ei->i_reserved_data_blocks == 0) { - /* - * We can release all of the reserved metadata blocks - * only when we have written all of the delayed - * allocation blocks. - * Note that in case of bigalloc, i_reserved_meta_blocks, - * i_reserved_data_blocks, etc. refer to number of clusters. - */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - ei->i_reserved_meta_blocks); - ei->i_reserved_meta_blocks = 0; - ei->i_da_metadata_calc_len = 0; - } - - /* update fs dirty data blocks counter */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); - - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - - dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); -} - -static void ext4_da_page_release_reservation(struct page *page, - unsigned long offset) -{ - int to_release = 0; - struct buffer_head *head, *bh; - unsigned int curr_off = 0; - struct inode *inode = page->mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int num_clusters; - - head = page_buffers(page); - bh = head; - do { - unsigned int next_off = curr_off + bh->b_size; - - if ((offset <= curr_off) && (buffer_delay(bh))) { - to_release++; - clear_buffer_delay(bh); - clear_buffer_da_mapped(bh); - } - curr_off = next_off; - } while ((bh = bh->b_this_page) != head); - - /* If we have released all the blocks belonging to a cluster, then we - * need to release the reserved space for that cluster. */ - num_clusters = EXT4_NUM_B2C(sbi, to_release); - while (num_clusters > 0) { - ext4_fsblk_t lblk; - lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + - ((num_clusters - 1) << sbi->s_cluster_bits); - if (sbi->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, lblk, 1)) - ext4_da_release_space(inode, 1); - - num_clusters--; - } -} - -/* - * Delayed allocation stuff - */ - -/* - * mpage_da_submit_io - walks through extent of pages and try to write - * them with writepage() call back - * - * @mpd->inode: inode - * @mpd->first_page: first page of the extent - * @mpd->next_page: page after the last page of the extent - * - * By the time mpage_da_submit_io() is called we expect all blocks - * to be allocated. this may be wrong if allocation failed. - * - * As pages are already locked by write_cache_pages(), we can't use it - */ -static int mpage_da_submit_io(struct mpage_da_data *mpd, - struct ext4_map_blocks *map) -{ - struct pagevec pvec; - unsigned long index, end; - int ret = 0, err, nr_pages, i; - struct inode *inode = mpd->inode; - struct address_space *mapping = inode->i_mapping; - loff_t size = i_size_read(inode); - unsigned int len, block_start; - struct buffer_head *bh, *page_bufs = NULL; - int journal_data = ext4_should_journal_data(inode); - sector_t pblock = 0, cur_logical = 0; - struct ext4_io_submit io_submit; - - BUG_ON(mpd->next_page <= mpd->first_page); - memset(&io_submit, 0, sizeof(io_submit)); - /* - * We need to start from the first_page to the next_page - 1 - * to make sure we also write the mapped dirty buffer_heads. - * If we look at mpd->b_blocknr we would only be looking - * at the currently mapped buffer_heads. - */ - index = mpd->first_page; - end = mpd->next_page - 1; - - pagevec_init(&pvec, 0); - while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - int commit_write = 0, skip_page = 0; - struct page *page = pvec.pages[i]; - - index = page->index; - if (index > end) - break; - - if (index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - if (map) { - cur_logical = index << (PAGE_CACHE_SHIFT - - inode->i_blkbits); - pblock = map->m_pblk + (cur_logical - - map->m_lblk); - } - index++; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - - /* - * If the page does not have buffers (for - * whatever reason), try to create them using - * __block_write_begin. If this fails, - * skip the page and move on. - */ - if (!page_has_buffers(page)) { - if (__block_write_begin(page, 0, len, - noalloc_get_block_write)) { - skip_page: - unlock_page(page); - continue; - } - commit_write = 1; - } - - bh = page_bufs = page_buffers(page); - block_start = 0; - do { - if (!bh) - goto skip_page; - if (map && (cur_logical >= map->m_lblk) && - (cur_logical <= (map->m_lblk + - (map->m_len - 1)))) { - if (buffer_delay(bh)) { - clear_buffer_delay(bh); - bh->b_blocknr = pblock; - } - if (buffer_da_mapped(bh)) - clear_buffer_da_mapped(bh); - if (buffer_unwritten(bh) || - buffer_mapped(bh)) - BUG_ON(bh->b_blocknr != pblock); - if (map->m_flags & EXT4_MAP_UNINIT) - set_buffer_uninit(bh); - clear_buffer_unwritten(bh); - } - - /* - * skip page if block allocation undone and - * block is dirty - */ - if (ext4_bh_delay_or_unwritten(NULL, bh)) - skip_page = 1; - bh = bh->b_this_page; - block_start += bh->b_size; - cur_logical++; - pblock++; - } while (bh != page_bufs); - - if (skip_page) - goto skip_page; - - if (commit_write) - /* mark the buffer_heads as dirty & uptodate */ - block_commit_write(page, 0, len); - - clear_page_dirty_for_io(page); - /* - * Delalloc doesn't support data journalling, - * but eventually maybe we'll lift this - * restriction. - */ - if (unlikely(journal_data && PageChecked(page))) - err = __ext4_journalled_writepage(page, len); - else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) - err = ext4_bio_write_page(&io_submit, page, - len, mpd->wbc); - else if (buffer_uninit(page_bufs)) { - ext4_set_bh_endio(page_bufs, inode); - err = block_write_full_page_endio(page, - noalloc_get_block_write, - mpd->wbc, ext4_end_io_buffer_write); - } else - err = block_write_full_page(page, - noalloc_get_block_write, mpd->wbc); - - if (!err) - mpd->pages_written++; - /* - * In error case, we have to continue because - * remaining pages are still locked - */ - if (ret == 0) - ret = err; - } - pagevec_release(&pvec); - } - ext4_io_submit(&io_submit); - return ret; -} - -static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) -{ - int nr_pages, i; - pgoff_t index, end; - struct pagevec pvec; - struct inode *inode = mpd->inode; - struct address_space *mapping = inode->i_mapping; - - index = mpd->first_page; - end = mpd->next_page - 1; - while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - if (page->index > end) - break; - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - block_invalidatepage(page, 0); - ClearPageUptodate(page); - unlock_page(page); - } - index = pvec.pages[nr_pages - 1]->index + 1; - pagevec_release(&pvec); - } - return; -} - -static void ext4_print_free_blocks(struct inode *inode) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct super_block *sb = inode->i_sb; - - ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", - EXT4_C2B(EXT4_SB(inode->i_sb), - ext4_count_free_clusters(inode->i_sb))); - ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); - ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", - (long long) EXT4_C2B(EXT4_SB(inode->i_sb), - percpu_counter_sum(&sbi->s_freeclusters_counter))); - ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", - (long long) EXT4_C2B(EXT4_SB(inode->i_sb), - percpu_counter_sum(&sbi->s_dirtyclusters_counter))); - ext4_msg(sb, KERN_CRIT, "Block reservation details"); - ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", - EXT4_I(inode)->i_reserved_data_blocks); - ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u", - EXT4_I(inode)->i_reserved_meta_blocks); - return; -} - -/* - * mpage_da_map_and_submit - go through given space, map them - * if necessary, and then submit them for I/O - * - * @mpd - bh describing space - * - * The function skips space we know is already mapped to disk blocks. - * - */ -static void mpage_da_map_and_submit(struct mpage_da_data *mpd) -{ - int err, blks, get_blocks_flags; - struct ext4_map_blocks map, *mapp = NULL; - sector_t next = mpd->b_blocknr; - unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; - loff_t disksize = EXT4_I(mpd->inode)->i_disksize; - handle_t *handle = NULL; - - /* - * If the blocks are mapped already, or we couldn't accumulate - * any blocks, then proceed immediately to the submission stage. - */ - if ((mpd->b_size == 0) || - ((mpd->b_state & (1 << BH_Mapped)) && - !(mpd->b_state & (1 << BH_Delay)) && - !(mpd->b_state & (1 << BH_Unwritten)))) - goto submit_io; - - handle = ext4_journal_current_handle(); - BUG_ON(!handle); - - /* - * Call ext4_map_blocks() to allocate any delayed allocation - * blocks, or to convert an uninitialized extent to be - * initialized (in the case where we have written into - * one or more preallocated blocks). - * - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to - * indicate that we are on the delayed allocation path. This - * affects functions in many different parts of the allocation - * call path. This flag exists primarily because we don't - * want to change *many* call functions, so ext4_map_blocks() - * will set the EXT4_STATE_DELALLOC_RESERVED flag once the - * inode's allocation semaphore is taken. - * - * If the blocks in questions were delalloc blocks, set - * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting - * variables are updated after the blocks have been allocated. - */ - map.m_lblk = next; - map.m_len = max_blocks; - get_blocks_flags = EXT4_GET_BLOCKS_CREATE; - if (ext4_should_dioread_nolock(mpd->inode)) - get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (mpd->b_state & (1 << BH_Delay)) - get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; - - blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); - if (blks < 0) { - struct super_block *sb = mpd->inode->i_sb; - - err = blks; - /* - * If get block returns EAGAIN or ENOSPC and there - * appears to be free blocks we will just let - * mpage_da_submit_io() unlock all of the pages. - */ - if (err == -EAGAIN) - goto submit_io; - - if (err == -ENOSPC && ext4_count_free_clusters(sb)) { - mpd->retval = err; - goto submit_io; - } - - /* - * get block failure will cause us to loop in - * writepages, because a_ops->writepage won't be able - * to make progress. The page will be redirtied by - * writepage and writepages will again try to write - * the same. - */ - if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { - ext4_msg(sb, KERN_CRIT, - "delayed block allocation failed for inode %lu " - "at logical offset %llu with max blocks %zd " - "with error %d", mpd->inode->i_ino, - (unsigned long long) next, - mpd->b_size >> mpd->inode->i_blkbits, err); - ext4_msg(sb, KERN_CRIT, - "This should not happen!! Data will be lost\n"); - if (err == -ENOSPC) - ext4_print_free_blocks(mpd->inode); - } - /* invalidate all the pages */ - ext4_da_block_invalidatepages(mpd); - - /* Mark this page range as having been completed */ - mpd->io_done = 1; - return; - } - BUG_ON(blks == 0); - - mapp = ↦ - if (map.m_flags & EXT4_MAP_NEW) { - struct block_device *bdev = mpd->inode->i_sb->s_bdev; - int i; - - for (i = 0; i < map.m_len; i++) - unmap_underlying_metadata(bdev, map.m_pblk + i); - - if (ext4_should_order_data(mpd->inode)) { - err = ext4_jbd2_file_inode(handle, mpd->inode); - if (err) { - /* Only if the journal is aborted */ - mpd->retval = err; - goto submit_io; - } - } - } - - /* - * Update on-disk size along with block allocation. - */ - disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; - if (disksize > i_size_read(mpd->inode)) - disksize = i_size_read(mpd->inode); - if (disksize > EXT4_I(mpd->inode)->i_disksize) { - ext4_update_i_disksize(mpd->inode, disksize); - err = ext4_mark_inode_dirty(handle, mpd->inode); - if (err) - ext4_error(mpd->inode->i_sb, - "Failed to mark inode %lu dirty", - mpd->inode->i_ino); - } - -submit_io: - mpage_da_submit_io(mpd, mapp); - mpd->io_done = 1; -} - -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ - (1 << BH_Delay) | (1 << BH_Unwritten)) - -/* - * mpage_add_bh_to_extent - try to add one more block to extent of blocks - * - * @mpd->lbh - extent of blocks - * @logical - logical number of the block in the file - * @bh - bh of the block (used to access block's state) - * - * the function is used to collect contig. blocks in same state - */ -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, - sector_t logical, size_t b_size, - unsigned long b_state) -{ - sector_t next; - int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; - - /* - * XXX Don't go larger than mballoc is willing to allocate - * This is a stopgap solution. We eventually need to fold - * mpage_da_submit_io() into this function and then call - * ext4_map_blocks() multiple times in a loop - */ - if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) - goto flush_it; - - /* check if thereserved journal credits might overflow */ - if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { - if (nrblocks >= EXT4_MAX_TRANS_DATA) { - /* - * With non-extent format we are limited by the journal - * credit available. Total credit needed to insert - * nrblocks contiguous blocks is dependent on the - * nrblocks. So limit nrblocks. - */ - goto flush_it; - } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > - EXT4_MAX_TRANS_DATA) { - /* - * Adding the new buffer_head would make it cross the - * allowed limit for which we have journal credit - * reserved. So limit the new bh->b_size - */ - b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << - mpd->inode->i_blkbits; - /* we will do mpage_da_submit_io in the next loop */ - } - } - /* - * First block in the extent - */ - if (mpd->b_size == 0) { - mpd->b_blocknr = logical; - mpd->b_size = b_size; - mpd->b_state = b_state & BH_FLAGS; - return; - } - - next = mpd->b_blocknr + nrblocks; - /* - * Can we merge the block to our big extent? - */ - if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { - mpd->b_size += b_size; - return; - } - -flush_it: - /* - * We couldn't merge the block to our extent, so we - * need to flush current extent and start new one - */ - mpage_da_map_and_submit(mpd); - return; -} - -static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) -{ - return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); -} - -/* - * This function is grabs code from the very beginning of - * ext4_map_blocks, but assumes that the caller is from delayed write - * time. This function looks up the requested blocks and sets the - * buffer delay bit under the protection of i_data_sem. - */ -static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, - struct ext4_map_blocks *map, - struct buffer_head *bh) -{ - int retval; - sector_t invalid_block = ~((sector_t) 0xffff); - - if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) - invalid_block = ~0; - - map->m_flags = 0; - ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," - "logical block %lu\n", inode->i_ino, map->m_len, - (unsigned long) map->m_lblk); - /* - * Try to see if we can get the block without requesting a new - * file system block. - */ - down_read((&EXT4_I(inode)->i_data_sem)); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, 0); - else - retval = ext4_ind_map_blocks(NULL, inode, map, 0); - - if (retval == 0) { - /* - * XXX: __block_prepare_write() unmaps passed block, - * is it OK? - */ - /* If the block was allocated from previously allocated cluster, - * then we dont need to reserve it again. */ - if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { - retval = ext4_da_reserve_space(inode, iblock); - if (retval) - /* not enough space to reserve */ - goto out_unlock; - } - - /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served - * and it should not appear on the bh->b_state. - */ - map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; - - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); - } - -out_unlock: - up_read((&EXT4_I(inode)->i_data_sem)); - - return retval; -} - -/* - * This is a special get_blocks_t callback which is used by - * ext4_da_write_begin(). It will either return mapped block or - * reserve space for a single block. - * - * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. - * We also have b_blocknr = -1 and b_bdev initialized properly - * - * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. - * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev - * initialized properly. - */ -static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - struct ext4_map_blocks map; - int ret = 0; - - BUG_ON(create == 0); - BUG_ON(bh->b_size != inode->i_sb->s_blocksize); - - map.m_lblk = iblock; - map.m_len = 1; - - /* - * first, we need to know whether the block is allocated already - * preallocated blocks are unmapped but should treated - * the same as allocated blocks. - */ - ret = ext4_da_map_blocks(inode, iblock, &map, bh); - if (ret <= 0) - return ret; - - map_bh(bh, inode->i_sb, map.m_pblk); - bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; - - if (buffer_unwritten(bh)) { - /* A delayed write to unwritten bh should be marked - * new and mapped. Mapped ensures that we don't do - * get_block multiple times when we write to the same - * offset and new ensures that we do proper zero out - * for partial write. - */ - set_buffer_new(bh); - set_buffer_mapped(bh); - } - return 0; -} - -/* - * This function is used as a standard get_block_t calback function - * when there is no desire to allocate any blocks. It is used as a - * callback function for block_write_begin() and block_write_full_page(). - * These functions should only try to map a single block at a time. - * - * Since this function doesn't do block allocations even if the caller - * requests it by passing in create=1, it is critically important that - * any caller checks to make sure that any buffer heads are returned - * by this function are either all already mapped or marked for - * delayed allocation before calling block_write_full_page(). Otherwise, - * b_blocknr could be left unitialized, and the page write functions will - * be taken by surprise. - */ -static int noalloc_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); - return _ext4_get_block(inode, iblock, bh_result, 0); -} - -static int bget_one(handle_t *handle, struct buffer_head *bh) -{ - get_bh(bh); - return 0; -} - -static int bput_one(handle_t *handle, struct buffer_head *bh) -{ - put_bh(bh); - return 0; -} - -static int __ext4_journalled_writepage(struct page *page, - unsigned int len) -{ - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct buffer_head *page_bufs; - handle_t *handle = NULL; - int ret = 0; - int err; - - ClearPageChecked(page); - page_bufs = page_buffers(page); - BUG_ON(!page_bufs); - walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); - /* As soon as we unlock the page, it can go away, but we have - * references to buffers so we are safe */ - unlock_page(page); - - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - BUG_ON(!ext4_handle_valid(handle)); - - ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, - do_journal_get_write_access); - - err = walk_page_buffers(handle, page_bufs, 0, len, NULL, - write_end_fn); - if (ret == 0) - ret = err; - EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - - walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); - ext4_set_inode_state(inode, EXT4_STATE_JDATA); -out: - return ret; -} - -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode); -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); - -/* - * Note that we don't need to start a transaction unless we're journaling data - * because we should have holes filled from ext4_page_mkwrite(). We even don't - * need to file the inode to the transaction's list in ordered mode because if - * we are writing back data added by write(), the inode is already there and if - * we are writing back data modified via mmap(), no one guarantees in which - * transaction the data will hit the disk. In case we are journaling data, we - * cannot start transaction directly because transaction start ranks above page - * lock so we have to do some magic. - * - * This function can get called via... - * - ext4_da_writepages after taking page lock (have journal handle) - * - journal_submit_inode_data_buffers (no journal handle) - * - shrink_page_list via pdflush (no journal handle) - * - grab_page_cache when doing write_begin (have journal handle) - * - * We don't do any block allocation in this function. If we have page with - * multiple blocks we need to write those buffer_heads that are mapped. This - * is important for mmaped based write. So if we do with blocksize 1K - * truncate(f, 1024); - * a = mmap(f, 0, 4096); - * a[0] = 'a'; - * truncate(f, 4096); - * we have in the page first buffer_head mapped via page_mkwrite call back - * but other buffer_heads would be unmapped but dirty (dirty done via the - * do_wp_page). So writepage should write the first block. If we modify - * the mmap area beyond 1024 we will again get a page_fault and the - * page_mkwrite callback will do the block allocation and mark the - * buffer_heads mapped. - * - * We redirty the page if we have any buffer_heads that is either delay or - * unwritten in the page. - * - * We can get recursively called as show below. - * - * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext4_writepage() - * - * But since we don't do any block allocation we should not deadlock. - * Page also have the dirty flag cleared so we don't get recurive page_lock. - */ -static int ext4_writepage(struct page *page, - struct writeback_control *wbc) -{ - int ret = 0, commit_write = 0; - loff_t size; - unsigned int len; - struct buffer_head *page_bufs = NULL; - struct inode *inode = page->mapping->host; - - trace_ext4_writepage(page); - size = i_size_read(inode); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - - /* - * If the page does not have buffers (for whatever reason), - * try to create them using __block_write_begin. If this - * fails, redirty the page and move on. - */ - if (!page_has_buffers(page)) { - if (__block_write_begin(page, 0, len, - noalloc_get_block_write)) { - redirty_page: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - commit_write = 1; - } - page_bufs = page_buffers(page); - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - /* - * We don't want to do block allocation, so redirty - * the page and return. We may reach here when we do - * a journal commit via journal_submit_inode_data_buffers. - * We can also reach here via shrink_page_list but it - * should never be for direct reclaim so warn if that - * happens - */ - WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == - PF_MEMALLOC); - goto redirty_page; - } - if (commit_write) - /* now mark the buffer_heads as dirty and uptodate */ - block_commit_write(page, 0, len); - - if (PageChecked(page) && ext4_should_journal_data(inode)) - /* - * It's mmapped pagecache. Add buffers and journal it. There - * doesn't seem much point in redirtying the page here. - */ - return __ext4_journalled_writepage(page, len); - - if (buffer_uninit(page_bufs)) { - ext4_set_bh_endio(page_bufs, inode); - ret = block_write_full_page_endio(page, noalloc_get_block_write, - wbc, ext4_end_io_buffer_write); - } else - ret = block_write_full_page(page, noalloc_get_block_write, - wbc); - - return ret; -} - -/* - * This is called via ext4_da_writepages() to - * calculate the total number of credits to reserve to fit - * a single extent allocation into a single transaction, - * ext4_da_writpeages() will loop calling this before - * the block allocation. - */ - -static int ext4_da_writepages_trans_blocks(struct inode *inode) -{ - int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; - - /* - * With non-extent format the journal credit needed to - * insert nrblocks contiguous block is dependent on - * number of contiguous block. So we will limit - * number of contiguous block to a sane value - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && - (max_blocks > EXT4_MAX_TRANS_DATA)) - max_blocks = EXT4_MAX_TRANS_DATA; - - return ext4_chunk_trans_blocks(inode, max_blocks); -} - -/* - * write_cache_pages_da - walk the list of dirty pages of the given - * address space and accumulate pages that need writing, and call - * mpage_da_map_and_submit to map a single contiguous memory region - * and then write them. - */ -static int write_cache_pages_da(struct address_space *mapping, - struct writeback_control *wbc, - struct mpage_da_data *mpd, - pgoff_t *done_index) -{ - struct buffer_head *bh, *head; - struct inode *inode = mapping->host; - struct pagevec pvec; - unsigned int nr_pages; - sector_t logical; - pgoff_t index, end; - long nr_to_write = wbc->nr_to_write; - int i, tag, ret = 0; - - memset(mpd, 0, sizeof(struct mpage_da_data)); - mpd->wbc = wbc; - mpd->inode = inode; - pagevec_init(&pvec, 0); - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag = PAGECACHE_TAG_TOWRITE; - else - tag = PAGECACHE_TAG_DIRTY; - - *done_index = index; - while (index <= end) { - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - return 0; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) - goto out; - - *done_index = page->index + 1; - - /* - * If we can't merge this page, and we have - * accumulated an contiguous region, write it - */ - if ((mpd->next_page != page->index) && - (mpd->next_page != mpd->first_page)) { - mpage_da_map_and_submit(mpd); - goto ret_extent_tail; - } - - lock_page(page); - - /* - * If the page is no longer dirty, or its - * mapping no longer corresponds to inode we - * are writing (which means it has been - * truncated or invalidated), or the page is - * already under writeback and we are not - * doing a data integrity writeback, skip the page - */ - if (!PageDirty(page) || - (PageWriteback(page) && - (wbc->sync_mode == WB_SYNC_NONE)) || - unlikely(page->mapping != mapping)) { - unlock_page(page); - continue; - } - - wait_on_page_writeback(page); - BUG_ON(PageWriteback(page)); - - if (mpd->next_page != page->index) - mpd->first_page = page->index; - mpd->next_page = page->index + 1; - logical = (sector_t) page->index << - (PAGE_CACHE_SHIFT - inode->i_blkbits); - - if (!page_has_buffers(page)) { - mpage_add_bh_to_extent(mpd, logical, - PAGE_CACHE_SIZE, - (1 << BH_Dirty) | (1 << BH_Uptodate)); - if (mpd->io_done) - goto ret_extent_tail; - } else { - /* - * Page with regular buffer heads, - * just add all dirty ones - */ - head = page_buffers(page); - bh = head; - do { - BUG_ON(buffer_locked(bh)); - /* - * We need to try to allocate - * unmapped blocks in the same page. - * Otherwise we won't make progress - * with the page in ext4_writepage - */ - if (ext4_bh_delay_or_unwritten(NULL, bh)) { - mpage_add_bh_to_extent(mpd, logical, - bh->b_size, - bh->b_state); - if (mpd->io_done) - goto ret_extent_tail; - } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { - /* - * mapped dirty buffer. We need - * to update the b_state - * because we look at b_state - * in mpage_da_map_blocks. We - * don't update b_size because - * if we find an unmapped - * buffer_head later we need to - * use the b_state flag of that - * buffer_head. - */ - if (mpd->b_size == 0) - mpd->b_state = bh->b_state & BH_FLAGS; - } - logical++; - } while ((bh = bh->b_this_page) != head); - } - - if (nr_to_write > 0) { - nr_to_write--; - if (nr_to_write == 0 && - wbc->sync_mode == WB_SYNC_NONE) - /* - * We stop writing back only if we are - * not doing integrity sync. In case of - * integrity sync we have to keep going - * because someone may be concurrently - * dirtying pages, and we might have - * synced a lot of newly appeared dirty - * pages, but have not synced all of the - * old dirty pages. - */ - goto out; - } - } - pagevec_release(&pvec); - cond_resched(); - } - return 0; -ret_extent_tail: - ret = MPAGE_DA_EXTENT_TAIL; -out: - pagevec_release(&pvec); - cond_resched(); - return ret; -} - - -static int ext4_da_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - pgoff_t index; - int range_whole = 0; - handle_t *handle = NULL; - struct mpage_da_data mpd; - struct inode *inode = mapping->host; - int pages_written = 0; - unsigned int max_pages; - int range_cyclic, cycled = 1, io_done = 0; - int needed_blocks, ret = 0; - long desired_nr_to_write, nr_to_writebump = 0; - loff_t range_start = wbc->range_start; - struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); - pgoff_t done_index = 0; - pgoff_t end; - struct blk_plug plug; - - trace_ext4_da_writepages(inode, wbc); - - /* - * No pages to write? This is mainly a kludge to avoid starting - * a transaction for special inodes like journal inode on last iput() - * because that could violate lock ordering on umount - */ - if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - return 0; - - /* - * If the filesystem has aborted, it is read-only, so return - * right away instead of dumping stack traces later on that - * will obscure the real source of the problem. We test - * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because - * the latter could be true if the filesystem is mounted - * read-only, and in that case, ext4_da_writepages should - * *never* be called, so if that ever happens, we would want - * the stack trace. - */ - if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) - return -EROFS; - - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; - - range_cyclic = wbc->range_cyclic; - if (wbc->range_cyclic) { - index = mapping->writeback_index; - if (index) - cycled = 0; - wbc->range_start = index << PAGE_CACHE_SHIFT; - wbc->range_end = LLONG_MAX; - wbc->range_cyclic = 0; - end = -1; - } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - } - - /* - * This works around two forms of stupidity. The first is in - * the writeback code, which caps the maximum number of pages - * written to be 1024 pages. This is wrong on multiple - * levels; different architectues have a different page size, - * which changes the maximum amount of data which gets - * written. Secondly, 4 megabytes is way too small. XFS - * forces this value to be 16 megabytes by multiplying - * nr_to_write parameter by four, and then relies on its - * allocator to allocate larger extents to make them - * contiguous. Unfortunately this brings us to the second - * stupidity, which is that ext4's mballoc code only allocates - * at most 2048 blocks. So we force contiguous writes up to - * the number of dirty blocks in the inode, or - * sbi->max_writeback_mb_bump whichever is smaller. - */ - max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); - if (!range_cyclic && range_whole) { - if (wbc->nr_to_write == LONG_MAX) - desired_nr_to_write = wbc->nr_to_write; - else - desired_nr_to_write = wbc->nr_to_write * 8; - } else - desired_nr_to_write = ext4_num_dirty_pages(inode, index, - max_pages); - if (desired_nr_to_write > max_pages) - desired_nr_to_write = max_pages; - - if (wbc->nr_to_write < desired_nr_to_write) { - nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; - wbc->nr_to_write = desired_nr_to_write; - } - -retry: - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, index, end); - - blk_start_plug(&plug); - while (!ret && wbc->nr_to_write > 0) { - - /* - * we insert one extent at a time. So we need - * credit needed for single extent allocation. - * journalled mode is currently not supported - * by delalloc - */ - BUG_ON(ext4_should_journal_data(inode)); - needed_blocks = ext4_da_writepages_trans_blocks(inode); - - /* start a new transaction*/ - handle = ext4_journal_start(inode, needed_blocks); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " - "%ld pages, ino %lu; err %d", __func__, - wbc->nr_to_write, inode->i_ino, ret); - blk_finish_plug(&plug); - goto out_writepages; - } - - /* - * Now call write_cache_pages_da() to find the next - * contiguous region of logical blocks that need - * blocks to be allocated by ext4 and submit them. - */ - ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); - /* - * If we have a contiguous extent of pages and we - * haven't done the I/O yet, map the blocks and submit - * them for I/O. - */ - if (!mpd.io_done && mpd.next_page != mpd.first_page) { - mpage_da_map_and_submit(&mpd); - ret = MPAGE_DA_EXTENT_TAIL; - } - trace_ext4_da_write_pages(inode, &mpd); - wbc->nr_to_write -= mpd.pages_written; - - ext4_journal_stop(handle); - - if ((mpd.retval == -ENOSPC) && sbi->s_journal) { - /* commit the transaction which would - * free blocks released in the transaction - * and try again - */ - jbd2_journal_force_commit_nested(sbi->s_journal); - ret = 0; - } else if (ret == MPAGE_DA_EXTENT_TAIL) { - /* - * Got one extent now try with rest of the pages. - * If mpd.retval is set -EIO, journal is aborted. - * So we don't need to write any more. - */ - pages_written += mpd.pages_written; - ret = mpd.retval; - io_done = 1; - } else if (wbc->nr_to_write) - /* - * There is no more writeout needed - * or we requested for a noblocking writeout - * and we found the device congested - */ - break; - } - blk_finish_plug(&plug); - if (!io_done && !cycled) { - cycled = 1; - index = 0; - wbc->range_start = index << PAGE_CACHE_SHIFT; - wbc->range_end = mapping->writeback_index - 1; - goto retry; - } - - /* Update index */ - wbc->range_cyclic = range_cyclic; - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - /* - * set the writeback_index so that range_cyclic - * mode will write it back later - */ - mapping->writeback_index = done_index; - -out_writepages: - wbc->nr_to_write -= nr_to_writebump; - wbc->range_start = range_start; - trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); - return ret; -} - -#define FALL_BACK_TO_NONDELALLOC 1 -static int ext4_nonda_switch(struct super_block *sb) -{ - s64 free_blocks, dirty_blocks; - struct ext4_sb_info *sbi = EXT4_SB(sb); - - /* - * switch to non delalloc mode if we are running low - * on free block. The free block accounting via percpu - * counters can get slightly wrong with percpu_counter_batch getting - * accumulated on each CPU without updating global counters - * Delalloc need an accurate free block accounting. So switch - * to non delalloc when we are near to error range. - */ - free_blocks = EXT4_C2B(sbi, - percpu_counter_read_positive(&sbi->s_freeclusters_counter)); - dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); - if (2 * free_blocks < 3 * dirty_blocks || - free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) { - /* - * free block count is less than 150% of dirty blocks - * or free blocks is less than watermark - */ - return 1; - } - /* - * Even if we don't switch but are nearing capacity, - * start pushing delalloc when 1/2 of free blocks are dirty. - */ - if (free_blocks < 2 * dirty_blocks) - writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE); - - return 0; -} - -static int ext4_da_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - int ret, retries = 0; - struct page *page; - pgoff_t index; - struct inode *inode = mapping->host; - handle_t *handle; - - index = pos >> PAGE_CACHE_SHIFT; - - if (ext4_nonda_switch(inode->i_sb)) { - *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; - return ext4_write_begin(file, mapping, pos, - len, flags, pagep, fsdata); - } - *fsdata = (void *)0; - trace_ext4_da_write_begin(inode, pos, len, flags); -retry: - /* - * With delayed allocation, we don't log the i_disksize update - * if there is delayed block allocation. But we still need - * to journalling the i_disksize update if writes to the end - * of file which has an already mapped buffer. - */ - handle = ext4_journal_start(inode, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - /* We cannot recurse into the filesystem as the transaction is already - * started */ - flags |= AOP_FLAG_NOFS; - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { - ext4_journal_stop(handle); - ret = -ENOMEM; - goto out; - } - *pagep = page; - - ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); - if (ret < 0) { - unlock_page(page); - ext4_journal_stop(handle); - page_cache_release(page); - /* - * block_write_begin may have instantiated a few blocks - * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. - */ - if (pos + len > inode->i_size) - ext4_truncate_failed_write(inode); - } - - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; -out: - return ret; -} - -/* - * Check if we should update i_disksize - * when write to the end of file but not require block allocation - */ -static int ext4_da_should_update_i_disksize(struct page *page, - unsigned long offset) -{ - struct buffer_head *bh; - struct inode *inode = page->mapping->host; - unsigned int idx; - int i; - - bh = page_buffers(page); - idx = offset >> inode->i_blkbits; - - for (i = 0; i < idx; i++) - bh = bh->b_this_page; - - if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) - return 0; - return 1; -} - -static int ext4_da_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = mapping->host; - int ret = 0, ret2; - handle_t *handle = ext4_journal_current_handle(); - loff_t new_i_size; - unsigned long start, end; - int write_mode = (int)(unsigned long)fsdata; - - if (write_mode == FALL_BACK_TO_NONDELALLOC) { - switch (ext4_inode_journal_mode(inode)) { - case EXT4_INODE_ORDERED_DATA_MODE: - return ext4_ordered_write_end(file, mapping, pos, - len, copied, page, fsdata); - case EXT4_INODE_WRITEBACK_DATA_MODE: - return ext4_writeback_write_end(file, mapping, pos, - len, copied, page, fsdata); - default: - BUG(); - } - } - - trace_ext4_da_write_end(inode, pos, len, copied); - start = pos & (PAGE_CACHE_SIZE - 1); - end = start + copied - 1; - - /* - * generic_write_end() will run mark_inode_dirty() if i_size - * changes. So let's piggyback the i_disksize mark_inode_dirty - * into that. - */ - - new_i_size = pos + copied; - if (copied && new_i_size > EXT4_I(inode)->i_disksize) { - if (ext4_da_should_update_i_disksize(page, end)) { - down_write(&EXT4_I(inode)->i_data_sem); - if (new_i_size > EXT4_I(inode)->i_disksize) { - /* - * Updating i_disksize when extending file - * without needing block allocation - */ - if (ext4_should_order_data(inode)) - ret = ext4_jbd2_file_inode(handle, - inode); - - EXT4_I(inode)->i_disksize = new_i_size; - } - up_write(&EXT4_I(inode)->i_data_sem); - /* We need to mark inode dirty even if - * new_i_size is less that inode->i_size - * bu greater than i_disksize.(hint delalloc) - */ - ext4_mark_inode_dirty(handle, inode); - } - } - ret2 = generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - copied = ret2; - if (ret2 < 0) - ret = ret2; - ret2 = ext4_journal_stop(handle); - if (!ret) - ret = ret2; - - return ret ? ret : copied; -} - -static void ext4_da_invalidatepage(struct page *page, unsigned long offset) -{ - /* - * Drop reserved blocks - */ - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) - goto out; - - ext4_da_page_release_reservation(page, offset); - -out: - ext4_invalidatepage(page, offset); - - return; -} - -/* - * Force all delayed allocation blocks to be allocated for a given inode. - */ -int ext4_alloc_da_blocks(struct inode *inode) -{ - trace_ext4_alloc_da_blocks(inode); - - if (!EXT4_I(inode)->i_reserved_data_blocks && - !EXT4_I(inode)->i_reserved_meta_blocks) - return 0; - - /* - * We do something simple for now. The filemap_flush() will - * also start triggering a write of the data blocks, which is - * not strictly speaking necessary (and for users of - * laptop_mode, not even desirable). However, to do otherwise - * would require replicating code paths in: - * - * ext4_da_writepages() -> - * write_cache_pages() ---> (via passed in callback function) - * __mpage_da_writepage() --> - * mpage_add_bh_to_extent() - * mpage_da_map_blocks() - * - * The problem is that write_cache_pages(), located in - * mm/page-writeback.c, marks pages clean in preparation for - * doing I/O, which is not desirable if we're not planning on - * doing I/O at all. - * - * We could call write_cache_pages(), and then redirty all of - * the pages by calling redirty_page_for_writepage() but that - * would be ugly in the extreme. So instead we would need to - * replicate parts of the code in the above functions, - * simplifying them because we wouldn't actually intend to - * write out the pages, but rather only collect contiguous - * logical block extents, call the multi-block allocator, and - * then update the buffer heads with the block allocations. - * - * For now, though, we'll cheat by calling filemap_flush(), - * which will map the blocks, and start the I/O, but not - * actually wait for the I/O to complete. - */ - return filemap_flush(inode->i_mapping); -} - -/* - * bmap() is special. It gets used by applications such as lilo and by - * the swapper to find the on-disk block of a specific piece of data. - * - * Naturally, this is dangerous if the block concerned is still in the - * journal. If somebody makes a swapfile on an ext4 data-journaling - * filesystem and enables swap, then they may get a nasty shock when the - * data getting swapped to that swapfile suddenly gets overwritten by - * the original zero's written out previously to the journal and - * awaiting writeback in the kernel's buffer cache. - * - * So, if we see any bmap calls here on a modified, data-journaled file, - * take extra steps to flush any blocks which might be in the cache. - */ -static sector_t ext4_bmap(struct address_space *mapping, sector_t block) -{ - struct inode *inode = mapping->host; - journal_t *journal; - int err; - - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && - test_opt(inode->i_sb, DELALLOC)) { - /* - * With delalloc we want to sync the file - * so that we can make sure we allocate - * blocks for file - */ - filemap_write_and_wait(mapping); - } - - if (EXT4_JOURNAL(inode) && - ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { - /* - * This is a REALLY heavyweight approach, but the use of - * bmap on dirty files is expected to be extremely rare: - * only if we run lilo or swapon on a freshly made file - * do we expect this to happen. - * - * (bmap requires CAP_SYS_RAWIO so this does not - * represent an unprivileged user DOS attack --- we'd be - * in trouble if mortal users could trigger this path at - * will.) - * - * NB. EXT4_STATE_JDATA is not set on files other than - * regular files. If somebody wants to bmap a directory - * or symlink and gets confused because the buffer - * hasn't yet been flushed to disk, they deserve - * everything they get. - */ - - ext4_clear_inode_state(inode, EXT4_STATE_JDATA); - journal = EXT4_JOURNAL(inode); - jbd2_journal_lock_updates(journal); - err = jbd2_journal_flush(journal); - jbd2_journal_unlock_updates(journal); - - if (err) - return 0; - } - - return generic_block_bmap(mapping, block, ext4_get_block); -} - -static int ext4_readpage(struct file *file, struct page *page) -{ - trace_ext4_readpage(page); - return mpage_readpage(page, ext4_get_block); -} - -static int -ext4_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); -} - -static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset) -{ - struct buffer_head *head, *bh; - unsigned int curr_off = 0; - - if (!page_has_buffers(page)) - return; - head = bh = page_buffers(page); - do { - if (offset <= curr_off && test_clear_buffer_uninit(bh) - && bh->b_private) { - ext4_free_io_end(bh->b_private); - bh->b_private = NULL; - bh->b_end_io = NULL; - } - curr_off = curr_off + bh->b_size; - bh = bh->b_this_page; - } while (bh != head); -} - -static void ext4_invalidatepage(struct page *page, unsigned long offset) -{ - journal_t *journal = EXT4_JOURNAL(page->mapping->host); - - trace_ext4_invalidatepage(page, offset); - - /* - * free any io_end structure allocated for buffers to be discarded - */ - if (ext4_should_dioread_nolock(page->mapping->host)) - ext4_invalidatepage_free_endio(page, offset); - /* - * If it's a full truncate we just forget about the pending dirtying - */ - if (offset == 0) - ClearPageChecked(page); - - if (journal) - jbd2_journal_invalidatepage(journal, page, offset); - else - block_invalidatepage(page, offset); -} - -static int ext4_releasepage(struct page *page, gfp_t wait) -{ - journal_t *journal = EXT4_JOURNAL(page->mapping->host); - - trace_ext4_releasepage(page); - - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) - return 0; - if (journal) - return jbd2_journal_try_to_free_buffers(journal, page, wait); - else - return try_to_free_buffers(page); -} - -/* - * ext4_get_block used when preparing for a DIO write or buffer write. - * We allocate an uinitialized extent if blocks haven't been allocated. - * The extent will be converted to initialized after the IO is complete. - */ -static int ext4_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", - inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_IO_CREATE_EXT); -} - -static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private, int ret, - bool is_async) -{ - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; - ext4_io_end_t *io_end = iocb->private; - struct workqueue_struct *wq; - unsigned long flags; - struct ext4_inode_info *ei; - - /* if not async direct IO or dio with 0 bytes write, just return */ - if (!io_end || !size) - goto out; - - ext_debug("ext4_end_io_dio(): io_end 0x%p " - "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", - iocb->private, io_end->inode->i_ino, iocb, offset, - size); - - iocb->private = NULL; - - /* if not aio dio with unwritten extents, just free io and return */ - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); -out: - if (is_async) - aio_complete(iocb, ret, 0); - inode_dio_done(inode); - return; - } - - io_end->offset = offset; - io_end->size = size; - if (is_async) { - io_end->iocb = iocb; - io_end->result = ret; - } - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; - - /* Add the io_end to per-inode completed aio dio list*/ - ei = EXT4_I(io_end->inode); - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - list_add_tail(&io_end->list, &ei->i_completed_io_list); - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); -} - -static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) -{ - ext4_io_end_t *io_end = bh->b_private; - struct workqueue_struct *wq; - struct inode *inode; - unsigned long flags; - - if (!test_clear_buffer_uninit(bh) || !io_end) - goto out; - - if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) { - ext4_msg(io_end->inode->i_sb, KERN_INFO, - "sb umounted, discard end_io request for inode %lu", - io_end->inode->i_ino); - ext4_free_io_end(io_end); - goto out; - } - - /* - * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now, - * but being more careful is always safe for the future change. - */ - inode = io_end->inode; - ext4_set_io_unwritten_flag(inode, io_end); - - /* Add the io_end to per-inode completed io list*/ - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); - - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); -out: - bh->b_private = NULL; - bh->b_end_io = NULL; - clear_buffer_uninit(bh); - end_buffer_async_write(bh, uptodate); -} - -static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode) -{ - ext4_io_end_t *io_end; - struct page *page = bh->b_page; - loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT; - size_t size = bh->b_size; - -retry: - io_end = ext4_init_io_end(inode, GFP_ATOMIC); - if (!io_end) { - pr_warn_ratelimited("%s: allocation fail\n", __func__); - schedule(); - goto retry; - } - io_end->offset = offset; - io_end->size = size; - /* - * We need to hold a reference to the page to make sure it - * doesn't get evicted before ext4_end_io_work() has a chance - * to convert the extent from written to unwritten. - */ - io_end->page = page; - get_page(io_end->page); - - bh->b_private = io_end; - bh->b_end_io = ext4_end_io_buffer_write; - return 0; -} - -/* - * For ext4 extent files, ext4 will do direct-io write to holes, - * preallocated extents, and those write extend the file, no need to - * fall back to buffered IO. - * - * For holes, we fallocate those blocks, mark them as uninitialized - * If those blocks were preallocated, we mark sure they are splited, but - * still keep the range to write as uninitialized. - * - * The unwrritten extents will be converted to written when DIO is completed. - * For async direct IO, since the IO may still pending when return, we - * set up an end_io call back function, which will do the conversion - * when async direct IO completed. - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - */ -static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - ssize_t ret; - size_t count = iov_length(iov, nr_segs); - - loff_t final_size = offset + count; - if (rw == WRITE && final_size <= inode->i_size) { - /* - * We could direct write to holes and fallocate. - * - * Allocated blocks to fill the hole are marked as uninitialized - * to prevent parallel buffered read to expose the stale data - * before DIO complete the data IO. - * - * As to previously fallocated extents, ext4 get_block - * will just simply mark the buffer mapped but still - * keep the extents uninitialized. - * - * for non AIO case, we will convert those unwritten extents - * to written after return back from blockdev_direct_IO. - * - * for async DIO, the conversion needs to be defered when - * the IO is completed. The ext4 end_io callback function - * will be called to take care of the conversion work. - * Here for async case, we allocate an io_end structure to - * hook to the iocb. - */ - iocb->private = NULL; - EXT4_I(inode)->cur_aio_dio = NULL; - if (!is_sync_kiocb(iocb)) { - ext4_io_end_t *io_end = - ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) - return -ENOMEM; - io_end->flag |= EXT4_IO_END_DIRECT; - iocb->private = io_end; - /* - * we save the io structure for current async - * direct IO, so that later ext4_map_blocks() - * could flag the io structure whether there - * is a unwritten extents needs to be converted - * when IO is completed. - */ - EXT4_I(inode)->cur_aio_dio = iocb->private; - } - - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block_write, - ext4_end_io_dio, - NULL, - DIO_LOCKING); - if (iocb->private) - EXT4_I(inode)->cur_aio_dio = NULL; - /* - * The io_end structure takes a reference to the inode, - * that structure needs to be destroyed and the - * reference to the inode need to be dropped, when IO is - * complete, even with 0 byte write, or failed. - * - * In the successful AIO DIO case, the io_end structure will be - * desctroyed and the reference to the inode will be dropped - * after the end_io call back function is called. - * - * In the case there is 0 byte write, or error case, since - * VFS direct IO won't invoke the end_io call back function, - * we need to free the end_io structure here. - */ - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { - ext4_free_io_end(iocb->private); - iocb->private = NULL; - } else if (ret > 0 && ext4_test_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN)) { - int err; - /* - * for non AIO case, since the IO is already - * completed, we could do the conversion right here - */ - err = ext4_convert_unwritten_extents(inode, - offset, ret); - if (err < 0) - ret = err; - ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - } - return ret; - } - - /* for write the the end of file case, we fall back to old way */ - return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); -} - -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - ssize_t ret; - - /* - * If we are doing data journalling we don't support O_DIRECT - */ - if (ext4_should_journal_data(inode)) - return 0; - - trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); - else - ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); - trace_ext4_direct_IO_exit(inode, offset, - iov_length(iov, nr_segs), rw, ret); - return ret; -} - -/* - * Pages can be marked dirty completely asynchronously from ext4's journalling - * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do - * much here because ->set_page_dirty is called under VFS locks. The page is - * not necessarily locked. - * - * We cannot just dirty the page and leave attached buffers clean, because the - * buffers' dirty state is "definitive". We cannot just set the buffers dirty - * or jbddirty because all the journalling code will explode. - * - * So what we do is to mark the page "pending dirty" and next time writepage - * is called, propagate that into the buffers appropriately. - */ -static int ext4_journalled_set_page_dirty(struct page *page) -{ - SetPageChecked(page); - return __set_page_dirty_nobuffers(page); -} - -static const struct address_space_operations ext4_ordered_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_writepage, - .write_begin = ext4_write_begin, - .write_end = ext4_ordered_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, -}; - -static const struct address_space_operations ext4_writeback_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_writepage, - .write_begin = ext4_write_begin, - .write_end = ext4_writeback_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, -}; - -static const struct address_space_operations ext4_journalled_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_writepage, - .write_begin = ext4_write_begin, - .write_end = ext4_journalled_write_end, - .set_page_dirty = ext4_journalled_set_page_dirty, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, -}; - -static const struct address_space_operations ext4_da_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_writepage, - .writepages = ext4_da_writepages, - .write_begin = ext4_da_write_begin, - .write_end = ext4_da_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_da_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, -}; - -void ext4_set_aops(struct inode *inode) -{ - switch (ext4_inode_journal_mode(inode)) { - case EXT4_INODE_ORDERED_DATA_MODE: - if (test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else - inode->i_mapping->a_ops = &ext4_ordered_aops; - break; - case EXT4_INODE_WRITEBACK_DATA_MODE: - if (test_opt(inode->i_sb, DELALLOC)) - inode->i_mapping->a_ops = &ext4_da_aops; - else - inode->i_mapping->a_ops = &ext4_writeback_aops; - break; - case EXT4_INODE_JOURNAL_DATA_MODE: - inode->i_mapping->a_ops = &ext4_journalled_aops; - break; - default: - BUG(); - } -} - - -/* - * ext4_discard_partial_page_buffers() - * Wrapper function for ext4_discard_partial_page_buffers_no_lock. - * This function finds and locks the page containing the offset - * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. - * Calling functions that already have the page locked should call - * ext4_discard_partial_page_buffers_no_lock directly. - */ -int ext4_discard_partial_page_buffers(handle_t *handle, - struct address_space *mapping, loff_t from, - loff_t length, int flags) -{ - struct inode *inode = mapping->host; - struct page *page; - int err = 0; - - page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, - mapping_gfp_mask(mapping) & ~__GFP_FS); - if (!page) - return -ENOMEM; - - err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, - from, length, flags); - - unlock_page(page); - page_cache_release(page); - return err; -} - -/* - * ext4_discard_partial_page_buffers_no_lock() - * Zeros a page range of length 'length' starting from offset 'from'. - * Buffer heads that correspond to the block aligned regions of the - * zeroed range will be unmapped. Unblock aligned regions - * will have the corresponding buffer head mapped if needed so that - * that region of the page can be updated with the partial zero out. - * - * This function assumes that the page has already been locked. The - * The range to be discarded must be contained with in the given page. - * If the specified range exceeds the end of the page it will be shortened - * to the end of the page that corresponds to 'from'. This function is - * appropriate for updating a page and it buffer heads to be unmapped and - * zeroed for blocks that have been either released, or are going to be - * released. - * - * handle: The journal handle - * inode: The files inode - * page: A locked page that contains the offset "from" - * from: The starting byte offset (from the begining of the file) - * to begin discarding - * len: The length of bytes to discard - * flags: Optional flags that may be used: - * - * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED - * Only zero the regions of the page whose buffer heads - * have already been unmapped. This flag is appropriate - * for updateing the contents of a page whose blocks may - * have already been released, and we only want to zero - * out the regions that correspond to those released blocks. - * - * Returns zero on sucess or negative on failure. - */ -static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, - struct inode *inode, struct page *page, loff_t from, - loff_t length, int flags) -{ - ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned int offset = from & (PAGE_CACHE_SIZE-1); - unsigned int blocksize, max, pos; - ext4_lblk_t iblock; - struct buffer_head *bh; - int err = 0; - - blocksize = inode->i_sb->s_blocksize; - max = PAGE_CACHE_SIZE - offset; - - if (index != page->index) - return -EINVAL; - - /* - * correct length if it does not fall between - * 'from' and the end of the page - */ - if (length > max || length < 0) - length = max; - - iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - - /* Find the buffer that contains "offset" */ - bh = page_buffers(page); - pos = blocksize; - while (offset >= pos) { - bh = bh->b_this_page; - iblock++; - pos += blocksize; - } - - pos = offset; - while (pos < offset + length) { - unsigned int end_of_block, range_to_discard; - - err = 0; - - /* The length of space left to zero and unmap */ - range_to_discard = offset + length - pos; - - /* The length of space until the end of the block */ - end_of_block = blocksize - (pos & (blocksize-1)); - - /* - * Do not unmap or zero past end of block - * for this buffer head - */ - if (range_to_discard > end_of_block) - range_to_discard = end_of_block; - - - /* - * Skip this buffer head if we are only zeroing unampped - * regions of the page - */ - if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED && - buffer_mapped(bh)) - goto next; - - /* If the range is block aligned, unmap */ - if (range_to_discard == blocksize) { - clear_buffer_dirty(bh); - bh->b_bdev = NULL; - clear_buffer_mapped(bh); - clear_buffer_req(bh); - clear_buffer_new(bh); - clear_buffer_delay(bh); - clear_buffer_unwritten(bh); - clear_buffer_uptodate(bh); - zero_user(page, pos, range_to_discard); - BUFFER_TRACE(bh, "Buffer discarded"); - goto next; - } - - /* - * If this block is not completely contained in the range - * to be discarded, then it is not going to be released. Because - * we need to keep this block, we need to make sure this part - * of the page is uptodate before we modify it by writeing - * partial zeros on it. - */ - if (!buffer_mapped(bh)) { - /* - * Buffer head must be mapped before we can read - * from the block - */ - BUFFER_TRACE(bh, "unmapped"); - ext4_get_block(inode, iblock, bh, 0); - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "still unmapped"); - goto next; - } - } - - /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) - set_buffer_uptodate(bh); - - if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); - /* Uhhuh. Read error. Complain and punt.*/ - if (!buffer_uptodate(bh)) - goto next; - } - - if (ext4_should_journal_data(inode)) { - BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); - if (err) - goto next; - } - - zero_user(page, pos, range_to_discard); - - err = 0; - if (ext4_should_journal_data(inode)) { - err = ext4_handle_dirty_metadata(handle, inode, bh); - } else - mark_buffer_dirty(bh); - - BUFFER_TRACE(bh, "Partial buffer zeroed"); -next: - bh = bh->b_this_page; - iblock++; - pos += range_to_discard; - } - - return err; -} - -int ext4_can_truncate(struct inode *inode) -{ - if (S_ISREG(inode->i_mode)) - return 1; - if (S_ISDIR(inode->i_mode)) - return 1; - if (S_ISLNK(inode->i_mode)) - return !ext4_inode_is_fast_symlink(inode); - return 0; -} - -/* - * ext4_punch_hole: punches a hole in a file by releaseing the blocks - * associated with the given offset and length - * - * @inode: File inode - * @offset: The offset where the hole will begin - * @len: The length of the hole - * - * Returns: 0 on sucess or negative on failure - */ - -int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) -{ - struct inode *inode = file->f_path.dentry->d_inode; - if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; - - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - /* TODO: Add support for non extent hole punching */ - return -EOPNOTSUPP; - } - - if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) { - /* TODO: Add support for bigalloc file systems */ - return -EOPNOTSUPP; - } - - return ext4_ext_punch_hole(file, offset, length); -} - -/* - * ext4_truncate() - * - * We block out ext4_get_block() block instantiations across the entire - * transaction, and VFS/VM ensures that ext4_truncate() cannot run - * simultaneously on behalf of the same inode. - * - * As we work through the truncate and commit bits of it to the journal there - * is one core, guiding principle: the file's tree must always be consistent on - * disk. We must be able to restart the truncate after a crash. - * - * The file's tree may be transiently inconsistent in memory (although it - * probably isn't), but whenever we close off and commit a journal transaction, - * the contents of (the filesystem + the journal) must be consistent and - * restartable. It's pretty simple, really: bottom up, right to left (although - * left-to-right works OK too). - * - * Note that at recovery time, journal replay occurs *before* the restart of - * truncate against the orphan inode list. - * - * The committed inode has the new, desired i_size (which is the same as - * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see - * that this inode's truncate did not complete and it will again call - * ext4_truncate() to have another go. So there will be instantiated blocks - * to the right of the truncation point in a crashed ext4 filesystem. But - * that's fine - as long as they are linked from the inode, the post-crash - * ext4_truncate() run will find them and release them. - */ -void ext4_truncate(struct inode *inode) -{ - trace_ext4_truncate_enter(inode); - - if (!ext4_can_truncate(inode)) - return; - - ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); - - if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) - ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); - - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ext4_ext_truncate(inode); - else - ext4_ind_truncate(inode); - - trace_ext4_truncate_exit(inode); -} - -/* - * ext4_get_inode_loc returns with an extra refcount against the inode's - * underlying buffer_head on success. If 'in_mem' is true, we have all - * data in memory that is needed to recreate the on-disk version of this - * inode. - */ -static int __ext4_get_inode_loc(struct inode *inode, - struct ext4_iloc *iloc, int in_mem) -{ - struct ext4_group_desc *gdp; - struct buffer_head *bh; - struct super_block *sb = inode->i_sb; - ext4_fsblk_t block; - int inodes_per_block, inode_offset; - - iloc->bh = NULL; - if (!ext4_valid_inum(sb, inode->i_ino)) - return -EIO; - - iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); - gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); - if (!gdp) - return -EIO; - - /* - * Figure out the offset within the block group inode table - */ - inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; - inode_offset = ((inode->i_ino - 1) % - EXT4_INODES_PER_GROUP(sb)); - block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); - iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); - - bh = sb_getblk(sb, block); - if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, block, - "unable to read itable block"); - return -EIO; - } - if (!buffer_uptodate(bh)) { - lock_buffer(bh); - - /* - * If the buffer has the write error flag, we have failed - * to write out another inode in the same block. In this - * case, we don't have to read the block because we may - * read the old inode data successfully. - */ - if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) - set_buffer_uptodate(bh); - - if (buffer_uptodate(bh)) { - /* someone brought it uptodate while we waited */ - unlock_buffer(bh); - goto has_buffer; - } - - /* - * If we have all information of the inode in memory and this - * is the only valid inode in the block, we need not read the - * block. - */ - if (in_mem) { - struct buffer_head *bitmap_bh; - int i, start; - - start = inode_offset & ~(inodes_per_block - 1); - - /* Is the inode bitmap in cache? */ - bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); - if (!bitmap_bh) - goto make_io; - - /* - * If the inode bitmap isn't in cache then the - * optimisation may end up performing two reads instead - * of one, so skip it. - */ - if (!buffer_uptodate(bitmap_bh)) { - brelse(bitmap_bh); - goto make_io; - } - for (i = start; i < start + inodes_per_block; i++) { - if (i == inode_offset) - continue; - if (ext4_test_bit(i, bitmap_bh->b_data)) - break; - } - brelse(bitmap_bh); - if (i == start + inodes_per_block) { - /* all other inodes are free, so skip I/O */ - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - unlock_buffer(bh); - goto has_buffer; - } - } - -make_io: - /* - * If we need to do any I/O, try to pre-readahead extra - * blocks from the inode table. - */ - if (EXT4_SB(sb)->s_inode_readahead_blks) { - ext4_fsblk_t b, end, table; - unsigned num; - - table = ext4_inode_table(sb, gdp); - /* s_inode_readahead_blks is always a power of 2 */ - b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); - if (table > b) - b = table; - end = b + EXT4_SB(sb)->s_inode_readahead_blks; - num = EXT4_INODES_PER_GROUP(sb); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) - num -= ext4_itable_unused_count(sb, gdp); - table += num / inodes_per_block; - if (end > table) - end = table; - while (b <= end) - sb_breadahead(sb, b++); - } - - /* - * There are other valid inodes in the buffer, this inode - * has in-inode xattrs, or we don't have this inode in memory. - * Read the block from disk. - */ - trace_ext4_load_inode(inode); - get_bh(bh); - bh->b_end_io = end_buffer_read_sync; - submit_bh(READ | REQ_META | REQ_PRIO, bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - EXT4_ERROR_INODE_BLOCK(inode, block, - "unable to read itable block"); - brelse(bh); - return -EIO; - } - } -has_buffer: - iloc->bh = bh; - return 0; -} - -int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) -{ - /* We have all inode data except xattrs in memory here. */ - return __ext4_get_inode_loc(inode, iloc, - !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); -} - -void ext4_set_inode_flags(struct inode *inode) -{ - unsigned int flags = EXT4_I(inode)->i_flags; - - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - if (flags & EXT4_SYNC_FL) - inode->i_flags |= S_SYNC; - if (flags & EXT4_APPEND_FL) - inode->i_flags |= S_APPEND; - if (flags & EXT4_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; - if (flags & EXT4_NOATIME_FL) - inode->i_flags |= S_NOATIME; - if (flags & EXT4_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; -} - -/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ -void ext4_get_inode_flags(struct ext4_inode_info *ei) -{ - unsigned int vfs_fl; - unsigned long old_fl, new_fl; - - do { - vfs_fl = ei->vfs_inode.i_flags; - old_fl = ei->i_flags; - new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| - EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| - EXT4_DIRSYNC_FL); - if (vfs_fl & S_SYNC) - new_fl |= EXT4_SYNC_FL; - if (vfs_fl & S_APPEND) - new_fl |= EXT4_APPEND_FL; - if (vfs_fl & S_IMMUTABLE) - new_fl |= EXT4_IMMUTABLE_FL; - if (vfs_fl & S_NOATIME) - new_fl |= EXT4_NOATIME_FL; - if (vfs_fl & S_DIRSYNC) - new_fl |= EXT4_DIRSYNC_FL; - } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); -} - -static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, - struct ext4_inode_info *ei) -{ - blkcnt_t i_blocks ; - struct inode *inode = &(ei->vfs_inode); - struct super_block *sb = inode->i_sb; - - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { - /* we are using combined 48 bit field */ - i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | - le32_to_cpu(raw_inode->i_blocks_lo); - if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { - /* i_blocks represent file system block size */ - return i_blocks << (inode->i_blkbits - 9); - } else { - return i_blocks; - } - } else { - return le32_to_cpu(raw_inode->i_blocks_lo); - } -} - -struct inode *ext4_iget(struct super_block *sb, unsigned long ino) -{ - struct ext4_iloc iloc; - struct ext4_inode *raw_inode; - struct ext4_inode_info *ei; - struct inode *inode; - journal_t *journal = EXT4_SB(sb)->s_journal; - long ret; - int block; - - inode = iget_locked(sb, ino); - if (!inode) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; - - ei = EXT4_I(inode); - iloc.bh = NULL; - - ret = __ext4_get_inode_loc(inode, &iloc, 0); - if (ret < 0) - goto bad_inode; - raw_inode = ext4_raw_inode(&iloc); - inode->i_mode = le16_to_cpu(raw_inode->i_mode); - inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); - inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); - if (!(test_opt(inode->i_sb, NO_UID32))) { - inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; - inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; - } - set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); - - ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ - ei->i_dir_start_lookup = 0; - ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); - /* We now have enough fields to check if the inode was active or not. - * This is needed because nfsd might try to access dead inodes - * the test is that same one that e2fsck uses - * NeilBrown 1999oct15 - */ - if (inode->i_nlink == 0) { - if (inode->i_mode == 0 || - !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { - /* this inode is deleted */ - ret = -ESTALE; - goto bad_inode; - } - /* The only unlinked inodes we let through here have - * valid i_mode and are being read by the orphan - * recovery code: that's fine, we're about to complete - * the process of deleting those. */ - } - ei->i_flags = le32_to_cpu(raw_inode->i_flags); - inode->i_blocks = ext4_inode_blocks(raw_inode, ei); - ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) - ei->i_file_acl |= - ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; - inode->i_size = ext4_isize(raw_inode); - ei->i_disksize = inode->i_size; -#ifdef CONFIG_QUOTA - ei->i_reserved_quota = 0; -#endif - inode->i_generation = le32_to_cpu(raw_inode->i_generation); - ei->i_block_group = iloc.block_group; - ei->i_last_alloc_group = ~0; - /* - * NOTE! The in-memory inode i_data array is in little-endian order - * even on big-endian machines: we do NOT byteswap the block numbers! - */ - for (block = 0; block < EXT4_N_BLOCKS; block++) - ei->i_data[block] = raw_inode->i_block[block]; - INIT_LIST_HEAD(&ei->i_orphan); - - /* - * Set transaction id's of transactions that have to be committed - * to finish f[data]sync. We set them to currently running transaction - * as we cannot be sure that the inode or some of its metadata isn't - * part of the transaction - the inode could have been reclaimed and - * now it is reread from disk. - */ - if (journal) { - transaction_t *transaction; - tid_t tid; - - read_lock(&journal->j_state_lock); - if (journal->j_running_transaction) - transaction = journal->j_running_transaction; - else - transaction = journal->j_committing_transaction; - if (transaction) - tid = transaction->t_tid; - else - tid = journal->j_commit_sequence; - read_unlock(&journal->j_state_lock); - ei->i_sync_tid = tid; - ei->i_datasync_tid = tid; - } - - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { - ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); - if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > - EXT4_INODE_SIZE(inode->i_sb)) { - ret = -EIO; - goto bad_inode; - } - if (ei->i_extra_isize == 0) { - /* The extra space is currently unused. Use it. */ - ei->i_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; - } else { - __le32 *magic = (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + - ei->i_extra_isize; - if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) - ext4_set_inode_state(inode, EXT4_STATE_XATTR); - } - } else - ei->i_extra_isize = 0; - - EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); - EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); - EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); - - inode->i_version = le32_to_cpu(raw_inode->i_disk_version); - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - inode->i_version |= - (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; - } - - ret = 0; - if (ei->i_file_acl && - !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { - EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", - ei->i_file_acl); - ret = -EIO; - goto bad_inode; - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode))) - /* Validate extent which is part of inode */ - ret = ext4_ext_check_inode(inode); - } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode))) { - /* Validate block references which are part of inode */ - ret = ext4_ind_check_inode(inode); - } - if (ret) - goto bad_inode; - - if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; - ext4_set_aops(inode); - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &ext4_dir_inode_operations; - inode->i_fop = &ext4_dir_operations; - } else if (S_ISLNK(inode->i_mode)) { - if (ext4_inode_is_fast_symlink(inode)) { - inode->i_op = &ext4_fast_symlink_inode_operations; - nd_terminate_link(ei->i_data, inode->i_size, - sizeof(ei->i_data) - 1); - } else { - inode->i_op = &ext4_symlink_inode_operations; - ext4_set_aops(inode); - } - } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || - S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { - inode->i_op = &ext4_special_inode_operations; - if (raw_inode->i_block[0]) - init_special_inode(inode, inode->i_mode, - old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); - else - init_special_inode(inode, inode->i_mode, - new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); - } else { - ret = -EIO; - EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); - goto bad_inode; - } - brelse(iloc.bh); - ext4_set_inode_flags(inode); - unlock_new_inode(inode); - return inode; - -bad_inode: - brelse(iloc.bh); - iget_failed(inode); - return ERR_PTR(ret); -} - -static int ext4_inode_blocks_set(handle_t *handle, - struct ext4_inode *raw_inode, - struct ext4_inode_info *ei) -{ - struct inode *inode = &(ei->vfs_inode); - u64 i_blocks = inode->i_blocks; - struct super_block *sb = inode->i_sb; - - if (i_blocks <= ~0U) { - /* - * i_blocks can be represnted in a 32 bit variable - * as multiple of 512 bytes - */ - raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); - raw_inode->i_blocks_high = 0; - ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); - return 0; - } - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) - return -EFBIG; - - if (i_blocks <= 0xffffffffffffULL) { - /* - * i_blocks can be represented in a 48 bit variable - * as multiple of 512 bytes - */ - raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); - raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); - ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); - } else { - ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); - /* i_block is stored in file system block size */ - i_blocks = i_blocks >> (inode->i_blkbits - 9); - raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); - raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); - } - return 0; -} - -/* - * Post the struct inode info into an on-disk inode location in the - * buffer-cache. This gobbles the caller's reference to the - * buffer_head in the inode location struct. - * - * The caller must have write access to iloc->bh. - */ -static int ext4_do_update_inode(handle_t *handle, - struct inode *inode, - struct ext4_iloc *iloc) -{ - struct ext4_inode *raw_inode = ext4_raw_inode(iloc); - struct ext4_inode_info *ei = EXT4_I(inode); - struct buffer_head *bh = iloc->bh; - int err = 0, rc, block; - - /* For fields not not tracking in the in-memory inode, - * initialise them to zero for new inodes. */ - if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) - memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); - - ext4_get_inode_flags(ei); - raw_inode->i_mode = cpu_to_le16(inode->i_mode); - if (!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); - raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); -/* - * Fix up interoperability with old kernels. Otherwise, old inodes get - * re-used with the upper 16 bits of the uid/gid intact - */ - if (!ei->i_dtime) { - raw_inode->i_uid_high = - cpu_to_le16(high_16_bits(inode->i_uid)); - raw_inode->i_gid_high = - cpu_to_le16(high_16_bits(inode->i_gid)); - } else { - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; - } - } else { - raw_inode->i_uid_low = - cpu_to_le16(fs_high2lowuid(inode->i_uid)); - raw_inode->i_gid_low = - cpu_to_le16(fs_high2lowgid(inode->i_gid)); - raw_inode->i_uid_high = 0; - raw_inode->i_gid_high = 0; - } - raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - - EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); - EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); - - if (ext4_inode_blocks_set(handle, raw_inode, ei)) - goto out_brelse; - raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); - if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != - cpu_to_le32(EXT4_OS_HURD)) - raw_inode->i_file_acl_high = - cpu_to_le16(ei->i_file_acl >> 32); - raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - ext4_isize_set(raw_inode, ei->i_disksize); - if (ei->i_disksize > 0x7fffffffULL) { - struct super_block *sb = inode->i_sb; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || - EXT4_SB(sb)->s_es->s_rev_level == - cpu_to_le32(EXT4_GOOD_OLD_REV)) { - /* If this is the first large file - * created, add a flag to the superblock. - */ - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - goto out_brelse; - ext4_update_dynamic_rev(sb); - EXT4_SET_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_LARGE_FILE); - ext4_handle_sync(handle); - err = ext4_handle_dirty_super(handle, sb); - } - } - raw_inode->i_generation = cpu_to_le32(inode->i_generation); - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - if (old_valid_dev(inode->i_rdev)) { - raw_inode->i_block[0] = - cpu_to_le32(old_encode_dev(inode->i_rdev)); - raw_inode->i_block[1] = 0; - } else { - raw_inode->i_block[0] = 0; - raw_inode->i_block[1] = - cpu_to_le32(new_encode_dev(inode->i_rdev)); - raw_inode->i_block[2] = 0; - } - } else - for (block = 0; block < EXT4_N_BLOCKS; block++) - raw_inode->i_block[block] = ei->i_data[block]; - - raw_inode->i_disk_version = cpu_to_le32(inode->i_version); - if (ei->i_extra_isize) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - raw_inode->i_version_hi = - cpu_to_le32(inode->i_version >> 32); - raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); - } - - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - rc = ext4_handle_dirty_metadata(handle, NULL, bh); - if (!err) - err = rc; - ext4_clear_inode_state(inode, EXT4_STATE_NEW); - - ext4_update_inode_fsync_trans(handle, inode, 0); -out_brelse: - brelse(bh); - ext4_std_error(inode->i_sb, err); - return err; -} - -/* - * ext4_write_inode() - * - * We are called from a few places: - * - * - Within generic_file_write() for O_SYNC files. - * Here, there will be no transaction running. We wait for any running - * trasnaction to commit. - * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. - * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. - * - * In all cases it is actually safe for us to return without doing anything, - * because the inode has been copied into a raw inode buffer in - * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. - * - * Note that we are absolutely dependent upon all inode dirtiers doing the - * right thing: they *must* call mark_inode_dirty() after dirtying info in - * which we are interested. - * - * It would be a bug for them to not do this. The code: - * - * mark_inode_dirty(inode) - * stuff(); - * inode->i_size = expr; - * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. - */ -int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - int err; - - if (current->flags & PF_MEMALLOC) - return 0; - - if (EXT4_SB(inode->i_sb)->s_journal) { - if (ext4_journal_current_handle()) { - jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); - dump_stack(); - return -EIO; - } - - if (wbc->sync_mode != WB_SYNC_ALL) - return 0; - - err = ext4_force_commit(inode->i_sb); - } else { - struct ext4_iloc iloc; - - err = __ext4_get_inode_loc(inode, &iloc, 0); - if (err) - return err; - if (wbc->sync_mode == WB_SYNC_ALL) - sync_dirty_buffer(iloc.bh); - if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { - EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, - "IO error syncing inode"); - err = -EIO; - } - brelse(iloc.bh); - } - return err; -} - -/* - * ext4_setattr() - * - * Called from notify_change. - * - * We want to trap VFS attempts to truncate the file as soon as - * possible. In particular, we want to make sure that when the VFS - * shrinks i_size, we put the inode on the orphan list and modify - * i_disksize immediately, so that during the subsequent flushing of - * dirty pages and freeing of disk blocks, we can guarantee that any - * commit will leave the blocks being flushed in an unused state on - * disk. (On recovery, the inode will get truncated and the blocks will - * be freed, so we have a strong guarantee that no future commit will - * leave these blocks visible to the user.) - * - * Another thing we have to assure is that if we are in ordered mode - * and inode is still attached to the committing transaction, we must - * we start writeout of all the dirty pages which are being truncated. - * This way we are sure that all the data written in the previous - * transaction are already on disk (truncate waits for pages under - * writeback). - * - * Called with inode->i_mutex down. - */ -int ext4_setattr(struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = dentry->d_inode; - int error, rc = 0; - int orphan = 0; - const unsigned int ia_valid = attr->ia_valid; - - error = inode_change_ok(inode, attr); - if (error) - return error; - - if (is_quota_modification(inode, attr)) - dquot_initialize(inode); - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { - handle_t *handle; - - /* (user+group)*(old+new) structure, inode write (sb, - * inode block, ? - but truncate inode update has it) */ - handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ - EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto err_out; - } - error = dquot_transfer(inode, attr); - if (error) { - ext4_journal_stop(handle); - return error; - } - /* Update corresponding info in inode so that everything is in - * one transaction */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; - error = ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - } - - if (attr->ia_valid & ATTR_SIZE) { - inode_dio_wait(inode); - - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - - if (attr->ia_size > sbi->s_bitmap_maxbytes) - return -EFBIG; - } - } - - if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && - (attr->ia_size < inode->i_size)) { - handle_t *handle; - - handle = ext4_journal_start(inode, 3); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto err_out; - } - if (ext4_handle_valid(handle)) { - error = ext4_orphan_add(handle, inode); - orphan = 1; - } - EXT4_I(inode)->i_disksize = attr->ia_size; - rc = ext4_mark_inode_dirty(handle, inode); - if (!error) - error = rc; - ext4_journal_stop(handle); - - if (ext4_should_order_data(inode)) { - error = ext4_begin_ordered_truncate(inode, - attr->ia_size); - if (error) { - /* Do as much error cleanup as possible */ - handle = ext4_journal_start(inode, 3); - if (IS_ERR(handle)) { - ext4_orphan_del(NULL, inode); - goto err_out; - } - ext4_orphan_del(handle, inode); - orphan = 0; - ext4_journal_stop(handle); - goto err_out; - } - } - } - - if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(inode)) - truncate_setsize(inode, attr->ia_size); - ext4_truncate(inode); - } - - if (!rc) { - setattr_copy(inode, attr); - mark_inode_dirty(inode); - } - - /* - * If the call to ext4_truncate failed to get a transaction handle at - * all, we need to clean up the in-core orphan list manually. - */ - if (orphan && inode->i_nlink) - ext4_orphan_del(NULL, inode); - - if (!rc && (ia_valid & ATTR_MODE)) - rc = ext4_acl_chmod(inode); - -err_out: - ext4_std_error(inode->i_sb, error); - if (!error) - error = rc; - return error; -} - -int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) -{ - struct inode *inode; - unsigned long delalloc_blocks; - - inode = dentry->d_inode; - generic_fillattr(inode, stat); - - /* - * We can't update i_blocks if the block allocation is delayed - * otherwise in the case of system crash before the real block - * allocation is done, we will have i_blocks inconsistent with - * on-disk file blocks. - * We always keep i_blocks updated together with real - * allocation. But to not confuse with user, stat - * will return the blocks that include the delayed allocation - * blocks for this file. - */ - delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; - - stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; - return 0; -} - -static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) -{ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return ext4_ind_trans_blocks(inode, nrblocks, chunk); - return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); -} - -/* - * Account for index blocks, block groups bitmaps and block group - * descriptor blocks if modify datablocks and index blocks - * worse case, the indexs blocks spread over different block groups - * - * If datablocks are discontiguous, they are possible to spread over - * different block groups too. If they are contiuguous, with flexbg, - * they could still across block group boundary. - * - * Also account for superblock, inode, quota and xattr blocks - */ -static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) -{ - ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); - int gdpblocks; - int idxblocks; - int ret = 0; - - /* - * How many index blocks need to touch to modify nrblocks? - * The "Chunk" flag indicating whether the nrblocks is - * physically contiguous on disk - * - * For Direct IO and fallocate, they calls get_block to allocate - * one single extent at a time, so they could set the "Chunk" flag - */ - idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); - - ret = idxblocks; - - /* - * Now let's see how many group bitmaps and group descriptors need - * to account - */ - groups = idxblocks; - if (chunk) - groups += 1; - else - groups += nrblocks; - - gdpblocks = groups; - if (groups > ngroups) - groups = ngroups; - if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) - gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; - - /* bitmaps and block group descriptor blocks */ - ret += groups + gdpblocks; - - /* Blocks for super block, inode, quota and xattr blocks */ - ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); - - return ret; -} - -/* - * Calculate the total number of credits to reserve to fit - * the modification of a single pages into a single transaction, - * which may include multiple chunks of block allocations. - * - * This could be called via ext4_write_begin() - * - * We need to consider the worse case, when - * one new block per extent. - */ -int ext4_writepage_trans_blocks(struct inode *inode) -{ - int bpp = ext4_journal_blocks_per_page(inode); - int ret; - - ret = ext4_meta_trans_blocks(inode, bpp, 0); - - /* Account for data blocks for journalled mode */ - if (ext4_should_journal_data(inode)) - ret += bpp; - return ret; -} - -/* - * Calculate the journal credits for a chunk of data modification. - * - * This is called from DIO, fallocate or whoever calling - * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. - * - * journal buffers for data blocks are not included here, as DIO - * and fallocate do no need to journal data buffers. - */ -int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) -{ - return ext4_meta_trans_blocks(inode, nrblocks, 1); -} - -/* - * The caller must have previously called ext4_reserve_inode_write(). - * Give this, we know that the caller already has write access to iloc->bh. - */ -int ext4_mark_iloc_dirty(handle_t *handle, - struct inode *inode, struct ext4_iloc *iloc) -{ - int err = 0; - - if (IS_I_VERSION(inode)) - inode_inc_iversion(inode); - - /* the do_update_inode consumes one bh->b_count */ - get_bh(iloc->bh); - - /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ - err = ext4_do_update_inode(handle, inode, iloc); - put_bh(iloc->bh); - return err; -} - -/* - * On success, We end up with an outstanding reference count against - * iloc->bh. This _must_ be cleaned up later. - */ - -int -ext4_reserve_inode_write(handle_t *handle, struct inode *inode, - struct ext4_iloc *iloc) -{ - int err; - - err = ext4_get_inode_loc(inode, iloc); - if (!err) { - BUFFER_TRACE(iloc->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, iloc->bh); - if (err) { - brelse(iloc->bh); - iloc->bh = NULL; - } - } - ext4_std_error(inode->i_sb, err); - return err; -} - -/* - * Expand an inode by new_extra_isize bytes. - * Returns 0 on success or negative error number on failure. - */ -static int ext4_expand_extra_isize(struct inode *inode, - unsigned int new_extra_isize, - struct ext4_iloc iloc, - handle_t *handle) -{ - struct ext4_inode *raw_inode; - struct ext4_xattr_ibody_header *header; - - if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) - return 0; - - raw_inode = ext4_raw_inode(&iloc); - - header = IHDR(inode, raw_inode); - - /* No extended attributes present */ - if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || - header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { - memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, - new_extra_isize); - EXT4_I(inode)->i_extra_isize = new_extra_isize; - return 0; - } - - /* try to expand with EAs present */ - return ext4_expand_extra_isize_ea(inode, new_extra_isize, - raw_inode, handle); -} - -/* - * What we do here is to mark the in-core inode as clean with respect to inode - * dirtiness (it may still be data-dirty). - * This means that the in-core inode may be reaped by prune_icache - * without having to perform any I/O. This is a very good thing, - * because *any* task may call prune_icache - even ones which - * have a transaction open against a different journal. - * - * Is this cheating? Not really. Sure, we haven't written the - * inode out, but prune_icache isn't a user-visible syncing function. - * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) - * we start and wait on commits. - * - * Is this efficient/effective? Well, we're being nice to the system - * by cleaning up our inodes proactively so they can be reaped - * without I/O. But we are potentially leaving up to five seconds' - * worth of inodes floating about which prune_icache wants us to - * write out. One way to fix that would be to get prune_icache() - * to do a write_super() to free up some memory. It has the desired - * effect. - */ -int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) -{ - struct ext4_iloc iloc; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - static unsigned int mnt_count; - int err, ret; - - might_sleep(); - trace_ext4_mark_inode_dirty(inode, _RET_IP_); - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (ext4_handle_valid(handle) && - EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && - !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { - /* - * We need extra buffer credits since we may write into EA block - * with this same handle. If journal_extend fails, then it will - * only result in a minor loss of functionality for that inode. - * If this is felt to be critical, then e2fsck should be run to - * force a large enough s_min_extra_isize. - */ - if ((jbd2_journal_extend(handle, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { - ret = ext4_expand_extra_isize(inode, - sbi->s_want_extra_isize, - iloc, handle); - if (ret) { - ext4_set_inode_state(inode, - EXT4_STATE_NO_EXPAND); - if (mnt_count != - le16_to_cpu(sbi->s_es->s_mnt_count)) { - ext4_warning(inode->i_sb, - "Unable to expand inode %lu. Delete" - " some EAs or run e2fsck.", - inode->i_ino); - mnt_count = - le16_to_cpu(sbi->s_es->s_mnt_count); - } - } - } - } - if (!err) - err = ext4_mark_iloc_dirty(handle, inode, &iloc); - return err; -} - -/* - * ext4_dirty_inode() is called from __mark_inode_dirty() - * - * We're really interested in the case where a file is being extended. - * i_size has been changed by generic_commit_write() and we thus need - * to include the updated inode in the current transaction. - * - * Also, dquot_alloc_block() will always dirty the inode when blocks - * are allocated to the file. - * - * If the inode is marked synchronous, we don't honour that here - doing - * so would cause a commit on atime updates, which we don't bother doing. - * We handle synchronous inodes at the highest possible level. - */ -void ext4_dirty_inode(struct inode *inode, int flags) -{ - handle_t *handle; - - handle = ext4_journal_start(inode, 2); - if (IS_ERR(handle)) - goto out; - - ext4_mark_inode_dirty(handle, inode); - - ext4_journal_stop(handle); -out: - return; -} - -#if 0 -/* - * Bind an inode's backing buffer_head into this transaction, to prevent - * it from being flushed to disk early. Unlike - * ext4_reserve_inode_write, this leaves behind no bh reference and - * returns no iloc structure, so the caller needs to repeat the iloc - * lookup to mark the inode dirty later. - */ -static int ext4_pin_inode(handle_t *handle, struct inode *inode) -{ - struct ext4_iloc iloc; - - int err = 0; - if (handle) { - err = ext4_get_inode_loc(inode, &iloc); - if (!err) { - BUFFER_TRACE(iloc.bh, "get_write_access"); - err = jbd2_journal_get_write_access(handle, iloc.bh); - if (!err) - err = ext4_handle_dirty_metadata(handle, - NULL, - iloc.bh); - brelse(iloc.bh); - } - } - ext4_std_error(inode->i_sb, err); - return err; -} -#endif - -int ext4_change_inode_journal_flag(struct inode *inode, int val) -{ - journal_t *journal; - handle_t *handle; - int err; - - /* - * We have to be very careful here: changing a data block's - * journaling status dynamically is dangerous. If we write a - * data block to the journal, change the status and then delete - * that block, we risk forgetting to revoke the old log record - * from the journal and so a subsequent replay can corrupt data. - * So, first we make sure that the journal is empty and that - * nobody is changing anything. - */ - - journal = EXT4_JOURNAL(inode); - if (!journal) - return 0; - if (is_journal_aborted(journal)) - return -EROFS; - /* We have to allocate physical blocks for delalloc blocks - * before flushing journal. otherwise delalloc blocks can not - * be allocated any more. even more truncate on delalloc blocks - * could trigger BUG by flushing delalloc blocks in journal. - * There is no delalloc block in non-journal data mode. - */ - if (val && test_opt(inode->i_sb, DELALLOC)) { - err = ext4_alloc_da_blocks(inode); - if (err < 0) - return err; - } - - jbd2_journal_lock_updates(journal); - - /* - * OK, there are no updates running now, and all cached data is - * synced to disk. We are now in a completely consistent state - * which doesn't have anything in the journal, and we know that - * no filesystem updates are running, so it is safe to modify - * the inode's in-core data-journaling state flag now. - */ - - if (val) - ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); - else { - jbd2_journal_flush(journal); - ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); - } - ext4_set_aops(inode); - - jbd2_journal_unlock_updates(journal); - - /* Finally we can mark the inode as dirty. */ - - handle = ext4_journal_start(inode, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - err = ext4_mark_inode_dirty(handle, inode); - ext4_handle_sync(handle); - ext4_journal_stop(handle); - ext4_std_error(inode->i_sb, err); - - return err; -} - -static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) -{ - return !buffer_mapped(bh); -} - -int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *page = vmf->page; - loff_t size; - unsigned long len; - int ret; - struct file *file = vma->vm_file; - struct inode *inode = file->f_path.dentry->d_inode; - struct address_space *mapping = inode->i_mapping; - handle_t *handle; - get_block_t *get_block; - int retries = 0; - - /* - * This check is racy but catches the common case. We rely on - * __block_page_mkwrite() to do a reliable check. - */ - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - /* Delalloc case is easy... */ - if (test_opt(inode->i_sb, DELALLOC) && - !ext4_should_journal_data(inode) && - !ext4_nonda_switch(inode->i_sb)) { - do { - ret = __block_page_mkwrite(vma, vmf, - ext4_da_get_block_prep); - } while (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)); - goto out_ret; - } - - lock_page(page); - size = i_size_read(inode); - /* Page got truncated from under us? */ - if (page->mapping != mapping || page_offset(page) > size) { - unlock_page(page); - ret = VM_FAULT_NOPAGE; - goto out; - } - - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - /* - * Return if we have all the buffers mapped. This avoids the need to do - * journal_start/journal_stop which can block and take a long time - */ - if (page_has_buffers(page)) { - if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_unmapped)) { - /* Wait so that we don't change page under IO */ - wait_on_page_writeback(page); - ret = VM_FAULT_LOCKED; - goto out; - } - } - unlock_page(page); - /* OK, we need to fill the hole... */ - if (ext4_should_dioread_nolock(inode)) - get_block = ext4_get_block_write; - else - get_block = ext4_get_block; -retry_alloc: - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = VM_FAULT_SIGBUS; - goto out; - } - ret = __block_page_mkwrite(vma, vmf, get_block); - if (!ret && ext4_should_journal_data(inode)) { - if (walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { - unlock_page(page); - ret = VM_FAULT_SIGBUS; - ext4_journal_stop(handle); - goto out; - } - ext4_set_inode_state(inode, EXT4_STATE_JDATA); - } - ext4_journal_stop(handle); - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry_alloc; -out_ret: - ret = block_page_mkwrite_return(ret); -out: - return ret; -} diff --git a/ANDROID_3.4.5/fs/ext4/ioctl.c b/ANDROID_3.4.5/fs/ext4/ioctl.c deleted file mode 100644 index 1365903a..00000000 --- a/ANDROID_3.4.5/fs/ext4/ioctl.c +++ /dev/null @@ -1,509 +0,0 @@ -/* - * linux/fs/ext4/ioctl.c - * - * Copyright (C) 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - */ - -#include <linux/fs.h> -#include <linux/jbd2.h> -#include <linux/capability.h> -#include <linux/time.h> -#include <linux/compat.h> -#include <linux/mount.h> -#include <linux/file.h> -#include <asm/uaccess.h> -#include "ext4_jbd2.h" -#include "ext4.h" - -#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1) - -long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - struct inode *inode = filp->f_dentry->d_inode; - struct super_block *sb = inode->i_sb; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned int flags; - - ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); - - switch (cmd) { - case EXT4_IOC_GETFLAGS: - ext4_get_inode_flags(ei); - flags = ei->i_flags & EXT4_FL_USER_VISIBLE; - return put_user(flags, (int __user *) arg); - case EXT4_IOC_SETFLAGS: { - handle_t *handle = NULL; - int err, migrate = 0; - struct ext4_iloc iloc; - unsigned int oldflags, mask, i; - unsigned int jflag; - - if (!inode_owner_or_capable(inode)) - return -EACCES; - - if (get_user(flags, (int __user *) arg)) - return -EFAULT; - - err = mnt_want_write_file(filp); - if (err) - return err; - - flags = ext4_mask_flags(inode->i_mode, flags); - - err = -EPERM; - mutex_lock(&inode->i_mutex); - /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) - goto flags_out; - - oldflags = ei->i_flags; - - /* The JOURNAL_DATA flag is modifiable only by root */ - jflag = flags & EXT4_JOURNAL_DATA_FL; - - /* - * The IMMUTABLE and APPEND_ONLY flags can only be changed by - * the relevant capability. - * - * This test looks nicer. Thanks to Pauline Middelink - */ - if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) - goto flags_out; - } - - /* - * The JOURNAL_DATA flag can only be changed by - * the relevant capability. - */ - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) - goto flags_out; - } - if (oldflags & EXT4_EXTENTS_FL) { - /* We don't support clearning extent flags */ - if (!(flags & EXT4_EXTENTS_FL)) { - err = -EOPNOTSUPP; - goto flags_out; - } - } else if (flags & EXT4_EXTENTS_FL) { - /* migrate the file */ - migrate = 1; - flags &= ~EXT4_EXTENTS_FL; - } - - if (flags & EXT4_EOFBLOCKS_FL) { - /* we don't support adding EOFBLOCKS flag */ - if (!(oldflags & EXT4_EOFBLOCKS_FL)) { - err = -EOPNOTSUPP; - goto flags_out; - } - } else if (oldflags & EXT4_EOFBLOCKS_FL) - ext4_truncate(inode); - - handle = ext4_journal_start(inode, 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto flags_out; - } - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) - goto flags_err; - - for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { - if (!(mask & EXT4_FL_USER_MODIFIABLE)) - continue; - if (mask & flags) - ext4_set_inode_flag(inode, i); - else - ext4_clear_inode_flag(inode, i); - } - - ext4_set_inode_flags(inode); - inode->i_ctime = ext4_current_time(inode); - - err = ext4_mark_iloc_dirty(handle, inode, &iloc); -flags_err: - ext4_journal_stop(handle); - if (err) - goto flags_out; - - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) - err = ext4_change_inode_journal_flag(inode, jflag); - if (err) - goto flags_out; - if (migrate) - err = ext4_ext_migrate(inode); -flags_out: - mutex_unlock(&inode->i_mutex); - mnt_drop_write_file(filp); - return err; - } - case EXT4_IOC_GETVERSION: - case EXT4_IOC_GETVERSION_OLD: - return put_user(inode->i_generation, (int __user *) arg); - case EXT4_IOC_SETVERSION: - case EXT4_IOC_SETVERSION_OLD: { - handle_t *handle; - struct ext4_iloc iloc; - __u32 generation; - int err; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - err = mnt_want_write_file(filp); - if (err) - return err; - if (get_user(generation, (int __user *) arg)) { - err = -EFAULT; - goto setversion_out; - } - - mutex_lock(&inode->i_mutex); - handle = ext4_journal_start(inode, 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto unlock_out; - } - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err == 0) { - inode->i_ctime = ext4_current_time(inode); - inode->i_generation = generation; - err = ext4_mark_iloc_dirty(handle, inode, &iloc); - } - ext4_journal_stop(handle); - -unlock_out: - mutex_unlock(&inode->i_mutex); -setversion_out: - mnt_drop_write_file(filp); - return err; - } - case EXT4_IOC_GROUP_EXTEND: { - ext4_fsblk_t n_blocks_count; - int err, err2=0; - - err = ext4_resize_begin(sb); - if (err) - return err; - - if (get_user(n_blocks_count, (__u32 __user *)arg)) { - err = -EFAULT; - goto group_extend_out; - } - - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not supported with bigalloc"); - err = -EOPNOTSUPP; - goto group_extend_out; - } - - err = mnt_want_write_file(filp); - if (err) - goto group_extend_out; - - err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); - if (EXT4_SB(sb)->s_journal) { - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - } - if (err == 0) - err = err2; - mnt_drop_write_file(filp); -group_extend_out: - ext4_resize_end(sb); - return err; - } - - case EXT4_IOC_MOVE_EXT: { - struct move_extent me; - struct file *donor_filp; - int err; - - if (!(filp->f_mode & FMODE_READ) || - !(filp->f_mode & FMODE_WRITE)) - return -EBADF; - - if (copy_from_user(&me, - (struct move_extent __user *)arg, sizeof(me))) - return -EFAULT; - me.moved_len = 0; - - donor_filp = fget(me.donor_fd); - if (!donor_filp) - return -EBADF; - - if (!(donor_filp->f_mode & FMODE_WRITE)) { - err = -EBADF; - goto mext_out; - } - - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { - ext4_msg(sb, KERN_ERR, - "Online defrag not supported with bigalloc"); - return -EOPNOTSUPP; - } - - err = mnt_want_write_file(filp); - if (err) - goto mext_out; - - err = ext4_move_extents(filp, donor_filp, me.orig_start, - me.donor_start, me.len, &me.moved_len); - mnt_drop_write_file(filp); - mnt_drop_write(filp->f_path.mnt); - - if (copy_to_user((struct move_extent __user *)arg, - &me, sizeof(me))) - err = -EFAULT; -mext_out: - fput(donor_filp); - return err; - } - - case EXT4_IOC_GROUP_ADD: { - struct ext4_new_group_data input; - int err, err2=0; - - err = ext4_resize_begin(sb); - if (err) - return err; - - if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, - sizeof(input))) { - err = -EFAULT; - goto group_add_out; - } - - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not supported with bigalloc"); - err = -EOPNOTSUPP; - goto group_add_out; - } - - err = mnt_want_write_file(filp); - if (err) - goto group_add_out; - - err = ext4_group_add(sb, &input); - if (EXT4_SB(sb)->s_journal) { - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - } - if (err == 0) - err = err2; - mnt_drop_write_file(filp); -group_add_out: - ext4_resize_end(sb); - return err; - } - - case EXT4_IOC_MIGRATE: - { - int err; - if (!inode_owner_or_capable(inode)) - return -EACCES; - - err = mnt_want_write_file(filp); - if (err) - return err; - /* - * inode_mutex prevent write and truncate on the file. - * Read still goes through. We take i_data_sem in - * ext4_ext_swap_inode_data before we switch the - * inode format to prevent read. - */ - mutex_lock(&(inode->i_mutex)); - err = ext4_ext_migrate(inode); - mutex_unlock(&(inode->i_mutex)); - mnt_drop_write_file(filp); - return err; - } - - case EXT4_IOC_ALLOC_DA_BLKS: - { - int err; - if (!inode_owner_or_capable(inode)) - return -EACCES; - - err = mnt_want_write_file(filp); - if (err) - return err; - err = ext4_alloc_da_blocks(inode); - mnt_drop_write_file(filp); - return err; - } - - case EXT4_IOC_RESIZE_FS: { - ext4_fsblk_t n_blocks_count; - struct super_block *sb = inode->i_sb; - int err = 0, err2 = 0; - - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not (yet) supported with bigalloc"); - return -EOPNOTSUPP; - } - - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_META_BG)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not (yet) supported with meta_bg"); - return -EOPNOTSUPP; - } - - if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, - sizeof(__u64))) { - return -EFAULT; - } - - if (n_blocks_count > MAX_32_NUM && - !EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_64BIT)) { - ext4_msg(sb, KERN_ERR, - "File system only supports 32-bit block numbers"); - return -EOPNOTSUPP; - } - - err = ext4_resize_begin(sb); - if (err) - return err; - - err = mnt_want_write(filp->f_path.mnt); - if (err) - goto resizefs_out; - - err = ext4_resize_fs(sb, n_blocks_count); - if (EXT4_SB(sb)->s_journal) { - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - } - if (err == 0) - err = err2; - mnt_drop_write(filp->f_path.mnt); -resizefs_out: - ext4_resize_end(sb); - return err; - } - - case FITRIM: - { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - struct fstrim_range range; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; - - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { - ext4_msg(sb, KERN_ERR, - "FITRIM not supported with bigalloc"); - return -EOPNOTSUPP; - } - - if (copy_from_user(&range, (struct fstrim_range __user *)arg, - sizeof(range))) - return -EFAULT; - - range.minlen = max((unsigned int)range.minlen, - q->limits.discard_granularity); - ret = ext4_trim_fs(sb, &range); - if (ret < 0) - return ret; - - if (copy_to_user((struct fstrim_range __user *)arg, &range, - sizeof(range))) - return -EFAULT; - - return 0; - } - - default: - return -ENOTTY; - } -} - -#ifdef CONFIG_COMPAT -long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - /* These are just misnamed, they actually get/put from/to user an int */ - switch (cmd) { - case EXT4_IOC32_GETFLAGS: - cmd = EXT4_IOC_GETFLAGS; - break; - case EXT4_IOC32_SETFLAGS: - cmd = EXT4_IOC_SETFLAGS; - break; - case EXT4_IOC32_GETVERSION: - cmd = EXT4_IOC_GETVERSION; - break; - case EXT4_IOC32_SETVERSION: - cmd = EXT4_IOC_SETVERSION; - break; - case EXT4_IOC32_GROUP_EXTEND: - cmd = EXT4_IOC_GROUP_EXTEND; - break; - case EXT4_IOC32_GETVERSION_OLD: - cmd = EXT4_IOC_GETVERSION_OLD; - break; - case EXT4_IOC32_SETVERSION_OLD: - cmd = EXT4_IOC_SETVERSION_OLD; - break; - case EXT4_IOC32_GETRSVSZ: - cmd = EXT4_IOC_GETRSVSZ; - break; - case EXT4_IOC32_SETRSVSZ: - cmd = EXT4_IOC_SETRSVSZ; - break; - case EXT4_IOC32_GROUP_ADD: { - struct compat_ext4_new_group_input __user *uinput; - struct ext4_new_group_input input; - mm_segment_t old_fs; - int err; - - uinput = compat_ptr(arg); - err = get_user(input.group, &uinput->group); - err |= get_user(input.block_bitmap, &uinput->block_bitmap); - err |= get_user(input.inode_bitmap, &uinput->inode_bitmap); - err |= get_user(input.inode_table, &uinput->inode_table); - err |= get_user(input.blocks_count, &uinput->blocks_count); - err |= get_user(input.reserved_blocks, - &uinput->reserved_blocks); - if (err) - return -EFAULT; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD, - (unsigned long) &input); - set_fs(old_fs); - return err; - } - case EXT4_IOC_MOVE_EXT: - case FITRIM: - case EXT4_IOC_RESIZE_FS: - break; - default: - return -ENOIOCTLCMD; - } - return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); -} -#endif diff --git a/ANDROID_3.4.5/fs/ext4/mballoc.c b/ANDROID_3.4.5/fs/ext4/mballoc.c deleted file mode 100644 index 6b0a57ea..00000000 --- a/ANDROID_3.4.5/fs/ext4/mballoc.c +++ /dev/null @@ -1,5047 +0,0 @@ -/* - * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com - * Written by Alex Tomas <alex@clusterfs.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public Licens - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- - */ - - -/* - * mballoc.c contains the multiblocks allocation routines - */ - -#include "ext4_jbd2.h" -#include "mballoc.h" -#include <linux/debugfs.h> -#include <linux/slab.h> -#include <trace/events/ext4.h> - -/* - * MUSTDO: - * - test ext4_ext_search_left() and ext4_ext_search_right() - * - search for metadata in few groups - * - * TODO v4: - * - normalization should take into account whether file is still open - * - discard preallocations if no free space left (policy?) - * - don't normalize tails - * - quota - * - reservation for superuser - * - * TODO v3: - * - bitmap read-ahead (proposed by Oleg Drokin aka green) - * - track min/max extents in each group for better group selection - * - mb_mark_used() may allocate chunk right after splitting buddy - * - tree of groups sorted by number of free blocks - * - error handling - */ - -/* - * The allocation request involve request for multiple number of blocks - * near to the goal(block) value specified. - * - * During initialization phase of the allocator we decide to use the - * group preallocation or inode preallocation depending on the size of - * the file. The size of the file could be the resulting file size we - * would have after allocation, or the current file size, which ever - * is larger. If the size is less than sbi->s_mb_stream_request we - * select to use the group preallocation. The default value of - * s_mb_stream_request is 16 blocks. This can also be tuned via - * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in - * terms of number of blocks. - * - * The main motivation for having small file use group preallocation is to - * ensure that we have small files closer together on the disk. - * - * First stage the allocator looks at the inode prealloc list, - * ext4_inode_info->i_prealloc_list, which contains list of prealloc - * spaces for this particular inode. The inode prealloc space is - * represented as: - * - * pa_lstart -> the logical start block for this prealloc space - * pa_pstart -> the physical start block for this prealloc space - * pa_len -> length for this prealloc space (in clusters) - * pa_free -> free space available in this prealloc space (in clusters) - * - * The inode preallocation space is used looking at the _logical_ start - * block. If only the logical file block falls within the range of prealloc - * space we will consume the particular prealloc space. This makes sure that - * we have contiguous physical blocks representing the file blocks - * - * The important thing to be noted in case of inode prealloc space is that - * we don't modify the values associated to inode prealloc space except - * pa_free. - * - * If we are not able to find blocks in the inode prealloc space and if we - * have the group allocation flag set then we look at the locality group - * prealloc space. These are per CPU prealloc list represented as - * - * ext4_sb_info.s_locality_groups[smp_processor_id()] - * - * The reason for having a per cpu locality group is to reduce the contention - * between CPUs. It is possible to get scheduled at this point. - * - * The locality group prealloc space is used looking at whether we have - * enough free space (pa_free) within the prealloc space. - * - * If we can't allocate blocks via inode prealloc or/and locality group - * prealloc then we look at the buddy cache. The buddy cache is represented - * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets - * mapped to the buddy and bitmap information regarding different - * groups. The buddy information is attached to buddy cache inode so that - * we can access them through the page cache. The information regarding - * each group is loaded via ext4_mb_load_buddy. The information involve - * block bitmap and buddy information. The information are stored in the - * inode as: - * - * { page } - * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... - * - * - * one block each for bitmap and buddy information. So for each group we - * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / - * blocksize) blocks. So it can have information regarding groups_per_page - * which is blocks_per_page/2 - * - * The buddy cache inode is not stored on disk. The inode is thrown - * away when the filesystem is unmounted. - * - * We look for count number of blocks in the buddy cache. If we were able - * to locate that many free blocks we return with additional information - * regarding rest of the contiguous physical block available - * - * Before allocating blocks via buddy cache we normalize the request - * blocks. This ensure we ask for more blocks that we needed. The extra - * blocks that we get after allocation is added to the respective prealloc - * list. In case of inode preallocation we follow a list of heuristics - * based on file size. This can be found in ext4_mb_normalize_request. If - * we are doing a group prealloc we try to normalize the request to - * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is - * dependent on the cluster size; for non-bigalloc file systems, it is - * 512 blocks. This can be tuned via - * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in - * terms of number of blocks. If we have mounted the file system with -O - * stripe=<value> option the group prealloc request is normalized to the - * the smallest multiple of the stripe value (sbi->s_stripe) which is - * greater than the default mb_group_prealloc. - * - * The regular allocator (using the buddy cache) supports a few tunables. - * - * /sys/fs/ext4/<partition>/mb_min_to_scan - * /sys/fs/ext4/<partition>/mb_max_to_scan - * /sys/fs/ext4/<partition>/mb_order2_req - * - * The regular allocator uses buddy scan only if the request len is power of - * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The - * value of s_mb_order2_reqs can be tuned via - * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to - * stripe size (sbi->s_stripe), we try to search for contiguous block in - * stripe size. This should result in better allocation on RAID setups. If - * not, we search in the specific group using bitmap for best extents. The - * tunable min_to_scan and max_to_scan control the behaviour here. - * min_to_scan indicate how long the mballoc __must__ look for a best - * extent and max_to_scan indicates how long the mballoc __can__ look for a - * best extent in the found extents. Searching for the blocks starts with - * the group specified as the goal value in allocation context via - * ac_g_ex. Each group is first checked based on the criteria whether it - * can be used for allocation. ext4_mb_good_group explains how the groups are - * checked. - * - * Both the prealloc space are getting populated as above. So for the first - * request we will hit the buddy cache which will result in this prealloc - * space getting filled. The prealloc space is then later used for the - * subsequent request. - */ - -/* - * mballoc operates on the following data: - * - on-disk bitmap - * - in-core buddy (actually includes buddy and bitmap) - * - preallocation descriptors (PAs) - * - * there are two types of preallocations: - * - inode - * assiged to specific inode and can be used for this inode only. - * it describes part of inode's space preallocated to specific - * physical blocks. any block from that preallocated can be used - * independent. the descriptor just tracks number of blocks left - * unused. so, before taking some block from descriptor, one must - * make sure corresponded logical block isn't allocated yet. this - * also means that freeing any block within descriptor's range - * must discard all preallocated blocks. - * - locality group - * assigned to specific locality group which does not translate to - * permanent set of inodes: inode can join and leave group. space - * from this type of preallocation can be used for any inode. thus - * it's consumed from the beginning to the end. - * - * relation between them can be expressed as: - * in-core buddy = on-disk bitmap + preallocation descriptors - * - * this mean blocks mballoc considers used are: - * - allocated blocks (persistent) - * - preallocated blocks (non-persistent) - * - * consistency in mballoc world means that at any time a block is either - * free or used in ALL structures. notice: "any time" should not be read - * literally -- time is discrete and delimited by locks. - * - * to keep it simple, we don't use block numbers, instead we count number of - * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. - * - * all operations can be expressed as: - * - init buddy: buddy = on-disk + PAs - * - new PA: buddy += N; PA = N - * - use inode PA: on-disk += N; PA -= N - * - discard inode PA buddy -= on-disk - PA; PA = 0 - * - use locality group PA on-disk += N; PA -= N - * - discard locality group PA buddy -= PA; PA = 0 - * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap - * is used in real operation because we can't know actual used - * bits from PA, only from on-disk bitmap - * - * if we follow this strict logic, then all operations above should be atomic. - * given some of them can block, we'd have to use something like semaphores - * killing performance on high-end SMP hardware. let's try to relax it using - * the following knowledge: - * 1) if buddy is referenced, it's already initialized - * 2) while block is used in buddy and the buddy is referenced, - * nobody can re-allocate that block - * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has - * bit set and PA claims same block, it's OK. IOW, one can set bit in - * on-disk bitmap if buddy has same bit set or/and PA covers corresponded - * block - * - * so, now we're building a concurrency table: - * - init buddy vs. - * - new PA - * blocks for PA are allocated in the buddy, buddy must be referenced - * until PA is linked to allocation group to avoid concurrent buddy init - * - use inode PA - * we need to make sure that either on-disk bitmap or PA has uptodate data - * given (3) we care that PA-=N operation doesn't interfere with init - * - discard inode PA - * the simplest way would be to have buddy initialized by the discard - * - use locality group PA - * again PA-=N must be serialized with init - * - discard locality group PA - * the simplest way would be to have buddy initialized by the discard - * - new PA vs. - * - use inode PA - * i_data_sem serializes them - * - discard inode PA - * discard process must wait until PA isn't used by another process - * - use locality group PA - * some mutex should serialize them - * - discard locality group PA - * discard process must wait until PA isn't used by another process - * - use inode PA - * - use inode PA - * i_data_sem or another mutex should serializes them - * - discard inode PA - * discard process must wait until PA isn't used by another process - * - use locality group PA - * nothing wrong here -- they're different PAs covering different blocks - * - discard locality group PA - * discard process must wait until PA isn't used by another process - * - * now we're ready to make few consequences: - * - PA is referenced and while it is no discard is possible - * - PA is referenced until block isn't marked in on-disk bitmap - * - PA changes only after on-disk bitmap - * - discard must not compete with init. either init is done before - * any discard or they're serialized somehow - * - buddy init as sum of on-disk bitmap and PAs is done atomically - * - * a special case when we've used PA to emptiness. no need to modify buddy - * in this case, but we should care about concurrent init - * - */ - - /* - * Logic in few words: - * - * - allocation: - * load group - * find blocks - * mark bits in on-disk bitmap - * release group - * - * - use preallocation: - * find proper PA (per-inode or group) - * load group - * mark bits in on-disk bitmap - * release group - * release PA - * - * - free: - * load group - * mark bits in on-disk bitmap - * release group - * - * - discard preallocations in group: - * mark PAs deleted - * move them onto local list - * load on-disk bitmap - * load group - * remove PA from object (inode or locality group) - * mark free blocks in-core - * - * - discard inode's preallocations: - */ - -/* - * Locking rules - * - * Locks: - * - bitlock on a group (group) - * - object (inode/locality) (object) - * - per-pa lock (pa) - * - * Paths: - * - new pa - * object - * group - * - * - find and use pa: - * pa - * - * - release consumed pa: - * pa - * group - * object - * - * - generate in-core bitmap: - * group - * pa - * - * - discard all for given object (inode, locality group): - * object - * pa - * group - * - * - discard all for given group: - * group - * pa - * group - * object - * - */ -static struct kmem_cache *ext4_pspace_cachep; -static struct kmem_cache *ext4_ac_cachep; -static struct kmem_cache *ext4_free_data_cachep; - -/* We create slab caches for groupinfo data structures based on the - * superblock block size. There will be one per mounted filesystem for - * each unique s_blocksize_bits */ -#define NR_GRPINFO_CACHES 8 -static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; - -static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { - "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", - "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", - "ext4_groupinfo_64k", "ext4_groupinfo_128k" -}; - -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, - ext4_group_t group); -static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, - ext4_group_t group); -static void ext4_free_data_callback(struct super_block *sb, - struct ext4_journal_cb_entry *jce, int rc); - -static inline void *mb_correct_addr_and_bit(int *bit, void *addr) -{ -#if BITS_PER_LONG == 64 - *bit += ((unsigned long) addr & 7UL) << 3; - addr = (void *) ((unsigned long) addr & ~7UL); -#elif BITS_PER_LONG == 32 - *bit += ((unsigned long) addr & 3UL) << 3; - addr = (void *) ((unsigned long) addr & ~3UL); -#else -#error "how many bits you are?!" -#endif - return addr; -} - -static inline int mb_test_bit(int bit, void *addr) -{ - /* - * ext4_test_bit on architecture like powerpc - * needs unsigned long aligned address - */ - addr = mb_correct_addr_and_bit(&bit, addr); - return ext4_test_bit(bit, addr); -} - -static inline void mb_set_bit(int bit, void *addr) -{ - addr = mb_correct_addr_and_bit(&bit, addr); - ext4_set_bit(bit, addr); -} - -static inline void mb_clear_bit(int bit, void *addr) -{ - addr = mb_correct_addr_and_bit(&bit, addr); - ext4_clear_bit(bit, addr); -} - -static inline int mb_find_next_zero_bit(void *addr, int max, int start) -{ - int fix = 0, ret, tmpmax; - addr = mb_correct_addr_and_bit(&fix, addr); - tmpmax = max + fix; - start += fix; - - ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; - if (ret > max) - return max; - return ret; -} - -static inline int mb_find_next_bit(void *addr, int max, int start) -{ - int fix = 0, ret, tmpmax; - addr = mb_correct_addr_and_bit(&fix, addr); - tmpmax = max + fix; - start += fix; - - ret = ext4_find_next_bit(addr, tmpmax, start) - fix; - if (ret > max) - return max; - return ret; -} - -static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) -{ - char *bb; - - BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); - BUG_ON(max == NULL); - - if (order > e4b->bd_blkbits + 1) { - *max = 0; - return NULL; - } - - /* at order 0 we see each particular block */ - if (order == 0) { - *max = 1 << (e4b->bd_blkbits + 3); - return e4b->bd_bitmap; - } - - bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; - *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; - - return bb; -} - -#ifdef DOUBLE_CHECK -static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, - int first, int count) -{ - int i; - struct super_block *sb = e4b->bd_sb; - - if (unlikely(e4b->bd_info->bb_bitmap == NULL)) - return; - assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); - for (i = 0; i < count; i++) { - if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { - ext4_fsblk_t blocknr; - - blocknr = ext4_group_first_block_no(sb, e4b->bd_group); - blocknr += EXT4_C2B(EXT4_SB(sb), first + i); - ext4_grp_locked_error(sb, e4b->bd_group, - inode ? inode->i_ino : 0, - blocknr, - "freeing block already freed " - "(bit %u)", - first + i); - } - mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); - } -} - -static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) -{ - int i; - - if (unlikely(e4b->bd_info->bb_bitmap == NULL)) - return; - assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); - for (i = 0; i < count; i++) { - BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); - mb_set_bit(first + i, e4b->bd_info->bb_bitmap); - } -} - -static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) -{ - if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { - unsigned char *b1, *b2; - int i; - b1 = (unsigned char *) e4b->bd_info->bb_bitmap; - b2 = (unsigned char *) bitmap; - for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { - if (b1[i] != b2[i]) { - ext4_msg(e4b->bd_sb, KERN_ERR, - "corruption in group %u " - "at byte %u(%u): %x in copy != %x " - "on disk/prealloc", - e4b->bd_group, i, i * 8, b1[i], b2[i]); - BUG(); - } - } - } -} - -#else -static inline void mb_free_blocks_double(struct inode *inode, - struct ext4_buddy *e4b, int first, int count) -{ - return; -} -static inline void mb_mark_used_double(struct ext4_buddy *e4b, - int first, int count) -{ - return; -} -static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) -{ - return; -} -#endif - -#ifdef AGGRESSIVE_CHECK - -#define MB_CHECK_ASSERT(assert) \ -do { \ - if (!(assert)) { \ - printk(KERN_EMERG \ - "Assertion failure in %s() at %s:%d: \"%s\"\n", \ - function, file, line, # assert); \ - BUG(); \ - } \ -} while (0) - -static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, - const char *function, int line) -{ - struct super_block *sb = e4b->bd_sb; - int order = e4b->bd_blkbits + 1; - int max; - int max2; - int i; - int j; - int k; - int count; - struct ext4_group_info *grp; - int fragments = 0; - int fstart; - struct list_head *cur; - void *buddy; - void *buddy2; - - { - static int mb_check_counter; - if (mb_check_counter++ % 100 != 0) - return 0; - } - - while (order > 1) { - buddy = mb_find_buddy(e4b, order, &max); - MB_CHECK_ASSERT(buddy); - buddy2 = mb_find_buddy(e4b, order - 1, &max2); - MB_CHECK_ASSERT(buddy2); - MB_CHECK_ASSERT(buddy != buddy2); - MB_CHECK_ASSERT(max * 2 == max2); - - count = 0; - for (i = 0; i < max; i++) { - - if (mb_test_bit(i, buddy)) { - /* only single bit in buddy2 may be 1 */ - if (!mb_test_bit(i << 1, buddy2)) { - MB_CHECK_ASSERT( - mb_test_bit((i<<1)+1, buddy2)); - } else if (!mb_test_bit((i << 1) + 1, buddy2)) { - MB_CHECK_ASSERT( - mb_test_bit(i << 1, buddy2)); - } - continue; - } - - /* both bits in buddy2 must be 1 */ - MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); - MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); - - for (j = 0; j < (1 << order); j++) { - k = (i * (1 << order)) + j; - MB_CHECK_ASSERT( - !mb_test_bit(k, e4b->bd_bitmap)); - } - count++; - } - MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); - order--; - } - - fstart = -1; - buddy = mb_find_buddy(e4b, 0, &max); - for (i = 0; i < max; i++) { - if (!mb_test_bit(i, buddy)) { - MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); - if (fstart == -1) { - fragments++; - fstart = i; - } - continue; - } - fstart = -1; - /* check used bits only */ - for (j = 0; j < e4b->bd_blkbits + 1; j++) { - buddy2 = mb_find_buddy(e4b, j, &max2); - k = i >> j; - MB_CHECK_ASSERT(k < max2); - MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); - } - } - MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); - MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); - - grp = ext4_get_group_info(sb, e4b->bd_group); - list_for_each(cur, &grp->bb_prealloc_list) { - ext4_group_t groupnr; - struct ext4_prealloc_space *pa; - pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); - MB_CHECK_ASSERT(groupnr == e4b->bd_group); - for (i = 0; i < pa->pa_len; i++) - MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); - } - return 0; -} -#undef MB_CHECK_ASSERT -#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ - __FILE__, __func__, __LINE__) -#else -#define mb_check_buddy(e4b) -#endif - -/* - * Divide blocks started from @first with length @len into - * smaller chunks with power of 2 blocks. - * Clear the bits in bitmap which the blocks of the chunk(s) covered, - * then increase bb_counters[] for corresponded chunk size. - */ -static void ext4_mb_mark_free_simple(struct super_block *sb, - void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, - struct ext4_group_info *grp) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_grpblk_t min; - ext4_grpblk_t max; - ext4_grpblk_t chunk; - unsigned short border; - - BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); - - border = 2 << sb->s_blocksize_bits; - - while (len > 0) { - /* find how many blocks can be covered since this position */ - max = ffs(first | border) - 1; - - /* find how many blocks of power 2 we need to mark */ - min = fls(len) - 1; - - if (max < min) - min = max; - chunk = 1 << min; - - /* mark multiblock chunks only */ - grp->bb_counters[min]++; - if (min > 0) - mb_clear_bit(first >> min, - buddy + sbi->s_mb_offsets[min]); - - len -= chunk; - first += chunk; - } -} - -/* - * Cache the order of the largest free extent we have available in this block - * group. - */ -static void -mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) -{ - int i; - int bits; - - grp->bb_largest_free_order = -1; /* uninit */ - - bits = sb->s_blocksize_bits + 1; - for (i = bits; i >= 0; i--) { - if (grp->bb_counters[i] > 0) { - grp->bb_largest_free_order = i; - break; - } - } -} - -static noinline_for_stack -void ext4_mb_generate_buddy(struct super_block *sb, - void *buddy, void *bitmap, ext4_group_t group) -{ - struct ext4_group_info *grp = ext4_get_group_info(sb, group); - ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); - ext4_grpblk_t i = 0; - ext4_grpblk_t first; - ext4_grpblk_t len; - unsigned free = 0; - unsigned fragments = 0; - unsigned long long period = get_cycles(); - - /* initialize buddy from bitmap which is aggregation - * of on-disk bitmap and preallocations */ - i = mb_find_next_zero_bit(bitmap, max, 0); - grp->bb_first_free = i; - while (i < max) { - fragments++; - first = i; - i = mb_find_next_bit(bitmap, max, i); - len = i - first; - free += len; - if (len > 1) - ext4_mb_mark_free_simple(sb, buddy, first, len, grp); - else - grp->bb_counters[0]++; - if (i < max) - i = mb_find_next_zero_bit(bitmap, max, i); - } - grp->bb_fragments = fragments; - - if (free != grp->bb_free) { - ext4_grp_locked_error(sb, group, 0, 0, - "%u clusters in bitmap, %u in gd", - free, grp->bb_free); - /* - * If we intent to continue, we consider group descritor - * corrupt and update bb_free using bitmap value - */ - grp->bb_free = free; - } - mb_set_largest_free_order(sb, grp); - - clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); - - period = get_cycles() - period; - spin_lock(&EXT4_SB(sb)->s_bal_lock); - EXT4_SB(sb)->s_mb_buddies_generated++; - EXT4_SB(sb)->s_mb_generation_time += period; - spin_unlock(&EXT4_SB(sb)->s_bal_lock); -} - -/* The buddy information is attached the buddy cache inode - * for convenience. The information regarding each group - * is loaded via ext4_mb_load_buddy. The information involve - * block bitmap and buddy information. The information are - * stored in the inode as - * - * { page } - * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... - * - * - * one block each for bitmap and buddy information. - * So for each group we take up 2 blocks. A page can - * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. - * So it can have information regarding groups_per_page which - * is blocks_per_page/2 - * - * Locking note: This routine takes the block group lock of all groups - * for this page; do not hold this lock when calling this routine! - */ - -static int ext4_mb_init_cache(struct page *page, char *incore) -{ - ext4_group_t ngroups; - int blocksize; - int blocks_per_page; - int groups_per_page; - int err = 0; - int i; - ext4_group_t first_group, group; - int first_block; - struct super_block *sb; - struct buffer_head *bhs; - struct buffer_head **bh; - struct inode *inode; - char *data; - char *bitmap; - struct ext4_group_info *grinfo; - - mb_debug(1, "init page %lu\n", page->index); - - inode = page->mapping->host; - sb = inode->i_sb; - ngroups = ext4_get_groups_count(sb); - blocksize = 1 << inode->i_blkbits; - blocks_per_page = PAGE_CACHE_SIZE / blocksize; - - groups_per_page = blocks_per_page >> 1; - if (groups_per_page == 0) - groups_per_page = 1; - - /* allocate buffer_heads to read bitmaps */ - if (groups_per_page > 1) { - i = sizeof(struct buffer_head *) * groups_per_page; - bh = kzalloc(i, GFP_NOFS); - if (bh == NULL) { - err = -ENOMEM; - goto out; - } - } else - bh = &bhs; - - first_group = page->index * blocks_per_page / 2; - - /* read all groups the page covers into the cache */ - for (i = 0, group = first_group; i < groups_per_page; i++, group++) { - if (group >= ngroups) - break; - - grinfo = ext4_get_group_info(sb, group); - /* - * If page is uptodate then we came here after online resize - * which added some new uninitialized group info structs, so - * we must skip all initialized uptodate buddies on the page, - * which may be currently in use by an allocating task. - */ - if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { - bh[i] = NULL; - continue; - } - if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) { - err = -ENOMEM; - goto out; - } - mb_debug(1, "read bitmap for group %u\n", group); - } - - /* wait for I/O completion */ - for (i = 0, group = first_group; i < groups_per_page; i++, group++) { - if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { - err = -EIO; - goto out; - } - } - - first_block = page->index * blocks_per_page; - for (i = 0; i < blocks_per_page; i++) { - int group; - - group = (first_block + i) >> 1; - if (group >= ngroups) - break; - - if (!bh[group - first_group]) - /* skip initialized uptodate buddy */ - continue; - - /* - * data carry information regarding this - * particular group in the format specified - * above - * - */ - data = page_address(page) + (i * blocksize); - bitmap = bh[group - first_group]->b_data; - - /* - * We place the buddy block and bitmap block - * close together - */ - if ((first_block + i) & 1) { - /* this is block of buddy */ - BUG_ON(incore == NULL); - mb_debug(1, "put buddy for group %u in page %lu/%x\n", - group, page->index, i * blocksize); - trace_ext4_mb_buddy_bitmap_load(sb, group); - grinfo = ext4_get_group_info(sb, group); - grinfo->bb_fragments = 0; - memset(grinfo->bb_counters, 0, - sizeof(*grinfo->bb_counters) * - (sb->s_blocksize_bits+2)); - /* - * incore got set to the group block bitmap below - */ - ext4_lock_group(sb, group); - /* init the buddy */ - memset(data, 0xff, blocksize); - ext4_mb_generate_buddy(sb, data, incore, group); - ext4_unlock_group(sb, group); - incore = NULL; - } else { - /* this is block of bitmap */ - BUG_ON(incore != NULL); - mb_debug(1, "put bitmap for group %u in page %lu/%x\n", - group, page->index, i * blocksize); - trace_ext4_mb_bitmap_load(sb, group); - - /* see comments in ext4_mb_put_pa() */ - ext4_lock_group(sb, group); - memcpy(data, bitmap, blocksize); - - /* mark all preallocated blks used in in-core bitmap */ - ext4_mb_generate_from_pa(sb, data, group); - ext4_mb_generate_from_freelist(sb, data, group); - ext4_unlock_group(sb, group); - - /* set incore so that the buddy information can be - * generated using this - */ - incore = data; - } - } - SetPageUptodate(page); - -out: - if (bh) { - for (i = 0; i < groups_per_page; i++) - brelse(bh[i]); - if (bh != &bhs) - kfree(bh); - } - return err; -} - -/* - * Lock the buddy and bitmap pages. This make sure other parallel init_group - * on the same buddy page doesn't happen whild holding the buddy page lock. - * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap - * are on the same page e4b->bd_buddy_page is NULL and return value is 0. - */ -static int ext4_mb_get_buddy_page_lock(struct super_block *sb, - ext4_group_t group, struct ext4_buddy *e4b) -{ - struct inode *inode = EXT4_SB(sb)->s_buddy_cache; - int block, pnum, poff; - int blocks_per_page; - struct page *page; - - e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; - - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (!page) - return -EIO; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); - - if (blocks_per_page >= 2) { - /* buddy and bitmap are on the same page */ - return 0; - } - - block++; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (!page) - return -EIO; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_buddy_page = page; - return 0; -} - -static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) -{ - if (e4b->bd_bitmap_page) { - unlock_page(e4b->bd_bitmap_page); - page_cache_release(e4b->bd_bitmap_page); - } - if (e4b->bd_buddy_page) { - unlock_page(e4b->bd_buddy_page); - page_cache_release(e4b->bd_buddy_page); - } -} - -/* - * Locking note: This routine calls ext4_mb_init_cache(), which takes the - * block group lock of all groups for this page; do not hold the BG lock when - * calling this routine! - */ -static noinline_for_stack -int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) -{ - - struct ext4_group_info *this_grp; - struct ext4_buddy e4b; - struct page *page; - int ret = 0; - - mb_debug(1, "init group %u\n", group); - this_grp = ext4_get_group_info(sb, group); - /* - * This ensures that we don't reinit the buddy cache - * page which map to the group from which we are already - * allocating. If we are looking at the buddy cache we would - * have taken a reference using ext4_mb_load_buddy and that - * would have pinned buddy page to page cache. - */ - ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); - if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { - /* - * somebody initialized the group - * return without doing anything - */ - goto err; - } - - page = e4b.bd_bitmap_page; - ret = ext4_mb_init_cache(page, NULL); - if (ret) - goto err; - if (!PageUptodate(page)) { - ret = -EIO; - goto err; - } - mark_page_accessed(page); - - if (e4b.bd_buddy_page == NULL) { - /* - * If both the bitmap and buddy are in - * the same page we don't need to force - * init the buddy - */ - ret = 0; - goto err; - } - /* init buddy cache */ - page = e4b.bd_buddy_page; - ret = ext4_mb_init_cache(page, e4b.bd_bitmap); - if (ret) - goto err; - if (!PageUptodate(page)) { - ret = -EIO; - goto err; - } - mark_page_accessed(page); -err: - ext4_mb_put_buddy_page_lock(&e4b); - return ret; -} - -/* - * Locking note: This routine calls ext4_mb_init_cache(), which takes the - * block group lock of all groups for this page; do not hold the BG lock when - * calling this routine! - */ -static noinline_for_stack int -ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, - struct ext4_buddy *e4b) -{ - int blocks_per_page; - int block; - int pnum; - int poff; - struct page *page; - int ret; - struct ext4_group_info *grp; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; - - mb_debug(1, "load group %u\n", group); - - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - grp = ext4_get_group_info(sb, group); - - e4b->bd_blkbits = sb->s_blocksize_bits; - e4b->bd_info = grp; - e4b->bd_sb = sb; - e4b->bd_group = group; - e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; - - if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - /* - * we need full data about the group - * to make a good selection - */ - ret = ext4_mb_init_group(sb, group); - if (ret) - return ret; - } - - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - - /* we could use find_or_create_page(), but it locks page - * what we'd like to avoid in fast path ... */ - page = find_get_page(inode->i_mapping, pnum); - if (page == NULL || !PageUptodate(page)) { - if (page) - /* - * drop the page reference and try - * to get the page with lock. If we - * are not uptodate that implies - * somebody just created the page but - * is yet to initialize the same. So - * wait for it to initialize. - */ - page_cache_release(page); - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (page) { - BUG_ON(page->mapping != inode->i_mapping); - if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, NULL); - if (ret) { - unlock_page(page); - goto err; - } - mb_cmp_bitmaps(e4b, page_address(page) + - (poff * sb->s_blocksize)); - } - unlock_page(page); - } - } - if (page == NULL || !PageUptodate(page)) { - ret = -EIO; - goto err; - } - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); - mark_page_accessed(page); - - block++; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - - page = find_get_page(inode->i_mapping, pnum); - if (page == NULL || !PageUptodate(page)) { - if (page) - page_cache_release(page); - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (page) { - BUG_ON(page->mapping != inode->i_mapping); - if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, e4b->bd_bitmap); - if (ret) { - unlock_page(page); - goto err; - } - } - unlock_page(page); - } - } - if (page == NULL || !PageUptodate(page)) { - ret = -EIO; - goto err; - } - e4b->bd_buddy_page = page; - e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); - mark_page_accessed(page); - - BUG_ON(e4b->bd_bitmap_page == NULL); - BUG_ON(e4b->bd_buddy_page == NULL); - - return 0; - -err: - if (page) - page_cache_release(page); - if (e4b->bd_bitmap_page) - page_cache_release(e4b->bd_bitmap_page); - if (e4b->bd_buddy_page) - page_cache_release(e4b->bd_buddy_page); - e4b->bd_buddy = NULL; - e4b->bd_bitmap = NULL; - return ret; -} - -static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) -{ - if (e4b->bd_bitmap_page) - page_cache_release(e4b->bd_bitmap_page); - if (e4b->bd_buddy_page) - page_cache_release(e4b->bd_buddy_page); -} - - -static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) -{ - int order = 1; - void *bb; - - BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); - BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); - - bb = e4b->bd_buddy; - while (order <= e4b->bd_blkbits + 1) { - block = block >> 1; - if (!mb_test_bit(block, bb)) { - /* this block is part of buddy of order 'order' */ - return order; - } - bb += 1 << (e4b->bd_blkbits - order); - order++; - } - return 0; -} - -static void mb_clear_bits(void *bm, int cur, int len) -{ - __u32 *addr; - - len = cur + len; - while (cur < len) { - if ((cur & 31) == 0 && (len - cur) >= 32) { - /* fast path: clear whole word at once */ - addr = bm + (cur >> 3); - *addr = 0; - cur += 32; - continue; - } - mb_clear_bit(cur, bm); - cur++; - } -} - -void ext4_set_bits(void *bm, int cur, int len) -{ - __u32 *addr; - - len = cur + len; - while (cur < len) { - if ((cur & 31) == 0 && (len - cur) >= 32) { - /* fast path: set whole word at once */ - addr = bm + (cur >> 3); - *addr = 0xffffffff; - cur += 32; - continue; - } - mb_set_bit(cur, bm); - cur++; - } -} - -static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, - int first, int count) -{ - int block = 0; - int max = 0; - int order; - void *buddy; - void *buddy2; - struct super_block *sb = e4b->bd_sb; - - BUG_ON(first + count > (sb->s_blocksize << 3)); - assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); - mb_check_buddy(e4b); - mb_free_blocks_double(inode, e4b, first, count); - - e4b->bd_info->bb_free += count; - if (first < e4b->bd_info->bb_first_free) - e4b->bd_info->bb_first_free = first; - - /* let's maintain fragments counter */ - if (first != 0) - block = !mb_test_bit(first - 1, e4b->bd_bitmap); - if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) - max = !mb_test_bit(first + count, e4b->bd_bitmap); - if (block && max) - e4b->bd_info->bb_fragments--; - else if (!block && !max) - e4b->bd_info->bb_fragments++; - - /* let's maintain buddy itself */ - while (count-- > 0) { - block = first++; - order = 0; - - if (!mb_test_bit(block, e4b->bd_bitmap)) { - ext4_fsblk_t blocknr; - - blocknr = ext4_group_first_block_no(sb, e4b->bd_group); - blocknr += EXT4_C2B(EXT4_SB(sb), block); - ext4_grp_locked_error(sb, e4b->bd_group, - inode ? inode->i_ino : 0, - blocknr, - "freeing already freed block " - "(bit %u)", block); - } - mb_clear_bit(block, e4b->bd_bitmap); - e4b->bd_info->bb_counters[order]++; - - /* start of the buddy */ - buddy = mb_find_buddy(e4b, order, &max); - - do { - block &= ~1UL; - if (mb_test_bit(block, buddy) || - mb_test_bit(block + 1, buddy)) - break; - - /* both the buddies are free, try to coalesce them */ - buddy2 = mb_find_buddy(e4b, order + 1, &max); - - if (!buddy2) - break; - - if (order > 0) { - /* for special purposes, we don't set - * free bits in bitmap */ - mb_set_bit(block, buddy); - mb_set_bit(block + 1, buddy); - } - e4b->bd_info->bb_counters[order]--; - e4b->bd_info->bb_counters[order]--; - - block = block >> 1; - order++; - e4b->bd_info->bb_counters[order]++; - - mb_clear_bit(block, buddy2); - buddy = buddy2; - } while (1); - } - mb_set_largest_free_order(sb, e4b->bd_info); - mb_check_buddy(e4b); -} - -static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, - int needed, struct ext4_free_extent *ex) -{ - int next = block; - int max; - void *buddy; - - assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); - BUG_ON(ex == NULL); - - buddy = mb_find_buddy(e4b, order, &max); - BUG_ON(buddy == NULL); - BUG_ON(block >= max); - if (mb_test_bit(block, buddy)) { - ex->fe_len = 0; - ex->fe_start = 0; - ex->fe_group = 0; - return 0; - } - - /* FIXME dorp order completely ? */ - if (likely(order == 0)) { - /* find actual order */ - order = mb_find_order_for_block(e4b, block); - block = block >> order; - } - - ex->fe_len = 1 << order; - ex->fe_start = block << order; - ex->fe_group = e4b->bd_group; - - /* calc difference from given start */ - next = next - ex->fe_start; - ex->fe_len -= next; - ex->fe_start += next; - - while (needed > ex->fe_len && - (buddy = mb_find_buddy(e4b, order, &max))) { - - if (block + 1 >= max) - break; - - next = (block + 1) * (1 << order); - if (mb_test_bit(next, e4b->bd_bitmap)) - break; - - order = mb_find_order_for_block(e4b, next); - - block = next >> order; - ex->fe_len += 1 << order; - } - - BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); - return ex->fe_len; -} - -static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) -{ - int ord; - int mlen = 0; - int max = 0; - int cur; - int start = ex->fe_start; - int len = ex->fe_len; - unsigned ret = 0; - int len0 = len; - void *buddy; - - BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); - BUG_ON(e4b->bd_group != ex->fe_group); - assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); - mb_check_buddy(e4b); - mb_mark_used_double(e4b, start, len); - - e4b->bd_info->bb_free -= len; - if (e4b->bd_info->bb_first_free == start) - e4b->bd_info->bb_first_free += len; - - /* let's maintain fragments counter */ - if (start != 0) - mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); - if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) - max = !mb_test_bit(start + len, e4b->bd_bitmap); - if (mlen && max) - e4b->bd_info->bb_fragments++; - else if (!mlen && !max) - e4b->bd_info->bb_fragments--; - - /* let's maintain buddy itself */ - while (len) { - ord = mb_find_order_for_block(e4b, start); - - if (((start >> ord) << ord) == start && len >= (1 << ord)) { - /* the whole chunk may be allocated at once! */ - mlen = 1 << ord; - buddy = mb_find_buddy(e4b, ord, &max); - BUG_ON((start >> ord) >= max); - mb_set_bit(start >> ord, buddy); - e4b->bd_info->bb_counters[ord]--; - start += mlen; - len -= mlen; - BUG_ON(len < 0); - continue; - } - - /* store for history */ - if (ret == 0) - ret = len | (ord << 16); - - /* we have to split large buddy */ - BUG_ON(ord <= 0); - buddy = mb_find_buddy(e4b, ord, &max); - mb_set_bit(start >> ord, buddy); - e4b->bd_info->bb_counters[ord]--; - - ord--; - cur = (start >> ord) & ~1U; - buddy = mb_find_buddy(e4b, ord, &max); - mb_clear_bit(cur, buddy); - mb_clear_bit(cur + 1, buddy); - e4b->bd_info->bb_counters[ord]++; - e4b->bd_info->bb_counters[ord]++; - } - mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); - - ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); - mb_check_buddy(e4b); - - return ret; -} - -/* - * Must be called under group lock! - */ -static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, - struct ext4_buddy *e4b) -{ - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - int ret; - - BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); - BUG_ON(ac->ac_status == AC_STATUS_FOUND); - - ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); - ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; - ret = mb_mark_used(e4b, &ac->ac_b_ex); - - /* preallocation can change ac_b_ex, thus we store actually - * allocated blocks for history */ - ac->ac_f_ex = ac->ac_b_ex; - - ac->ac_status = AC_STATUS_FOUND; - ac->ac_tail = ret & 0xffff; - ac->ac_buddy = ret >> 16; - - /* - * take the page reference. We want the page to be pinned - * so that we don't get a ext4_mb_init_cache_call for this - * group until we update the bitmap. That would mean we - * double allocate blocks. The reference is dropped - * in ext4_mb_release_context - */ - ac->ac_bitmap_page = e4b->bd_bitmap_page; - get_page(ac->ac_bitmap_page); - ac->ac_buddy_page = e4b->bd_buddy_page; - get_page(ac->ac_buddy_page); - /* store last allocated for subsequent stream allocation */ - if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { - spin_lock(&sbi->s_md_lock); - sbi->s_mb_last_group = ac->ac_f_ex.fe_group; - sbi->s_mb_last_start = ac->ac_f_ex.fe_start; - spin_unlock(&sbi->s_md_lock); - } -} - -/* - * regular allocator, for general purposes allocation - */ - -static void ext4_mb_check_limits(struct ext4_allocation_context *ac, - struct ext4_buddy *e4b, - int finish_group) -{ - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_free_extent *bex = &ac->ac_b_ex; - struct ext4_free_extent *gex = &ac->ac_g_ex; - struct ext4_free_extent ex; - int max; - - if (ac->ac_status == AC_STATUS_FOUND) - return; - /* - * We don't want to scan for a whole year - */ - if (ac->ac_found > sbi->s_mb_max_to_scan && - !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { - ac->ac_status = AC_STATUS_BREAK; - return; - } - - /* - * Haven't found good chunk so far, let's continue - */ - if (bex->fe_len < gex->fe_len) - return; - - if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) - && bex->fe_group == e4b->bd_group) { - /* recheck chunk's availability - we don't know - * when it was found (within this lock-unlock - * period or not) */ - max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); - if (max >= gex->fe_len) { - ext4_mb_use_best_found(ac, e4b); - return; - } - } -} - -/* - * The routine checks whether found extent is good enough. If it is, - * then the extent gets marked used and flag is set to the context - * to stop scanning. Otherwise, the extent is compared with the - * previous found extent and if new one is better, then it's stored - * in the context. Later, the best found extent will be used, if - * mballoc can't find good enough extent. - * - * FIXME: real allocation policy is to be designed yet! - */ -static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, - struct ext4_free_extent *ex, - struct ext4_buddy *e4b) -{ - struct ext4_free_extent *bex = &ac->ac_b_ex; - struct ext4_free_extent *gex = &ac->ac_g_ex; - - BUG_ON(ex->fe_len <= 0); - BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); - BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); - BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); - - ac->ac_found++; - - /* - * The special case - take what you catch first - */ - if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { - *bex = *ex; - ext4_mb_use_best_found(ac, e4b); - return; - } - - /* - * Let's check whether the chuck is good enough - */ - if (ex->fe_len == gex->fe_len) { - *bex = *ex; - ext4_mb_use_best_found(ac, e4b); - return; - } - - /* - * If this is first found extent, just store it in the context - */ - if (bex->fe_len == 0) { - *bex = *ex; - return; - } - - /* - * If new found extent is better, store it in the context - */ - if (bex->fe_len < gex->fe_len) { - /* if the request isn't satisfied, any found extent - * larger than previous best one is better */ - if (ex->fe_len > bex->fe_len) - *bex = *ex; - } else if (ex->fe_len > gex->fe_len) { - /* if the request is satisfied, then we try to find - * an extent that still satisfy the request, but is - * smaller than previous one */ - if (ex->fe_len < bex->fe_len) - *bex = *ex; - } - - ext4_mb_check_limits(ac, e4b, 0); -} - -static noinline_for_stack -int ext4_mb_try_best_found(struct ext4_allocation_context *ac, - struct ext4_buddy *e4b) -{ - struct ext4_free_extent ex = ac->ac_b_ex; - ext4_group_t group = ex.fe_group; - int max; - int err; - - BUG_ON(ex.fe_len <= 0); - err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); - if (err) - return err; - - ext4_lock_group(ac->ac_sb, group); - max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); - - if (max > 0) { - ac->ac_b_ex = ex; - ext4_mb_use_best_found(ac, e4b); - } - - ext4_unlock_group(ac->ac_sb, group); - ext4_mb_unload_buddy(e4b); - - return 0; -} - -static noinline_for_stack -int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, - struct ext4_buddy *e4b) -{ - ext4_group_t group = ac->ac_g_ex.fe_group; - int max; - int err; - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_free_extent ex; - - if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) - return 0; - - err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); - if (err) - return err; - - ext4_lock_group(ac->ac_sb, group); - max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, - ac->ac_g_ex.fe_len, &ex); - - if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { - ext4_fsblk_t start; - - start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + - ex.fe_start; - /* use do_div to get remainder (would be 64-bit modulo) */ - if (do_div(start, sbi->s_stripe) == 0) { - ac->ac_found++; - ac->ac_b_ex = ex; - ext4_mb_use_best_found(ac, e4b); - } - } else if (max >= ac->ac_g_ex.fe_len) { - BUG_ON(ex.fe_len <= 0); - BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); - BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); - ac->ac_found++; - ac->ac_b_ex = ex; - ext4_mb_use_best_found(ac, e4b); - } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { - /* Sometimes, caller may want to merge even small - * number of blocks to an existing extent */ - BUG_ON(ex.fe_len <= 0); - BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); - BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); - ac->ac_found++; - ac->ac_b_ex = ex; - ext4_mb_use_best_found(ac, e4b); - } - ext4_unlock_group(ac->ac_sb, group); - ext4_mb_unload_buddy(e4b); - - return 0; -} - -/* - * The routine scans buddy structures (not bitmap!) from given order - * to max order and tries to find big enough chunk to satisfy the req - */ -static noinline_for_stack -void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, - struct ext4_buddy *e4b) -{ - struct super_block *sb = ac->ac_sb; - struct ext4_group_info *grp = e4b->bd_info; - void *buddy; - int i; - int k; - int max; - - BUG_ON(ac->ac_2order <= 0); - for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { - if (grp->bb_counters[i] == 0) - continue; - - buddy = mb_find_buddy(e4b, i, &max); - BUG_ON(buddy == NULL); - - k = mb_find_next_zero_bit(buddy, max, 0); - BUG_ON(k >= max); - - ac->ac_found++; - - ac->ac_b_ex.fe_len = 1 << i; - ac->ac_b_ex.fe_start = k << i; - ac->ac_b_ex.fe_group = e4b->bd_group; - - ext4_mb_use_best_found(ac, e4b); - - BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); - - if (EXT4_SB(sb)->s_mb_stats) - atomic_inc(&EXT4_SB(sb)->s_bal_2orders); - - break; - } -} - -/* - * The routine scans the group and measures all found extents. - * In order to optimize scanning, caller must pass number of - * free blocks in the group, so the routine can know upper limit. - */ -static noinline_for_stack -void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, - struct ext4_buddy *e4b) -{ - struct super_block *sb = ac->ac_sb; - void *bitmap = e4b->bd_bitmap; - struct ext4_free_extent ex; - int i; - int free; - - free = e4b->bd_info->bb_free; - BUG_ON(free <= 0); - - i = e4b->bd_info->bb_first_free; - - while (free && ac->ac_status == AC_STATUS_CONTINUE) { - i = mb_find_next_zero_bit(bitmap, - EXT4_CLUSTERS_PER_GROUP(sb), i); - if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { - /* - * IF we have corrupt bitmap, we won't find any - * free blocks even though group info says we - * we have free blocks - */ - ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, - "%d free clusters as per " - "group info. But bitmap says 0", - free); - break; - } - - mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); - BUG_ON(ex.fe_len <= 0); - if (free < ex.fe_len) { - ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, - "%d free clusters as per " - "group info. But got %d blocks", - free, ex.fe_len); - /* - * The number of free blocks differs. This mostly - * indicate that the bitmap is corrupt. So exit - * without claiming the space. - */ - break; - } - - ext4_mb_measure_extent(ac, &ex, e4b); - - i += ex.fe_len; - free -= ex.fe_len; - } - - ext4_mb_check_limits(ac, e4b, 1); -} - -/* - * This is a special case for storages like raid5 - * we try to find stripe-aligned chunks for stripe-size-multiple requests - */ -static noinline_for_stack -void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, - struct ext4_buddy *e4b) -{ - struct super_block *sb = ac->ac_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - void *bitmap = e4b->bd_bitmap; - struct ext4_free_extent ex; - ext4_fsblk_t first_group_block; - ext4_fsblk_t a; - ext4_grpblk_t i; - int max; - - BUG_ON(sbi->s_stripe == 0); - - /* find first stripe-aligned block in group */ - first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); - - a = first_group_block + sbi->s_stripe - 1; - do_div(a, sbi->s_stripe); - i = (a * sbi->s_stripe) - first_group_block; - - while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { - if (!mb_test_bit(i, bitmap)) { - max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); - if (max >= sbi->s_stripe) { - ac->ac_found++; - ac->ac_b_ex = ex; - ext4_mb_use_best_found(ac, e4b); - break; - } - } - i += sbi->s_stripe; - } -} - -/* This is now called BEFORE we load the buddy bitmap. */ -static int ext4_mb_good_group(struct ext4_allocation_context *ac, - ext4_group_t group, int cr) -{ - unsigned free, fragments; - int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); - struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); - - BUG_ON(cr < 0 || cr >= 4); - - /* We only do this if the grp has never been initialized */ - if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - int ret = ext4_mb_init_group(ac->ac_sb, group); - if (ret) - return 0; - } - - free = grp->bb_free; - fragments = grp->bb_fragments; - if (free == 0) - return 0; - if (fragments == 0) - return 0; - - switch (cr) { - case 0: - BUG_ON(ac->ac_2order == 0); - - if (grp->bb_largest_free_order < ac->ac_2order) - return 0; - - /* Avoid using the first bg of a flexgroup for data files */ - if ((ac->ac_flags & EXT4_MB_HINT_DATA) && - (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && - ((group % flex_size) == 0)) - return 0; - - return 1; - case 1: - if ((free / fragments) >= ac->ac_g_ex.fe_len) - return 1; - break; - case 2: - if (free >= ac->ac_g_ex.fe_len) - return 1; - break; - case 3: - return 1; - default: - BUG(); - } - - return 0; -} - -static noinline_for_stack int -ext4_mb_regular_allocator(struct ext4_allocation_context *ac) -{ - ext4_group_t ngroups, group, i; - int cr; - int err = 0; - struct ext4_sb_info *sbi; - struct super_block *sb; - struct ext4_buddy e4b; - - sb = ac->ac_sb; - sbi = EXT4_SB(sb); - ngroups = ext4_get_groups_count(sb); - /* non-extent files are limited to low blocks/groups */ - if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) - ngroups = sbi->s_blockfile_groups; - - BUG_ON(ac->ac_status == AC_STATUS_FOUND); - - /* first, try the goal */ - err = ext4_mb_find_by_goal(ac, &e4b); - if (err || ac->ac_status == AC_STATUS_FOUND) - goto out; - - if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) - goto out; - - /* - * ac->ac2_order is set only if the fe_len is a power of 2 - * if ac2_order is set we also set criteria to 0 so that we - * try exact allocation using buddy. - */ - i = fls(ac->ac_g_ex.fe_len); - ac->ac_2order = 0; - /* - * We search using buddy data only if the order of the request - * is greater than equal to the sbi_s_mb_order2_reqs - * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req - */ - if (i >= sbi->s_mb_order2_reqs) { - /* - * This should tell if fe_len is exactly power of 2 - */ - if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) - ac->ac_2order = i - 1; - } - - /* if stream allocation is enabled, use global goal */ - if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { - /* TBD: may be hot point */ - spin_lock(&sbi->s_md_lock); - ac->ac_g_ex.fe_group = sbi->s_mb_last_group; - ac->ac_g_ex.fe_start = sbi->s_mb_last_start; - spin_unlock(&sbi->s_md_lock); - } - - /* Let's just scan groups to find more-less suitable blocks */ - cr = ac->ac_2order ? 0 : 1; - /* - * cr == 0 try to get exact allocation, - * cr == 3 try to get anything - */ -repeat: - for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { - ac->ac_criteria = cr; - /* - * searching for the right group start - * from the goal value specified - */ - group = ac->ac_g_ex.fe_group; - - for (i = 0; i < ngroups; group++, i++) { - if (group == ngroups) - group = 0; - - /* This now checks without needing the buddy page */ - if (!ext4_mb_good_group(ac, group, cr)) - continue; - - err = ext4_mb_load_buddy(sb, group, &e4b); - if (err) - goto out; - - ext4_lock_group(sb, group); - - /* - * We need to check again after locking the - * block group - */ - if (!ext4_mb_good_group(ac, group, cr)) { - ext4_unlock_group(sb, group); - ext4_mb_unload_buddy(&e4b); - continue; - } - - ac->ac_groups_scanned++; - if (cr == 0) - ext4_mb_simple_scan_group(ac, &e4b); - else if (cr == 1 && sbi->s_stripe && - !(ac->ac_g_ex.fe_len % sbi->s_stripe)) - ext4_mb_scan_aligned(ac, &e4b); - else - ext4_mb_complex_scan_group(ac, &e4b); - - ext4_unlock_group(sb, group); - ext4_mb_unload_buddy(&e4b); - - if (ac->ac_status != AC_STATUS_CONTINUE) - break; - } - } - - if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && - !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { - /* - * We've been searching too long. Let's try to allocate - * the best chunk we've found so far - */ - - ext4_mb_try_best_found(ac, &e4b); - if (ac->ac_status != AC_STATUS_FOUND) { - /* - * Someone more lucky has already allocated it. - * The only thing we can do is just take first - * found block(s) - printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); - */ - ac->ac_b_ex.fe_group = 0; - ac->ac_b_ex.fe_start = 0; - ac->ac_b_ex.fe_len = 0; - ac->ac_status = AC_STATUS_CONTINUE; - ac->ac_flags |= EXT4_MB_HINT_FIRST; - cr = 3; - atomic_inc(&sbi->s_mb_lost_chunks); - goto repeat; - } - } -out: - return err; -} - -static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) -{ - struct super_block *sb = seq->private; - ext4_group_t group; - - if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) - return NULL; - group = *pos + 1; - return (void *) ((unsigned long) group); -} - -static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct super_block *sb = seq->private; - ext4_group_t group; - - ++*pos; - if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) - return NULL; - group = *pos + 1; - return (void *) ((unsigned long) group); -} - -static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) -{ - struct super_block *sb = seq->private; - ext4_group_t group = (ext4_group_t) ((unsigned long) v); - int i; - int err; - struct ext4_buddy e4b; - struct sg { - struct ext4_group_info info; - ext4_grpblk_t counters[16]; - } sg; - - group--; - if (group == 0) - seq_printf(seq, "#%-5s: %-5s %-5s %-5s " - "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " - "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", - "group", "free", "frags", "first", - "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", - "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); - - i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + - sizeof(struct ext4_group_info); - err = ext4_mb_load_buddy(sb, group, &e4b); - if (err) { - seq_printf(seq, "#%-5u: I/O error\n", group); - return 0; - } - ext4_lock_group(sb, group); - memcpy(&sg, ext4_get_group_info(sb, group), i); - ext4_unlock_group(sb, group); - ext4_mb_unload_buddy(&e4b); - - seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, - sg.info.bb_fragments, sg.info.bb_first_free); - for (i = 0; i <= 13; i++) - seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? - sg.info.bb_counters[i] : 0); - seq_printf(seq, " ]\n"); - - return 0; -} - -static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) -{ -} - -static const struct seq_operations ext4_mb_seq_groups_ops = { - .start = ext4_mb_seq_groups_start, - .next = ext4_mb_seq_groups_next, - .stop = ext4_mb_seq_groups_stop, - .show = ext4_mb_seq_groups_show, -}; - -static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) -{ - struct super_block *sb = PDE(inode)->data; - int rc; - - rc = seq_open(file, &ext4_mb_seq_groups_ops); - if (rc == 0) { - struct seq_file *m = file->private_data; - m->private = sb; - } - return rc; - -} - -static const struct file_operations ext4_mb_seq_groups_fops = { - .owner = THIS_MODULE, - .open = ext4_mb_seq_groups_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) -{ - int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; - struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; - - BUG_ON(!cachep); - return cachep; -} - -/* Create and initialize ext4_group_info data for the given group. */ -int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, - struct ext4_group_desc *desc) -{ - int i; - int metalen = 0; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_group_info **meta_group_info; - struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); - - /* - * First check if this group is the first of a reserved block. - * If it's true, we have to allocate a new table of pointers - * to ext4_group_info structures - */ - if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { - metalen = sizeof(*meta_group_info) << - EXT4_DESC_PER_BLOCK_BITS(sb); - meta_group_info = kmalloc(metalen, GFP_KERNEL); - if (meta_group_info == NULL) { - ext4_msg(sb, KERN_ERR, "can't allocate mem " - "for a buddy group"); - goto exit_meta_group_info; - } - sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = - meta_group_info; - } - - meta_group_info = - sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; - i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); - - meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); - if (meta_group_info[i] == NULL) { - ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); - goto exit_group_info; - } - memset(meta_group_info[i], 0, kmem_cache_size(cachep)); - set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, - &(meta_group_info[i]->bb_state)); - - /* - * initialize bb_free to be able to skip - * empty groups without initialization - */ - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - meta_group_info[i]->bb_free = - ext4_free_clusters_after_init(sb, group, desc); - } else { - meta_group_info[i]->bb_free = - ext4_free_group_clusters(sb, desc); - } - - INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); - init_rwsem(&meta_group_info[i]->alloc_sem); - meta_group_info[i]->bb_free_root = RB_ROOT; - meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ - -#ifdef DOUBLE_CHECK - { - struct buffer_head *bh; - meta_group_info[i]->bb_bitmap = - kmalloc(sb->s_blocksize, GFP_KERNEL); - BUG_ON(meta_group_info[i]->bb_bitmap == NULL); - bh = ext4_read_block_bitmap(sb, group); - BUG_ON(bh == NULL); - memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, - sb->s_blocksize); - put_bh(bh); - } -#endif - - return 0; - -exit_group_info: - /* If a meta_group_info table has been allocated, release it now */ - if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { - kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); - sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; - } -exit_meta_group_info: - return -ENOMEM; -} /* ext4_mb_add_groupinfo */ - -static int ext4_mb_init_backend(struct super_block *sb) -{ - ext4_group_t ngroups = ext4_get_groups_count(sb); - ext4_group_t i; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - int num_meta_group_infos; - int num_meta_group_infos_max; - int array_size; - struct ext4_group_desc *desc; - struct kmem_cache *cachep; - - /* This is the number of blocks used by GDT */ - num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); - - /* - * This is the total number of blocks used by GDT including - * the number of reserved blocks for GDT. - * The s_group_info array is allocated with this value - * to allow a clean online resize without a complex - * manipulation of pointer. - * The drawback is the unused memory when no resize - * occurs but it's very low in terms of pages - * (see comments below) - * Need to handle this properly when META_BG resizing is allowed - */ - num_meta_group_infos_max = num_meta_group_infos + - le16_to_cpu(es->s_reserved_gdt_blocks); - - /* - * array_size is the size of s_group_info array. We round it - * to the next power of two because this approximation is done - * internally by kmalloc so we can have some more memory - * for free here (e.g. may be used for META_BG resize). - */ - array_size = 1; - while (array_size < sizeof(*sbi->s_group_info) * - num_meta_group_infos_max) - array_size = array_size << 1; - /* An 8TB filesystem with 64-bit pointers requires a 4096 byte - * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. - * So a two level scheme suffices for now. */ - sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); - if (sbi->s_group_info == NULL) { - ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); - return -ENOMEM; - } - sbi->s_buddy_cache = new_inode(sb); - if (sbi->s_buddy_cache == NULL) { - ext4_msg(sb, KERN_ERR, "can't get new inode"); - goto err_freesgi; - } - /* To avoid potentially colliding with an valid on-disk inode number, - * use EXT4_BAD_INO for the buddy cache inode number. This inode is - * not in the inode hash, so it should never be found by iget(), but - * this will avoid confusion if it ever shows up during debugging. */ - sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; - EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; - for (i = 0; i < ngroups; i++) { - desc = ext4_get_group_desc(sb, i, NULL); - if (desc == NULL) { - ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); - goto err_freebuddy; - } - if (ext4_mb_add_groupinfo(sb, i, desc) != 0) - goto err_freebuddy; - } - - return 0; - -err_freebuddy: - cachep = get_groupinfo_cache(sb->s_blocksize_bits); - while (i-- > 0) - kmem_cache_free(cachep, ext4_get_group_info(sb, i)); - i = num_meta_group_infos; - while (i-- > 0) - kfree(sbi->s_group_info[i]); - iput(sbi->s_buddy_cache); -err_freesgi: - ext4_kvfree(sbi->s_group_info); - return -ENOMEM; -} - -static void ext4_groupinfo_destroy_slabs(void) -{ - int i; - - for (i = 0; i < NR_GRPINFO_CACHES; i++) { - if (ext4_groupinfo_caches[i]) - kmem_cache_destroy(ext4_groupinfo_caches[i]); - ext4_groupinfo_caches[i] = NULL; - } -} - -static int ext4_groupinfo_create_slab(size_t size) -{ - static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); - int slab_size; - int blocksize_bits = order_base_2(size); - int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; - struct kmem_cache *cachep; - - if (cache_index >= NR_GRPINFO_CACHES) - return -EINVAL; - - if (unlikely(cache_index < 0)) - cache_index = 0; - - mutex_lock(&ext4_grpinfo_slab_create_mutex); - if (ext4_groupinfo_caches[cache_index]) { - mutex_unlock(&ext4_grpinfo_slab_create_mutex); - return 0; /* Already created */ - } - - slab_size = offsetof(struct ext4_group_info, - bb_counters[blocksize_bits + 2]); - - cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], - slab_size, 0, SLAB_RECLAIM_ACCOUNT, - NULL); - - ext4_groupinfo_caches[cache_index] = cachep; - - mutex_unlock(&ext4_grpinfo_slab_create_mutex); - if (!cachep) { - printk(KERN_EMERG - "EXT4-fs: no memory for groupinfo slab cache\n"); - return -ENOMEM; - } - - return 0; -} - -int ext4_mb_init(struct super_block *sb, int needs_recovery) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned i, j; - unsigned offset; - unsigned max; - int ret; - - i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); - - sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); - if (sbi->s_mb_offsets == NULL) { - ret = -ENOMEM; - goto out; - } - - i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); - sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); - if (sbi->s_mb_maxs == NULL) { - ret = -ENOMEM; - goto out; - } - - ret = ext4_groupinfo_create_slab(sb->s_blocksize); - if (ret < 0) - goto out; - - /* order 0 is regular bitmap */ - sbi->s_mb_maxs[0] = sb->s_blocksize << 3; - sbi->s_mb_offsets[0] = 0; - - i = 1; - offset = 0; - max = sb->s_blocksize << 2; - do { - sbi->s_mb_offsets[i] = offset; - sbi->s_mb_maxs[i] = max; - offset += 1 << (sb->s_blocksize_bits - i); - max = max >> 1; - i++; - } while (i <= sb->s_blocksize_bits + 1); - - spin_lock_init(&sbi->s_md_lock); - spin_lock_init(&sbi->s_bal_lock); - - sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; - sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; - sbi->s_mb_stats = MB_DEFAULT_STATS; - sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; - sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - /* - * The default group preallocation is 512, which for 4k block - * sizes translates to 2 megabytes. However for bigalloc file - * systems, this is probably too big (i.e, if the cluster size - * is 1 megabyte, then group preallocation size becomes half a - * gigabyte!). As a default, we will keep a two megabyte - * group pralloc size for cluster sizes up to 64k, and after - * that, we will force a minimum group preallocation size of - * 32 clusters. This translates to 8 megs when the cluster - * size is 256k, and 32 megs when the cluster size is 1 meg, - * which seems reasonable as a default. - */ - sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> - sbi->s_cluster_bits, 32); - /* - * If there is a s_stripe > 1, then we set the s_mb_group_prealloc - * to the lowest multiple of s_stripe which is bigger than - * the s_mb_group_prealloc as determined above. We want - * the preallocation size to be an exact multiple of the - * RAID stripe size so that preallocations don't fragment - * the stripes. - */ - if (sbi->s_stripe > 1) { - sbi->s_mb_group_prealloc = roundup( - sbi->s_mb_group_prealloc, sbi->s_stripe); - } - - sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); - if (sbi->s_locality_groups == NULL) { - ret = -ENOMEM; - goto out_free_groupinfo_slab; - } - for_each_possible_cpu(i) { - struct ext4_locality_group *lg; - lg = per_cpu_ptr(sbi->s_locality_groups, i); - mutex_init(&lg->lg_mutex); - for (j = 0; j < PREALLOC_TB_SIZE; j++) - INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); - spin_lock_init(&lg->lg_prealloc_lock); - } - - /* init file for buddy data */ - ret = ext4_mb_init_backend(sb); - if (ret != 0) - goto out_free_locality_groups; - - if (sbi->s_proc) - proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, - &ext4_mb_seq_groups_fops, sb); - - return 0; - -out_free_locality_groups: - free_percpu(sbi->s_locality_groups); - sbi->s_locality_groups = NULL; -out_free_groupinfo_slab: - ext4_groupinfo_destroy_slabs(); -out: - kfree(sbi->s_mb_offsets); - sbi->s_mb_offsets = NULL; - kfree(sbi->s_mb_maxs); - sbi->s_mb_maxs = NULL; - return ret; -} - -/* need to called with the ext4 group lock held */ -static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) -{ - struct ext4_prealloc_space *pa; - struct list_head *cur, *tmp; - int count = 0; - - list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { - pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); - list_del(&pa->pa_group_list); - count++; - kmem_cache_free(ext4_pspace_cachep, pa); - } - if (count) - mb_debug(1, "mballoc: %u PAs left\n", count); - -} - -int ext4_mb_release(struct super_block *sb) -{ - ext4_group_t ngroups = ext4_get_groups_count(sb); - ext4_group_t i; - int num_meta_group_infos; - struct ext4_group_info *grinfo; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); - - if (sbi->s_proc) - remove_proc_entry("mb_groups", sbi->s_proc); - - if (sbi->s_group_info) { - for (i = 0; i < ngroups; i++) { - grinfo = ext4_get_group_info(sb, i); -#ifdef DOUBLE_CHECK - kfree(grinfo->bb_bitmap); -#endif - ext4_lock_group(sb, i); - ext4_mb_cleanup_pa(grinfo); - ext4_unlock_group(sb, i); - kmem_cache_free(cachep, grinfo); - } - num_meta_group_infos = (ngroups + - EXT4_DESC_PER_BLOCK(sb) - 1) >> - EXT4_DESC_PER_BLOCK_BITS(sb); - for (i = 0; i < num_meta_group_infos; i++) - kfree(sbi->s_group_info[i]); - ext4_kvfree(sbi->s_group_info); - } - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); - if (sbi->s_buddy_cache) - iput(sbi->s_buddy_cache); - if (sbi->s_mb_stats) { - ext4_msg(sb, KERN_INFO, - "mballoc: %u blocks %u reqs (%u success)", - atomic_read(&sbi->s_bal_allocated), - atomic_read(&sbi->s_bal_reqs), - atomic_read(&sbi->s_bal_success)); - ext4_msg(sb, KERN_INFO, - "mballoc: %u extents scanned, %u goal hits, " - "%u 2^N hits, %u breaks, %u lost", - atomic_read(&sbi->s_bal_ex_scanned), - atomic_read(&sbi->s_bal_goals), - atomic_read(&sbi->s_bal_2orders), - atomic_read(&sbi->s_bal_breaks), - atomic_read(&sbi->s_mb_lost_chunks)); - ext4_msg(sb, KERN_INFO, - "mballoc: %lu generated and it took %Lu", - sbi->s_mb_buddies_generated, - sbi->s_mb_generation_time); - ext4_msg(sb, KERN_INFO, - "mballoc: %u preallocated, %u discarded", - atomic_read(&sbi->s_mb_preallocated), - atomic_read(&sbi->s_mb_discarded)); - } - - free_percpu(sbi->s_locality_groups); - - return 0; -} - -static inline int ext4_issue_discard(struct super_block *sb, - ext4_group_t block_group, ext4_grpblk_t cluster, int count) -{ - ext4_fsblk_t discard_block; - - discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + - ext4_group_first_block_no(sb, block_group)); - count = EXT4_C2B(EXT4_SB(sb), count); - trace_ext4_discard_blocks(sb, - (unsigned long long) discard_block, count); - return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); -} - -/* - * This function is called by the jbd2 layer once the commit has finished, - * so we know we can free the blocks that were released with that commit. - */ -static void ext4_free_data_callback(struct super_block *sb, - struct ext4_journal_cb_entry *jce, - int rc) -{ - struct ext4_free_data *entry = (struct ext4_free_data *)jce; - struct ext4_buddy e4b; - struct ext4_group_info *db; - int err, count = 0, count2 = 0; - - mb_debug(1, "gonna free %u blocks in group %u (0x%p):", - entry->efd_count, entry->efd_group, entry); - - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, entry->efd_group, - entry->efd_start_cluster, entry->efd_count); - - err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); - /* we expect to find existing buddy because it's pinned */ - BUG_ON(err != 0); - - - db = e4b.bd_info; - /* there are blocks to put in buddy to make them really free */ - count += entry->efd_count; - count2++; - ext4_lock_group(sb, entry->efd_group); - /* Take it out of per group rb tree */ - rb_erase(&entry->efd_node, &(db->bb_free_root)); - mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); - - /* - * Clear the trimmed flag for the group so that the next - * ext4_trim_fs can trim it. - * If the volume is mounted with -o discard, online discard - * is supported and the free blocks will be trimmed online. - */ - if (!test_opt(sb, DISCARD)) - EXT4_MB_GRP_CLEAR_TRIMMED(db); - - if (!db->bb_free_root.rb_node) { - /* No more items in the per group rb tree - * balance refcounts from ext4_mb_free_metadata() - */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); - } - ext4_unlock_group(sb, entry->efd_group); - kmem_cache_free(ext4_free_data_cachep, entry); - ext4_mb_unload_buddy(&e4b); - - mb_debug(1, "freed %u blocks in %u structures\n", count, count2); -} - -#ifdef CONFIG_EXT4_DEBUG -u8 mb_enable_debug __read_mostly; - -static struct dentry *debugfs_dir; -static struct dentry *debugfs_debug; - -static void __init ext4_create_debugfs_entry(void) -{ - debugfs_dir = debugfs_create_dir("ext4", NULL); - if (debugfs_dir) - debugfs_debug = debugfs_create_u8("mballoc-debug", - S_IRUGO | S_IWUSR, - debugfs_dir, - &mb_enable_debug); -} - -static void ext4_remove_debugfs_entry(void) -{ - debugfs_remove(debugfs_debug); - debugfs_remove(debugfs_dir); -} - -#else - -static void __init ext4_create_debugfs_entry(void) -{ -} - -static void ext4_remove_debugfs_entry(void) -{ -} - -#endif - -int __init ext4_init_mballoc(void) -{ - ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, - SLAB_RECLAIM_ACCOUNT); - if (ext4_pspace_cachep == NULL) - return -ENOMEM; - - ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, - SLAB_RECLAIM_ACCOUNT); - if (ext4_ac_cachep == NULL) { - kmem_cache_destroy(ext4_pspace_cachep); - return -ENOMEM; - } - - ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, - SLAB_RECLAIM_ACCOUNT); - if (ext4_free_data_cachep == NULL) { - kmem_cache_destroy(ext4_pspace_cachep); - kmem_cache_destroy(ext4_ac_cachep); - return -ENOMEM; - } - ext4_create_debugfs_entry(); - return 0; -} - -void ext4_exit_mballoc(void) -{ - /* - * Wait for completion of call_rcu()'s on ext4_pspace_cachep - * before destroying the slab cache. - */ - rcu_barrier(); - kmem_cache_destroy(ext4_pspace_cachep); - kmem_cache_destroy(ext4_ac_cachep); - kmem_cache_destroy(ext4_free_data_cachep); - ext4_groupinfo_destroy_slabs(); - ext4_remove_debugfs_entry(); -} - - -/* - * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps - * Returns 0 if success or error code - */ -static noinline_for_stack int -ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, - handle_t *handle, unsigned int reserv_clstrs) -{ - struct buffer_head *bitmap_bh = NULL; - struct ext4_group_desc *gdp; - struct buffer_head *gdp_bh; - struct ext4_sb_info *sbi; - struct super_block *sb; - ext4_fsblk_t block; - int err, len; - - BUG_ON(ac->ac_status != AC_STATUS_FOUND); - BUG_ON(ac->ac_b_ex.fe_len <= 0); - - sb = ac->ac_sb; - sbi = EXT4_SB(sb); - - err = -EIO; - bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); - if (!bitmap_bh) - goto out_err; - - err = ext4_journal_get_write_access(handle, bitmap_bh); - if (err) - goto out_err; - - err = -EIO; - gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); - if (!gdp) - goto out_err; - - ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, - ext4_free_group_clusters(sb, gdp)); - - err = ext4_journal_get_write_access(handle, gdp_bh); - if (err) - goto out_err; - - block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - - len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (!ext4_data_block_valid(sbi, block, len)) { - ext4_error(sb, "Allocating blocks %llu-%llu which overlap " - "fs metadata", block, block+len); - /* File system mounted not to panic on error - * Fix the bitmap and repeat the block allocation - * We leak some of the blocks here. - */ - ext4_lock_group(sb, ac->ac_b_ex.fe_group); - ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - if (!err) - err = -EAGAIN; - goto out_err; - } - - ext4_lock_group(sb, ac->ac_b_ex.fe_group); -#ifdef AGGRESSIVE_CHECK - { - int i; - for (i = 0; i < ac->ac_b_ex.fe_len; i++) { - BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, - bitmap_bh->b_data)); - } - } -#endif - ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_group_clusters_set(sb, gdp, - ext4_free_clusters_after_init(sb, - ac->ac_b_ex.fe_group, gdp)); - } - len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; - ext4_free_group_clusters_set(sb, gdp, len); - gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); - - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); - /* - * Now reduce the dirty block count also. Should not go negative - */ - if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) - /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - reserv_clstrs); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, - ac->ac_b_ex.fe_group); - atomic_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_clusters); - } - - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - if (err) - goto out_err; - err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); - -out_err: - ext4_mark_super_dirty(sb); - brelse(bitmap_bh); - return err; -} - -/* - * here we normalize request for locality group - * Group request are normalized to s_mb_group_prealloc, which goes to - * s_strip if we set the same via mount option. - * s_mb_group_prealloc can be configured via - * /sys/fs/ext4/<partition>/mb_group_prealloc - * - * XXX: should we try to preallocate more than the group has now? - */ -static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) -{ - struct super_block *sb = ac->ac_sb; - struct ext4_locality_group *lg = ac->ac_lg; - - BUG_ON(lg == NULL); - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; - mb_debug(1, "#%u: goal %u blocks for locality group\n", - current->pid, ac->ac_g_ex.fe_len); -} - -/* - * Normalization means making request better in terms of - * size and alignment - */ -static noinline_for_stack void -ext4_mb_normalize_request(struct ext4_allocation_context *ac, - struct ext4_allocation_request *ar) -{ - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - int bsbits, max; - ext4_lblk_t end; - loff_t size, start_off; - loff_t orig_size __maybe_unused; - ext4_lblk_t start; - struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); - struct ext4_prealloc_space *pa; - - /* do normalize only data requests, metadata requests - do not need preallocation */ - if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) - return; - - /* sometime caller may want exact blocks */ - if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) - return; - - /* caller may indicate that preallocation isn't - * required (it's a tail, for example) */ - if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) - return; - - if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { - ext4_mb_normalize_group_request(ac); - return ; - } - - bsbits = ac->ac_sb->s_blocksize_bits; - - /* first, let's learn actual file size - * given current request is allocated */ - size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); - size = size << bsbits; - if (size < i_size_read(ac->ac_inode)) - size = i_size_read(ac->ac_inode); - orig_size = size; - - /* max size of free chunks */ - max = 2 << bsbits; - -#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ - (req <= (size) || max <= (chunk_size)) - - /* first, try to predict filesize */ - /* XXX: should this table be tunable? */ - start_off = 0; - if (size <= 16 * 1024) { - size = 16 * 1024; - } else if (size <= 32 * 1024) { - size = 32 * 1024; - } else if (size <= 64 * 1024) { - size = 64 * 1024; - } else if (size <= 128 * 1024) { - size = 128 * 1024; - } else if (size <= 256 * 1024) { - size = 256 * 1024; - } else if (size <= 512 * 1024) { - size = 512 * 1024; - } else if (size <= 1024 * 1024) { - size = 1024 * 1024; - } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { - start_off = ((loff_t)ac->ac_o_ex.fe_logical >> - (21 - bsbits)) << 21; - size = 2 * 1024 * 1024; - } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { - start_off = ((loff_t)ac->ac_o_ex.fe_logical >> - (22 - bsbits)) << 22; - size = 4 * 1024 * 1024; - } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, - (8<<20)>>bsbits, max, 8 * 1024)) { - start_off = ((loff_t)ac->ac_o_ex.fe_logical >> - (23 - bsbits)) << 23; - size = 8 * 1024 * 1024; - } else { - start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; - size = ac->ac_o_ex.fe_len << bsbits; - } - size = size >> bsbits; - start = start_off >> bsbits; - - /* don't cover already allocated blocks in selected range */ - if (ar->pleft && start <= ar->lleft) { - size -= ar->lleft + 1 - start; - start = ar->lleft + 1; - } - if (ar->pright && start + size - 1 >= ar->lright) - size -= start + size - ar->lright; - - end = start + size; - - /* check we don't cross already preallocated blocks */ - rcu_read_lock(); - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - ext4_lblk_t pa_end; - - if (pa->pa_deleted) - continue; - spin_lock(&pa->pa_lock); - if (pa->pa_deleted) { - spin_unlock(&pa->pa_lock); - continue; - } - - pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), - pa->pa_len); - - /* PA must not overlap original request */ - BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || - ac->ac_o_ex.fe_logical < pa->pa_lstart)); - - /* skip PAs this normalized request doesn't overlap with */ - if (pa->pa_lstart >= end || pa_end <= start) { - spin_unlock(&pa->pa_lock); - continue; - } - BUG_ON(pa->pa_lstart <= start && pa_end >= end); - - /* adjust start or end to be adjacent to this pa */ - if (pa_end <= ac->ac_o_ex.fe_logical) { - BUG_ON(pa_end < start); - start = pa_end; - } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { - BUG_ON(pa->pa_lstart > end); - end = pa->pa_lstart; - } - spin_unlock(&pa->pa_lock); - } - rcu_read_unlock(); - size = end - start; - - /* XXX: extra loop to check we really don't overlap preallocations */ - rcu_read_lock(); - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - ext4_lblk_t pa_end; - - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0) { - pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), - pa->pa_len); - BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); - } - spin_unlock(&pa->pa_lock); - } - rcu_read_unlock(); - - if (start + size <= ac->ac_o_ex.fe_logical && - start > ac->ac_o_ex.fe_logical) { - ext4_msg(ac->ac_sb, KERN_ERR, - "start %lu, size %lu, fe_logical %lu", - (unsigned long) start, (unsigned long) size, - (unsigned long) ac->ac_o_ex.fe_logical); - } - BUG_ON(start + size <= ac->ac_o_ex.fe_logical && - start > ac->ac_o_ex.fe_logical); - BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); - - /* now prepare goal request */ - - /* XXX: is it better to align blocks WRT to logical - * placement or satisfy big request as is */ - ac->ac_g_ex.fe_logical = start; - ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); - - /* define goal start in order to merge */ - if (ar->pright && (ar->lright == (start + size))) { - /* merge to the right */ - ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, - &ac->ac_f_ex.fe_group, - &ac->ac_f_ex.fe_start); - ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; - } - if (ar->pleft && (ar->lleft + 1 == start)) { - /* merge to the left */ - ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, - &ac->ac_f_ex.fe_group, - &ac->ac_f_ex.fe_start); - ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; - } - - mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, - (unsigned) orig_size, (unsigned) start); -} - -static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) -{ - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - - if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { - atomic_inc(&sbi->s_bal_reqs); - atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); - if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) - atomic_inc(&sbi->s_bal_success); - atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); - if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && - ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) - atomic_inc(&sbi->s_bal_goals); - if (ac->ac_found > sbi->s_mb_max_to_scan) - atomic_inc(&sbi->s_bal_breaks); - } - - if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) - trace_ext4_mballoc_alloc(ac); - else - trace_ext4_mballoc_prealloc(ac); -} - -/* - * Called on failure; free up any blocks from the inode PA for this - * context. We don't need this for MB_GROUP_PA because we only change - * pa_free in ext4_mb_release_context(), but on failure, we've already - * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. - */ -static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) -{ - struct ext4_prealloc_space *pa = ac->ac_pa; - int len; - - if (pa && pa->pa_type == MB_INODE_PA) { - len = ac->ac_b_ex.fe_len; - pa->pa_free += len; - } - -} - -/* - * use blocks preallocated to inode - */ -static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, - struct ext4_prealloc_space *pa) -{ - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - ext4_fsblk_t start; - ext4_fsblk_t end; - int len; - - /* found preallocated blocks, use them */ - start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); - end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), - start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); - len = EXT4_NUM_B2C(sbi, end - start); - ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, - &ac->ac_b_ex.fe_start); - ac->ac_b_ex.fe_len = len; - ac->ac_status = AC_STATUS_FOUND; - ac->ac_pa = pa; - - BUG_ON(start < pa->pa_pstart); - BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); - BUG_ON(pa->pa_free < len); - pa->pa_free -= len; - - mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); -} - -/* - * use blocks preallocated to locality group - */ -static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, - struct ext4_prealloc_space *pa) -{ - unsigned int len = ac->ac_o_ex.fe_len; - - ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, - &ac->ac_b_ex.fe_group, - &ac->ac_b_ex.fe_start); - ac->ac_b_ex.fe_len = len; - ac->ac_status = AC_STATUS_FOUND; - ac->ac_pa = pa; - - /* we don't correct pa_pstart or pa_plen here to avoid - * possible race when the group is being loaded concurrently - * instead we correct pa later, after blocks are marked - * in on-disk bitmap -- see ext4_mb_release_context() - * Other CPUs are prevented from allocating from this pa by lg_mutex - */ - mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); -} - -/* - * Return the prealloc space that have minimal distance - * from the goal block. @cpa is the prealloc - * space that is having currently known minimal distance - * from the goal block. - */ -static struct ext4_prealloc_space * -ext4_mb_check_group_pa(ext4_fsblk_t goal_block, - struct ext4_prealloc_space *pa, - struct ext4_prealloc_space *cpa) -{ - ext4_fsblk_t cur_distance, new_distance; - - if (cpa == NULL) { - atomic_inc(&pa->pa_count); - return pa; - } - cur_distance = abs(goal_block - cpa->pa_pstart); - new_distance = abs(goal_block - pa->pa_pstart); - - if (cur_distance <= new_distance) - return cpa; - - /* drop the previous reference */ - atomic_dec(&cpa->pa_count); - atomic_inc(&pa->pa_count); - return pa; -} - -/* - * search goal blocks in preallocated space - */ -static noinline_for_stack int -ext4_mb_use_preallocated(struct ext4_allocation_context *ac) -{ - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - int order, i; - struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); - struct ext4_locality_group *lg; - struct ext4_prealloc_space *pa, *cpa = NULL; - ext4_fsblk_t goal_block; - - /* only data can be preallocated */ - if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) - return 0; - - /* first, try per-file preallocation */ - rcu_read_lock(); - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - - /* all fields in this condition don't change, - * so we can skip locking for them */ - if (ac->ac_o_ex.fe_logical < pa->pa_lstart || - ac->ac_o_ex.fe_logical >= (pa->pa_lstart + - EXT4_C2B(sbi, pa->pa_len))) - continue; - - /* non-extent files can't have physical blocks past 2^32 */ - if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && - (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > - EXT4_MAX_BLOCK_FILE_PHYS)) - continue; - - /* found preallocated blocks, use them */ - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0 && pa->pa_free) { - atomic_inc(&pa->pa_count); - ext4_mb_use_inode_pa(ac, pa); - spin_unlock(&pa->pa_lock); - ac->ac_criteria = 10; - rcu_read_unlock(); - return 1; - } - spin_unlock(&pa->pa_lock); - } - rcu_read_unlock(); - - /* can we use group allocation? */ - if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) - return 0; - - /* inode may have no locality group for some reason */ - lg = ac->ac_lg; - if (lg == NULL) - return 0; - order = fls(ac->ac_o_ex.fe_len) - 1; - if (order > PREALLOC_TB_SIZE - 1) - /* The max size of hash table is PREALLOC_TB_SIZE */ - order = PREALLOC_TB_SIZE - 1; - - goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); - /* - * search for the prealloc space that is having - * minimal distance from the goal block. - */ - for (i = order; i < PREALLOC_TB_SIZE; i++) { - rcu_read_lock(); - list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], - pa_inode_list) { - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0 && - pa->pa_free >= ac->ac_o_ex.fe_len) { - - cpa = ext4_mb_check_group_pa(goal_block, - pa, cpa); - } - spin_unlock(&pa->pa_lock); - } - rcu_read_unlock(); - } - if (cpa) { - ext4_mb_use_group_pa(ac, cpa); - ac->ac_criteria = 20; - return 1; - } - return 0; -} - -/* - * the function goes through all block freed in the group - * but not yet committed and marks them used in in-core bitmap. - * buddy must be generated from this bitmap - * Need to be called with the ext4 group lock held - */ -static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, - ext4_group_t group) -{ - struct rb_node *n; - struct ext4_group_info *grp; - struct ext4_free_data *entry; - - grp = ext4_get_group_info(sb, group); - n = rb_first(&(grp->bb_free_root)); - - while (n) { - entry = rb_entry(n, struct ext4_free_data, efd_node); - ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); - n = rb_next(n); - } - return; -} - -/* - * the function goes through all preallocation in this group and marks them - * used in in-core bitmap. buddy must be generated from this bitmap - * Need to be called with ext4 group lock held - */ -static noinline_for_stack -void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, - ext4_group_t group) -{ - struct ext4_group_info *grp = ext4_get_group_info(sb, group); - struct ext4_prealloc_space *pa; - struct list_head *cur; - ext4_group_t groupnr; - ext4_grpblk_t start; - int preallocated = 0; - int len; - - /* all form of preallocation discards first load group, - * so the only competing code is preallocation use. - * we don't need any locking here - * notice we do NOT ignore preallocations with pa_deleted - * otherwise we could leave used blocks available for - * allocation in buddy when concurrent ext4_mb_put_pa() - * is dropping preallocation - */ - list_for_each(cur, &grp->bb_prealloc_list) { - pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); - spin_lock(&pa->pa_lock); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, - &groupnr, &start); - len = pa->pa_len; - spin_unlock(&pa->pa_lock); - if (unlikely(len == 0)) - continue; - BUG_ON(groupnr != group); - ext4_set_bits(bitmap, start, len); - preallocated += len; - } - mb_debug(1, "prellocated %u for group %u\n", preallocated, group); -} - -static void ext4_mb_pa_callback(struct rcu_head *head) -{ - struct ext4_prealloc_space *pa; - pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); - kmem_cache_free(ext4_pspace_cachep, pa); -} - -/* - * drops a reference to preallocated space descriptor - * if this was the last reference and the space is consumed - */ -static void ext4_mb_put_pa(struct ext4_allocation_context *ac, - struct super_block *sb, struct ext4_prealloc_space *pa) -{ - ext4_group_t grp; - ext4_fsblk_t grp_blk; - - if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) - return; - - /* in this short window concurrent discard can set pa_deleted */ - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 1) { - spin_unlock(&pa->pa_lock); - return; - } - - pa->pa_deleted = 1; - spin_unlock(&pa->pa_lock); - - grp_blk = pa->pa_pstart; - /* - * If doing group-based preallocation, pa_pstart may be in the - * next group when pa is used up - */ - if (pa->pa_type == MB_GROUP_PA) - grp_blk--; - - ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); - - /* - * possible race: - * - * P1 (buddy init) P2 (regular allocation) - * find block B in PA - * copy on-disk bitmap to buddy - * mark B in on-disk bitmap - * drop PA from group - * mark all PAs in buddy - * - * thus, P1 initializes buddy with B available. to prevent this - * we make "copy" and "mark all PAs" atomic and serialize "drop PA" - * against that pair - */ - ext4_lock_group(sb, grp); - list_del(&pa->pa_group_list); - ext4_unlock_group(sb, grp); - - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); - - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); -} - -/* - * creates new preallocated space for given inode - */ -static noinline_for_stack int -ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) -{ - struct super_block *sb = ac->ac_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_prealloc_space *pa; - struct ext4_group_info *grp; - struct ext4_inode_info *ei; - - /* preallocate only when found space is larger then requested */ - BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); - BUG_ON(ac->ac_status != AC_STATUS_FOUND); - BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); - - pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); - if (pa == NULL) - return -ENOMEM; - - if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { - int winl; - int wins; - int win; - int offs; - - /* we can't allocate as much as normalizer wants. - * so, found space must get proper lstart - * to cover original request */ - BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); - BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); - - /* we're limited by original request in that - * logical block must be covered any way - * winl is window we can move our chunk within */ - winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; - - /* also, we should cover whole original request */ - wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); - - /* the smallest one defines real window */ - win = min(winl, wins); - - offs = ac->ac_o_ex.fe_logical % - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (offs && offs < win) - win = offs; - - ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - - EXT4_B2C(sbi, win); - BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); - BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); - } - - /* preallocation can change ac_b_ex, thus we store actually - * allocated blocks for history */ - ac->ac_f_ex = ac->ac_b_ex; - - pa->pa_lstart = ac->ac_b_ex.fe_logical; - pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - pa->pa_len = ac->ac_b_ex.fe_len; - pa->pa_free = pa->pa_len; - atomic_set(&pa->pa_count, 1); - spin_lock_init(&pa->pa_lock); - INIT_LIST_HEAD(&pa->pa_inode_list); - INIT_LIST_HEAD(&pa->pa_group_list); - pa->pa_deleted = 0; - pa->pa_type = MB_INODE_PA; - - mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); - trace_ext4_mb_new_inode_pa(ac, pa); - - ext4_mb_use_inode_pa(ac, pa); - atomic_add(pa->pa_free, &sbi->s_mb_preallocated); - - ei = EXT4_I(ac->ac_inode); - grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); - - pa->pa_obj_lock = &ei->i_prealloc_lock; - pa->pa_inode = ac->ac_inode; - - ext4_lock_group(sb, ac->ac_b_ex.fe_group); - list_add(&pa->pa_group_list, &grp->bb_prealloc_list); - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - - spin_lock(pa->pa_obj_lock); - list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); - spin_unlock(pa->pa_obj_lock); - - return 0; -} - -/* - * creates new preallocated space for locality group inodes belongs to - */ -static noinline_for_stack int -ext4_mb_new_group_pa(struct ext4_allocation_context *ac) -{ - struct super_block *sb = ac->ac_sb; - struct ext4_locality_group *lg; - struct ext4_prealloc_space *pa; - struct ext4_group_info *grp; - - /* preallocate only when found space is larger then requested */ - BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); - BUG_ON(ac->ac_status != AC_STATUS_FOUND); - BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); - - BUG_ON(ext4_pspace_cachep == NULL); - pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); - if (pa == NULL) - return -ENOMEM; - - /* preallocation can change ac_b_ex, thus we store actually - * allocated blocks for history */ - ac->ac_f_ex = ac->ac_b_ex; - - pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - pa->pa_lstart = pa->pa_pstart; - pa->pa_len = ac->ac_b_ex.fe_len; - pa->pa_free = pa->pa_len; - atomic_set(&pa->pa_count, 1); - spin_lock_init(&pa->pa_lock); - INIT_LIST_HEAD(&pa->pa_inode_list); - INIT_LIST_HEAD(&pa->pa_group_list); - pa->pa_deleted = 0; - pa->pa_type = MB_GROUP_PA; - - mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); - trace_ext4_mb_new_group_pa(ac, pa); - - ext4_mb_use_group_pa(ac, pa); - atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); - - grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); - lg = ac->ac_lg; - BUG_ON(lg == NULL); - - pa->pa_obj_lock = &lg->lg_prealloc_lock; - pa->pa_inode = NULL; - - ext4_lock_group(sb, ac->ac_b_ex.fe_group); - list_add(&pa->pa_group_list, &grp->bb_prealloc_list); - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - - /* - * We will later add the new pa to the right bucket - * after updating the pa_free in ext4_mb_release_context - */ - return 0; -} - -static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) -{ - int err; - - if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) - err = ext4_mb_new_group_pa(ac); - else - err = ext4_mb_new_inode_pa(ac); - return err; -} - -/* - * finds all unused blocks in on-disk bitmap, frees them in - * in-core bitmap and buddy. - * @pa must be unlinked from inode and group lists, so that - * nobody else can find/use it. - * the caller MUST hold group/inode locks. - * TODO: optimize the case when there are no in-core structures yet - */ -static noinline_for_stack int -ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, - struct ext4_prealloc_space *pa) -{ - struct super_block *sb = e4b->bd_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned int end; - unsigned int next; - ext4_group_t group; - ext4_grpblk_t bit; - unsigned long long grp_blk_start; - int err = 0; - int free = 0; - - BUG_ON(pa->pa_deleted == 0); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); - grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); - BUG_ON(group != e4b->bd_group && pa->pa_len != 0); - end = bit + pa->pa_len; - - while (bit < end) { - bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); - if (bit >= end) - break; - next = mb_find_next_bit(bitmap_bh->b_data, end, bit); - mb_debug(1, " free preallocated %u/%u in group %u\n", - (unsigned) ext4_group_first_block_no(sb, group) + bit, - (unsigned) next - bit, (unsigned) group); - free += next - bit; - - trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); - trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + - EXT4_C2B(sbi, bit)), - next - bit); - mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); - bit = next + 1; - } - if (free != pa->pa_free) { - ext4_msg(e4b->bd_sb, KERN_CRIT, - "pa %p: logic %lu, phys. %lu, len %lu", - pa, (unsigned long) pa->pa_lstart, - (unsigned long) pa->pa_pstart, - (unsigned long) pa->pa_len); - ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", - free, pa->pa_free); - /* - * pa is already deleted so we use the value obtained - * from the bitmap and continue. - */ - } - atomic_add(free, &sbi->s_mb_discarded); - - return err; -} - -static noinline_for_stack int -ext4_mb_release_group_pa(struct ext4_buddy *e4b, - struct ext4_prealloc_space *pa) -{ - struct super_block *sb = e4b->bd_sb; - ext4_group_t group; - ext4_grpblk_t bit; - - trace_ext4_mb_release_group_pa(sb, pa); - BUG_ON(pa->pa_deleted == 0); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); - BUG_ON(group != e4b->bd_group && pa->pa_len != 0); - mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); - atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); - trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); - - return 0; -} - -/* - * releases all preallocations in given group - * - * first, we need to decide discard policy: - * - when do we discard - * 1) ENOSPC - * - how many do we discard - * 1) how many requested - */ -static noinline_for_stack int -ext4_mb_discard_group_preallocations(struct super_block *sb, - ext4_group_t group, int needed) -{ - struct ext4_group_info *grp = ext4_get_group_info(sb, group); - struct buffer_head *bitmap_bh = NULL; - struct ext4_prealloc_space *pa, *tmp; - struct list_head list; - struct ext4_buddy e4b; - int err; - int busy = 0; - int free = 0; - - mb_debug(1, "discard preallocation for group %u\n", group); - - if (list_empty(&grp->bb_prealloc_list)) - return 0; - - bitmap_bh = ext4_read_block_bitmap(sb, group); - if (bitmap_bh == NULL) { - ext4_error(sb, "Error reading block bitmap for %u", group); - return 0; - } - - err = ext4_mb_load_buddy(sb, group, &e4b); - if (err) { - ext4_error(sb, "Error loading buddy information for %u", group); - put_bh(bitmap_bh); - return 0; - } - - if (needed == 0) - needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; - - INIT_LIST_HEAD(&list); -repeat: - ext4_lock_group(sb, group); - list_for_each_entry_safe(pa, tmp, - &grp->bb_prealloc_list, pa_group_list) { - spin_lock(&pa->pa_lock); - if (atomic_read(&pa->pa_count)) { - spin_unlock(&pa->pa_lock); - busy = 1; - continue; - } - if (pa->pa_deleted) { - spin_unlock(&pa->pa_lock); - continue; - } - - /* seems this one can be freed ... */ - pa->pa_deleted = 1; - - /* we can trust pa_free ... */ - free += pa->pa_free; - - spin_unlock(&pa->pa_lock); - - list_del(&pa->pa_group_list); - list_add(&pa->u.pa_tmp_list, &list); - } - - /* if we still need more blocks and some PAs were used, try again */ - if (free < needed && busy) { - busy = 0; - ext4_unlock_group(sb, group); - /* - * Yield the CPU here so that we don't get soft lockup - * in non preempt case. - */ - yield(); - goto repeat; - } - - /* found anything to free? */ - if (list_empty(&list)) { - BUG_ON(free != 0); - goto out; - } - - /* now free all selected PAs */ - list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { - - /* remove from object (inode or locality group) */ - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); - - if (pa->pa_type == MB_GROUP_PA) - ext4_mb_release_group_pa(&e4b, pa); - else - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); - - list_del(&pa->u.pa_tmp_list); - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); - } - -out: - ext4_unlock_group(sb, group); - ext4_mb_unload_buddy(&e4b); - put_bh(bitmap_bh); - return free; -} - -/* - * releases all non-used preallocated blocks for given inode - * - * It's important to discard preallocations under i_data_sem - * We don't want another block to be served from the prealloc - * space when we are discarding the inode prealloc space. - * - * FIXME!! Make sure it is valid at all the call sites - */ -void ext4_discard_preallocations(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct super_block *sb = inode->i_sb; - struct buffer_head *bitmap_bh = NULL; - struct ext4_prealloc_space *pa, *tmp; - ext4_group_t group = 0; - struct list_head list; - struct ext4_buddy e4b; - int err; - - if (!S_ISREG(inode->i_mode)) { - /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ - return; - } - - mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); - trace_ext4_discard_preallocations(inode); - - INIT_LIST_HEAD(&list); - -repeat: - /* first, collect all pa's in the inode */ - spin_lock(&ei->i_prealloc_lock); - while (!list_empty(&ei->i_prealloc_list)) { - pa = list_entry(ei->i_prealloc_list.next, - struct ext4_prealloc_space, pa_inode_list); - BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); - spin_lock(&pa->pa_lock); - if (atomic_read(&pa->pa_count)) { - /* this shouldn't happen often - nobody should - * use preallocation while we're discarding it */ - spin_unlock(&pa->pa_lock); - spin_unlock(&ei->i_prealloc_lock); - ext4_msg(sb, KERN_ERR, - "uh-oh! used pa while discarding"); - WARN_ON(1); - schedule_timeout_uninterruptible(HZ); - goto repeat; - - } - if (pa->pa_deleted == 0) { - pa->pa_deleted = 1; - spin_unlock(&pa->pa_lock); - list_del_rcu(&pa->pa_inode_list); - list_add(&pa->u.pa_tmp_list, &list); - continue; - } - - /* someone is deleting pa right now */ - spin_unlock(&pa->pa_lock); - spin_unlock(&ei->i_prealloc_lock); - - /* we have to wait here because pa_deleted - * doesn't mean pa is already unlinked from - * the list. as we might be called from - * ->clear_inode() the inode will get freed - * and concurrent thread which is unlinking - * pa from inode's list may access already - * freed memory, bad-bad-bad */ - - /* XXX: if this happens too often, we can - * add a flag to force wait only in case - * of ->clear_inode(), but not in case of - * regular truncate */ - schedule_timeout_uninterruptible(HZ); - goto repeat; - } - spin_unlock(&ei->i_prealloc_lock); - - list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { - BUG_ON(pa->pa_type != MB_INODE_PA); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); - - err = ext4_mb_load_buddy(sb, group, &e4b); - if (err) { - ext4_error(sb, "Error loading buddy information for %u", - group); - continue; - } - - bitmap_bh = ext4_read_block_bitmap(sb, group); - if (bitmap_bh == NULL) { - ext4_error(sb, "Error reading block bitmap for %u", - group); - ext4_mb_unload_buddy(&e4b); - continue; - } - - ext4_lock_group(sb, group); - list_del(&pa->pa_group_list); - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); - ext4_unlock_group(sb, group); - - ext4_mb_unload_buddy(&e4b); - put_bh(bitmap_bh); - - list_del(&pa->u.pa_tmp_list); - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); - } -} - -#ifdef CONFIG_EXT4_DEBUG -static void ext4_mb_show_ac(struct ext4_allocation_context *ac) -{ - struct super_block *sb = ac->ac_sb; - ext4_group_t ngroups, i; - - if (!mb_enable_debug || - (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) - return; - - ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" - " Allocation context details:"); - ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", - ac->ac_status, ac->ac_flags); - ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " - "goal %lu/%lu/%lu@%lu, " - "best %lu/%lu/%lu@%lu cr %d", - (unsigned long)ac->ac_o_ex.fe_group, - (unsigned long)ac->ac_o_ex.fe_start, - (unsigned long)ac->ac_o_ex.fe_len, - (unsigned long)ac->ac_o_ex.fe_logical, - (unsigned long)ac->ac_g_ex.fe_group, - (unsigned long)ac->ac_g_ex.fe_start, - (unsigned long)ac->ac_g_ex.fe_len, - (unsigned long)ac->ac_g_ex.fe_logical, - (unsigned long)ac->ac_b_ex.fe_group, - (unsigned long)ac->ac_b_ex.fe_start, - (unsigned long)ac->ac_b_ex.fe_len, - (unsigned long)ac->ac_b_ex.fe_logical, - (int)ac->ac_criteria); - ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found", - ac->ac_ex_scanned, ac->ac_found); - ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); - ngroups = ext4_get_groups_count(sb); - for (i = 0; i < ngroups; i++) { - struct ext4_group_info *grp = ext4_get_group_info(sb, i); - struct ext4_prealloc_space *pa; - ext4_grpblk_t start; - struct list_head *cur; - ext4_lock_group(sb, i); - list_for_each(cur, &grp->bb_prealloc_list) { - pa = list_entry(cur, struct ext4_prealloc_space, - pa_group_list); - spin_lock(&pa->pa_lock); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, - NULL, &start); - spin_unlock(&pa->pa_lock); - printk(KERN_ERR "PA:%u:%d:%u \n", i, - start, pa->pa_len); - } - ext4_unlock_group(sb, i); - - if (grp->bb_free == 0) - continue; - printk(KERN_ERR "%u: %d/%d \n", - i, grp->bb_free, grp->bb_fragments); - } - printk(KERN_ERR "\n"); -} -#else -static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) -{ - return; -} -#endif - -/* - * We use locality group preallocation for small size file. The size of the - * file is determined by the current size or the resulting size after - * allocation which ever is larger - * - * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req - */ -static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) -{ - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - int bsbits = ac->ac_sb->s_blocksize_bits; - loff_t size, isize; - - if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) - return; - - if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) - return; - - size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); - isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) - >> bsbits; - - if ((size == isize) && - !ext4_fs_is_busy(sbi) && - (atomic_read(&ac->ac_inode->i_writecount) == 0)) { - ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; - return; - } - - if (sbi->s_mb_group_prealloc <= 0) { - ac->ac_flags |= EXT4_MB_STREAM_ALLOC; - return; - } - - /* don't use group allocation for large files */ - size = max(size, isize); - if (size > sbi->s_mb_stream_request) { - ac->ac_flags |= EXT4_MB_STREAM_ALLOC; - return; - } - - BUG_ON(ac->ac_lg != NULL); - /* - * locality group prealloc space are per cpu. The reason for having - * per cpu locality group is to reduce the contention between block - * request from multiple CPUs. - */ - ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups); - - /* we're going to use group allocation */ - ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; - - /* serialize all allocations in the group */ - mutex_lock(&ac->ac_lg->lg_mutex); -} - -static noinline_for_stack int -ext4_mb_initialize_context(struct ext4_allocation_context *ac, - struct ext4_allocation_request *ar) -{ - struct super_block *sb = ar->inode->i_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - ext4_group_t group; - unsigned int len; - ext4_fsblk_t goal; - ext4_grpblk_t block; - - /* we can't allocate > group size */ - len = ar->len; - - /* just a dirty hack to filter too big requests */ - if (len >= EXT4_CLUSTERS_PER_GROUP(sb) - 10) - len = EXT4_CLUSTERS_PER_GROUP(sb) - 10; - - /* start searching from the goal */ - goal = ar->goal; - if (goal < le32_to_cpu(es->s_first_data_block) || - goal >= ext4_blocks_count(es)) - goal = le32_to_cpu(es->s_first_data_block); - ext4_get_group_no_and_offset(sb, goal, &group, &block); - - /* set up allocation goals */ - memset(ac, 0, sizeof(struct ext4_allocation_context)); - ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); - ac->ac_status = AC_STATUS_CONTINUE; - ac->ac_sb = sb; - ac->ac_inode = ar->inode; - ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; - ac->ac_o_ex.fe_group = group; - ac->ac_o_ex.fe_start = block; - ac->ac_o_ex.fe_len = len; - ac->ac_g_ex = ac->ac_o_ex; - ac->ac_flags = ar->flags; - - /* we have to define context: we'll we work with a file or - * locality group. this is a policy, actually */ - ext4_mb_group_or_file(ac); - - mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " - "left: %u/%u, right %u/%u to %swritable\n", - (unsigned) ar->len, (unsigned) ar->logical, - (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, - (unsigned) ar->lleft, (unsigned) ar->pleft, - (unsigned) ar->lright, (unsigned) ar->pright, - atomic_read(&ar->inode->i_writecount) ? "" : "non-"); - return 0; - -} - -static noinline_for_stack void -ext4_mb_discard_lg_preallocations(struct super_block *sb, - struct ext4_locality_group *lg, - int order, int total_entries) -{ - ext4_group_t group = 0; - struct ext4_buddy e4b; - struct list_head discard_list; - struct ext4_prealloc_space *pa, *tmp; - - mb_debug(1, "discard locality group preallocation\n"); - - INIT_LIST_HEAD(&discard_list); - - spin_lock(&lg->lg_prealloc_lock); - list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], - pa_inode_list) { - spin_lock(&pa->pa_lock); - if (atomic_read(&pa->pa_count)) { - /* - * This is the pa that we just used - * for block allocation. So don't - * free that - */ - spin_unlock(&pa->pa_lock); - continue; - } - if (pa->pa_deleted) { - spin_unlock(&pa->pa_lock); - continue; - } - /* only lg prealloc space */ - BUG_ON(pa->pa_type != MB_GROUP_PA); - - /* seems this one can be freed ... */ - pa->pa_deleted = 1; - spin_unlock(&pa->pa_lock); - - list_del_rcu(&pa->pa_inode_list); - list_add(&pa->u.pa_tmp_list, &discard_list); - - total_entries--; - if (total_entries <= 5) { - /* - * we want to keep only 5 entries - * allowing it to grow to 8. This - * mak sure we don't call discard - * soon for this list. - */ - break; - } - } - spin_unlock(&lg->lg_prealloc_lock); - - list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { - - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); - if (ext4_mb_load_buddy(sb, group, &e4b)) { - ext4_error(sb, "Error loading buddy information for %u", - group); - continue; - } - ext4_lock_group(sb, group); - list_del(&pa->pa_group_list); - ext4_mb_release_group_pa(&e4b, pa); - ext4_unlock_group(sb, group); - - ext4_mb_unload_buddy(&e4b); - list_del(&pa->u.pa_tmp_list); - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); - } -} - -/* - * We have incremented pa_count. So it cannot be freed at this - * point. Also we hold lg_mutex. So no parallel allocation is - * possible from this lg. That means pa_free cannot be updated. - * - * A parallel ext4_mb_discard_group_preallocations is possible. - * which can cause the lg_prealloc_list to be updated. - */ - -static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) -{ - int order, added = 0, lg_prealloc_count = 1; - struct super_block *sb = ac->ac_sb; - struct ext4_locality_group *lg = ac->ac_lg; - struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; - - order = fls(pa->pa_free) - 1; - if (order > PREALLOC_TB_SIZE - 1) - /* The max size of hash table is PREALLOC_TB_SIZE */ - order = PREALLOC_TB_SIZE - 1; - /* Add the prealloc space to lg */ - rcu_read_lock(); - list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], - pa_inode_list) { - spin_lock(&tmp_pa->pa_lock); - if (tmp_pa->pa_deleted) { - spin_unlock(&tmp_pa->pa_lock); - continue; - } - if (!added && pa->pa_free < tmp_pa->pa_free) { - /* Add to the tail of the previous entry */ - list_add_tail_rcu(&pa->pa_inode_list, - &tmp_pa->pa_inode_list); - added = 1; - /* - * we want to count the total - * number of entries in the list - */ - } - spin_unlock(&tmp_pa->pa_lock); - lg_prealloc_count++; - } - if (!added) - list_add_tail_rcu(&pa->pa_inode_list, - &lg->lg_prealloc_list[order]); - rcu_read_unlock(); - - /* Now trim the list to be not more than 8 elements */ - if (lg_prealloc_count > 8) { - ext4_mb_discard_lg_preallocations(sb, lg, - order, lg_prealloc_count); - return; - } - return ; -} - -/* - * release all resource we used in allocation - */ -static int ext4_mb_release_context(struct ext4_allocation_context *ac) -{ - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_prealloc_space *pa = ac->ac_pa; - if (pa) { - if (pa->pa_type == MB_GROUP_PA) { - /* see comment in ext4_mb_use_group_pa() */ - spin_lock(&pa->pa_lock); - pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - pa->pa_free -= ac->ac_b_ex.fe_len; - pa->pa_len -= ac->ac_b_ex.fe_len; - spin_unlock(&pa->pa_lock); - } - } - if (pa) { - /* - * We want to add the pa to the right bucket. - * Remove it from the list and while adding - * make sure the list to which we are adding - * doesn't grow big. - */ - if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); - ext4_mb_add_n_trim(ac); - } - ext4_mb_put_pa(ac, ac->ac_sb, pa); - } - if (ac->ac_bitmap_page) - page_cache_release(ac->ac_bitmap_page); - if (ac->ac_buddy_page) - page_cache_release(ac->ac_buddy_page); - if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) - mutex_unlock(&ac->ac_lg->lg_mutex); - ext4_mb_collect_stats(ac); - return 0; -} - -static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) -{ - ext4_group_t i, ngroups = ext4_get_groups_count(sb); - int ret; - int freed = 0; - - trace_ext4_mb_discard_preallocations(sb, needed); - for (i = 0; i < ngroups && needed > 0; i++) { - ret = ext4_mb_discard_group_preallocations(sb, i, needed); - freed += ret; - needed -= ret; - } - - return freed; -} - -/* - * Main entry point into mballoc to allocate blocks - * it tries to use preallocation first, then falls back - * to usual allocation - */ -ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, - struct ext4_allocation_request *ar, int *errp) -{ - int freed; - struct ext4_allocation_context *ac = NULL; - struct ext4_sb_info *sbi; - struct super_block *sb; - ext4_fsblk_t block = 0; - unsigned int inquota = 0; - unsigned int reserv_clstrs = 0; - - sb = ar->inode->i_sb; - sbi = EXT4_SB(sb); - - trace_ext4_request_blocks(ar); - - /* Allow to use superuser reservation for quota file */ - if (IS_NOQUOTA(ar->inode)) - ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; - - /* - * For delayed allocation, we could skip the ENOSPC and - * EDQUOT check, as blocks and quotas have been already - * reserved when data being copied into pagecache. - */ - if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) - ar->flags |= EXT4_MB_DELALLOC_RESERVED; - else { - /* Without delayed allocation we need to verify - * there is enough free blocks to do block allocation - * and verify allocation doesn't exceed the quota limits. - */ - while (ar->len && - ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { - - /* let others to free the space */ - yield(); - ar->len = ar->len >> 1; - } - if (!ar->len) { - *errp = -ENOSPC; - return 0; - } - reserv_clstrs = ar->len; - if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { - dquot_alloc_block_nofail(ar->inode, - EXT4_C2B(sbi, ar->len)); - } else { - while (ar->len && - dquot_alloc_block(ar->inode, - EXT4_C2B(sbi, ar->len))) { - - ar->flags |= EXT4_MB_HINT_NOPREALLOC; - ar->len--; - } - } - inquota = ar->len; - if (ar->len == 0) { - *errp = -EDQUOT; - goto out; - } - } - - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (!ac) { - ar->len = 0; - *errp = -ENOMEM; - goto out; - } - - *errp = ext4_mb_initialize_context(ac, ar); - if (*errp) { - ar->len = 0; - goto out; - } - - ac->ac_op = EXT4_MB_HISTORY_PREALLOC; - if (!ext4_mb_use_preallocated(ac)) { - ac->ac_op = EXT4_MB_HISTORY_ALLOC; - ext4_mb_normalize_request(ac, ar); -repeat: - /* allocate space in core */ - *errp = ext4_mb_regular_allocator(ac); - if (*errp) - goto errout; - - /* as we've just preallocated more space than - * user requested orinally, we store allocated - * space in a special descriptor */ - if (ac->ac_status == AC_STATUS_FOUND && - ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) - ext4_mb_new_preallocation(ac); - } - if (likely(ac->ac_status == AC_STATUS_FOUND)) { - *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); - if (*errp == -EAGAIN) { - /* - * drop the reference that we took - * in ext4_mb_use_best_found - */ - ext4_mb_release_context(ac); - ac->ac_b_ex.fe_group = 0; - ac->ac_b_ex.fe_start = 0; - ac->ac_b_ex.fe_len = 0; - ac->ac_status = AC_STATUS_CONTINUE; - goto repeat; - } else if (*errp) - errout: - ext4_discard_allocated_blocks(ac); - else { - block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - ar->len = ac->ac_b_ex.fe_len; - } - } else { - freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); - if (freed) - goto repeat; - *errp = -ENOSPC; - } - - if (*errp) { - ac->ac_b_ex.fe_len = 0; - ar->len = 0; - ext4_mb_show_ac(ac); - } - ext4_mb_release_context(ac); -out: - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); - if (inquota && ar->len < inquota) - dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); - if (!ar->len) { - if (!ext4_test_inode_state(ar->inode, - EXT4_STATE_DELALLOC_RESERVED)) - /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyclusters_counter, - reserv_clstrs); - } - - trace_ext4_allocate_blocks(ar, (unsigned long long)block); - - return block; -} - -/* - * We can merge two free data extents only if the physical blocks - * are contiguous, AND the extents were freed by the same transaction, - * AND the blocks are associated with the same group. - */ -static int can_merge(struct ext4_free_data *entry1, - struct ext4_free_data *entry2) -{ - if ((entry1->efd_tid == entry2->efd_tid) && - (entry1->efd_group == entry2->efd_group) && - ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) - return 1; - return 0; -} - -static noinline_for_stack int -ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, - struct ext4_free_data *new_entry) -{ - ext4_group_t group = e4b->bd_group; - ext4_grpblk_t cluster; - struct ext4_free_data *entry; - struct ext4_group_info *db = e4b->bd_info; - struct super_block *sb = e4b->bd_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct rb_node **n = &db->bb_free_root.rb_node, *node; - struct rb_node *parent = NULL, *new_node; - - BUG_ON(!ext4_handle_valid(handle)); - BUG_ON(e4b->bd_bitmap_page == NULL); - BUG_ON(e4b->bd_buddy_page == NULL); - - new_node = &new_entry->efd_node; - cluster = new_entry->efd_start_cluster; - - if (!*n) { - /* first free block exent. We need to - protect buddy cache from being freed, - * otherwise we'll refresh it from - * on-disk bitmap and lose not-yet-available - * blocks */ - page_cache_get(e4b->bd_buddy_page); - page_cache_get(e4b->bd_bitmap_page); - } - while (*n) { - parent = *n; - entry = rb_entry(parent, struct ext4_free_data, efd_node); - if (cluster < entry->efd_start_cluster) - n = &(*n)->rb_left; - else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) - n = &(*n)->rb_right; - else { - ext4_grp_locked_error(sb, group, 0, - ext4_group_first_block_no(sb, group) + - EXT4_C2B(sbi, cluster), - "Block already on to-be-freed list"); - return 0; - } - } - - rb_link_node(new_node, parent, n); - rb_insert_color(new_node, &db->bb_free_root); - - /* Now try to see the extent can be merged to left and right */ - node = rb_prev(new_node); - if (node) { - entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(entry, new_entry)) { - new_entry->efd_start_cluster = entry->efd_start_cluster; - new_entry->efd_count += entry->efd_count; - rb_erase(node, &(db->bb_free_root)); - ext4_journal_callback_del(handle, &entry->efd_jce); - kmem_cache_free(ext4_free_data_cachep, entry); - } - } - - node = rb_next(new_node); - if (node) { - entry = rb_entry(node, struct ext4_free_data, efd_node); - if (can_merge(new_entry, entry)) { - new_entry->efd_count += entry->efd_count; - rb_erase(node, &(db->bb_free_root)); - ext4_journal_callback_del(handle, &entry->efd_jce); - kmem_cache_free(ext4_free_data_cachep, entry); - } - } - /* Add the extent to transaction's private list */ - ext4_journal_callback_add(handle, ext4_free_data_callback, - &new_entry->efd_jce); - return 0; -} - -/** - * ext4_free_blocks() -- Free given blocks and update quota - * @handle: handle for this transaction - * @inode: inode - * @block: start physical block to free - * @count: number of blocks to count - * @flags: flags used by ext4_free_blocks - */ -void ext4_free_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t block, - unsigned long count, int flags) -{ - struct buffer_head *bitmap_bh = NULL; - struct super_block *sb = inode->i_sb; - struct ext4_group_desc *gdp; - unsigned long freed = 0; - unsigned int overflow; - ext4_grpblk_t bit; - struct buffer_head *gd_bh; - ext4_group_t block_group; - struct ext4_sb_info *sbi; - struct ext4_buddy e4b; - unsigned int count_clusters; - int err = 0; - int ret; - - if (bh) { - if (block) - BUG_ON(block != bh->b_blocknr); - else - block = bh->b_blocknr; - } - - sbi = EXT4_SB(sb); - if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && - !ext4_data_block_valid(sbi, block, count)) { - ext4_error(sb, "Freeing blocks not in datazone - " - "block = %llu, count = %lu", block, count); - goto error_return; - } - - ext4_debug("freeing block %llu\n", block); - trace_ext4_free_blocks(inode, block, count, flags); - - if (flags & EXT4_FREE_BLOCKS_FORGET) { - struct buffer_head *tbh = bh; - int i; - - BUG_ON(bh && (count > 1)); - - for (i = 0; i < count; i++) { - if (!bh) - tbh = sb_find_get_block(inode->i_sb, - block + i); - if (unlikely(!tbh)) - continue; - ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, - inode, tbh, block + i); - } - } - - /* - * We need to make sure we don't reuse the freed block until - * after the transaction is committed, which we can do by - * treating the block as metadata, below. We make an - * exception if the inode is to be written in writeback mode - * since writeback mode has weak data consistency guarantees. - */ - if (!ext4_should_writeback_data(inode)) - flags |= EXT4_FREE_BLOCKS_METADATA; - - /* - * If the extent to be freed does not begin on a cluster - * boundary, we need to deal with partial clusters at the - * beginning and end of the extent. Normally we will free - * blocks at the beginning or the end unless we are explicitly - * requested to avoid doing so. - */ - overflow = block & (sbi->s_cluster_ratio - 1); - if (overflow) { - if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { - overflow = sbi->s_cluster_ratio - overflow; - block += overflow; - if (count > overflow) - count -= overflow; - else - return; - } else { - block -= overflow; - count += overflow; - } - } - overflow = count & (sbi->s_cluster_ratio - 1); - if (overflow) { - if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { - if (count > overflow) - count -= overflow; - else - return; - } else - count += sbi->s_cluster_ratio - overflow; - } - -do_more: - overflow = 0; - ext4_get_group_no_and_offset(sb, block, &block_group, &bit); - - /* - * Check to see if we are freeing blocks across a group - * boundary. - */ - if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { - overflow = EXT4_C2B(sbi, bit) + count - - EXT4_BLOCKS_PER_GROUP(sb); - count -= overflow; - } - count_clusters = EXT4_B2C(sbi, count); - bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (!bitmap_bh) { - err = -EIO; - goto error_return; - } - gdp = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!gdp) { - err = -EIO; - goto error_return; - } - - if (in_range(ext4_block_bitmap(sb, gdp), block, count) || - in_range(ext4_inode_bitmap(sb, gdp), block, count) || - in_range(block, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group) || - in_range(block + count - 1, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group)) { - - ext4_error(sb, "Freeing blocks in system zone - " - "Block = %llu, count = %lu", block, count); - /* err = 0. ext4_std_error should be a no op */ - goto error_return; - } - - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); - if (err) - goto error_return; - - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gd_bh); - if (err) - goto error_return; -#ifdef AGGRESSIVE_CHECK - { - int i; - for (i = 0; i < count_clusters; i++) - BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); - } -#endif - trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); - - err = ext4_mb_load_buddy(sb, block_group, &e4b); - if (err) - goto error_return; - - if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { - struct ext4_free_data *new_entry; - /* - * blocks being freed are metadata. these blocks shouldn't - * be used until this transaction is committed - */ - new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); - if (!new_entry) { - ext4_mb_unload_buddy(&e4b); - err = -ENOMEM; - goto error_return; - } - new_entry->efd_start_cluster = bit; - new_entry->efd_group = block_group; - new_entry->efd_count = count_clusters; - new_entry->efd_tid = handle->h_transaction->t_tid; - - ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); - ext4_mb_free_metadata(handle, &e4b, new_entry); - } else { - /* need to update group_info->bb_free and bitmap - * with group lock held. generate_buddy look at - * them with group lock_held - */ - ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); - mb_free_blocks(inode, &e4b, bit, count_clusters); - } - - ret = ext4_free_group_clusters(sb, gdp) + count_clusters; - ext4_free_group_clusters_set(sb, gdp, ret); - gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); - ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(count_clusters, - &sbi->s_flex_groups[flex_group].free_clusters); - } - - ext4_mb_unload_buddy(&e4b); - - freed += count; - - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) - dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) - err = ret; - - if (overflow && !err) { - block += count; - count = overflow; - put_bh(bitmap_bh); - goto do_more; - } - ext4_mark_super_dirty(sb); -error_return: - brelse(bitmap_bh); - ext4_std_error(sb, err); - return; -} - -/** - * ext4_group_add_blocks() -- Add given blocks to an existing group - * @handle: handle to this transaction - * @sb: super block - * @block: start physcial block to add to the block group - * @count: number of blocks to free - * - * This marks the blocks as free in the bitmap and buddy. - */ -int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count) -{ - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *gd_bh; - ext4_group_t block_group; - ext4_grpblk_t bit; - unsigned int i; - struct ext4_group_desc *desc; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_buddy e4b; - int err = 0, ret, blk_free_count; - ext4_grpblk_t blocks_freed; - - ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); - - if (count == 0) - return 0; - - ext4_get_group_no_and_offset(sb, block, &block_group, &bit); - /* - * Check to see if we are freeing blocks across a group - * boundary. - */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - ext4_warning(sb, "too much blocks added to group %u\n", - block_group); - err = -EINVAL; - goto error_return; - } - - bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (!bitmap_bh) { - err = -EIO; - goto error_return; - } - - desc = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!desc) { - err = -EIO; - goto error_return; - } - - if (in_range(ext4_block_bitmap(sb, desc), block, count) || - in_range(ext4_inode_bitmap(sb, desc), block, count) || - in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || - in_range(block + count - 1, ext4_inode_table(sb, desc), - sbi->s_itb_per_group)) { - ext4_error(sb, "Adding blocks in system zones - " - "Block = %llu, count = %lu", - block, count); - err = -EINVAL; - goto error_return; - } - - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); - if (err) - goto error_return; - - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gd_bh); - if (err) - goto error_return; - - for (i = 0, blocks_freed = 0; i < count; i++) { - BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { - ext4_error(sb, "bit already cleared for block %llu", - (ext4_fsblk_t)(block + i)); - BUFFER_TRACE(bitmap_bh, "bit already cleared"); - } else { - blocks_freed++; - } - } - - err = ext4_mb_load_buddy(sb, block_group, &e4b); - if (err) - goto error_return; - - /* - * need to update group_info->bb_free and bitmap - * with group lock held. generate_buddy look at - * them with group lock_held - */ - ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count); - mb_free_blocks(NULL, &e4b, bit, count); - blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); - ext4_free_group_clusters_set(sb, desc, blk_free_count); - desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); - ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_B2C(sbi, blocks_freed)); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(EXT4_B2C(sbi, blocks_freed), - &sbi->s_flex_groups[flex_group].free_clusters); - } - - ext4_mb_unload_buddy(&e4b); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) - err = ret; - -error_return: - brelse(bitmap_bh); - ext4_std_error(sb, err); - return err; -} - -/** - * ext4_trim_extent -- function to TRIM one single free extent in the group - * @sb: super block for the file system - * @start: starting block of the free extent in the alloc. group - * @count: number of blocks to TRIM - * @group: alloc. group we are working with - * @e4b: ext4 buddy for the group - * - * Trim "count" blocks starting at "start" in the "group". To assure that no - * one will allocate those blocks, mark it as used in buddy bitmap. This must - * be called with under the group lock. - */ -static void ext4_trim_extent(struct super_block *sb, int start, int count, - ext4_group_t group, struct ext4_buddy *e4b) -{ - struct ext4_free_extent ex; - - trace_ext4_trim_extent(sb, group, start, count); - - assert_spin_locked(ext4_group_lock_ptr(sb, group)); - - ex.fe_start = start; - ex.fe_group = group; - ex.fe_len = count; - - /* - * Mark blocks used, so no one can reuse them while - * being trimmed. - */ - mb_mark_used(e4b, &ex); - ext4_unlock_group(sb, group); - ext4_issue_discard(sb, group, start, count); - ext4_lock_group(sb, group); - mb_free_blocks(NULL, e4b, start, ex.fe_len); -} - -/** - * ext4_trim_all_free -- function to trim all free space in alloc. group - * @sb: super block for file system - * @group: group to be trimmed - * @start: first group block to examine - * @max: last group block to examine - * @minblocks: minimum extent block count - * - * ext4_trim_all_free walks through group's buddy bitmap searching for free - * extents. When the free block is found, ext4_trim_extent is called to TRIM - * the extent. - * - * - * ext4_trim_all_free walks through group's block bitmap searching for free - * extents. When the free extent is found, mark it as used in group buddy - * bitmap. Then issue a TRIM command on this extent and free the extent in - * the group buddy bitmap. This is done until whole group is scanned. - */ -static ext4_grpblk_t -ext4_trim_all_free(struct super_block *sb, ext4_group_t group, - ext4_grpblk_t start, ext4_grpblk_t max, - ext4_grpblk_t minblocks) -{ - void *bitmap; - ext4_grpblk_t next, count = 0, free_count = 0; - struct ext4_buddy e4b; - int ret; - - trace_ext4_trim_all_free(sb, group, start, max); - - ret = ext4_mb_load_buddy(sb, group, &e4b); - if (ret) { - ext4_error(sb, "Error in loading buddy " - "information for %u", group); - return ret; - } - bitmap = e4b.bd_bitmap; - - ext4_lock_group(sb, group); - if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && - minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) - goto out; - - start = (e4b.bd_info->bb_first_free > start) ? - e4b.bd_info->bb_first_free : start; - - while (start <= max) { - start = mb_find_next_zero_bit(bitmap, max + 1, start); - if (start > max) - break; - next = mb_find_next_bit(bitmap, max + 1, start); - - if ((next - start) >= minblocks) { - ext4_trim_extent(sb, start, - next - start, group, &e4b); - count += next - start; - } - free_count += next - start; - start = next + 1; - - if (fatal_signal_pending(current)) { - count = -ERESTARTSYS; - break; - } - - if (need_resched()) { - ext4_unlock_group(sb, group); - cond_resched(); - ext4_lock_group(sb, group); - } - - if ((e4b.bd_info->bb_free - free_count) < minblocks) - break; - } - - if (!ret) - EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); -out: - ext4_unlock_group(sb, group); - ext4_mb_unload_buddy(&e4b); - - ext4_debug("trimmed %d blocks in the group %d\n", - count, group); - - return count; -} - -/** - * ext4_trim_fs() -- trim ioctl handle function - * @sb: superblock for filesystem - * @range: fstrim_range structure - * - * start: First Byte to trim - * len: number of Bytes to trim from start - * minlen: minimum extent length in Bytes - * ext4_trim_fs goes through all allocation groups containing Bytes from - * start to start+len. For each such a group ext4_trim_all_free function - * is invoked to trim all free space. - */ -int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) -{ - struct ext4_group_info *grp; - ext4_group_t group, first_group, last_group; - ext4_grpblk_t cnt = 0, first_cluster, last_cluster; - uint64_t start, end, minlen, trimmed = 0; - ext4_fsblk_t first_data_blk = - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); - int ret = 0; - - start = range->start >> sb->s_blocksize_bits; - end = start + (range->len >> sb->s_blocksize_bits) - 1; - minlen = range->minlen >> sb->s_blocksize_bits; - - if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) || - unlikely(start >= max_blks)) - return -EINVAL; - if (end >= max_blks) - end = max_blks - 1; - if (end <= first_data_blk) - goto out; - if (start < first_data_blk) - start = first_data_blk; - - /* Determine first and last group to examine based on start and end */ - ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, - &first_group, &first_cluster); - ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, - &last_group, &last_cluster); - - /* end now represents the last cluster to discard in this group */ - end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; - - for (group = first_group; group <= last_group; group++) { - grp = ext4_get_group_info(sb, group); - /* We only do this if the grp has never been initialized */ - if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - ret = ext4_mb_init_group(sb, group); - if (ret) - break; - } - - /* - * For all the groups except the last one, last cluster will - * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to - * change it for the last group, note that last_cluster is - * already computed earlier by ext4_get_group_no_and_offset() - */ - if (group == last_group) - end = last_cluster; - - if (grp->bb_free >= minlen) { - cnt = ext4_trim_all_free(sb, group, first_cluster, - end, minlen); - if (cnt < 0) { - ret = cnt; - break; - } - trimmed += cnt; - } - - /* - * For every group except the first one, we are sure - * that the first cluster to discard will be cluster #0. - */ - first_cluster = 0; - } - - if (!ret) - atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); - -out: - range->len = trimmed * sb->s_blocksize; - return ret; -} diff --git a/ANDROID_3.4.5/fs/ext4/mballoc.h b/ANDROID_3.4.5/fs/ext4/mballoc.h deleted file mode 100644 index c070618c..00000000 --- a/ANDROID_3.4.5/fs/ext4/mballoc.h +++ /dev/null @@ -1,222 +0,0 @@ -/* - * fs/ext4/mballoc.h - * - * Written by: Alex Tomas <alex@clusterfs.com> - * - */ -#ifndef _EXT4_MBALLOC_H -#define _EXT4_MBALLOC_H - -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/namei.h> -#include <linux/quotaops.h> -#include <linux/buffer_head.h> -#include <linux/module.h> -#include <linux/swap.h> -#include <linux/proc_fs.h> -#include <linux/pagemap.h> -#include <linux/seq_file.h> -#include <linux/blkdev.h> -#include <linux/mutex.h> -#include "ext4_jbd2.h" -#include "ext4.h" - -/* - * with AGGRESSIVE_CHECK allocator runs consistency checks over - * structures. these checks slow things down a lot - */ -#define AGGRESSIVE_CHECK__ - -/* - * with DOUBLE_CHECK defined mballoc creates persistent in-core - * bitmaps, maintains and uses them to check for double allocations - */ -#define DOUBLE_CHECK__ - -/* - */ -#ifdef CONFIG_EXT4_DEBUG -extern u8 mb_enable_debug; - -#define mb_debug(n, fmt, a...) \ - do { \ - if ((n) <= mb_enable_debug) { \ - printk(KERN_DEBUG "(%s, %d): %s: ", \ - __FILE__, __LINE__, __func__); \ - printk(fmt, ## a); \ - } \ - } while (0) -#else -#define mb_debug(n, fmt, a...) -#endif - -#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ -#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ - -/* - * How long mballoc can look for a best extent (in found extents) - */ -#define MB_DEFAULT_MAX_TO_SCAN 200 - -/* - * How long mballoc must look for a best extent - */ -#define MB_DEFAULT_MIN_TO_SCAN 10 - -/* - * How many groups mballoc will scan looking for the best chunk - */ -#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 - -/* - * with 'ext4_mb_stats' allocator will collect stats that will be - * shown at umount. The collecting costs though! - */ -#define MB_DEFAULT_STATS 0 - -/* - * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served - * by the stream allocator, which purpose is to pack requests - * as close each to other as possible to produce smooth I/O traffic - * We use locality group prealloc space for stream request. - * We can tune the same via /proc/fs/ext4/<parition>/stream_req - */ -#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ - -/* - * for which requests use 2^N search using buddies - */ -#define MB_DEFAULT_ORDER2_REQS 2 - -/* - * default group prealloc size 512 blocks - */ -#define MB_DEFAULT_GROUP_PREALLOC 512 - - -struct ext4_free_data { - /* MUST be the first member */ - struct ext4_journal_cb_entry efd_jce; - - /* ext4_free_data private data starts from here */ - - /* this links the free block information from group_info */ - struct rb_node efd_node; - - /* group which free block extent belongs */ - ext4_group_t efd_group; - - /* free block extent */ - ext4_grpblk_t efd_start_cluster; - ext4_grpblk_t efd_count; - - /* transaction which freed this extent */ - tid_t efd_tid; -}; - -struct ext4_prealloc_space { - struct list_head pa_inode_list; - struct list_head pa_group_list; - union { - struct list_head pa_tmp_list; - struct rcu_head pa_rcu; - } u; - spinlock_t pa_lock; - atomic_t pa_count; - unsigned pa_deleted; - ext4_fsblk_t pa_pstart; /* phys. block */ - ext4_lblk_t pa_lstart; /* log. block */ - ext4_grpblk_t pa_len; /* len of preallocated chunk */ - ext4_grpblk_t pa_free; /* how many blocks are free */ - unsigned short pa_type; /* pa type. inode or group */ - spinlock_t *pa_obj_lock; - struct inode *pa_inode; /* hack, for history only */ -}; - -enum { - MB_INODE_PA = 0, - MB_GROUP_PA = 1 -}; - -struct ext4_free_extent { - ext4_lblk_t fe_logical; - ext4_grpblk_t fe_start; /* In cluster units */ - ext4_group_t fe_group; - ext4_grpblk_t fe_len; /* In cluster units */ -}; - -/* - * Locality group: - * we try to group all related changes together - * so that writeback can flush/allocate them together as well - * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC - * (512). We store prealloc space into the hash based on the pa_free blocks - * order value.ie, fls(pa_free)-1; - */ -#define PREALLOC_TB_SIZE 10 -struct ext4_locality_group { - /* for allocator */ - /* to serialize allocates */ - struct mutex lg_mutex; - /* list of preallocations */ - struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; - spinlock_t lg_prealloc_lock; -}; - -struct ext4_allocation_context { - struct inode *ac_inode; - struct super_block *ac_sb; - - /* original request */ - struct ext4_free_extent ac_o_ex; - - /* goal request (normalized ac_o_ex) */ - struct ext4_free_extent ac_g_ex; - - /* the best found extent */ - struct ext4_free_extent ac_b_ex; - - /* copy of the best found extent taken before preallocation efforts */ - struct ext4_free_extent ac_f_ex; - - /* number of iterations done. we have to track to limit searching */ - unsigned long ac_ex_scanned; - __u16 ac_groups_scanned; - __u16 ac_found; - __u16 ac_tail; - __u16 ac_buddy; - __u16 ac_flags; /* allocation hints */ - __u8 ac_status; - __u8 ac_criteria; - __u8 ac_2order; /* if request is to allocate 2^N blocks and - * N > 0, the field stores N, otherwise 0 */ - __u8 ac_op; /* operation, for history only */ - struct page *ac_bitmap_page; - struct page *ac_buddy_page; - struct ext4_prealloc_space *ac_pa; - struct ext4_locality_group *ac_lg; -}; - -#define AC_STATUS_CONTINUE 1 -#define AC_STATUS_FOUND 2 -#define AC_STATUS_BREAK 3 - -struct ext4_buddy { - struct page *bd_buddy_page; - void *bd_buddy; - struct page *bd_bitmap_page; - void *bd_bitmap; - struct ext4_group_info *bd_info; - struct super_block *bd_sb; - __u16 bd_blkbits; - ext4_group_t bd_group; -}; - -static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, - struct ext4_free_extent *fex) -{ - return ext4_group_first_block_no(sb, fex->fe_group) + - (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); -} -#endif diff --git a/ANDROID_3.4.5/fs/ext4/migrate.c b/ANDROID_3.4.5/fs/ext4/migrate.c deleted file mode 100644 index f39f80f8..00000000 --- a/ANDROID_3.4.5/fs/ext4/migrate.c +++ /dev/null @@ -1,604 +0,0 @@ -/* - * Copyright IBM Corporation, 2007 - * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - */ - -#include <linux/slab.h> -#include "ext4_jbd2.h" - -/* - * The contiguous blocks details which can be - * represented by a single extent - */ -struct migrate_struct { - ext4_lblk_t first_block, last_block, curr_block; - ext4_fsblk_t first_pblock, last_pblock; -}; - -static int finish_range(handle_t *handle, struct inode *inode, - struct migrate_struct *lb) - -{ - int retval = 0, needed; - struct ext4_extent newext; - struct ext4_ext_path *path; - if (lb->first_pblock == 0) - return 0; - - /* Add the extent to temp inode*/ - newext.ee_block = cpu_to_le32(lb->first_block); - newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); - ext4_ext_store_pblock(&newext, lb->first_pblock); - path = ext4_ext_find_extent(inode, lb->first_block, NULL); - - if (IS_ERR(path)) { - retval = PTR_ERR(path); - path = NULL; - goto err_out; - } - - /* - * Calculate the credit needed to inserting this extent - * Since we are doing this in loop we may accumalate extra - * credit. But below we try to not accumalate too much - * of them by restarting the journal. - */ - needed = ext4_ext_calc_credits_for_single_extent(inode, - lb->last_block - lb->first_block + 1, path); - - /* - * Make sure the credit we accumalated is not really high - */ - if (needed && ext4_handle_has_enough_credits(handle, - EXT4_RESERVE_TRANS_BLOCKS)) { - retval = ext4_journal_restart(handle, needed); - if (retval) - goto err_out; - } else if (needed) { - retval = ext4_journal_extend(handle, needed); - if (retval) { - /* - * IF not able to extend the journal restart the journal - */ - retval = ext4_journal_restart(handle, needed); - if (retval) - goto err_out; - } - } - retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); -err_out: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } - lb->first_pblock = 0; - return retval; -} - -static int update_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, struct migrate_struct *lb) -{ - int retval; - /* - * See if we can add on to the existing range (if it exists) - */ - if (lb->first_pblock && - (lb->last_pblock+1 == pblock) && - (lb->last_block+1 == lb->curr_block)) { - lb->last_pblock = pblock; - lb->last_block = lb->curr_block; - lb->curr_block++; - return 0; - } - /* - * Start a new range. - */ - retval = finish_range(handle, inode, lb); - lb->first_pblock = lb->last_pblock = pblock; - lb->first_block = lb->last_block = lb->curr_block; - lb->curr_block++; - return retval; -} - -static int update_ind_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, - struct migrate_struct *lb) -{ - struct buffer_head *bh; - __le32 *i_data; - int i, retval = 0; - unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; - - i_data = (__le32 *)bh->b_data; - for (i = 0; i < max_entries; i++) { - if (i_data[i]) { - retval = update_extent_range(handle, inode, - le32_to_cpu(i_data[i]), lb); - if (retval) - break; - } else { - lb->curr_block++; - } - } - put_bh(bh); - return retval; - -} - -static int update_dind_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, - struct migrate_struct *lb) -{ - struct buffer_head *bh; - __le32 *i_data; - int i, retval = 0; - unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; - - i_data = (__le32 *)bh->b_data; - for (i = 0; i < max_entries; i++) { - if (i_data[i]) { - retval = update_ind_extent_range(handle, inode, - le32_to_cpu(i_data[i]), lb); - if (retval) - break; - } else { - /* Only update the file block number */ - lb->curr_block += max_entries; - } - } - put_bh(bh); - return retval; - -} - -static int update_tind_extent_range(handle_t *handle, struct inode *inode, - ext4_fsblk_t pblock, - struct migrate_struct *lb) -{ - struct buffer_head *bh; - __le32 *i_data; - int i, retval = 0; - unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; - - i_data = (__le32 *)bh->b_data; - for (i = 0; i < max_entries; i++) { - if (i_data[i]) { - retval = update_dind_extent_range(handle, inode, - le32_to_cpu(i_data[i]), lb); - if (retval) - break; - } else { - /* Only update the file block number */ - lb->curr_block += max_entries * max_entries; - } - } - put_bh(bh); - return retval; - -} - -static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) -{ - int retval = 0, needed; - - if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) - return 0; - /* - * We are freeing a blocks. During this we touch - * superblock, group descriptor and block bitmap. - * So allocate a credit of 3. We may update - * quota (user and group). - */ - needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); - - if (ext4_journal_extend(handle, needed) != 0) - retval = ext4_journal_restart(handle, needed); - - return retval; -} - -static int free_dind_blocks(handle_t *handle, - struct inode *inode, __le32 i_data) -{ - int i; - __le32 *tmp_idata; - struct buffer_head *bh; - unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - - bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); - if (!bh) - return -EIO; - - tmp_idata = (__le32 *)bh->b_data; - for (i = 0; i < max_entries; i++) { - if (tmp_idata[i]) { - extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, NULL, - le32_to_cpu(tmp_idata[i]), 1, - EXT4_FREE_BLOCKS_METADATA | - EXT4_FREE_BLOCKS_FORGET); - } - } - put_bh(bh); - extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, - EXT4_FREE_BLOCKS_METADATA | - EXT4_FREE_BLOCKS_FORGET); - return 0; -} - -static int free_tind_blocks(handle_t *handle, - struct inode *inode, __le32 i_data) -{ - int i, retval = 0; - __le32 *tmp_idata; - struct buffer_head *bh; - unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - - bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); - if (!bh) - return -EIO; - - tmp_idata = (__le32 *)bh->b_data; - for (i = 0; i < max_entries; i++) { - if (tmp_idata[i]) { - retval = free_dind_blocks(handle, - inode, tmp_idata[i]); - if (retval) { - put_bh(bh); - return retval; - } - } - } - put_bh(bh); - extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, - EXT4_FREE_BLOCKS_METADATA | - EXT4_FREE_BLOCKS_FORGET); - return 0; -} - -static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) -{ - int retval; - - /* ei->i_data[EXT4_IND_BLOCK] */ - if (i_data[0]) { - extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, NULL, - le32_to_cpu(i_data[0]), 1, - EXT4_FREE_BLOCKS_METADATA | - EXT4_FREE_BLOCKS_FORGET); - } - - /* ei->i_data[EXT4_DIND_BLOCK] */ - if (i_data[1]) { - retval = free_dind_blocks(handle, inode, i_data[1]); - if (retval) - return retval; - } - - /* ei->i_data[EXT4_TIND_BLOCK] */ - if (i_data[2]) { - retval = free_tind_blocks(handle, inode, i_data[2]); - if (retval) - return retval; - } - return 0; -} - -static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, - struct inode *tmp_inode) -{ - int retval; - __le32 i_data[3]; - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); - - /* - * One credit accounted for writing the - * i_data field of the original inode - */ - retval = ext4_journal_extend(handle, 1); - if (retval) { - retval = ext4_journal_restart(handle, 1); - if (retval) - goto err_out; - } - - i_data[0] = ei->i_data[EXT4_IND_BLOCK]; - i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; - i_data[2] = ei->i_data[EXT4_TIND_BLOCK]; - - down_write(&EXT4_I(inode)->i_data_sem); - /* - * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation - * happened after we started the migrate. We need to - * fail the migrate - */ - if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) { - retval = -EAGAIN; - up_write(&EXT4_I(inode)->i_data_sem); - goto err_out; - } else - ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); - /* - * We have the extent map build with the tmp inode. - * Now copy the i_data across - */ - ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); - memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); - - /* - * Update i_blocks with the new blocks that got - * allocated while adding extents for extent index - * blocks. - * - * While converting to extents we need not - * update the orignal inode i_blocks for extent blocks - * via quota APIs. The quota update happened via tmp_inode already. - */ - spin_lock(&inode->i_lock); - inode->i_blocks += tmp_inode->i_blocks; - spin_unlock(&inode->i_lock); - up_write(&EXT4_I(inode)->i_data_sem); - - /* - * We mark the inode dirty after, because we decrement the - * i_blocks when freeing the indirect meta-data blocks - */ - retval = free_ind_block(handle, inode, i_data); - ext4_mark_inode_dirty(handle, inode); - -err_out: - return retval; -} - -static int free_ext_idx(handle_t *handle, struct inode *inode, - struct ext4_extent_idx *ix) -{ - int i, retval = 0; - ext4_fsblk_t block; - struct buffer_head *bh; - struct ext4_extent_header *eh; - - block = ext4_idx_pblock(ix); - bh = sb_bread(inode->i_sb, block); - if (!bh) - return -EIO; - - eh = (struct ext4_extent_header *)bh->b_data; - if (eh->eh_depth != 0) { - ix = EXT_FIRST_INDEX(eh); - for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { - retval = free_ext_idx(handle, inode, ix); - if (retval) - break; - } - } - put_bh(bh); - extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, NULL, block, 1, - EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - return retval; -} - -/* - * Free the extent meta data blocks only - */ -static int free_ext_block(handle_t *handle, struct inode *inode) -{ - int i, retval = 0; - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data; - struct ext4_extent_idx *ix; - if (eh->eh_depth == 0) - /* - * No extra blocks allocated for extent meta data - */ - return 0; - ix = EXT_FIRST_INDEX(eh); - for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { - retval = free_ext_idx(handle, inode, ix); - if (retval) - return retval; - } - return retval; - -} - -int ext4_ext_migrate(struct inode *inode) -{ - handle_t *handle; - int retval = 0, i; - __le32 *i_data; - struct ext4_inode_info *ei; - struct inode *tmp_inode = NULL; - struct migrate_struct lb; - unsigned long max_entries; - __u32 goal; - uid_t owner[2]; - - /* - * If the filesystem does not support extents, or the inode - * already is extent-based, error out. - */ - if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_INCOMPAT_EXTENTS) || - (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return -EINVAL; - - if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) - /* - * don't migrate fast symlink - */ - return retval; - - handle = ext4_journal_start(inode, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) - + 1); - if (IS_ERR(handle)) { - retval = PTR_ERR(handle); - return retval; - } - goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * - EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; - owner[0] = inode->i_uid; - owner[1] = inode->i_gid; - tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, - S_IFREG, NULL, goal, owner); - if (IS_ERR(tmp_inode)) { - retval = PTR_ERR(tmp_inode); - ext4_journal_stop(handle); - return retval; - } - i_size_write(tmp_inode, i_size_read(inode)); - /* - * Set the i_nlink to zero so it will be deleted later - * when we drop inode reference. - */ - clear_nlink(tmp_inode); - - ext4_ext_tree_init(handle, tmp_inode); - ext4_orphan_add(handle, tmp_inode); - ext4_journal_stop(handle); - - /* - * start with one credit accounted for - * superblock modification. - * - * For the tmp_inode we already have committed the - * trascation that created the inode. Later as and - * when we add extents we extent the journal - */ - /* - * Even though we take i_mutex we can still cause block - * allocation via mmap write to holes. If we have allocated - * new blocks we fail migrate. New block allocation will - * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated - * with i_data_sem held to prevent racing with block - * allocation. - */ - down_read((&EXT4_I(inode)->i_data_sem)); - ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); - up_read((&EXT4_I(inode)->i_data_sem)); - - handle = ext4_journal_start(inode, 1); - if (IS_ERR(handle)) { - /* - * It is impossible to update on-disk structures without - * a handle, so just rollback in-core changes and live other - * work to orphan_list_cleanup() - */ - ext4_orphan_del(NULL, tmp_inode); - retval = PTR_ERR(handle); - goto out; - } - - ei = EXT4_I(inode); - i_data = ei->i_data; - memset(&lb, 0, sizeof(lb)); - - /* 32 bit block address 4 bytes */ - max_entries = inode->i_sb->s_blocksize >> 2; - for (i = 0; i < EXT4_NDIR_BLOCKS; i++) { - if (i_data[i]) { - retval = update_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[i]), &lb); - if (retval) - goto err_out; - } else - lb.curr_block++; - } - if (i_data[EXT4_IND_BLOCK]) { - retval = update_ind_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb); - if (retval) - goto err_out; - } else - lb.curr_block += max_entries; - if (i_data[EXT4_DIND_BLOCK]) { - retval = update_dind_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb); - if (retval) - goto err_out; - } else - lb.curr_block += max_entries * max_entries; - if (i_data[EXT4_TIND_BLOCK]) { - retval = update_tind_extent_range(handle, tmp_inode, - le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb); - if (retval) - goto err_out; - } - /* - * Build the last extent - */ - retval = finish_range(handle, tmp_inode, &lb); -err_out: - if (retval) - /* - * Failure case delete the extent information with the - * tmp_inode - */ - free_ext_block(handle, tmp_inode); - else { - retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode); - if (retval) - /* - * if we fail to swap inode data free the extent - * details of the tmp inode - */ - free_ext_block(handle, tmp_inode); - } - - /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ - if (ext4_journal_extend(handle, 1) != 0) - ext4_journal_restart(handle, 1); - - /* - * Mark the tmp_inode as of size zero - */ - i_size_write(tmp_inode, 0); - - /* - * set the i_blocks count to zero - * so that the ext4_delete_inode does the - * right job - * - * We don't need to take the i_lock because - * the inode is not visible to user space. - */ - tmp_inode->i_blocks = 0; - - /* Reset the extent details */ - ext4_ext_tree_init(handle, tmp_inode); - ext4_journal_stop(handle); -out: - unlock_new_inode(tmp_inode); - iput(tmp_inode); - - return retval; -} diff --git a/ANDROID_3.4.5/fs/ext4/mmp.c b/ANDROID_3.4.5/fs/ext4/mmp.c deleted file mode 100644 index ed6548d8..00000000 --- a/ANDROID_3.4.5/fs/ext4/mmp.c +++ /dev/null @@ -1,353 +0,0 @@ -#include <linux/fs.h> -#include <linux/random.h> -#include <linux/buffer_head.h> -#include <linux/utsname.h> -#include <linux/kthread.h> - -#include "ext4.h" - -/* - * Write the MMP block using WRITE_SYNC to try to get the block on-disk - * faster. - */ -static int write_mmp_block(struct buffer_head *bh) -{ - mark_buffer_dirty(bh); - lock_buffer(bh); - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); - submit_bh(WRITE_SYNC, bh); - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - return 1; - - return 0; -} - -/* - * Read the MMP block. It _must_ be read from disk and hence we clear the - * uptodate flag on the buffer. - */ -static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, - ext4_fsblk_t mmp_block) -{ - struct mmp_struct *mmp; - - if (*bh) - clear_buffer_uptodate(*bh); - - /* This would be sb_bread(sb, mmp_block), except we need to be sure - * that the MD RAID device cache has been bypassed, and that the read - * is not blocked in the elevator. */ - if (!*bh) - *bh = sb_getblk(sb, mmp_block); - if (*bh) { - get_bh(*bh); - lock_buffer(*bh); - (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(READ_SYNC, *bh); - wait_on_buffer(*bh); - if (!buffer_uptodate(*bh)) { - brelse(*bh); - *bh = NULL; - } - } - if (!*bh) { - ext4_warning(sb, "Error while reading MMP block %llu", - mmp_block); - return -EIO; - } - - mmp = (struct mmp_struct *)((*bh)->b_data); - if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) - return -EINVAL; - - return 0; -} - -/* - * Dump as much information as possible to help the admin. - */ -void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, - const char *function, unsigned int line, const char *msg) -{ - __ext4_warning(sb, function, line, msg); - __ext4_warning(sb, function, line, - "MMP failure info: last update time: %llu, last update " - "node: %s, last update device: %s\n", - (long long unsigned int) le64_to_cpu(mmp->mmp_time), - mmp->mmp_nodename, mmp->mmp_bdevname); -} - -/* - * kmmpd will update the MMP sequence every s_mmp_update_interval seconds - */ -static int kmmpd(void *data) -{ - struct super_block *sb = ((struct mmpd_data *) data)->sb; - struct buffer_head *bh = ((struct mmpd_data *) data)->bh; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - struct mmp_struct *mmp; - ext4_fsblk_t mmp_block; - u32 seq = 0; - unsigned long failed_writes = 0; - int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval); - unsigned mmp_check_interval; - unsigned long last_update_time; - unsigned long diff; - int retval; - - mmp_block = le64_to_cpu(es->s_mmp_block); - mmp = (struct mmp_struct *)(bh->b_data); - mmp->mmp_time = cpu_to_le64(get_seconds()); - /* - * Start with the higher mmp_check_interval and reduce it if - * the MMP block is being updated on time. - */ - mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, - EXT4_MMP_MIN_CHECK_INTERVAL); - mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); - bdevname(bh->b_bdev, mmp->mmp_bdevname); - - memcpy(mmp->mmp_nodename, init_utsname()->nodename, - sizeof(mmp->mmp_nodename)); - - while (!kthread_should_stop()) { - if (++seq > EXT4_MMP_SEQ_MAX) - seq = 1; - - mmp->mmp_seq = cpu_to_le32(seq); - mmp->mmp_time = cpu_to_le64(get_seconds()); - last_update_time = jiffies; - - retval = write_mmp_block(bh); - /* - * Don't spew too many error messages. Print one every - * (s_mmp_update_interval * 60) seconds. - */ - if (retval) { - if ((failed_writes % 60) == 0) - ext4_error(sb, "Error writing to MMP block"); - failed_writes++; - } - - if (!(le32_to_cpu(es->s_feature_incompat) & - EXT4_FEATURE_INCOMPAT_MMP)) { - ext4_warning(sb, "kmmpd being stopped since MMP feature" - " has been disabled."); - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; - } - - if (sb->s_flags & MS_RDONLY) { - ext4_warning(sb, "kmmpd being stopped since filesystem " - "has been remounted as readonly."); - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; - } - - diff = jiffies - last_update_time; - if (diff < mmp_update_interval * HZ) - schedule_timeout_interruptible(mmp_update_interval * - HZ - diff); - - /* - * We need to make sure that more than mmp_check_interval - * seconds have not passed since writing. If that has happened - * we need to check if the MMP block is as we left it. - */ - diff = jiffies - last_update_time; - if (diff > mmp_check_interval * HZ) { - struct buffer_head *bh_check = NULL; - struct mmp_struct *mmp_check; - - retval = read_mmp_block(sb, &bh_check, mmp_block); - if (retval) { - ext4_error(sb, "error reading MMP data: %d", - retval); - - EXT4_SB(sb)->s_mmp_tsk = NULL; - goto failed; - } - - mmp_check = (struct mmp_struct *)(bh_check->b_data); - if (mmp->mmp_seq != mmp_check->mmp_seq || - memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename, - sizeof(mmp->mmp_nodename))) { - dump_mmp_msg(sb, mmp_check, - "Error while updating MMP info. " - "The filesystem seems to have been" - " multiply mounted."); - ext4_error(sb, "abort"); - goto failed; - } - put_bh(bh_check); - } - - /* - * Adjust the mmp_check_interval depending on how much time - * it took for the MMP block to be written. - */ - mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, - EXT4_MMP_MAX_CHECK_INTERVAL), - EXT4_MMP_MIN_CHECK_INTERVAL); - mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); - } - - /* - * Unmount seems to be clean. - */ - mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); - mmp->mmp_time = cpu_to_le64(get_seconds()); - - retval = write_mmp_block(bh); - -failed: - kfree(data); - brelse(bh); - return retval; -} - -/* - * Get a random new sequence number but make sure it is not greater than - * EXT4_MMP_SEQ_MAX. - */ -static unsigned int mmp_new_seq(void) -{ - u32 new_seq; - - do { - get_random_bytes(&new_seq, sizeof(u32)); - } while (new_seq > EXT4_MMP_SEQ_MAX); - - return new_seq; -} - -/* - * Protect the filesystem from being mounted more than once. - */ -int ext4_multi_mount_protect(struct super_block *sb, - ext4_fsblk_t mmp_block) -{ - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - struct buffer_head *bh = NULL; - struct mmp_struct *mmp = NULL; - struct mmpd_data *mmpd_data; - u32 seq; - unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); - unsigned int wait_time = 0; - int retval; - - if (mmp_block < le32_to_cpu(es->s_first_data_block) || - mmp_block >= ext4_blocks_count(es)) { - ext4_warning(sb, "Invalid MMP block in superblock"); - goto failed; - } - - retval = read_mmp_block(sb, &bh, mmp_block); - if (retval) - goto failed; - - mmp = (struct mmp_struct *)(bh->b_data); - - if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL) - mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL; - - /* - * If check_interval in MMP block is larger, use that instead of - * update_interval from the superblock. - */ - if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval) - mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval); - - seq = le32_to_cpu(mmp->mmp_seq); - if (seq == EXT4_MMP_SEQ_CLEAN) - goto skip; - - if (seq == EXT4_MMP_SEQ_FSCK) { - dump_mmp_msg(sb, mmp, "fsck is running on the filesystem"); - goto failed; - } - - wait_time = min(mmp_check_interval * 2 + 1, - mmp_check_interval + 60); - - /* Print MMP interval if more than 20 secs. */ - if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4) - ext4_warning(sb, "MMP interval %u higher than expected, please" - " wait.\n", wait_time * 2); - - if (schedule_timeout_interruptible(HZ * wait_time) != 0) { - ext4_warning(sb, "MMP startup interrupted, failing mount\n"); - goto failed; - } - - retval = read_mmp_block(sb, &bh, mmp_block); - if (retval) - goto failed; - mmp = (struct mmp_struct *)(bh->b_data); - if (seq != le32_to_cpu(mmp->mmp_seq)) { - dump_mmp_msg(sb, mmp, - "Device is already active on another node."); - goto failed; - } - -skip: - /* - * write a new random sequence number. - */ - seq = mmp_new_seq(); - mmp->mmp_seq = cpu_to_le32(seq); - - retval = write_mmp_block(bh); - if (retval) - goto failed; - - /* - * wait for MMP interval and check mmp_seq. - */ - if (schedule_timeout_interruptible(HZ * wait_time) != 0) { - ext4_warning(sb, "MMP startup interrupted, failing mount\n"); - goto failed; - } - - retval = read_mmp_block(sb, &bh, mmp_block); - if (retval) - goto failed; - mmp = (struct mmp_struct *)(bh->b_data); - if (seq != le32_to_cpu(mmp->mmp_seq)) { - dump_mmp_msg(sb, mmp, - "Device is already active on another node."); - goto failed; - } - - mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); - if (!mmpd_data) { - ext4_warning(sb, "not enough memory for mmpd_data"); - goto failed; - } - mmpd_data->sb = sb; - mmpd_data->bh = bh; - - /* - * Start a kernel thread to update the MMP block periodically. - */ - EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", - bdevname(bh->b_bdev, - mmp->mmp_bdevname)); - if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { - EXT4_SB(sb)->s_mmp_tsk = NULL; - kfree(mmpd_data); - ext4_warning(sb, "Unable to create kmmpd thread for %s.", - sb->s_id); - goto failed; - } - - return 0; - -failed: - brelse(bh); - return 1; -} - - diff --git a/ANDROID_3.4.5/fs/ext4/move_extent.c b/ANDROID_3.4.5/fs/ext4/move_extent.c deleted file mode 100644 index c5826c62..00000000 --- a/ANDROID_3.4.5/fs/ext4/move_extent.c +++ /dev/null @@ -1,1423 +0,0 @@ -/* - * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd. - * Written by Takashi Sato <t-sato@yk.jp.nec.com> - * Akira Fujita <a-fujita@rs.jp.nec.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include <linux/fs.h> -#include <linux/quotaops.h> -#include <linux/slab.h> -#include "ext4_jbd2.h" -#include "ext4.h" - -/** - * get_ext_path - Find an extent path for designated logical block number. - * - * @inode: an inode which is searched - * @lblock: logical block number to find an extent path - * @path: pointer to an extent path pointer (for output) - * - * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value - * on failure. - */ -static inline int -get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **path) -{ - int ret = 0; - - *path = ext4_ext_find_extent(inode, lblock, *path); - if (IS_ERR(*path)) { - ret = PTR_ERR(*path); - *path = NULL; - } else if ((*path)[ext_depth(inode)].p_ext == NULL) - ret = -ENODATA; - - return ret; -} - -/** - * copy_extent_status - Copy the extent's initialization status - * - * @src: an extent for getting initialize status - * @dest: an extent to be set the status - */ -static void -copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) -{ - if (ext4_ext_is_uninitialized(src)) - ext4_ext_mark_uninitialized(dest); - else - dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest)); -} - -/** - * mext_next_extent - Search for the next extent and set it to "extent" - * - * @inode: inode which is searched - * @path: this will obtain data for the next extent - * @extent: pointer to the next extent we have just gotten - * - * Search the next extent in the array of ext4_ext_path structure (@path) - * and set it to ext4_extent structure (@extent). In addition, the member of - * @path (->p_ext) also points the next extent. Return 0 on success, 1 if - * ext4_ext_path structure refers to the last extent, or a negative error - * value on failure. - */ -static int -mext_next_extent(struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent **extent) -{ - struct ext4_extent_header *eh; - int ppos, leaf_ppos = path->p_depth; - - ppos = leaf_ppos; - if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { - /* leaf block */ - *extent = ++path[ppos].p_ext; - path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); - return 0; - } - - while (--ppos >= 0) { - if (EXT_LAST_INDEX(path[ppos].p_hdr) > - path[ppos].p_idx) { - int cur_ppos = ppos; - - /* index block */ - path[ppos].p_idx++; - path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); - if (path[ppos+1].p_bh) - brelse(path[ppos+1].p_bh); - path[ppos+1].p_bh = - sb_bread(inode->i_sb, path[ppos].p_block); - if (!path[ppos+1].p_bh) - return -EIO; - path[ppos+1].p_hdr = - ext_block_hdr(path[ppos+1].p_bh); - - /* Halfway index block */ - while (++cur_ppos < leaf_ppos) { - path[cur_ppos].p_idx = - EXT_FIRST_INDEX(path[cur_ppos].p_hdr); - path[cur_ppos].p_block = - ext4_idx_pblock(path[cur_ppos].p_idx); - if (path[cur_ppos+1].p_bh) - brelse(path[cur_ppos+1].p_bh); - path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, - path[cur_ppos].p_block); - if (!path[cur_ppos+1].p_bh) - return -EIO; - path[cur_ppos+1].p_hdr = - ext_block_hdr(path[cur_ppos+1].p_bh); - } - - path[leaf_ppos].p_ext = *extent = NULL; - - eh = path[leaf_ppos].p_hdr; - if (le16_to_cpu(eh->eh_entries) == 0) - /* empty leaf is found */ - return -ENODATA; - - /* leaf block */ - path[leaf_ppos].p_ext = *extent = - EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); - path[leaf_ppos].p_block = - ext4_ext_pblock(path[leaf_ppos].p_ext); - return 0; - } - } - /* We found the last extent */ - return 1; -} - -/** - * mext_check_null_inode - NULL check for two inodes - * - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. - */ -static int -mext_check_null_inode(struct inode *inode1, struct inode *inode2, - const char *function, unsigned int line) -{ - int ret = 0; - - if (inode1 == NULL) { - __ext4_error(inode2->i_sb, function, line, - "Both inodes should not be NULL: " - "inode1 NULL inode2 %lu", inode2->i_ino); - ret = -EIO; - } else if (inode2 == NULL) { - __ext4_error(inode1->i_sb, function, line, - "Both inodes should not be NULL: " - "inode1 %lu inode2 NULL", inode1->i_ino); - ret = -EIO; - } - return ret; -} - -/** - * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem - * - * @orig_inode: original inode structure - * @donor_inode: donor inode structure - * Acquire write lock of i_data_sem of the two inodes (orig and donor) by - * i_ino order. - */ -static void -double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) -{ - struct inode *first = orig_inode, *second = donor_inode; - - /* - * Use the inode number to provide the stable locking order instead - * of its address, because the C language doesn't guarantee you can - * compare pointers that don't come from the same array. - */ - if (donor_inode->i_ino < orig_inode->i_ino) { - first = donor_inode; - second = orig_inode; - } - - down_write(&EXT4_I(first)->i_data_sem); - down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); -} - -/** - * double_up_write_data_sem - Release two inodes' write lock of i_data_sem - * - * @orig_inode: original inode structure to be released its lock first - * @donor_inode: donor inode structure to be released its lock second - * Release write lock of i_data_sem of two inodes (orig and donor). - */ -static void -double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) -{ - up_write(&EXT4_I(orig_inode)->i_data_sem); - up_write(&EXT4_I(donor_inode)->i_data_sem); -} - -/** - * mext_insert_across_blocks - Insert extents across leaf block - * - * @handle: journal handle - * @orig_inode: original inode - * @o_start: first original extent to be changed - * @o_end: last original extent to be changed - * @start_ext: first new extent to be inserted - * @new_ext: middle of new extent to be inserted - * @end_ext: last new extent to be inserted - * - * Allocate a new leaf block and insert extents into it. Return 0 on success, - * or a negative error value on failure. - */ -static int -mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, - struct ext4_extent *o_start, struct ext4_extent *o_end, - struct ext4_extent *start_ext, struct ext4_extent *new_ext, - struct ext4_extent *end_ext) -{ - struct ext4_ext_path *orig_path = NULL; - ext4_lblk_t eblock = 0; - int new_flag = 0; - int end_flag = 0; - int err = 0; - - if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) { - if (o_start == o_end) { - - /* start_ext new_ext end_ext - * donor |---------|-----------|--------| - * orig |------------------------------| - */ - end_flag = 1; - } else { - - /* start_ext new_ext end_ext - * donor |---------|----------|---------| - * orig |---------------|--------------| - */ - o_end->ee_block = end_ext->ee_block; - o_end->ee_len = end_ext->ee_len; - ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); - } - - o_start->ee_len = start_ext->ee_len; - eblock = le32_to_cpu(start_ext->ee_block); - new_flag = 1; - - } else if (start_ext->ee_len && new_ext->ee_len && - !end_ext->ee_len && o_start == o_end) { - - /* start_ext new_ext - * donor |--------------|---------------| - * orig |------------------------------| - */ - o_start->ee_len = start_ext->ee_len; - eblock = le32_to_cpu(start_ext->ee_block); - new_flag = 1; - - } else if (!start_ext->ee_len && new_ext->ee_len && - end_ext->ee_len && o_start == o_end) { - - /* new_ext end_ext - * donor |--------------|---------------| - * orig |------------------------------| - */ - o_end->ee_block = end_ext->ee_block; - o_end->ee_len = end_ext->ee_len; - ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); - - /* - * Set 0 to the extent block if new_ext was - * the first block. - */ - if (new_ext->ee_block) - eblock = le32_to_cpu(new_ext->ee_block); - - new_flag = 1; - } else { - ext4_debug("ext4 move extent: Unexpected insert case\n"); - return -EIO; - } - - if (new_flag) { - err = get_ext_path(orig_inode, eblock, &orig_path); - if (err) - goto out; - - if (ext4_ext_insert_extent(handle, orig_inode, - orig_path, new_ext, 0)) - goto out; - } - - if (end_flag) { - err = get_ext_path(orig_inode, - le32_to_cpu(end_ext->ee_block) - 1, &orig_path); - if (err) - goto out; - - if (ext4_ext_insert_extent(handle, orig_inode, - orig_path, end_ext, 0)) - goto out; - } -out: - if (orig_path) { - ext4_ext_drop_refs(orig_path); - kfree(orig_path); - } - - return err; - -} - -/** - * mext_insert_inside_block - Insert new extent to the extent block - * - * @o_start: first original extent to be moved - * @o_end: last original extent to be moved - * @start_ext: first new extent to be inserted - * @new_ext: middle of new extent to be inserted - * @end_ext: last new extent to be inserted - * @eh: extent header of target leaf block - * @range_to_move: used to decide how to insert extent - * - * Insert extents into the leaf block. The extent (@o_start) is overwritten - * by inserted extents. - */ -static void -mext_insert_inside_block(struct ext4_extent *o_start, - struct ext4_extent *o_end, - struct ext4_extent *start_ext, - struct ext4_extent *new_ext, - struct ext4_extent *end_ext, - struct ext4_extent_header *eh, - int range_to_move) -{ - int i = 0; - unsigned long len; - - /* Move the existing extents */ - if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) { - len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) - - (unsigned long)(o_end + 1); - memmove(o_end + 1 + range_to_move, o_end + 1, len); - } - - /* Insert start entry */ - if (start_ext->ee_len) - o_start[i++].ee_len = start_ext->ee_len; - - /* Insert new entry */ - if (new_ext->ee_len) { - o_start[i] = *new_ext; - ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext)); - } - - /* Insert end entry */ - if (end_ext->ee_len) - o_start[i] = *end_ext; - - /* Increment the total entries counter on the extent block */ - le16_add_cpu(&eh->eh_entries, range_to_move); -} - -/** - * mext_insert_extents - Insert new extent - * - * @handle: journal handle - * @orig_inode: original inode - * @orig_path: path indicates first extent to be changed - * @o_start: first original extent to be changed - * @o_end: last original extent to be changed - * @start_ext: first new extent to be inserted - * @new_ext: middle of new extent to be inserted - * @end_ext: last new extent to be inserted - * - * Call the function to insert extents. If we cannot add more extents into - * the leaf block, we call mext_insert_across_blocks() to create a - * new leaf block. Otherwise call mext_insert_inside_block(). Return 0 - * on success, or a negative error value on failure. - */ -static int -mext_insert_extents(handle_t *handle, struct inode *orig_inode, - struct ext4_ext_path *orig_path, - struct ext4_extent *o_start, - struct ext4_extent *o_end, - struct ext4_extent *start_ext, - struct ext4_extent *new_ext, - struct ext4_extent *end_ext) -{ - struct ext4_extent_header *eh; - unsigned long need_slots, slots_range; - int range_to_move, depth, ret; - - /* - * The extents need to be inserted - * start_extent + new_extent + end_extent. - */ - need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) + - (new_ext->ee_len ? 1 : 0); - - /* The number of slots between start and end */ - slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1) - / sizeof(struct ext4_extent); - - /* Range to move the end of extent */ - range_to_move = need_slots - slots_range; - depth = orig_path->p_depth; - orig_path += depth; - eh = orig_path->p_hdr; - - if (depth) { - /* Register to journal */ - ret = ext4_journal_get_write_access(handle, orig_path->p_bh); - if (ret) - return ret; - } - - /* Expansion */ - if (range_to_move > 0 && - (range_to_move > le16_to_cpu(eh->eh_max) - - le16_to_cpu(eh->eh_entries))) { - - ret = mext_insert_across_blocks(handle, orig_inode, o_start, - o_end, start_ext, new_ext, end_ext); - if (ret < 0) - return ret; - } else - mext_insert_inside_block(o_start, o_end, start_ext, new_ext, - end_ext, eh, range_to_move); - - if (depth) { - ret = ext4_handle_dirty_metadata(handle, orig_inode, - orig_path->p_bh); - if (ret) - return ret; - } else { - ret = ext4_mark_inode_dirty(handle, orig_inode); - if (ret < 0) - return ret; - } - - return 0; -} - -/** - * mext_leaf_block - Move one leaf extent block into the inode. - * - * @handle: journal handle - * @orig_inode: original inode - * @orig_path: path indicates first extent to be changed - * @dext: donor extent - * @from: start offset on the target file - * - * In order to insert extents into the leaf block, we must divide the extent - * in the leaf block into three extents. The one is located to be inserted - * extents, and the others are located around it. - * - * Therefore, this function creates structures to save extents of the leaf - * block, and inserts extents by calling mext_insert_extents() with - * created extents. Return 0 on success, or a negative error value on failure. - */ -static int -mext_leaf_block(handle_t *handle, struct inode *orig_inode, - struct ext4_ext_path *orig_path, struct ext4_extent *dext, - ext4_lblk_t *from) -{ - struct ext4_extent *oext, *o_start, *o_end, *prev_ext; - struct ext4_extent new_ext, start_ext, end_ext; - ext4_lblk_t new_ext_end; - int oext_alen, new_ext_alen, end_ext_alen; - int depth = ext_depth(orig_inode); - int ret; - - start_ext.ee_block = end_ext.ee_block = 0; - o_start = o_end = oext = orig_path[depth].p_ext; - oext_alen = ext4_ext_get_actual_len(oext); - start_ext.ee_len = end_ext.ee_len = 0; - - new_ext.ee_block = cpu_to_le32(*from); - ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext)); - new_ext.ee_len = dext->ee_len; - new_ext_alen = ext4_ext_get_actual_len(&new_ext); - new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; - - /* - * Case: original extent is first - * oext |--------| - * new_ext |--| - * start_ext |--| - */ - if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) && - le32_to_cpu(new_ext.ee_block) < - le32_to_cpu(oext->ee_block) + oext_alen) { - start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - - le32_to_cpu(oext->ee_block)); - start_ext.ee_block = oext->ee_block; - copy_extent_status(oext, &start_ext); - } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { - prev_ext = oext - 1; - /* - * We can merge new_ext into previous extent, - * if these are contiguous and same extent type. - */ - if (ext4_can_extents_be_merged(orig_inode, prev_ext, - &new_ext)) { - o_start = prev_ext; - start_ext.ee_len = cpu_to_le16( - ext4_ext_get_actual_len(prev_ext) + - new_ext_alen); - start_ext.ee_block = oext->ee_block; - copy_extent_status(prev_ext, &start_ext); - new_ext.ee_len = 0; - } - } - - /* - * Case: new_ext_end must be less than oext - * oext |-----------| - * new_ext |-------| - */ - if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { - EXT4_ERROR_INODE(orig_inode, - "new_ext_end(%u) should be less than or equal to " - "oext->ee_block(%u) + oext_alen(%d) - 1", - new_ext_end, le32_to_cpu(oext->ee_block), - oext_alen); - ret = -EIO; - goto out; - } - - /* - * Case: new_ext is smaller than original extent - * oext |---------------| - * new_ext |-----------| - * end_ext |---| - */ - if (le32_to_cpu(oext->ee_block) <= new_ext_end && - new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) { - end_ext.ee_len = - cpu_to_le16(le32_to_cpu(oext->ee_block) + - oext_alen - 1 - new_ext_end); - copy_extent_status(oext, &end_ext); - end_ext_alen = ext4_ext_get_actual_len(&end_ext); - ext4_ext_store_pblock(&end_ext, - (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen)); - end_ext.ee_block = - cpu_to_le32(le32_to_cpu(o_end->ee_block) + - oext_alen - end_ext_alen); - } - - ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, - o_end, &start_ext, &new_ext, &end_ext); -out: - return ret; -} - -/** - * mext_calc_swap_extents - Calculate extents for extent swapping. - * - * @tmp_dext: the extent that will belong to the original inode - * @tmp_oext: the extent that will belong to the donor inode - * @orig_off: block offset of original inode - * @donor_off: block offset of donor inode - * @max_count: the maximum length of extents - * - * Return 0 on success, or a negative error value on failure. - */ -static int -mext_calc_swap_extents(struct ext4_extent *tmp_dext, - struct ext4_extent *tmp_oext, - ext4_lblk_t orig_off, ext4_lblk_t donor_off, - ext4_lblk_t max_count) -{ - ext4_lblk_t diff, orig_diff; - struct ext4_extent dext_old, oext_old; - - BUG_ON(orig_off != donor_off); - - /* original and donor extents have to cover the same block offset */ - if (orig_off < le32_to_cpu(tmp_oext->ee_block) || - le32_to_cpu(tmp_oext->ee_block) + - ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off) - return -ENODATA; - - if (orig_off < le32_to_cpu(tmp_dext->ee_block) || - le32_to_cpu(tmp_dext->ee_block) + - ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off) - return -ENODATA; - - dext_old = *tmp_dext; - oext_old = *tmp_oext; - - /* When tmp_dext is too large, pick up the target range. */ - diff = donor_off - le32_to_cpu(tmp_dext->ee_block); - - ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); - tmp_dext->ee_block = - cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); - tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); - - if (max_count < ext4_ext_get_actual_len(tmp_dext)) - tmp_dext->ee_len = cpu_to_le16(max_count); - - orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); - ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff); - - /* Adjust extent length if donor extent is larger than orig */ - if (ext4_ext_get_actual_len(tmp_dext) > - ext4_ext_get_actual_len(tmp_oext) - orig_diff) - tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) - - orig_diff); - - tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext)); - - copy_extent_status(&oext_old, tmp_dext); - copy_extent_status(&dext_old, tmp_oext); - - return 0; -} - -/** - * mext_replace_branches - Replace original extents with new extents - * - * @handle: journal handle - * @orig_inode: original inode - * @donor_inode: donor inode - * @from: block offset of orig_inode - * @count: block count to be replaced - * @err: pointer to save return value - * - * Replace original inode extents and donor inode extents page by page. - * We implement this replacement in the following three steps: - * 1. Save the block information of original and donor inodes into - * dummy extents. - * 2. Change the block information of original inode to point at the - * donor inode blocks. - * 3. Change the block information of donor inode to point at the saved - * original inode blocks in the dummy extents. - * - * Return replaced block count. - */ -static int -mext_replace_branches(handle_t *handle, struct inode *orig_inode, - struct inode *donor_inode, ext4_lblk_t from, - ext4_lblk_t count, int *err) -{ - struct ext4_ext_path *orig_path = NULL; - struct ext4_ext_path *donor_path = NULL; - struct ext4_extent *oext, *dext; - struct ext4_extent tmp_dext, tmp_oext; - ext4_lblk_t orig_off = from, donor_off = from; - int depth; - int replaced_count = 0; - int dext_alen; - - /* Protect extent trees against block allocations via delalloc */ - double_down_write_data_sem(orig_inode, donor_inode); - - /* Get the original extent for the block "orig_off" */ - *err = get_ext_path(orig_inode, orig_off, &orig_path); - if (*err) - goto out; - - /* Get the donor extent for the head */ - *err = get_ext_path(donor_inode, donor_off, &donor_path); - if (*err) - goto out; - depth = ext_depth(orig_inode); - oext = orig_path[depth].p_ext; - tmp_oext = *oext; - - depth = ext_depth(donor_inode); - dext = donor_path[depth].p_ext; - tmp_dext = *dext; - - *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, - donor_off, count); - if (*err) - goto out; - - /* Loop for the donor extents */ - while (1) { - /* The extent for donor must be found. */ - if (!dext) { - EXT4_ERROR_INODE(donor_inode, - "The extent for donor must be found"); - *err = -EIO; - goto out; - } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { - EXT4_ERROR_INODE(donor_inode, - "Donor offset(%u) and the first block of donor " - "extent(%u) should be equal", - donor_off, - le32_to_cpu(tmp_dext.ee_block)); - *err = -EIO; - goto out; - } - - /* Set donor extent to orig extent */ - *err = mext_leaf_block(handle, orig_inode, - orig_path, &tmp_dext, &orig_off); - if (*err) - goto out; - - /* Set orig extent to donor extent */ - *err = mext_leaf_block(handle, donor_inode, - donor_path, &tmp_oext, &donor_off); - if (*err) - goto out; - - dext_alen = ext4_ext_get_actual_len(&tmp_dext); - replaced_count += dext_alen; - donor_off += dext_alen; - orig_off += dext_alen; - - /* Already moved the expected blocks */ - if (replaced_count >= count) - break; - - if (orig_path) - ext4_ext_drop_refs(orig_path); - *err = get_ext_path(orig_inode, orig_off, &orig_path); - if (*err) - goto out; - depth = ext_depth(orig_inode); - oext = orig_path[depth].p_ext; - tmp_oext = *oext; - - if (donor_path) - ext4_ext_drop_refs(donor_path); - *err = get_ext_path(donor_inode, donor_off, &donor_path); - if (*err) - goto out; - depth = ext_depth(donor_inode); - dext = donor_path[depth].p_ext; - tmp_dext = *dext; - - *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, - donor_off, count - replaced_count); - if (*err) - goto out; - } - -out: - if (orig_path) { - ext4_ext_drop_refs(orig_path); - kfree(orig_path); - } - if (donor_path) { - ext4_ext_drop_refs(donor_path); - kfree(donor_path); - } - - ext4_ext_invalidate_cache(orig_inode); - ext4_ext_invalidate_cache(donor_inode); - - double_up_write_data_sem(orig_inode, donor_inode); - - return replaced_count; -} - -/** - * move_extent_per_page - Move extent data per page - * - * @o_filp: file structure of original file - * @donor_inode: donor inode - * @orig_page_offset: page index on original file - * @data_offset_in_page: block index where data swapping starts - * @block_len_in_page: the number of blocks to be swapped - * @uninit: orig extent is uninitialized or not - * @err: pointer to save return value - * - * Save the data in original inode blocks and replace original inode extents - * with donor inode extents by calling mext_replace_branches(). - * Finally, write out the saved data in new original inode blocks. Return - * replaced block count. - */ -static int -move_extent_per_page(struct file *o_filp, struct inode *donor_inode, - pgoff_t orig_page_offset, int data_offset_in_page, - int block_len_in_page, int uninit, int *err) -{ - struct inode *orig_inode = o_filp->f_dentry->d_inode; - struct address_space *mapping = orig_inode->i_mapping; - struct buffer_head *bh; - struct page *page = NULL; - const struct address_space_operations *a_ops = mapping->a_ops; - handle_t *handle; - ext4_lblk_t orig_blk_offset; - long long offs = orig_page_offset << PAGE_CACHE_SHIFT; - unsigned long blocksize = orig_inode->i_sb->s_blocksize; - unsigned int w_flags = 0; - unsigned int tmp_data_size, data_size, replaced_size; - void *fsdata; - int i, jblocks; - int err2 = 0; - int replaced_count = 0; - int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; - - /* - * It needs twice the amount of ordinary journal buffers because - * inode and donor_inode may change each different metadata blocks. - */ - jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; - handle = ext4_journal_start(orig_inode, jblocks); - if (IS_ERR(handle)) { - *err = PTR_ERR(handle); - return 0; - } - - if (segment_eq(get_fs(), KERNEL_DS)) - w_flags |= AOP_FLAG_UNINTERRUPTIBLE; - - orig_blk_offset = orig_page_offset * blocks_per_page + - data_offset_in_page; - - /* - * If orig extent is uninitialized one, - * it's not necessary force the page into memory - * and then force it to be written out again. - * Just swap data blocks between orig and donor. - */ - if (uninit) { - replaced_count = mext_replace_branches(handle, orig_inode, - donor_inode, orig_blk_offset, - block_len_in_page, err); - goto out2; - } - - offs = (long long)orig_blk_offset << orig_inode->i_blkbits; - - /* Calculate data_size */ - if ((orig_blk_offset + block_len_in_page - 1) == - ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { - /* Replace the last block */ - tmp_data_size = orig_inode->i_size & (blocksize - 1); - /* - * If data_size equal zero, it shows data_size is multiples of - * blocksize. So we set appropriate value. - */ - if (tmp_data_size == 0) - tmp_data_size = blocksize; - - data_size = tmp_data_size + - ((block_len_in_page - 1) << orig_inode->i_blkbits); - } else - data_size = block_len_in_page << orig_inode->i_blkbits; - - replaced_size = data_size; - - *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, - &page, &fsdata); - if (unlikely(*err < 0)) - goto out; - - if (!PageUptodate(page)) { - mapping->a_ops->readpage(o_filp, page); - lock_page(page); - } - - /* - * try_to_release_page() doesn't call releasepage in writeback mode. - * We should care about the order of writing to the same file - * by multiple move extent processes. - * It needs to call wait_on_page_writeback() to wait for the - * writeback of the page. - */ - wait_on_page_writeback(page); - - /* Release old bh and drop refs */ - try_to_release_page(page, 0); - - replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, - orig_blk_offset, block_len_in_page, - &err2); - if (err2) { - if (replaced_count) { - block_len_in_page = replaced_count; - replaced_size = - block_len_in_page << orig_inode->i_blkbits; - } else - goto out; - } - - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); - - bh = page_buffers(page); - for (i = 0; i < data_offset_in_page; i++) - bh = bh->b_this_page; - - for (i = 0; i < block_len_in_page; i++) { - *err = ext4_get_block(orig_inode, - (sector_t)(orig_blk_offset + i), bh, 0); - if (*err < 0) - goto out; - - if (bh->b_this_page != NULL) - bh = bh->b_this_page; - } - - *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, - page, fsdata); - page = NULL; - -out: - if (unlikely(page)) { - if (PageLocked(page)) - unlock_page(page); - page_cache_release(page); - ext4_journal_stop(handle); - } -out2: - ext4_journal_stop(handle); - - if (err2) - *err = err2; - - return replaced_count; -} - -/** - * mext_check_arguments - Check whether move extent can be done - * - * @orig_inode: original inode - * @donor_inode: donor inode - * @orig_start: logical start offset in block for orig - * @donor_start: logical start offset in block for donor - * @len: the number of blocks to be moved - * - * Check the arguments of ext4_move_extents() whether the files can be - * exchanged with each other. - * Return 0 on success, or a negative error value on failure. - */ -static int -mext_check_arguments(struct inode *orig_inode, - struct inode *donor_inode, __u64 orig_start, - __u64 donor_start, __u64 *len) -{ - ext4_lblk_t orig_blocks, donor_blocks; - unsigned int blkbits = orig_inode->i_blkbits; - unsigned int blocksize = 1 << blkbits; - - if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { - ext4_debug("ext4 move extent: suid or sgid is set" - " to donor file [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) - return -EPERM; - - /* Ext4 move extent does not support swapfile */ - if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { - ext4_debug("ext4 move extent: The argument files should " - "not be swapfile [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - /* Files should be in the same ext4 FS */ - if (orig_inode->i_sb != donor_inode->i_sb) { - ext4_debug("ext4 move extent: The argument files " - "should be in same FS [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - /* Ext4 move extent supports only extent based file */ - if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { - ext4_debug("ext4 move extent: orig file is not extents " - "based file [ino:orig %lu]\n", orig_inode->i_ino); - return -EOPNOTSUPP; - } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { - ext4_debug("ext4 move extent: donor file is not extents " - "based file [ino:donor %lu]\n", donor_inode->i_ino); - return -EOPNOTSUPP; - } - - if ((!orig_inode->i_size) || (!donor_inode->i_size)) { - ext4_debug("ext4 move extent: File size is 0 byte\n"); - return -EINVAL; - } - - /* Start offset should be same */ - if (orig_start != donor_start) { - ext4_debug("ext4 move extent: orig and donor's start " - "offset are not same [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - if ((orig_start >= EXT_MAX_BLOCKS) || - (donor_start >= EXT_MAX_BLOCKS) || - (*len > EXT_MAX_BLOCKS) || - (orig_start + *len >= EXT_MAX_BLOCKS)) { - ext4_debug("ext4 move extent: Can't handle over [%u] blocks " - "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - if (orig_inode->i_size > donor_inode->i_size) { - donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; - /* TODO: eliminate this artificial restriction */ - if (orig_start >= donor_blocks) { - ext4_debug("ext4 move extent: orig start offset " - "[%llu] should be less than donor file blocks " - "[%u] [ino:orig %lu, donor %lu]\n", - orig_start, donor_blocks, - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - /* TODO: eliminate this artificial restriction */ - if (orig_start + *len > donor_blocks) { - ext4_debug("ext4 move extent: End offset [%llu] should " - "be less than donor file blocks [%u]." - "So adjust length from %llu to %llu " - "[ino:orig %lu, donor %lu]\n", - orig_start + *len, donor_blocks, - *len, donor_blocks - orig_start, - orig_inode->i_ino, donor_inode->i_ino); - *len = donor_blocks - orig_start; - } - } else { - orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits; - if (orig_start >= orig_blocks) { - ext4_debug("ext4 move extent: start offset [%llu] " - "should be less than original file blocks " - "[%u] [ino:orig %lu, donor %lu]\n", - orig_start, orig_blocks, - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - if (orig_start + *len > orig_blocks) { - ext4_debug("ext4 move extent: Adjust length " - "from %llu to %llu. Because it should be " - "less than original file blocks " - "[ino:orig %lu, donor %lu]\n", - *len, orig_blocks - orig_start, - orig_inode->i_ino, donor_inode->i_ino); - *len = orig_blocks - orig_start; - } - } - - if (!*len) { - ext4_debug("ext4 move extent: len should not be 0 " - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, - donor_inode->i_ino); - return -EINVAL; - } - - return 0; -} - -/** - * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 - * - * @inode1: the inode structure - * @inode2: the inode structure - * - * Lock two inodes' i_mutex by i_ino order. - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. - */ -static int -mext_inode_double_lock(struct inode *inode1, struct inode *inode2) -{ - int ret = 0; - - BUG_ON(inode1 == NULL && inode2 == NULL); - - ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); - if (ret < 0) - goto out; - - if (inode1 == inode2) { - mutex_lock(&inode1->i_mutex); - goto out; - } - - if (inode1->i_ino < inode2->i_ino) { - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); - } else { - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); - } - -out: - return ret; -} - -/** - * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 - * - * @inode1: the inode that is released first - * @inode2: the inode that is released second - * - * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. - */ - -static int -mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) -{ - int ret = 0; - - BUG_ON(inode1 == NULL && inode2 == NULL); - - ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); - if (ret < 0) - goto out; - - if (inode1) - mutex_unlock(&inode1->i_mutex); - - if (inode2 && inode2 != inode1) - mutex_unlock(&inode2->i_mutex); - -out: - return ret; -} - -/** - * ext4_move_extents - Exchange the specified range of a file - * - * @o_filp: file structure of the original file - * @d_filp: file structure of the donor file - * @orig_start: start offset in block for orig - * @donor_start: start offset in block for donor - * @len: the number of blocks to be moved - * @moved_len: moved block length - * - * This function returns 0 and moved block length is set in moved_len - * if succeed, otherwise returns error value. - * - * Note: ext4_move_extents() proceeds the following order. - * 1:ext4_move_extents() calculates the last block number of moving extent - * function by the start block number (orig_start) and the number of blocks - * to be moved (len) specified as arguments. - * If the {orig, donor}_start points a hole, the extent's start offset - * pointed by ext_cur (current extent), holecheck_path, orig_path are set - * after hole behind. - * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent - * or the ext_cur exceeds the block_end which is last logical block number. - * 3:To get the length of continues area, call mext_next_extent() - * specified with the ext_cur (initial value is holecheck_path) re-cursive, - * until find un-continuous extent, the start logical block number exceeds - * the block_end or the extent points to the last extent. - * 4:Exchange the original inode data with donor inode data - * from orig_page_offset to seq_end_page. - * The start indexes of data are specified as arguments. - * That of the original inode is orig_page_offset, - * and the donor inode is also orig_page_offset - * (To easily handle blocksize != pagesize case, the offset for the - * donor inode is block unit). - * 5:Update holecheck_path and orig_path to points a next proceeding extent, - * then returns to step 2. - * 6:Release holecheck_path, orig_path and set the len to moved_len - * which shows the number of moved blocks. - * The moved_len is useful for the command to calculate the file offset - * for starting next move extent ioctl. - * 7:Return 0 on success, or a negative error value on failure. - */ -int -ext4_move_extents(struct file *o_filp, struct file *d_filp, - __u64 orig_start, __u64 donor_start, __u64 len, - __u64 *moved_len) -{ - struct inode *orig_inode = o_filp->f_dentry->d_inode; - struct inode *donor_inode = d_filp->f_dentry->d_inode; - struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL; - struct ext4_extent *ext_prev, *ext_cur, *ext_dummy; - ext4_lblk_t block_start = orig_start; - ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; - ext4_lblk_t rest_blocks; - pgoff_t orig_page_offset = 0, seq_end_page; - int ret1, ret2, depth, last_extent = 0; - int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; - int data_offset_in_page; - int block_len_in_page; - int uninit; - - /* orig and donor should be different file */ - if (orig_inode->i_ino == donor_inode->i_ino) { - ext4_debug("ext4 move extent: The argument files should not " - "be same file [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - /* Regular file check */ - if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { - ext4_debug("ext4 move extent: The argument files should be " - "regular file [ino:orig %lu, donor %lu]\n", - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - /* Protect orig and donor inodes against a truncate */ - ret1 = mext_inode_double_lock(orig_inode, donor_inode); - if (ret1 < 0) - return ret1; - - /* Protect extent tree against block allocations via delalloc */ - double_down_write_data_sem(orig_inode, donor_inode); - /* Check the filesystem environment whether move_extent can be done */ - ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, - donor_start, &len); - if (ret1) - goto out; - - file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; - block_end = block_start + len - 1; - if (file_end < block_end) - len -= block_end - file_end; - - ret1 = get_ext_path(orig_inode, block_start, &orig_path); - if (ret1) - goto out; - - /* Get path structure to check the hole */ - ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); - if (ret1) - goto out; - - depth = ext_depth(orig_inode); - ext_cur = holecheck_path[depth].p_ext; - - /* - * Get proper starting location of block replacement if block_start was - * within the hole. - */ - if (le32_to_cpu(ext_cur->ee_block) + - ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { - /* - * The hole exists between extents or the tail of - * original file. - */ - last_extent = mext_next_extent(orig_inode, - holecheck_path, &ext_cur); - if (last_extent < 0) { - ret1 = last_extent; - goto out; - } - last_extent = mext_next_extent(orig_inode, orig_path, - &ext_dummy); - if (last_extent < 0) { - ret1 = last_extent; - goto out; - } - seq_start = le32_to_cpu(ext_cur->ee_block); - } else if (le32_to_cpu(ext_cur->ee_block) > block_start) - /* The hole exists at the beginning of original file. */ - seq_start = le32_to_cpu(ext_cur->ee_block); - else - seq_start = block_start; - - /* No blocks within the specified range. */ - if (le32_to_cpu(ext_cur->ee_block) > block_end) { - ext4_debug("ext4 move extent: The specified range of file " - "may be the hole\n"); - ret1 = -EINVAL; - goto out; - } - - /* Adjust start blocks */ - add_blocks = min(le32_to_cpu(ext_cur->ee_block) + - ext4_ext_get_actual_len(ext_cur), block_end + 1) - - max(le32_to_cpu(ext_cur->ee_block), block_start); - - while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { - seq_blocks += add_blocks; - - /* Adjust tail blocks */ - if (seq_start + seq_blocks - 1 > block_end) - seq_blocks = block_end - seq_start + 1; - - ext_prev = ext_cur; - last_extent = mext_next_extent(orig_inode, holecheck_path, - &ext_cur); - if (last_extent < 0) { - ret1 = last_extent; - break; - } - add_blocks = ext4_ext_get_actual_len(ext_cur); - - /* - * Extend the length of contiguous block (seq_blocks) - * if extents are contiguous. - */ - if (ext4_can_extents_be_merged(orig_inode, - ext_prev, ext_cur) && - block_end >= le32_to_cpu(ext_cur->ee_block) && - !last_extent) - continue; - - /* Is original extent is uninitialized */ - uninit = ext4_ext_is_uninitialized(ext_prev); - - data_offset_in_page = seq_start % blocks_per_page; - - /* - * Calculate data blocks count that should be swapped - * at the first page. - */ - if (data_offset_in_page + seq_blocks > blocks_per_page) { - /* Swapped blocks are across pages */ - block_len_in_page = - blocks_per_page - data_offset_in_page; - } else { - /* Swapped blocks are in a page */ - block_len_in_page = seq_blocks; - } - - orig_page_offset = seq_start >> - (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); - seq_end_page = (seq_start + seq_blocks - 1) >> - (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); - seq_start = le32_to_cpu(ext_cur->ee_block); - rest_blocks = seq_blocks; - - /* - * Up semaphore to avoid following problems: - * a. transaction deadlock among ext4_journal_start, - * ->write_begin via pagefault, and jbd2_journal_commit - * b. racing with ->readpage, ->write_begin, and ext4_get_block - * in move_extent_per_page - */ - double_up_write_data_sem(orig_inode, donor_inode); - - while (orig_page_offset <= seq_end_page) { - - /* Swap original branches with new branches */ - block_len_in_page = move_extent_per_page( - o_filp, donor_inode, - orig_page_offset, - data_offset_in_page, - block_len_in_page, uninit, - &ret1); - - /* Count how many blocks we have exchanged */ - *moved_len += block_len_in_page; - if (ret1 < 0) - break; - if (*moved_len > len) { - EXT4_ERROR_INODE(orig_inode, - "We replaced blocks too much! " - "sum of replaced: %llu requested: %llu", - *moved_len, len); - ret1 = -EIO; - break; - } - - orig_page_offset++; - data_offset_in_page = 0; - rest_blocks -= block_len_in_page; - if (rest_blocks > blocks_per_page) - block_len_in_page = blocks_per_page; - else - block_len_in_page = rest_blocks; - } - - double_down_write_data_sem(orig_inode, donor_inode); - if (ret1 < 0) - break; - - /* Decrease buffer counter */ - if (holecheck_path) - ext4_ext_drop_refs(holecheck_path); - ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); - if (ret1) - break; - depth = holecheck_path->p_depth; - - /* Decrease buffer counter */ - if (orig_path) - ext4_ext_drop_refs(orig_path); - ret1 = get_ext_path(orig_inode, seq_start, &orig_path); - if (ret1) - break; - - ext_cur = holecheck_path[depth].p_ext; - add_blocks = ext4_ext_get_actual_len(ext_cur); - seq_blocks = 0; - - } -out: - if (*moved_len) { - ext4_discard_preallocations(orig_inode); - ext4_discard_preallocations(donor_inode); - } - - if (orig_path) { - ext4_ext_drop_refs(orig_path); - kfree(orig_path); - } - if (holecheck_path) { - ext4_ext_drop_refs(holecheck_path); - kfree(holecheck_path); - } - double_up_write_data_sem(orig_inode, donor_inode); - ret2 = mext_inode_double_unlock(orig_inode, donor_inode); - - if (ret1) - return ret1; - else if (ret2) - return ret2; - - return 0; -} diff --git a/ANDROID_3.4.5/fs/ext4/namei.c b/ANDROID_3.4.5/fs/ext4/namei.c deleted file mode 100644 index 0a94cbbe..00000000 --- a/ANDROID_3.4.5/fs/ext4/namei.c +++ /dev/null @@ -1,2607 +0,0 @@ -/* - * linux/fs/ext4/namei.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/namei.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - * Directory entry file type support and forward compatibility hooks - * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 - * Hash Tree Directory indexing (c) - * Daniel Phillips, 2001 - * Hash Tree Directory indexing porting - * Christopher Li, 2002 - * Hash Tree Directory indexing cleanup - * Theodore Ts'o, 2002 - */ - -#include <linux/fs.h> -#include <linux/pagemap.h> -#include <linux/jbd2.h> -#include <linux/time.h> -#include <linux/fcntl.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/quotaops.h> -#include <linux/buffer_head.h> -#include <linux/bio.h> -#include "ext4.h" -#include "ext4_jbd2.h" - -#include "xattr.h" -#include "acl.h" - -#include <trace/events/ext4.h> -/* - * define how far ahead to read directories while searching them. - */ -#define NAMEI_RA_CHUNKS 2 -#define NAMEI_RA_BLOCKS 4 -#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) - -static struct buffer_head *ext4_append(handle_t *handle, - struct inode *inode, - ext4_lblk_t *block, int *err) -{ - struct buffer_head *bh; - - *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - - bh = ext4_bread(handle, inode, *block, 1, err); - if (bh) { - inode->i_size += inode->i_sb->s_blocksize; - EXT4_I(inode)->i_disksize = inode->i_size; - *err = ext4_journal_get_write_access(handle, bh); - if (*err) { - brelse(bh); - bh = NULL; - } - } - return bh; -} - -#ifndef assert -#define assert(test) J_ASSERT(test) -#endif - -#ifdef DX_DEBUG -#define dxtrace(command) command -#else -#define dxtrace(command) -#endif - -struct fake_dirent -{ - __le32 inode; - __le16 rec_len; - u8 name_len; - u8 file_type; -}; - -struct dx_countlimit -{ - __le16 limit; - __le16 count; -}; - -struct dx_entry -{ - __le32 hash; - __le32 block; -}; - -/* - * dx_root_info is laid out so that if it should somehow get overlaid by a - * dirent the two low bits of the hash version will be zero. Therefore, the - * hash version mod 4 should never be 0. Sincerely, the paranoia department. - */ - -struct dx_root -{ - struct fake_dirent dot; - char dot_name[4]; - struct fake_dirent dotdot; - char dotdot_name[4]; - struct dx_root_info - { - __le32 reserved_zero; - u8 hash_version; - u8 info_length; /* 8 */ - u8 indirect_levels; - u8 unused_flags; - } - info; - struct dx_entry entries[0]; -}; - -struct dx_node -{ - struct fake_dirent fake; - struct dx_entry entries[0]; -}; - - -struct dx_frame -{ - struct buffer_head *bh; - struct dx_entry *entries; - struct dx_entry *at; -}; - -struct dx_map_entry -{ - u32 hash; - u16 offs; - u16 size; -}; - -static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); -static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); -static inline unsigned dx_get_hash(struct dx_entry *entry); -static void dx_set_hash(struct dx_entry *entry, unsigned value); -static unsigned dx_get_count(struct dx_entry *entries); -static unsigned dx_get_limit(struct dx_entry *entries); -static void dx_set_count(struct dx_entry *entries, unsigned value); -static void dx_set_limit(struct dx_entry *entries, unsigned value); -static unsigned dx_root_limit(struct inode *dir, unsigned infosize); -static unsigned dx_node_limit(struct inode *dir); -static struct dx_frame *dx_probe(const struct qstr *d_name, - struct inode *dir, - struct dx_hash_info *hinfo, - struct dx_frame *frame, - int *err); -static void dx_release(struct dx_frame *frames); -static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, - struct dx_hash_info *hinfo, struct dx_map_entry map[]); -static void dx_sort_map(struct dx_map_entry *map, unsigned count); -static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, - struct dx_map_entry *offsets, int count, unsigned blocksize); -static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize); -static void dx_insert_block(struct dx_frame *frame, - u32 hash, ext4_lblk_t block); -static int ext4_htree_next_block(struct inode *dir, __u32 hash, - struct dx_frame *frame, - struct dx_frame *frames, - __u32 *start_hash); -static struct buffer_head * ext4_dx_find_entry(struct inode *dir, - const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, - int *err); -static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode); - -/* - * p is at least 6 bytes before the end of page - */ -static inline struct ext4_dir_entry_2 * -ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) -{ - return (struct ext4_dir_entry_2 *)((char *)p + - ext4_rec_len_from_disk(p->rec_len, blocksize)); -} - -/* - * Future: use high four bits of block for coalesce-on-delete flags - * Mask them off for now. - */ - -static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) -{ - return le32_to_cpu(entry->block) & 0x00ffffff; -} - -static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) -{ - entry->block = cpu_to_le32(value); -} - -static inline unsigned dx_get_hash(struct dx_entry *entry) -{ - return le32_to_cpu(entry->hash); -} - -static inline void dx_set_hash(struct dx_entry *entry, unsigned value) -{ - entry->hash = cpu_to_le32(value); -} - -static inline unsigned dx_get_count(struct dx_entry *entries) -{ - return le16_to_cpu(((struct dx_countlimit *) entries)->count); -} - -static inline unsigned dx_get_limit(struct dx_entry *entries) -{ - return le16_to_cpu(((struct dx_countlimit *) entries)->limit); -} - -static inline void dx_set_count(struct dx_entry *entries, unsigned value) -{ - ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); -} - -static inline void dx_set_limit(struct dx_entry *entries, unsigned value) -{ - ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); -} - -static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) -{ - unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - - EXT4_DIR_REC_LEN(2) - infosize; - return entry_space / sizeof(struct dx_entry); -} - -static inline unsigned dx_node_limit(struct inode *dir) -{ - unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); - return entry_space / sizeof(struct dx_entry); -} - -/* - * Debug - */ -#ifdef DX_DEBUG -static void dx_show_index(char * label, struct dx_entry *entries) -{ - int i, n = dx_get_count (entries); - printk(KERN_DEBUG "%s index ", label); - for (i = 0; i < n; i++) { - printk("%x->%lu ", i ? dx_get_hash(entries + i) : - 0, (unsigned long)dx_get_block(entries + i)); - } - printk("\n"); -} - -struct stats -{ - unsigned names; - unsigned space; - unsigned bcount; -}; - -static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de, - int size, int show_names) -{ - unsigned names = 0, space = 0; - char *base = (char *) de; - struct dx_hash_info h = *hinfo; - - printk("names: "); - while ((char *) de < base + size) - { - if (de->inode) - { - if (show_names) - { - int len = de->name_len; - char *name = de->name; - while (len--) printk("%c", *name++); - ext4fs_dirhash(de->name, de->name_len, &h); - printk(":%x.%u ", h.hash, - (unsigned) ((char *) de - base)); - } - space += EXT4_DIR_REC_LEN(de->name_len); - names++; - } - de = ext4_next_entry(de, size); - } - printk("(%i)\n", names); - return (struct stats) { names, space, 1 }; -} - -struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, - struct dx_entry *entries, int levels) -{ - unsigned blocksize = dir->i_sb->s_blocksize; - unsigned count = dx_get_count(entries), names = 0, space = 0, i; - unsigned bcount = 0; - struct buffer_head *bh; - int err; - printk("%i indexed blocks...\n", count); - for (i = 0; i < count; i++, entries++) - { - ext4_lblk_t block = dx_get_block(entries); - ext4_lblk_t hash = i ? dx_get_hash(entries): 0; - u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; - struct stats stats; - printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); - if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue; - stats = levels? - dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): - dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); - names += stats.names; - space += stats.space; - bcount += stats.bcount; - brelse(bh); - } - if (bcount) - printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", - levels ? "" : " ", names, space/bcount, - (space/bcount)*100/blocksize); - return (struct stats) { names, space, bcount}; -} -#endif /* DX_DEBUG */ - -/* - * Probe for a directory leaf block to search. - * - * dx_probe can return ERR_BAD_DX_DIR, which means there was a format - * error in the directory index, and the caller should fall back to - * searching the directory normally. The callers of dx_probe **MUST** - * check for this error code, and make sure it never gets reflected - * back to userspace. - */ -static struct dx_frame * -dx_probe(const struct qstr *d_name, struct inode *dir, - struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) -{ - unsigned count, indirect; - struct dx_entry *at, *entries, *p, *q, *m; - struct dx_root *root; - struct buffer_head *bh; - struct dx_frame *frame = frame_in; - u32 hash; - - frame->bh = NULL; - if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) - goto fail; - root = (struct dx_root *) bh->b_data; - if (root->info.hash_version != DX_HASH_TEA && - root->info.hash_version != DX_HASH_HALF_MD4 && - root->info.hash_version != DX_HASH_LEGACY) { - ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", - root->info.hash_version); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - hinfo->hash_version = root->info.hash_version; - if (hinfo->hash_version <= DX_HASH_TEA) - hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; - hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; - if (d_name) - ext4fs_dirhash(d_name->name, d_name->len, hinfo); - hash = hinfo->hash; - - if (root->info.unused_flags & 1) { - ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", - root->info.unused_flags); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - - if ((indirect = root->info.indirect_levels) > 1) { - ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", - root->info.indirect_levels); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - - entries = (struct dx_entry *) (((char *)&root->info) + - root->info.info_length); - - if (dx_get_limit(entries) != dx_root_limit(dir, - root->info.info_length)) { - ext4_warning(dir->i_sb, "dx entry: limit != root limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail; - } - - dxtrace(printk("Look up %x", hash)); - while (1) - { - count = dx_get_count(entries); - if (!count || count > dx_get_limit(entries)) { - ext4_warning(dir->i_sb, - "dx entry: no count or count > limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail2; - } - - p = entries + 1; - q = entries + count - 1; - while (p <= q) - { - m = p + (q - p)/2; - dxtrace(printk(".")); - if (dx_get_hash(m) > hash) - q = m - 1; - else - p = m + 1; - } - - if (0) // linear search cross check - { - unsigned n = count - 1; - at = entries; - while (n--) - { - dxtrace(printk(",")); - if (dx_get_hash(++at) > hash) - { - at--; - break; - } - } - assert (at == p - 1); - } - - at = p - 1; - dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); - frame->bh = bh; - frame->entries = entries; - frame->at = at; - if (!indirect--) return frame; - if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) - goto fail2; - at = entries = ((struct dx_node *) bh->b_data)->entries; - if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext4_warning(dir->i_sb, - "dx entry: limit != node limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail2; - } - frame++; - frame->bh = NULL; - } -fail2: - while (frame >= frame_in) { - brelse(frame->bh); - frame--; - } -fail: - if (*err == ERR_BAD_DX_DIR) - ext4_warning(dir->i_sb, - "Corrupt dir inode %lu, running e2fsck is " - "recommended.", dir->i_ino); - return NULL; -} - -static void dx_release (struct dx_frame *frames) -{ - if (frames[0].bh == NULL) - return; - - if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) - brelse(frames[1].bh); - brelse(frames[0].bh); -} - -/* - * This function increments the frame pointer to search the next leaf - * block, and reads in the necessary intervening nodes if the search - * should be necessary. Whether or not the search is necessary is - * controlled by the hash parameter. If the hash value is even, then - * the search is only continued if the next block starts with that - * hash value. This is used if we are searching for a specific file. - * - * If the hash value is HASH_NB_ALWAYS, then always go to the next block. - * - * This function returns 1 if the caller should continue to search, - * or 0 if it should not. If there is an error reading one of the - * index blocks, it will a negative error code. - * - * If start_hash is non-null, it will be filled in with the starting - * hash of the next page. - */ -static int ext4_htree_next_block(struct inode *dir, __u32 hash, - struct dx_frame *frame, - struct dx_frame *frames, - __u32 *start_hash) -{ - struct dx_frame *p; - struct buffer_head *bh; - int err, num_frames = 0; - __u32 bhash; - - p = frame; - /* - * Find the next leaf page by incrementing the frame pointer. - * If we run out of entries in the interior node, loop around and - * increment pointer in the parent node. When we break out of - * this loop, num_frames indicates the number of interior - * nodes need to be read. - */ - while (1) { - if (++(p->at) < p->entries + dx_get_count(p->entries)) - break; - if (p == frames) - return 0; - num_frames++; - p--; - } - - /* - * If the hash is 1, then continue only if the next page has a - * continuation hash of any value. This is used for readdir - * handling. Otherwise, check to see if the hash matches the - * desired contiuation hash. If it doesn't, return since - * there's no point to read in the successive index pages. - */ - bhash = dx_get_hash(p->at); - if (start_hash) - *start_hash = bhash; - if ((hash & 1) == 0) { - if ((bhash & ~1) != hash) - return 0; - } - /* - * If the hash is HASH_NB_ALWAYS, we always go to the next - * block so no check is necessary - */ - while (num_frames--) { - if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), - 0, &err))) - return err; /* Failure */ - p++; - brelse(p->bh); - p->bh = bh; - p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; - } - return 1; -} - - -/* - * This function fills a red-black tree with information from a - * directory block. It returns the number directory entries loaded - * into the tree. If there is an error it is returned in err. - */ -static int htree_dirblock_to_tree(struct file *dir_file, - struct inode *dir, ext4_lblk_t block, - struct dx_hash_info *hinfo, - __u32 start_hash, __u32 start_minor_hash) -{ - struct buffer_head *bh; - struct ext4_dir_entry_2 *de, *top; - int err, count = 0; - - dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", - (unsigned long)block)); - if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) - return err; - - de = (struct ext4_dir_entry_2 *) bh->b_data; - top = (struct ext4_dir_entry_2 *) ((char *) de + - dir->i_sb->s_blocksize - - EXT4_DIR_REC_LEN(0)); - for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { - if (ext4_check_dir_entry(dir, NULL, de, bh, - (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) - + ((char *)de - bh->b_data))) { - /* On error, skip the f_pos to the next block. */ - dir_file->f_pos = (dir_file->f_pos | - (dir->i_sb->s_blocksize - 1)) + 1; - brelse(bh); - return count; - } - ext4fs_dirhash(de->name, de->name_len, hinfo); - if ((hinfo->hash < start_hash) || - ((hinfo->hash == start_hash) && - (hinfo->minor_hash < start_minor_hash))) - continue; - if (de->inode == 0) - continue; - if ((err = ext4_htree_store_dirent(dir_file, - hinfo->hash, hinfo->minor_hash, de)) != 0) { - brelse(bh); - return err; - } - count++; - } - brelse(bh); - return count; -} - - -/* - * This function fills a red-black tree with information from a - * directory. We start scanning the directory in hash order, starting - * at start_hash and start_minor_hash. - * - * This function returns the number of entries inserted into the tree, - * or a negative error code. - */ -int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, - __u32 start_minor_hash, __u32 *next_hash) -{ - struct dx_hash_info hinfo; - struct ext4_dir_entry_2 *de; - struct dx_frame frames[2], *frame; - struct inode *dir; - ext4_lblk_t block; - int count = 0; - int ret, err; - __u32 hashval; - - dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", - start_hash, start_minor_hash)); - dir = dir_file->f_path.dentry->d_inode; - if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { - hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; - if (hinfo.hash_version <= DX_HASH_TEA) - hinfo.hash_version += - EXT4_SB(dir->i_sb)->s_hash_unsigned; - hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; - count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, - start_hash, start_minor_hash); - *next_hash = ~0; - return count; - } - hinfo.hash = start_hash; - hinfo.minor_hash = 0; - frame = dx_probe(NULL, dir, &hinfo, frames, &err); - if (!frame) - return err; - - /* Add '.' and '..' from the htree header */ - if (!start_hash && !start_minor_hash) { - de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; - if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0) - goto errout; - count++; - } - if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { - de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; - de = ext4_next_entry(de, dir->i_sb->s_blocksize); - if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) - goto errout; - count++; - } - - while (1) { - block = dx_get_block(frame->at); - ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, - start_hash, start_minor_hash); - if (ret < 0) { - err = ret; - goto errout; - } - count += ret; - hashval = ~0; - ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, - frame, frames, &hashval); - *next_hash = hashval; - if (ret < 0) { - err = ret; - goto errout; - } - /* - * Stop if: (a) there are no more entries, or - * (b) we have inserted at least one entry and the - * next hash value is not a continuation - */ - if ((ret == 0) || - (count && ((hashval & 1) == 0))) - break; - } - dx_release(frames); - dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " - "next hash: %x\n", count, *next_hash)); - return count; -errout: - dx_release(frames); - return (err); -} - - -/* - * Directory block splitting, compacting - */ - -/* - * Create map of hash values, offsets, and sizes, stored at end of block. - * Returns number of entries mapped. - */ -static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, - struct dx_hash_info *hinfo, - struct dx_map_entry *map_tail) -{ - int count = 0; - char *base = (char *) de; - struct dx_hash_info h = *hinfo; - - while ((char *) de < base + blocksize) { - if (de->name_len && de->inode) { - ext4fs_dirhash(de->name, de->name_len, &h); - map_tail--; - map_tail->hash = h.hash; - map_tail->offs = ((char *) de - base)>>2; - map_tail->size = le16_to_cpu(de->rec_len); - count++; - cond_resched(); - } - /* XXX: do we need to check rec_len == 0 case? -Chris */ - de = ext4_next_entry(de, blocksize); - } - return count; -} - -/* Sort map by hash value */ -static void dx_sort_map (struct dx_map_entry *map, unsigned count) -{ - struct dx_map_entry *p, *q, *top = map + count - 1; - int more; - /* Combsort until bubble sort doesn't suck */ - while (count > 2) { - count = count*10/13; - if (count - 9 < 2) /* 9, 10 -> 11 */ - count = 11; - for (p = top, q = p - count; q >= map; p--, q--) - if (p->hash < q->hash) - swap(*p, *q); - } - /* Garden variety bubble sort */ - do { - more = 0; - q = top; - while (q-- > map) { - if (q[1].hash >= q[0].hash) - continue; - swap(*(q+1), *q); - more = 1; - } - } while(more); -} - -static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) -{ - struct dx_entry *entries = frame->entries; - struct dx_entry *old = frame->at, *new = old + 1; - int count = dx_get_count(entries); - - assert(count < dx_get_limit(entries)); - assert(old < entries + count); - memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); - dx_set_hash(new, hash); - dx_set_block(new, block); - dx_set_count(entries, count + 1); -} - -static void ext4_update_dx_flag(struct inode *inode) -{ - if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX)) - ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); -} - -/* - * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. - * - * `len <= EXT4_NAME_LEN' is guaranteed by caller. - * `de != NULL' is guaranteed by caller. - */ -static inline int ext4_match (int len, const char * const name, - struct ext4_dir_entry_2 * de) -{ - if (len != de->name_len) - return 0; - if (!de->inode) - return 0; - return !memcmp(name, de->name, len); -} - -/* - * Returns 0 if not found, -1 on failure, and 1 on success - */ -static inline int search_dirblock(struct buffer_head *bh, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 ** res_dir) -{ - struct ext4_dir_entry_2 * de; - char * dlimit; - int de_len; - const char *name = d_name->name; - int namelen = d_name->len; - - de = (struct ext4_dir_entry_2 *) bh->b_data; - dlimit = bh->b_data + dir->i_sb->s_blocksize; - while ((char *) de < dlimit) { - /* this code is executed quadratically often */ - /* do minimal checking `by hand' */ - - if ((char *) de + namelen <= dlimit && - ext4_match (namelen, name, de)) { - /* found a match - just to be sure, do a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) - return -1; - *res_dir = de; - return 1; - } - /* prevent looping on a bad block */ - de_len = ext4_rec_len_from_disk(de->rec_len, - dir->i_sb->s_blocksize); - if (de_len <= 0) - return -1; - offset += de_len; - de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); - } - return 0; -} - - -/* - * ext4_find_entry() - * - * finds an entry in the specified directory with the wanted name. It - * returns the cache buffer in which the entry was found, and the entry - * itself (as a parameter - res_dir). It does NOT read the inode of the - * entry - you'll have to do that yourself if you want to. - * - * The returned buffer_head has ->b_count elevated. The caller is expected - * to brelse() it when appropriate. - */ -static struct buffer_head * ext4_find_entry (struct inode *dir, - const struct qstr *d_name, - struct ext4_dir_entry_2 ** res_dir) -{ - struct super_block *sb; - struct buffer_head *bh_use[NAMEI_RA_SIZE]; - struct buffer_head *bh, *ret = NULL; - ext4_lblk_t start, block, b; - const u8 *name = d_name->name; - int ra_max = 0; /* Number of bh's in the readahead - buffer, bh_use[] */ - int ra_ptr = 0; /* Current index into readahead - buffer */ - int num = 0; - ext4_lblk_t nblocks; - int i, err; - int namelen; - - *res_dir = NULL; - sb = dir->i_sb; - namelen = d_name->len; - if (namelen > EXT4_NAME_LEN) - return NULL; - if ((namelen <= 2) && (name[0] == '.') && - (name[1] == '.' || name[1] == '\0')) { - /* - * "." or ".." will only be in the first block - * NFS may look up ".."; "." should be handled by the VFS - */ - block = start = 0; - nblocks = 1; - goto restart; - } - if (is_dx(dir)) { - bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); - /* - * On success, or if the error was file not found, - * return. Otherwise, fall back to doing a search the - * old fashioned way. - */ - if (bh || (err != ERR_BAD_DX_DIR)) - return bh; - dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " - "falling back\n")); - } - nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); - start = EXT4_I(dir)->i_dir_start_lookup; - if (start >= nblocks) - start = 0; - block = start; -restart: - do { - /* - * We deal with the read-ahead logic here. - */ - if (ra_ptr >= ra_max) { - /* Refill the readahead buffer */ - ra_ptr = 0; - b = block; - for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { - /* - * Terminate if we reach the end of the - * directory and must wrap, or if our - * search has finished at this block. - */ - if (b >= nblocks || (num && block == start)) { - bh_use[ra_max] = NULL; - break; - } - num++; - bh = ext4_getblk(NULL, dir, b++, 0, &err); - bh_use[ra_max] = bh; - if (bh) - ll_rw_block(READ | REQ_META | REQ_PRIO, - 1, &bh); - } - } - if ((bh = bh_use[ra_ptr++]) == NULL) - goto next; - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - /* read error, skip block & hope for the best */ - EXT4_ERROR_INODE(dir, "reading directory lblock %lu", - (unsigned long) block); - brelse(bh); - goto next; - } - i = search_dirblock(bh, dir, d_name, - block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); - if (i == 1) { - EXT4_I(dir)->i_dir_start_lookup = block; - ret = bh; - goto cleanup_and_exit; - } else { - brelse(bh); - if (i < 0) - goto cleanup_and_exit; - } - next: - if (++block >= nblocks) - block = 0; - } while (block != start); - - /* - * If the directory has grown while we were searching, then - * search the last part of the directory before giving up. - */ - block = nblocks; - nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); - if (block < nblocks) { - start = 0; - goto restart; - } - -cleanup_and_exit: - /* Clean up the read-ahead blocks */ - for (; ra_ptr < ra_max; ra_ptr++) - brelse(bh_use[ra_ptr]); - return ret; -} - -static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, int *err) -{ - struct super_block * sb = dir->i_sb; - struct dx_hash_info hinfo; - struct dx_frame frames[2], *frame; - struct buffer_head *bh; - ext4_lblk_t block; - int retval; - - if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) - return NULL; - do { - block = dx_get_block(frame->at); - if (!(bh = ext4_bread(NULL, dir, block, 0, err))) - goto errout; - - retval = search_dirblock(bh, dir, d_name, - block << EXT4_BLOCK_SIZE_BITS(sb), - res_dir); - if (retval == 1) { /* Success! */ - dx_release(frames); - return bh; - } - brelse(bh); - if (retval == -1) { - *err = ERR_BAD_DX_DIR; - goto errout; - } - - /* Check to see if we should continue to search */ - retval = ext4_htree_next_block(dir, hinfo.hash, frame, - frames, NULL); - if (retval < 0) { - ext4_warning(sb, - "error reading index page in directory #%lu", - dir->i_ino); - *err = retval; - goto errout; - } - } while (retval == 1); - - *err = -ENOENT; -errout: - dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); - dx_release (frames); - return NULL; -} - -static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) -{ - struct inode *inode; - struct ext4_dir_entry_2 *de; - struct buffer_head *bh; - - if (dentry->d_name.len > EXT4_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - - bh = ext4_find_entry(dir, &dentry->d_name, &de); - inode = NULL; - if (bh) { - __u32 ino = le32_to_cpu(de->inode); - brelse(bh); - if (!ext4_valid_inum(dir->i_sb, ino)) { - EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); - return ERR_PTR(-EIO); - } - if (unlikely(ino == dir->i_ino)) { - EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir", - dentry->d_name.len, - dentry->d_name.name); - return ERR_PTR(-EIO); - } - inode = ext4_iget(dir->i_sb, ino); - if (inode == ERR_PTR(-ESTALE)) { - EXT4_ERROR_INODE(dir, - "deleted inode referenced: %u", - ino); - return ERR_PTR(-EIO); - } - } - return d_splice_alias(inode, dentry); -} - - -struct dentry *ext4_get_parent(struct dentry *child) -{ - __u32 ino; - static const struct qstr dotdot = { - .name = "..", - .len = 2, - }; - struct ext4_dir_entry_2 * de; - struct buffer_head *bh; - - bh = ext4_find_entry(child->d_inode, &dotdot, &de); - if (!bh) - return ERR_PTR(-ENOENT); - ino = le32_to_cpu(de->inode); - brelse(bh); - - if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { - EXT4_ERROR_INODE(child->d_inode, - "bad parent inode number: %u", ino); - return ERR_PTR(-EIO); - } - - return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); -} - -#define S_SHIFT 12 -static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, - [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, - [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, -}; - -static inline void ext4_set_de_type(struct super_block *sb, - struct ext4_dir_entry_2 *de, - umode_t mode) { - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) - de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; -} - -/* - * Move count entries from end of map between two memory locations. - * Returns pointer to last entry moved. - */ -static struct ext4_dir_entry_2 * -dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, - unsigned blocksize) -{ - unsigned rec_len = 0; - - while (count--) { - struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) - (from + (map->offs<<2)); - rec_len = EXT4_DIR_REC_LEN(de->name_len); - memcpy (to, de, rec_len); - ((struct ext4_dir_entry_2 *) to)->rec_len = - ext4_rec_len_to_disk(rec_len, blocksize); - de->inode = 0; - map++; - to += rec_len; - } - return (struct ext4_dir_entry_2 *) (to - rec_len); -} - -/* - * Compact each dir entry in the range to the minimal rec_len. - * Returns pointer to last entry in range. - */ -static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) -{ - struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; - unsigned rec_len = 0; - - prev = to = de; - while ((char*)de < base + blocksize) { - next = ext4_next_entry(de, blocksize); - if (de->inode && de->name_len) { - rec_len = EXT4_DIR_REC_LEN(de->name_len); - if (de > to) - memmove(to, de, rec_len); - to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); - prev = to; - to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len); - } - de = next; - } - return prev; -} - -/* - * Split a full leaf block to make room for a new dir entry. - * Allocate a new block, and move entries so that they are approx. equally full. - * Returns pointer to de in block into which the new entry will be inserted. - */ -static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, - struct buffer_head **bh,struct dx_frame *frame, - struct dx_hash_info *hinfo, int *error) -{ - unsigned blocksize = dir->i_sb->s_blocksize; - unsigned count, continued; - struct buffer_head *bh2; - ext4_lblk_t newblock; - u32 hash2; - struct dx_map_entry *map; - char *data1 = (*bh)->b_data, *data2; - unsigned split, move, size; - struct ext4_dir_entry_2 *de = NULL, *de2; - int err = 0, i; - - bh2 = ext4_append (handle, dir, &newblock, &err); - if (!(bh2)) { - brelse(*bh); - *bh = NULL; - goto errout; - } - - BUFFER_TRACE(*bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, *bh); - if (err) - goto journal_error; - - BUFFER_TRACE(frame->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, frame->bh); - if (err) - goto journal_error; - - data2 = bh2->b_data; - - /* create map in the end of data2 block */ - map = (struct dx_map_entry *) (data2 + blocksize); - count = dx_make_map((struct ext4_dir_entry_2 *) data1, - blocksize, hinfo, map); - map -= count; - dx_sort_map(map, count); - /* Split the existing block in the middle, size-wise */ - size = 0; - move = 0; - for (i = count-1; i >= 0; i--) { - /* is more than half of this entry in 2nd half of the block? */ - if (size + map[i].size/2 > blocksize/2) - break; - size += map[i].size; - move++; - } - /* map index at which we will split */ - split = count - move; - hash2 = map[split].hash; - continued = hash2 == map[split - 1].hash; - dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", - (unsigned long)dx_get_block(frame->at), - hash2, split, count-split)); - - /* Fancy dance to stay within two buffers */ - de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); - de = dx_pack_dirents(data1, blocksize); - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, - blocksize); - de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2, - blocksize); - dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); - dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); - - /* Which block gets the new entry? */ - if (hinfo->hash >= hash2) - { - swap(*bh, bh2); - de = de2; - } - dx_insert_block(frame, hash2 + continued, newblock); - err = ext4_handle_dirty_metadata(handle, dir, bh2); - if (err) - goto journal_error; - err = ext4_handle_dirty_metadata(handle, dir, frame->bh); - if (err) - goto journal_error; - brelse(bh2); - dxtrace(dx_show_index("frame", frame->entries)); - return de; - -journal_error: - brelse(*bh); - brelse(bh2); - *bh = NULL; - ext4_std_error(dir->i_sb, err); -errout: - *error = err; - return NULL; -} - -/* - * Add a new entry into a directory (leaf) block. If de is non-NULL, - * it points to a directory entry which is guaranteed to be large - * enough for new directory entry. If de is NULL, then - * add_dirent_to_buf will attempt search the directory block for - * space. It will return -ENOSPC if no space is available, and -EIO - * and -EEXIST if directory entry already exists. - */ -static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, - struct inode *inode, struct ext4_dir_entry_2 *de, - struct buffer_head *bh) -{ - struct inode *dir = dentry->d_parent->d_inode; - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - unsigned int offset = 0; - unsigned int blocksize = dir->i_sb->s_blocksize; - unsigned short reclen; - int nlen, rlen, err; - char *top; - - reclen = EXT4_DIR_REC_LEN(namelen); - if (!de) { - de = (struct ext4_dir_entry_2 *)bh->b_data; - top = bh->b_data + blocksize - reclen; - while ((char *) de <= top) { - if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) - return -EIO; - if (ext4_match(namelen, name, de)) - return -EEXIST; - nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); - if ((de->inode? rlen - nlen: rlen) >= reclen) - break; - de = (struct ext4_dir_entry_2 *)((char *)de + rlen); - offset += rlen; - } - if ((char *) de > top) - return -ENOSPC; - } - BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); - if (err) { - ext4_std_error(dir->i_sb, err); - return err; - } - - /* By now the buffer is marked for journaling */ - nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); - if (de->inode) { - struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); - de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize); - de->rec_len = ext4_rec_len_to_disk(nlen, blocksize); - de = de1; - } - de->file_type = EXT4_FT_UNKNOWN; - if (inode) { - de->inode = cpu_to_le32(inode->i_ino); - ext4_set_de_type(dir->i_sb, de, inode->i_mode); - } else - de->inode = 0; - de->name_len = namelen; - memcpy(de->name, name, namelen); - /* - * XXX shouldn't update any times until successful - * completion of syscall, but too many callers depend - * on this. - * - * XXX similarly, too many callers depend on - * ext4_new_inode() setting the times, but error - * recovery deletes the inode, so the worst that can - * happen is that the times are slightly out of date - * and/or different from the directory change time. - */ - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); - ext4_update_dx_flag(dir); - dir->i_version++; - ext4_mark_inode_dirty(handle, dir); - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, bh); - if (err) - ext4_std_error(dir->i_sb, err); - return 0; -} - -/* - * This converts a one block unindexed directory to a 3 block indexed - * directory, and adds the dentry to the indexed directory. - */ -static int make_indexed_dir(handle_t *handle, struct dentry *dentry, - struct inode *inode, struct buffer_head *bh) -{ - struct inode *dir = dentry->d_parent->d_inode; - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct buffer_head *bh2; - struct dx_root *root; - struct dx_frame frames[2], *frame; - struct dx_entry *entries; - struct ext4_dir_entry_2 *de, *de2; - char *data1, *top; - unsigned len; - int retval; - unsigned blocksize; - struct dx_hash_info hinfo; - ext4_lblk_t block; - struct fake_dirent *fde; - - blocksize = dir->i_sb->s_blocksize; - dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); - retval = ext4_journal_get_write_access(handle, bh); - if (retval) { - ext4_std_error(dir->i_sb, retval); - brelse(bh); - return retval; - } - root = (struct dx_root *) bh->b_data; - - /* The 0th block becomes the root, move the dirents out */ - fde = &root->dotdot; - de = (struct ext4_dir_entry_2 *)((char *)fde + - ext4_rec_len_from_disk(fde->rec_len, blocksize)); - if ((char *) de >= (((char *) root) + blocksize)) { - EXT4_ERROR_INODE(dir, "invalid rec_len for '..'"); - brelse(bh); - return -EIO; - } - len = ((char *) root) + blocksize - (char *) de; - - /* Allocate new block for the 0th block's dirents */ - bh2 = ext4_append(handle, dir, &block, &retval); - if (!(bh2)) { - brelse(bh); - return retval; - } - ext4_set_inode_flag(dir, EXT4_INODE_INDEX); - data1 = bh2->b_data; - - memcpy (data1, de, len); - de = (struct ext4_dir_entry_2 *) data1; - top = data1 + len; - while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) - de = de2; - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, - blocksize); - /* Initialize the root; the dot dirents already exist */ - de = (struct ext4_dir_entry_2 *) (&root->dotdot); - de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), - blocksize); - memset (&root->info, 0, sizeof(root->info)); - root->info.info_length = sizeof(root->info); - root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; - entries = root->entries; - dx_set_block(entries, 1); - dx_set_count(entries, 1); - dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); - - /* Initialize as for dx_probe */ - hinfo.hash_version = root->info.hash_version; - if (hinfo.hash_version <= DX_HASH_TEA) - hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; - hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; - ext4fs_dirhash(name, namelen, &hinfo); - frame = frames; - frame->entries = entries; - frame->at = entries; - frame->bh = bh; - bh = bh2; - - ext4_handle_dirty_metadata(handle, dir, frame->bh); - ext4_handle_dirty_metadata(handle, dir, bh); - - de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - if (!de) { - /* - * Even if the block split failed, we have to properly write - * out all the changes we did so far. Otherwise we can end up - * with corrupted filesystem. - */ - ext4_mark_inode_dirty(handle, dir); - dx_release(frames); - return retval; - } - dx_release(frames); - - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); - brelse(bh); - return retval; -} - -/* - * ext4_add_entry() - * - * adds a file entry to the specified directory, using the same - * semantics as ext4_find_entry(). It returns NULL if it failed. - * - * NOTE!! The inode part of 'de' is left at 0 - which means you - * may not sleep between calling this and putting something into - * the entry, as someone else might have used it while you slept. - */ -static int ext4_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) -{ - struct inode *dir = dentry->d_parent->d_inode; - struct buffer_head *bh; - struct ext4_dir_entry_2 *de; - struct super_block *sb; - int retval; - int dx_fallback=0; - unsigned blocksize; - ext4_lblk_t block, blocks; - - sb = dir->i_sb; - blocksize = sb->s_blocksize; - if (!dentry->d_name.len) - return -EINVAL; - if (is_dx(dir)) { - retval = ext4_dx_add_entry(handle, dentry, inode); - if (!retval || (retval != ERR_BAD_DX_DIR)) - return retval; - ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); - dx_fallback++; - ext4_mark_inode_dirty(handle, dir); - } - blocks = dir->i_size >> sb->s_blocksize_bits; - for (block = 0; block < blocks; block++) { - bh = ext4_bread(handle, dir, block, 0, &retval); - if(!bh) - return retval; - retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (retval != -ENOSPC) { - brelse(bh); - return retval; - } - - if (blocks == 1 && !dx_fallback && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) - return make_indexed_dir(handle, dentry, inode, bh); - brelse(bh); - } - bh = ext4_append(handle, dir, &block, &retval); - if (!bh) - return retval; - de = (struct ext4_dir_entry_2 *) bh->b_data; - de->inode = 0; - de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); - brelse(bh); - if (retval == 0) - ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); - return retval; -} - -/* - * Returns 0 for success, or a negative error value - */ -static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) -{ - struct dx_frame frames[2], *frame; - struct dx_entry *entries, *at; - struct dx_hash_info hinfo; - struct buffer_head *bh; - struct inode *dir = dentry->d_parent->d_inode; - struct super_block *sb = dir->i_sb; - struct ext4_dir_entry_2 *de; - int err; - - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); - if (!frame) - return err; - entries = frame->entries; - at = frame->at; - - if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) - goto cleanup; - - BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); - if (err) - goto journal_error; - - err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (err != -ENOSPC) - goto cleanup; - - /* Block full, should compress but for now just split */ - dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", - dx_get_count(entries), dx_get_limit(entries))); - /* Need to split index? */ - if (dx_get_count(entries) == dx_get_limit(entries)) { - ext4_lblk_t newblock; - unsigned icount = dx_get_count(entries); - int levels = frame - frames; - struct dx_entry *entries2; - struct dx_node *node2; - struct buffer_head *bh2; - - if (levels && (dx_get_count(frames->entries) == - dx_get_limit(frames->entries))) { - ext4_warning(sb, "Directory index full!"); - err = -ENOSPC; - goto cleanup; - } - bh2 = ext4_append (handle, dir, &newblock, &err); - if (!(bh2)) - goto cleanup; - node2 = (struct dx_node *)(bh2->b_data); - entries2 = node2->entries; - memset(&node2->fake, 0, sizeof(struct fake_dirent)); - node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, - sb->s_blocksize); - BUFFER_TRACE(frame->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, frame->bh); - if (err) - goto journal_error; - if (levels) { - unsigned icount1 = icount/2, icount2 = icount - icount1; - unsigned hash2 = dx_get_hash(entries + icount1); - dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", - icount1, icount2)); - - BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ - err = ext4_journal_get_write_access(handle, - frames[0].bh); - if (err) - goto journal_error; - - memcpy((char *) entries2, (char *) (entries + icount1), - icount2 * sizeof(struct dx_entry)); - dx_set_count(entries, icount1); - dx_set_count(entries2, icount2); - dx_set_limit(entries2, dx_node_limit(dir)); - - /* Which index block gets the new entry? */ - if (at - entries >= icount1) { - frame->at = at = at - entries - icount1 + entries2; - frame->entries = entries = entries2; - swap(frame->bh, bh2); - } - dx_insert_block(frames + 0, hash2, newblock); - dxtrace(dx_show_index("node", frames[1].entries)); - dxtrace(dx_show_index("node", - ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_handle_dirty_metadata(handle, dir, bh2); - if (err) - goto journal_error; - brelse (bh2); - } else { - dxtrace(printk(KERN_DEBUG - "Creating second level index...\n")); - memcpy((char *) entries2, (char *) entries, - icount * sizeof(struct dx_entry)); - dx_set_limit(entries2, dx_node_limit(dir)); - - /* Set up root */ - dx_set_count(entries, 1); - dx_set_block(entries + 0, newblock); - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; - - /* Add new access path frame */ - frame = frames + 1; - frame->at = at = at - entries + entries2; - frame->entries = entries = entries2; - frame->bh = bh2; - err = ext4_journal_get_write_access(handle, - frame->bh); - if (err) - goto journal_error; - } - err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); - if (err) { - ext4_std_error(inode->i_sb, err); - goto cleanup; - } - } - de = do_split(handle, dir, &bh, frame, &hinfo, &err); - if (!de) - goto cleanup; - err = add_dirent_to_buf(handle, dentry, inode, de, bh); - goto cleanup; - -journal_error: - ext4_std_error(dir->i_sb, err); -cleanup: - if (bh) - brelse(bh); - dx_release(frames); - return err; -} - -/* - * ext4_delete_entry deletes a directory entry by merging it with the - * previous entry - */ -static int ext4_delete_entry(handle_t *handle, - struct inode *dir, - struct ext4_dir_entry_2 *de_del, - struct buffer_head *bh) -{ - struct ext4_dir_entry_2 *de, *pde; - unsigned int blocksize = dir->i_sb->s_blocksize; - int i, err; - - i = 0; - pde = NULL; - de = (struct ext4_dir_entry_2 *) bh->b_data; - while (i < bh->b_size) { - if (ext4_check_dir_entry(dir, NULL, de, bh, i)) - return -EIO; - if (de == de_del) { - BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) { - ext4_std_error(dir->i_sb, err); - return err; - } - if (pde) - pde->rec_len = ext4_rec_len_to_disk( - ext4_rec_len_from_disk(pde->rec_len, - blocksize) + - ext4_rec_len_from_disk(de->rec_len, - blocksize), - blocksize); - else - de->inode = 0; - dir->i_version++; - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, bh); - if (unlikely(err)) { - ext4_std_error(dir->i_sb, err); - return err; - } - return 0; - } - i += ext4_rec_len_from_disk(de->rec_len, blocksize); - pde = de; - de = ext4_next_entry(de, blocksize); - } - return -ENOENT; -} - -/* - * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, - * since this indicates that nlinks count was previously 1. - */ -static void ext4_inc_count(handle_t *handle, struct inode *inode) -{ - inc_nlink(inode); - if (is_dx(inode) && inode->i_nlink > 1) { - /* limit is 16-bit i_links_count */ - if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { - set_nlink(inode, 1); - EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_DIR_NLINK); - } - } -} - -/* - * If a directory had nlink == 1, then we should let it be 1. This indicates - * directory has >EXT4_LINK_MAX subdirs. - */ -static void ext4_dec_count(handle_t *handle, struct inode *inode) -{ - if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) - drop_nlink(inode); -} - - -static int ext4_add_nondir(handle_t *handle, - struct dentry *dentry, struct inode *inode) -{ - int err = ext4_add_entry(handle, dentry, inode); - if (!err) { - ext4_mark_inode_dirty(handle, inode); - d_instantiate(dentry, inode); - unlock_new_inode(inode); - return 0; - } - drop_nlink(inode); - unlock_new_inode(inode); - iput(inode); - return err; -} - -/* - * By the time this is called, we already have created - * the directory cache entry for the new file, but it - * is so far negative - it has no inode. - * - * If the create succeeds, we fill in the inode information - * with d_instantiate(). - */ -static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) -{ - handle_t *handle; - struct inode *inode; - int err, retries = 0; - - dquot_initialize(dir); - -retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; - ext4_set_aops(inode); - err = ext4_add_nondir(handle, dentry, inode); - } - ext4_journal_stop(handle); - if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -} - -static int ext4_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) -{ - handle_t *handle; - struct inode *inode; - int err, retries = 0; - - if (!new_valid_dev(rdev)) - return -EINVAL; - - dquot_initialize(dir); - -retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - init_special_inode(inode, inode->i_mode, rdev); -#ifdef CONFIG_EXT4_FS_XATTR - inode->i_op = &ext4_special_inode_operations; -#endif - err = ext4_add_nondir(handle, dentry, inode); - } - ext4_journal_stop(handle); - if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -} - -static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - handle_t *handle; - struct inode *inode; - struct buffer_head *dir_block = NULL; - struct ext4_dir_entry_2 *de; - unsigned int blocksize = dir->i_sb->s_blocksize; - int err, retries = 0; - - if (EXT4_DIR_LINK_MAX(dir)) - return -EMLINK; - - dquot_initialize(dir); - -retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, S_IFDIR | mode, - &dentry->d_name, 0, NULL); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; - - inode->i_op = &ext4_dir_inode_operations; - inode->i_fop = &ext4_dir_operations; - inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - dir_block = ext4_bread(handle, inode, 0, 1, &err); - if (!dir_block) - goto out_clear_inode; - BUFFER_TRACE(dir_block, "get_write_access"); - err = ext4_journal_get_write_access(handle, dir_block); - if (err) - goto out_clear_inode; - de = (struct ext4_dir_entry_2 *) dir_block->b_data; - de->inode = cpu_to_le32(inode->i_ino); - de->name_len = 1; - de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), - blocksize); - strcpy(de->name, "."); - ext4_set_de_type(dir->i_sb, de, S_IFDIR); - de = ext4_next_entry(de, blocksize); - de->inode = cpu_to_le32(dir->i_ino); - de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1), - blocksize); - de->name_len = 2; - strcpy(de->name, ".."); - ext4_set_de_type(dir->i_sb, de, S_IFDIR); - set_nlink(inode, 2); - BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, dir_block); - if (err) - goto out_clear_inode; - err = ext4_mark_inode_dirty(handle, inode); - if (!err) - err = ext4_add_entry(handle, dentry, inode); - if (err) { -out_clear_inode: - clear_nlink(inode); - unlock_new_inode(inode); - ext4_mark_inode_dirty(handle, inode); - iput(inode); - goto out_stop; - } - ext4_inc_count(handle, dir); - ext4_update_dx_flag(dir); - err = ext4_mark_inode_dirty(handle, dir); - if (err) - goto out_clear_inode; - d_instantiate(dentry, inode); - unlock_new_inode(inode); -out_stop: - brelse(dir_block); - ext4_journal_stop(handle); - if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -} - -/* - * routine to check that the specified directory is empty (for rmdir) - */ -static int empty_dir(struct inode *inode) -{ - unsigned int offset; - struct buffer_head *bh; - struct ext4_dir_entry_2 *de, *de1; - struct super_block *sb; - int err = 0; - - sb = inode->i_sb; - if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || - !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { - if (err) - EXT4_ERROR_INODE(inode, - "error %d reading directory lblock 0", err); - else - ext4_warning(inode->i_sb, - "bad directory (dir #%lu) - no data block", - inode->i_ino); - return 1; - } - de = (struct ext4_dir_entry_2 *) bh->b_data; - de1 = ext4_next_entry(de, sb->s_blocksize); - if (le32_to_cpu(de->inode) != inode->i_ino || - !le32_to_cpu(de1->inode) || - strcmp(".", de->name) || - strcmp("..", de1->name)) { - ext4_warning(inode->i_sb, - "bad directory (dir #%lu) - no `.' or `..'", - inode->i_ino); - brelse(bh); - return 1; - } - offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) + - ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); - de = ext4_next_entry(de1, sb->s_blocksize); - while (offset < inode->i_size) { - if (!bh || - (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { - unsigned int lblock; - err = 0; - brelse(bh); - lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); - bh = ext4_bread(NULL, inode, lblock, 0, &err); - if (!bh) { - if (err) - EXT4_ERROR_INODE(inode, - "error %d reading directory " - "lblock %u", err, lblock); - offset += sb->s_blocksize; - continue; - } - de = (struct ext4_dir_entry_2 *) bh->b_data; - } - if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { - de = (struct ext4_dir_entry_2 *)(bh->b_data + - sb->s_blocksize); - offset = (offset | (sb->s_blocksize - 1)) + 1; - continue; - } - if (le32_to_cpu(de->inode)) { - brelse(bh); - return 0; - } - offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); - de = ext4_next_entry(de, sb->s_blocksize); - } - brelse(bh); - return 1; -} - -/* ext4_orphan_add() links an unlinked or truncated inode into a list of - * such inodes, starting at the superblock, in case we crash before the - * file is closed/deleted, or in case the inode truncate spans multiple - * transactions and the last transaction is not recovered after a crash. - * - * At filesystem recovery time, we walk this list deleting unlinked - * inodes and truncating linked inodes in ext4_orphan_cleanup(). - */ -int ext4_orphan_add(handle_t *handle, struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct ext4_iloc iloc; - int err = 0, rc; - - if (!ext4_handle_valid(handle)) - return 0; - - mutex_lock(&EXT4_SB(sb)->s_orphan_lock); - if (!list_empty(&EXT4_I(inode)->i_orphan)) - goto out_unlock; - - /* - * Orphan handling is only valid for files with data blocks - * being truncated, or files being unlinked. Note that we either - * hold i_mutex, or the inode can not be referenced from outside, - * so i_nlink should not be bumped due to race - */ - J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); - - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); - if (err) - goto out_unlock; - - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) - goto out_unlock; - /* - * Due to previous errors inode may be already a part of on-disk - * orphan list. If so skip on-disk list modification. - */ - if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <= - (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) - goto mem_insert; - - /* Insert this inode at the head of the on-disk orphan list... */ - NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); - EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); - rc = ext4_mark_iloc_dirty(handle, inode, &iloc); - if (!err) - err = rc; - - /* Only add to the head of the in-memory list if all the - * previous operations succeeded. If the orphan_add is going to - * fail (possibly taking the journal offline), we can't risk - * leaving the inode on the orphan list: stray orphan-list - * entries can cause panics at unmount time. - * - * This is safe: on error we're going to ignore the orphan list - * anyway on the next recovery. */ -mem_insert: - if (!err) - list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); - - jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); - jbd_debug(4, "orphan inode %lu will point to %d\n", - inode->i_ino, NEXT_ORPHAN(inode)); -out_unlock: - mutex_unlock(&EXT4_SB(sb)->s_orphan_lock); - ext4_std_error(inode->i_sb, err); - return err; -} - -/* - * ext4_orphan_del() removes an unlinked or truncated inode from the list - * of such inodes stored on disk, because it is finally being cleaned up. - */ -int ext4_orphan_del(handle_t *handle, struct inode *inode) -{ - struct list_head *prev; - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi; - __u32 ino_next; - struct ext4_iloc iloc; - int err = 0; - - /* ext4_handle_valid() assumes a valid handle_t pointer */ - if (handle && !ext4_handle_valid(handle)) - return 0; - - mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); - if (list_empty(&ei->i_orphan)) - goto out; - - ino_next = NEXT_ORPHAN(inode); - prev = ei->i_orphan.prev; - sbi = EXT4_SB(inode->i_sb); - - jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); - - list_del_init(&ei->i_orphan); - - /* If we're on an error path, we may not have a valid - * transaction handle with which to update the orphan list on - * disk, but we still need to remove the inode from the linked - * list in memory. */ - if (sbi->s_journal && !handle) - goto out; - - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) - goto out_err; - - if (prev == &sbi->s_orphan) { - jbd_debug(4, "superblock will point to %u\n", ino_next); - BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); - if (err) - goto out_brelse; - sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); - } else { - struct ext4_iloc iloc2; - struct inode *i_prev = - &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; - - jbd_debug(4, "orphan inode %lu will point to %u\n", - i_prev->i_ino, ino_next); - err = ext4_reserve_inode_write(handle, i_prev, &iloc2); - if (err) - goto out_brelse; - NEXT_ORPHAN(i_prev) = ino_next; - err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); - } - if (err) - goto out_brelse; - NEXT_ORPHAN(inode) = 0; - err = ext4_mark_iloc_dirty(handle, inode, &iloc); - -out_err: - ext4_std_error(inode->i_sb, err); -out: - mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock); - return err; - -out_brelse: - brelse(iloc.bh); - goto out_err; -} - -static int ext4_rmdir(struct inode *dir, struct dentry *dentry) -{ - int retval; - struct inode *inode; - struct buffer_head *bh; - struct ext4_dir_entry_2 *de; - handle_t *handle; - - /* Initialize quotas before so that eventual writes go in - * separate transaction */ - dquot_initialize(dir); - dquot_initialize(dentry->d_inode); - - handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - retval = -ENOENT; - bh = ext4_find_entry(dir, &dentry->d_name, &de); - if (!bh) - goto end_rmdir; - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = dentry->d_inode; - - retval = -EIO; - if (le32_to_cpu(de->inode) != inode->i_ino) - goto end_rmdir; - - retval = -ENOTEMPTY; - if (!empty_dir(inode)) - goto end_rmdir; - - retval = ext4_delete_entry(handle, dir, de, bh); - if (retval) - goto end_rmdir; - if (!EXT4_DIR_LINK_EMPTY(inode)) - ext4_warning(inode->i_sb, - "empty directory has too many links (%d)", - inode->i_nlink); - inode->i_version++; - clear_nlink(inode); - /* There's no need to set i_disksize: the fact that i_nlink is - * zero will ensure that the right thing happens during any - * recovery. */ - inode->i_size = 0; - ext4_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - ext4_dec_count(handle, dir); - ext4_update_dx_flag(dir); - ext4_mark_inode_dirty(handle, dir); - -end_rmdir: - ext4_journal_stop(handle); - brelse(bh); - return retval; -} - -static int ext4_unlink(struct inode *dir, struct dentry *dentry) -{ - int retval; - struct inode *inode; - struct buffer_head *bh; - struct ext4_dir_entry_2 *de; - handle_t *handle; - - trace_ext4_unlink_enter(dir, dentry); - /* Initialize quotas before so that eventual writes go - * in separate transaction */ - dquot_initialize(dir); - dquot_initialize(dentry->d_inode); - - handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - retval = -ENOENT; - bh = ext4_find_entry(dir, &dentry->d_name, &de); - if (!bh) - goto end_unlink; - - inode = dentry->d_inode; - - retval = -EIO; - if (le32_to_cpu(de->inode) != inode->i_ino) - goto end_unlink; - - if (!inode->i_nlink) { - ext4_warning(inode->i_sb, - "Deleting nonexistent file (%lu), %d", - inode->i_ino, inode->i_nlink); - set_nlink(inode, 1); - } - retval = ext4_delete_entry(handle, dir, de, bh); - if (retval) - goto end_unlink; - dir->i_ctime = dir->i_mtime = ext4_current_time(dir); - ext4_update_dx_flag(dir); - ext4_mark_inode_dirty(handle, dir); - drop_nlink(inode); - if (!inode->i_nlink) - ext4_orphan_add(handle, inode); - inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - retval = 0; - -end_unlink: - ext4_journal_stop(handle); - brelse(bh); - trace_ext4_unlink_exit(dentry, retval); - return retval; -} - -static int ext4_symlink(struct inode *dir, - struct dentry *dentry, const char *symname) -{ - handle_t *handle; - struct inode *inode; - int l, err, retries = 0; - int credits; - - l = strlen(symname)+1; - if (l > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; - - dquot_initialize(dir); - - if (l > EXT4_N_BLOCKS * 4) { - /* - * For non-fast symlinks, we just allocate inode and put it on - * orphan list in the first transaction => we need bitmap, - * group descriptor, sb, inode block, quota blocks, and - * possibly selinux xattr blocks. - */ - credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + - EXT4_XATTR_TRANS_BLOCKS; - } else { - /* - * Fast symlink. We have to add entry to directory - * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS), - * allocate new inode (bitmap, group descriptor, inode block, - * quota blocks, sb is already counted in previous macros). - */ - credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); - } -retry: - handle = ext4_journal_start(dir, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, - &dentry->d_name, 0, NULL); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; - - if (l > EXT4_N_BLOCKS * 4) { - inode->i_op = &ext4_symlink_inode_operations; - ext4_set_aops(inode); - /* - * We cannot call page_symlink() with transaction started - * because it calls into ext4_write_begin() which can wait - * for transaction commit if we are running out of space - * and thus we deadlock. So we have to stop transaction now - * and restart it when symlink contents is written. - * - * To keep fs consistent in case of crash, we have to put inode - * to orphan list in the mean time. - */ - drop_nlink(inode); - err = ext4_orphan_add(handle, inode); - ext4_journal_stop(handle); - if (err) - goto err_drop_inode; - err = __page_symlink(inode, symname, l, 1); - if (err) - goto err_drop_inode; - /* - * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS - * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified - */ - handle = ext4_journal_start(dir, - EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto err_drop_inode; - } - set_nlink(inode, 1); - err = ext4_orphan_del(handle, inode); - if (err) { - ext4_journal_stop(handle); - clear_nlink(inode); - goto err_drop_inode; - } - } else { - /* clear the extent format for fast symlink */ - ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); - inode->i_op = &ext4_fast_symlink_inode_operations; - memcpy((char *)&EXT4_I(inode)->i_data, symname, l); - inode->i_size = l-1; - } - EXT4_I(inode)->i_disksize = inode->i_size; - err = ext4_add_nondir(handle, dentry, inode); -out_stop: - ext4_journal_stop(handle); - if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -err_drop_inode: - unlock_new_inode(inode); - iput(inode); - return err; -} - -static int ext4_link(struct dentry *old_dentry, - struct inode *dir, struct dentry *dentry) -{ - handle_t *handle; - struct inode *inode = old_dentry->d_inode; - int err, retries = 0; - - if (inode->i_nlink >= EXT4_LINK_MAX) - return -EMLINK; - - dquot_initialize(dir); - -retry: - handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(dir)) - ext4_handle_sync(handle); - - inode->i_ctime = ext4_current_time(inode); - ext4_inc_count(handle, inode); - ihold(inode); - - err = ext4_add_entry(handle, dentry, inode); - if (!err) { - ext4_mark_inode_dirty(handle, inode); - d_instantiate(dentry, inode); - } else { - drop_nlink(inode); - iput(inode); - } - ext4_journal_stop(handle); - if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) - goto retry; - return err; -} - -#define PARENT_INO(buffer, size) \ - (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode) - -/* - * Anybody can rename anything with this: the permission checks are left to the - * higher-level routines. - */ -static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - handle_t *handle; - struct inode *old_inode, *new_inode; - struct buffer_head *old_bh, *new_bh, *dir_bh; - struct ext4_dir_entry_2 *old_de, *new_de; - int retval, force_da_alloc = 0; - - dquot_initialize(old_dir); - dquot_initialize(new_dir); - - old_bh = new_bh = dir_bh = NULL; - - /* Initialize quotas before so that eventual writes go - * in separate transaction */ - if (new_dentry->d_inode) - dquot_initialize(new_dentry->d_inode); - handle = ext4_journal_start(old_dir, 2 * - EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) - ext4_handle_sync(handle); - - old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); - /* - * Check for inode number is _not_ due to possible IO errors. - * We might rmdir the source, keep it as pwd of some process - * and merrily kill the link to whatever was created under the - * same name. Goodbye sticky bit ;-< - */ - old_inode = old_dentry->d_inode; - retval = -ENOENT; - if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) - goto end_rename; - - new_inode = new_dentry->d_inode; - new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); - if (new_bh) { - if (!new_inode) { - brelse(new_bh); - new_bh = NULL; - } - } - if (S_ISDIR(old_inode->i_mode)) { - if (new_inode) { - retval = -ENOTEMPTY; - if (!empty_dir(new_inode)) - goto end_rename; - } - retval = -EIO; - dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); - if (!dir_bh) - goto end_rename; - if (le32_to_cpu(PARENT_INO(dir_bh->b_data, - old_dir->i_sb->s_blocksize)) != old_dir->i_ino) - goto end_rename; - retval = -EMLINK; - if (!new_inode && new_dir != old_dir && - EXT4_DIR_LINK_MAX(new_dir)) - goto end_rename; - BUFFER_TRACE(dir_bh, "get_write_access"); - retval = ext4_journal_get_write_access(handle, dir_bh); - if (retval) - goto end_rename; - } - if (!new_bh) { - retval = ext4_add_entry(handle, new_dentry, old_inode); - if (retval) - goto end_rename; - } else { - BUFFER_TRACE(new_bh, "get write access"); - retval = ext4_journal_get_write_access(handle, new_bh); - if (retval) - goto end_rename; - new_de->inode = cpu_to_le32(old_inode->i_ino); - if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb, - EXT4_FEATURE_INCOMPAT_FILETYPE)) - new_de->file_type = old_de->file_type; - new_dir->i_version++; - new_dir->i_ctime = new_dir->i_mtime = - ext4_current_time(new_dir); - ext4_mark_inode_dirty(handle, new_dir); - BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh); - if (unlikely(retval)) { - ext4_std_error(new_dir->i_sb, retval); - goto end_rename; - } - brelse(new_bh); - new_bh = NULL; - } - - /* - * Like most other Unix systems, set the ctime for inodes on a - * rename. - */ - old_inode->i_ctime = ext4_current_time(old_inode); - ext4_mark_inode_dirty(handle, old_inode); - - /* - * ok, that's it - */ - if (le32_to_cpu(old_de->inode) != old_inode->i_ino || - old_de->name_len != old_dentry->d_name.len || - strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) || - (retval = ext4_delete_entry(handle, old_dir, - old_de, old_bh)) == -ENOENT) { - /* old_de could have moved from under us during htree split, so - * make sure that we are deleting the right entry. We might - * also be pointing to a stale entry in the unused part of - * old_bh so just checking inum and the name isn't enough. */ - struct buffer_head *old_bh2; - struct ext4_dir_entry_2 *old_de2; - - old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); - if (old_bh2) { - retval = ext4_delete_entry(handle, old_dir, - old_de2, old_bh2); - brelse(old_bh2); - } - } - if (retval) { - ext4_warning(old_dir->i_sb, - "Deleting old file (%lu), %d, error=%d", - old_dir->i_ino, old_dir->i_nlink, retval); - } - - if (new_inode) { - ext4_dec_count(handle, new_inode); - new_inode->i_ctime = ext4_current_time(new_inode); - } - old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); - ext4_update_dx_flag(old_dir); - if (dir_bh) { - PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = - cpu_to_le32(new_dir->i_ino); - BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh); - if (retval) { - ext4_std_error(old_dir->i_sb, retval); - goto end_rename; - } - ext4_dec_count(handle, old_dir); - if (new_inode) { - /* checked empty_dir above, can't have another parent, - * ext4_dec_count() won't work for many-linked dirs */ - clear_nlink(new_inode); - } else { - ext4_inc_count(handle, new_dir); - ext4_update_dx_flag(new_dir); - ext4_mark_inode_dirty(handle, new_dir); - } - } - ext4_mark_inode_dirty(handle, old_dir); - if (new_inode) { - ext4_mark_inode_dirty(handle, new_inode); - if (!new_inode->i_nlink) - ext4_orphan_add(handle, new_inode); - if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC)) - force_da_alloc = 1; - } - retval = 0; - -end_rename: - brelse(dir_bh); - brelse(old_bh); - brelse(new_bh); - ext4_journal_stop(handle); - if (retval == 0 && force_da_alloc) - ext4_alloc_da_blocks(old_inode); - return retval; -} - -/* - * directories can handle most operations... - */ -const struct inode_operations ext4_dir_inode_operations = { - .create = ext4_create, - .lookup = ext4_lookup, - .link = ext4_link, - .unlink = ext4_unlink, - .symlink = ext4_symlink, - .mkdir = ext4_mkdir, - .rmdir = ext4_rmdir, - .mknod = ext4_mknod, - .rename = ext4_rename, - .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext4_listxattr, - .removexattr = generic_removexattr, -#endif - .get_acl = ext4_get_acl, - .fiemap = ext4_fiemap, -}; - -const struct inode_operations ext4_special_inode_operations = { - .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext4_listxattr, - .removexattr = generic_removexattr, -#endif - .get_acl = ext4_get_acl, -}; diff --git a/ANDROID_3.4.5/fs/ext4/page-io.c b/ANDROID_3.4.5/fs/ext4/page-io.c deleted file mode 100644 index dcdeef16..00000000 --- a/ANDROID_3.4.5/fs/ext4/page-io.c +++ /dev/null @@ -1,433 +0,0 @@ -/* - * linux/fs/ext4/page-io.c - * - * This contains the new page_io functions for ext4 - * - * Written by Theodore Ts'o, 2010. - */ - -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/jbd2.h> -#include <linux/highuid.h> -#include <linux/pagemap.h> -#include <linux/quotaops.h> -#include <linux/string.h> -#include <linux/buffer_head.h> -#include <linux/writeback.h> -#include <linux/pagevec.h> -#include <linux/mpage.h> -#include <linux/namei.h> -#include <linux/uio.h> -#include <linux/bio.h> -#include <linux/workqueue.h> -#include <linux/kernel.h> -#include <linux/slab.h> - -#include "ext4_jbd2.h" -#include "xattr.h" -#include "acl.h" -#include "ext4_extents.h" - -static struct kmem_cache *io_page_cachep, *io_end_cachep; - -int __init ext4_init_pageio(void) -{ - io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); - if (io_page_cachep == NULL) - return -ENOMEM; - io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); - if (io_end_cachep == NULL) { - kmem_cache_destroy(io_page_cachep); - return -ENOMEM; - } - return 0; -} - -void ext4_exit_pageio(void) -{ - kmem_cache_destroy(io_end_cachep); - kmem_cache_destroy(io_page_cachep); -} - -void ext4_ioend_wait(struct inode *inode) -{ - wait_queue_head_t *wq = ext4_ioend_wq(inode); - - wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); -} - -static void put_io_page(struct ext4_io_page *io_page) -{ - if (atomic_dec_and_test(&io_page->p_count)) { - end_page_writeback(io_page->p_page); - put_page(io_page->p_page); - kmem_cache_free(io_page_cachep, io_page); - } -} - -void ext4_free_io_end(ext4_io_end_t *io) -{ - int i; - - BUG_ON(!io); - if (io->page) - put_page(io->page); - for (i = 0; i < io->num_io_pages; i++) - put_io_page(io->pages[i]); - io->num_io_pages = 0; - if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) - wake_up_all(ext4_ioend_wq(io->inode)); - kmem_cache_free(io_end_cachep, io); -} - -/* - * check a range of space and convert unwritten extents to written. - * - * Called with inode->i_mutex; we depend on this when we manipulate - * io->flag, since we could otherwise race with ext4_flush_completed_IO() - */ -int ext4_end_io_nolock(ext4_io_end_t *io) -{ - struct inode *inode = io->inode; - loff_t offset = io->offset; - ssize_t size = io->size; - int ret = 0; - - ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," - "list->prev 0x%p\n", - io, inode->i_ino, io->list.next, io->list.prev); - - ret = ext4_convert_unwritten_extents(inode, offset, size); - if (ret < 0) { - ext4_msg(inode->i_sb, KERN_EMERG, - "failed to convert unwritten extents to written " - "extents -- potential data loss! " - "(inode %lu, offset %llu, size %zd, error %d)", - inode->i_ino, offset, size, ret); - } - - if (io->iocb) - aio_complete(io->iocb, io->result, 0); - - if (io->flag & EXT4_IO_END_DIRECT) - inode_dio_done(inode); - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten)) - wake_up_all(ext4_ioend_wq(io->inode)); - return ret; -} - -/* - * work on completed aio dio IO, to convert unwritten extents to extents - */ -static void ext4_end_io_work(struct work_struct *work) -{ - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); - struct inode *inode = io->inode; - struct ext4_inode_info *ei = EXT4_I(inode); - unsigned long flags; - - spin_lock_irqsave(&ei->i_completed_io_lock, flags); - if (io->flag & EXT4_IO_END_IN_FSYNC) - goto requeue; - if (list_empty(&io->list)) { - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - goto free; - } - - if (!mutex_trylock(&inode->i_mutex)) { - bool was_queued; -requeue: - was_queued = !!(io->flag & EXT4_IO_END_QUEUED); - io->flag |= EXT4_IO_END_QUEUED; - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - /* - * Requeue the work instead of waiting so that the work - * items queued after this can be processed. - */ - queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); - /* - * To prevent the ext4-dio-unwritten thread from keeping - * requeueing end_io requests and occupying cpu for too long, - * yield the cpu if it sees an end_io request that has already - * been requeued. - */ - if (was_queued) - yield(); - return; - } - list_del_init(&io->list); - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); - (void) ext4_end_io_nolock(io); - mutex_unlock(&inode->i_mutex); -free: - ext4_free_io_end(io); -} - -ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) -{ - ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); - if (io) { - atomic_inc(&EXT4_I(inode)->i_ioend_count); - io->inode = inode; - INIT_WORK(&io->work, ext4_end_io_work); - INIT_LIST_HEAD(&io->list); - } - return io; -} - -/* - * Print an buffer I/O error compatible with the fs/buffer.c. This - * provides compatibility with dmesg scrapers that look for a specific - * buffer I/O error message. We really need a unified error reporting - * structure to userspace ala Digital Unix's uerf system, but it's - * probably not going to happen in my lifetime, due to LKML politics... - */ -static void buffer_io_error(struct buffer_head *bh) -{ - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", - bdevname(bh->b_bdev, b), - (unsigned long long)bh->b_blocknr); -} - -static void ext4_end_bio(struct bio *bio, int error) -{ - ext4_io_end_t *io_end = bio->bi_private; - struct workqueue_struct *wq; - struct inode *inode; - unsigned long flags; - int i; - sector_t bi_sector = bio->bi_sector; - - BUG_ON(!io_end); - bio->bi_private = NULL; - bio->bi_end_io = NULL; - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = 0; - bio_put(bio); - - for (i = 0; i < io_end->num_io_pages; i++) { - struct page *page = io_end->pages[i]->p_page; - struct buffer_head *bh, *head; - loff_t offset; - loff_t io_end_offset; - - if (error) { - SetPageError(page); - set_bit(AS_EIO, &page->mapping->flags); - head = page_buffers(page); - BUG_ON(!head); - - io_end_offset = io_end->offset + io_end->size; - - offset = (sector_t) page->index << PAGE_CACHE_SHIFT; - bh = head; - do { - if ((offset >= io_end->offset) && - (offset+bh->b_size <= io_end_offset)) - buffer_io_error(bh); - - offset += bh->b_size; - bh = bh->b_this_page; - } while (bh != head); - } - - put_io_page(io_end->pages[i]); - } - io_end->num_io_pages = 0; - inode = io_end->inode; - - if (error) { - io_end->flag |= EXT4_IO_END_ERROR; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " - "(offset %llu size %ld starting block %llu)", - inode->i_ino, - (unsigned long long) io_end->offset, - (long) io_end->size, - (unsigned long long) - bi_sector >> (inode->i_blkbits - 9)); - } - - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); - return; - } - - /* Add the io_end to per-inode completed io list*/ - spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); - list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); - spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); - - wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; - /* queue the work to convert unwritten extents to written */ - queue_work(wq, &io_end->work); -} - -void ext4_io_submit(struct ext4_io_submit *io) -{ - struct bio *bio = io->io_bio; - - if (bio) { - bio_get(io->io_bio); - submit_bio(io->io_op, io->io_bio); - BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); - bio_put(io->io_bio); - } - io->io_bio = NULL; - io->io_op = 0; - io->io_end = NULL; -} - -static int io_submit_init(struct ext4_io_submit *io, - struct inode *inode, - struct writeback_control *wbc, - struct buffer_head *bh) -{ - ext4_io_end_t *io_end; - struct page *page = bh->b_page; - int nvecs = bio_get_nr_vecs(bh->b_bdev); - struct bio *bio; - - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) - return -ENOMEM; - bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; - bio->bi_private = io->io_end = io_end; - bio->bi_end_io = ext4_end_bio; - - io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); - - io->io_bio = bio; - io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); - io->io_next_block = bh->b_blocknr; - return 0; -} - -static int io_submit_add_bh(struct ext4_io_submit *io, - struct ext4_io_page *io_page, - struct inode *inode, - struct writeback_control *wbc, - struct buffer_head *bh) -{ - ext4_io_end_t *io_end; - int ret; - - if (buffer_new(bh)) { - clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); - } - - if (!buffer_mapped(bh) || buffer_delay(bh)) { - if (!buffer_mapped(bh)) - clear_buffer_dirty(bh); - if (io->io_bio) - ext4_io_submit(io); - return 0; - } - - if (io->io_bio && bh->b_blocknr != io->io_next_block) { -submit_and_retry: - ext4_io_submit(io); - } - if (io->io_bio == NULL) { - ret = io_submit_init(io, inode, wbc, bh); - if (ret) - return ret; - } - io_end = io->io_end; - if ((io_end->num_io_pages >= MAX_IO_PAGES) && - (io_end->pages[io_end->num_io_pages-1] != io_page)) - goto submit_and_retry; - if (buffer_uninit(bh)) - ext4_set_io_unwritten_flag(inode, io_end); - io->io_end->size += bh->b_size; - io->io_next_block++; - ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); - if (ret != bh->b_size) - goto submit_and_retry; - if ((io_end->num_io_pages == 0) || - (io_end->pages[io_end->num_io_pages-1] != io_page)) { - io_end->pages[io_end->num_io_pages++] = io_page; - atomic_inc(&io_page->p_count); - } - return 0; -} - -int ext4_bio_write_page(struct ext4_io_submit *io, - struct page *page, - int len, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - unsigned block_start, block_end, blocksize; - struct ext4_io_page *io_page; - struct buffer_head *bh, *head; - int ret = 0; - - blocksize = 1 << inode->i_blkbits; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - - io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); - if (!io_page) { - set_page_dirty(page); - unlock_page(page); - return -ENOMEM; - } - io_page->p_page = page; - atomic_set(&io_page->p_count, 1); - get_page(page); - set_page_writeback(page); - ClearPageError(page); - - for (bh = head = page_buffers(page), block_start = 0; - bh != head || !block_start; - block_start = block_end, bh = bh->b_this_page) { - - block_end = block_start + blocksize; - if (block_start >= len) { - /* - * Comments copied from block_write_full_page_endio: - * - * The page straddles i_size. It must be zeroed out on - * each and every writepage invocation because it may - * be mmapped. "A file is mapped in multiples of the - * page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when - * mapped, and writes to that region are not written - * out to the file." - */ - zero_user_segment(page, block_start, block_end); - clear_buffer_dirty(bh); - set_buffer_uptodate(bh); - continue; - } - clear_buffer_dirty(bh); - ret = io_submit_add_bh(io, io_page, inode, wbc, bh); - if (ret) { - /* - * We only get here on ENOMEM. Not much else - * we can do but mark the page as dirty, and - * better luck next time. - */ - set_page_dirty(page); - break; - } - } - unlock_page(page); - /* - * If the page was truncated before we could do the writeback, - * or we had a memory allocation error while trying to write - * the first buffer head, we won't have submitted any pages for - * I/O. In that case we need to make sure we've cleared the - * PageWriteback bit from the page to prevent the system from - * wedging later on. - */ - put_io_page(io_page); - return ret; -} diff --git a/ANDROID_3.4.5/fs/ext4/resize.c b/ANDROID_3.4.5/fs/ext4/resize.c deleted file mode 100644 index 53589ff8..00000000 --- a/ANDROID_3.4.5/fs/ext4/resize.c +++ /dev/null @@ -1,1689 +0,0 @@ -/* - * linux/fs/ext4/resize.c - * - * Support for resizing an ext4 filesystem while it is mounted. - * - * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com> - * - * This could probably be made into a module, because it is not often in use. - */ - - -#define EXT4FS_DEBUG - -#include <linux/errno.h> -#include <linux/slab.h> - -#include "ext4_jbd2.h" - -int ext4_resize_begin(struct super_block *sb) -{ - int ret = 0; - - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; - - /* - * We are not allowed to do online-resizing on a filesystem mounted - * with error, because it can destroy the filesystem easily. - */ - if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { - ext4_warning(sb, "There are errors in the filesystem, " - "so online resizing is not allowed\n"); - return -EPERM; - } - - if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags)) - ret = -EBUSY; - - return ret; -} - -void ext4_resize_end(struct super_block *sb) -{ - clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); - smp_mb__after_clear_bit(); -} - -#define outside(b, first, last) ((b) < (first) || (b) >= (last)) -#define inside(b, first, last) ((b) >= (first) && (b) < (last)) - -static int verify_group_input(struct super_block *sb, - struct ext4_new_group_data *input) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - ext4_fsblk_t start = ext4_blocks_count(es); - ext4_fsblk_t end = start + input->blocks_count; - ext4_group_t group = input->group; - ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; - unsigned overhead = ext4_bg_has_super(sb, group) ? - (1 + ext4_bg_num_gdb(sb, group) + - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; - ext4_fsblk_t metaend = start + overhead; - struct buffer_head *bh = NULL; - ext4_grpblk_t free_blocks_count, offset; - int err = -EINVAL; - - input->free_blocks_count = free_blocks_count = - input->blocks_count - 2 - overhead - sbi->s_itb_per_group; - - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks " - "(%d free, %u reserved)\n", - ext4_bg_has_super(sb, input->group) ? "normal" : - "no-super", input->group, input->blocks_count, - free_blocks_count, input->reserved_blocks); - - ext4_get_group_no_and_offset(sb, start, NULL, &offset); - if (group != sbi->s_groups_count) - ext4_warning(sb, "Cannot add at group %u (only %u groups)", - input->group, sbi->s_groups_count); - else if (offset != 0) - ext4_warning(sb, "Last group not full"); - else if (input->reserved_blocks > input->blocks_count / 5) - ext4_warning(sb, "Reserved blocks too high (%u)", - input->reserved_blocks); - else if (free_blocks_count < 0) - ext4_warning(sb, "Bad blocks count %u", - input->blocks_count); - else if (!(bh = sb_bread(sb, end - 1))) - ext4_warning(sb, "Cannot read last block (%llu)", - end - 1); - else if (outside(input->block_bitmap, start, end)) - ext4_warning(sb, "Block bitmap not in group (block %llu)", - (unsigned long long)input->block_bitmap); - else if (outside(input->inode_bitmap, start, end)) - ext4_warning(sb, "Inode bitmap not in group (block %llu)", - (unsigned long long)input->inode_bitmap); - else if (outside(input->inode_table, start, end) || - outside(itend - 1, start, end)) - ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)", - (unsigned long long)input->inode_table, itend - 1); - else if (input->inode_bitmap == input->block_bitmap) - ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)", - (unsigned long long)input->block_bitmap); - else if (inside(input->block_bitmap, input->inode_table, itend)) - ext4_warning(sb, "Block bitmap (%llu) in inode table " - "(%llu-%llu)", - (unsigned long long)input->block_bitmap, - (unsigned long long)input->inode_table, itend - 1); - else if (inside(input->inode_bitmap, input->inode_table, itend)) - ext4_warning(sb, "Inode bitmap (%llu) in inode table " - "(%llu-%llu)", - (unsigned long long)input->inode_bitmap, - (unsigned long long)input->inode_table, itend - 1); - else if (inside(input->block_bitmap, start, metaend)) - ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)", - (unsigned long long)input->block_bitmap, - start, metaend - 1); - else if (inside(input->inode_bitmap, start, metaend)) - ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)", - (unsigned long long)input->inode_bitmap, - start, metaend - 1); - else if (inside(input->inode_table, start, metaend) || - inside(itend - 1, start, metaend)) - ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table " - "(%llu-%llu)", - (unsigned long long)input->inode_table, - itend - 1, start, metaend - 1); - else - err = 0; - brelse(bh); - - return err; -} - -/* - * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex - * group each time. - */ -struct ext4_new_flex_group_data { - struct ext4_new_group_data *groups; /* new_group_data for groups - in the flex group */ - __u16 *bg_flags; /* block group flags of groups - in @groups */ - ext4_group_t count; /* number of groups in @groups - */ -}; - -/* - * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of - * @flexbg_size. - * - * Returns NULL on failure otherwise address of the allocated structure. - */ -static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) -{ - struct ext4_new_flex_group_data *flex_gd; - - flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS); - if (flex_gd == NULL) - goto out3; - - if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data)) - goto out2; - flex_gd->count = flexbg_size; - - flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) * - flexbg_size, GFP_NOFS); - if (flex_gd->groups == NULL) - goto out2; - - flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS); - if (flex_gd->bg_flags == NULL) - goto out1; - - return flex_gd; - -out1: - kfree(flex_gd->groups); -out2: - kfree(flex_gd); -out3: - return NULL; -} - -static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) -{ - kfree(flex_gd->bg_flags); - kfree(flex_gd->groups); - kfree(flex_gd); -} - -/* - * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps - * and inode tables for a flex group. - * - * This function is used by 64bit-resize. Note that this function allocates - * group tables from the 1st group of groups contained by @flexgd, which may - * be a partial of a flex group. - * - * @sb: super block of fs to which the groups belongs - */ -static void ext4_alloc_group_tables(struct super_block *sb, - struct ext4_new_flex_group_data *flex_gd, - int flexbg_size) -{ - struct ext4_new_group_data *group_data = flex_gd->groups; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - ext4_fsblk_t start_blk; - ext4_fsblk_t last_blk; - ext4_group_t src_group; - ext4_group_t bb_index = 0; - ext4_group_t ib_index = 0; - ext4_group_t it_index = 0; - ext4_group_t group; - ext4_group_t last_group; - unsigned overhead; - - BUG_ON(flex_gd->count == 0 || group_data == NULL); - - src_group = group_data[0].group; - last_group = src_group + flex_gd->count - 1; - - BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) != - (last_group & ~(flexbg_size - 1)))); -next_group: - group = group_data[0].group; - start_blk = ext4_group_first_block_no(sb, src_group); - last_blk = start_blk + group_data[src_group - group].blocks_count; - - overhead = ext4_bg_has_super(sb, src_group) ? - (1 + ext4_bg_num_gdb(sb, src_group) + - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; - - start_blk += overhead; - - BUG_ON(src_group >= group_data[0].group + flex_gd->count); - /* We collect contiguous blocks as much as possible. */ - src_group++; - for (; src_group <= last_group; src_group++) - if (!ext4_bg_has_super(sb, src_group)) - last_blk += group_data[src_group - group].blocks_count; - else - break; - - /* Allocate block bitmaps */ - for (; bb_index < flex_gd->count; bb_index++) { - if (start_blk >= last_blk) - goto next_group; - group_data[bb_index].block_bitmap = start_blk++; - ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); - group -= group_data[0].group; - group_data[group].free_blocks_count--; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; - } - - /* Allocate inode bitmaps */ - for (; ib_index < flex_gd->count; ib_index++) { - if (start_blk >= last_blk) - goto next_group; - group_data[ib_index].inode_bitmap = start_blk++; - ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL); - group -= group_data[0].group; - group_data[group].free_blocks_count--; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; - } - - /* Allocate inode tables */ - for (; it_index < flex_gd->count; it_index++) { - if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) - goto next_group; - group_data[it_index].inode_table = start_blk; - ext4_get_group_no_and_offset(sb, start_blk, &group, NULL); - group -= group_data[0].group; - group_data[group].free_blocks_count -= - EXT4_SB(sb)->s_itb_per_group; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; - - start_blk += EXT4_SB(sb)->s_itb_per_group; - } - - if (test_opt(sb, DEBUG)) { - int i; - group = group_data[0].group; - - printk(KERN_DEBUG "EXT4-fs: adding a flex group with " - "%d groups, flexbg size is %d:\n", flex_gd->count, - flexbg_size); - - for (i = 0; i < flex_gd->count; i++) { - printk(KERN_DEBUG "adding %s group %u: %u " - "blocks (%d free)\n", - ext4_bg_has_super(sb, group + i) ? "normal" : - "no-super", group + i, - group_data[i].blocks_count, - group_data[i].free_blocks_count); - } - } -} - -static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, - ext4_fsblk_t blk) -{ - struct buffer_head *bh; - int err; - - bh = sb_getblk(sb, blk); - if (!bh) - return ERR_PTR(-EIO); - if ((err = ext4_journal_get_write_access(handle, bh))) { - brelse(bh); - bh = ERR_PTR(err); - } else { - memset(bh->b_data, 0, sb->s_blocksize); - set_buffer_uptodate(bh); - } - - return bh; -} - -/* - * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA. - * If that fails, restart the transaction & regain write access for the - * buffer head which is used for block_bitmap modifications. - */ -static int extend_or_restart_transaction(handle_t *handle, int thresh) -{ - int err; - - if (ext4_handle_has_enough_credits(handle, thresh)) - return 0; - - err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); - if (err < 0) - return err; - if (err) { - err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA); - if (err) - return err; - } - - return 0; -} - -/* - * set_flexbg_block_bitmap() mark @count blocks starting from @block used. - * - * Helper function for ext4_setup_new_group_blocks() which set . - * - * @sb: super block - * @handle: journal handle - * @flex_gd: flex group data - */ -static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, - struct ext4_new_flex_group_data *flex_gd, - ext4_fsblk_t block, ext4_group_t count) -{ - ext4_group_t count2; - - ext4_debug("mark blocks [%llu/%u] used\n", block, count); - for (count2 = count; count > 0; count -= count2, block += count2) { - ext4_fsblk_t start; - struct buffer_head *bh; - ext4_group_t group; - int err; - - ext4_get_group_no_and_offset(sb, block, &group, NULL); - start = ext4_group_first_block_no(sb, group); - group -= flex_gd->groups[0].group; - - count2 = sb->s_blocksize * 8 - (block - start); - if (count2 > count) - count2 = count; - - if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) { - BUG_ON(flex_gd->count > 1); - continue; - } - - err = extend_or_restart_transaction(handle, 1); - if (err) - return err; - - bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); - if (!bh) - return -EIO; - - err = ext4_journal_get_write_access(handle, bh); - if (err) - return err; - ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block, - block - start, count2); - ext4_set_bits(bh->b_data, block - start, count2); - - err = ext4_handle_dirty_metadata(handle, NULL, bh); - if (unlikely(err)) - return err; - brelse(bh); - } - - return 0; -} - -/* - * Set up the block and inode bitmaps, and the inode table for the new groups. - * This doesn't need to be part of the main transaction, since we are only - * changing blocks outside the actual filesystem. We still do journaling to - * ensure the recovery is correct in case of a failure just after resize. - * If any part of this fails, we simply abort the resize. - * - * setup_new_flex_group_blocks handles a flex group as follow: - * 1. copy super block and GDT, and initialize group tables if necessary. - * In this step, we only set bits in blocks bitmaps for blocks taken by - * super block and GDT. - * 2. allocate group tables in block bitmaps, that is, set bits in block - * bitmap for blocks taken by group tables. - */ -static int setup_new_flex_group_blocks(struct super_block *sb, - struct ext4_new_flex_group_data *flex_gd) -{ - int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group}; - ext4_fsblk_t start; - ext4_fsblk_t block; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - struct ext4_new_group_data *group_data = flex_gd->groups; - __u16 *bg_flags = flex_gd->bg_flags; - handle_t *handle; - ext4_group_t group, count; - struct buffer_head *bh = NULL; - int reserved_gdb, i, j, err = 0, err2; - - BUG_ON(!flex_gd->count || !group_data || - group_data[0].group != sbi->s_groups_count); - - reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); - - /* This transaction may be extended/restarted along the way */ - handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - group = group_data[0].group; - for (i = 0; i < flex_gd->count; i++, group++) { - unsigned long gdblocks; - - gdblocks = ext4_bg_num_gdb(sb, group); - start = ext4_group_first_block_no(sb, group); - - /* Copy all of the GDT blocks into the backup in this group */ - for (j = 0, block = start + 1; j < gdblocks; j++, block++) { - struct buffer_head *gdb; - - ext4_debug("update backup group %#04llx\n", block); - err = extend_or_restart_transaction(handle, 1); - if (err) - goto out; - - gdb = sb_getblk(sb, block); - if (!gdb) { - err = -EIO; - goto out; - } - - err = ext4_journal_get_write_access(handle, gdb); - if (err) { - brelse(gdb); - goto out; - } - memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data, - gdb->b_size); - set_buffer_uptodate(gdb); - - err = ext4_handle_dirty_metadata(handle, NULL, gdb); - if (unlikely(err)) { - brelse(gdb); - goto out; - } - brelse(gdb); - } - - /* Zero out all of the reserved backup group descriptor - * table blocks - */ - if (ext4_bg_has_super(sb, group)) { - err = sb_issue_zeroout(sb, gdblocks + start + 1, - reserved_gdb, GFP_NOFS); - if (err) - goto out; - } - - /* Initialize group tables of the grop @group */ - if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) - goto handle_bb; - - /* Zero out all of the inode table blocks */ - block = group_data[i].inode_table; - ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", - block, sbi->s_itb_per_group); - err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, - GFP_NOFS); - if (err) - goto out; - -handle_bb: - if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT) - goto handle_ib; - - /* Initialize block bitmap of the @group */ - block = group_data[i].block_bitmap; - err = extend_or_restart_transaction(handle, 1); - if (err) - goto out; - - bh = bclean(handle, sb, block); - if (IS_ERR(bh)) { - err = PTR_ERR(bh); - goto out; - } - if (ext4_bg_has_super(sb, group)) { - ext4_debug("mark backup superblock %#04llx (+0)\n", - start); - ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + - 1); - } - ext4_mark_bitmap_end(group_data[i].blocks_count, - sb->s_blocksize * 8, bh->b_data); - err = ext4_handle_dirty_metadata(handle, NULL, bh); - if (err) - goto out; - brelse(bh); - -handle_ib: - if (bg_flags[i] & EXT4_BG_INODE_UNINIT) - continue; - - /* Initialize inode bitmap of the @group */ - block = group_data[i].inode_bitmap; - err = extend_or_restart_transaction(handle, 1); - if (err) - goto out; - /* Mark unused entries in inode bitmap used */ - bh = bclean(handle, sb, block); - if (IS_ERR(bh)) { - err = PTR_ERR(bh); - goto out; - } - - ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), - sb->s_blocksize * 8, bh->b_data); - err = ext4_handle_dirty_metadata(handle, NULL, bh); - if (err) - goto out; - brelse(bh); - } - bh = NULL; - - /* Mark group tables in block bitmap */ - for (j = 0; j < GROUP_TABLE_COUNT; j++) { - count = group_table_count[j]; - start = (&group_data[0].block_bitmap)[j]; - block = start; - for (i = 1; i < flex_gd->count; i++) { - block += group_table_count[j]; - if (block == (&group_data[i].block_bitmap)[j]) { - count += group_table_count[j]; - continue; - } - err = set_flexbg_block_bitmap(sb, handle, - flex_gd, start, count); - if (err) - goto out; - count = group_table_count[j]; - start = group_data[i].block_bitmap; - block = start; - } - - if (count) { - err = set_flexbg_block_bitmap(sb, handle, - flex_gd, start, count); - if (err) - goto out; - } - } - -out: - brelse(bh); - err2 = ext4_journal_stop(handle); - if (err2 && !err) - err = err2; - - return err; -} - -/* - * Iterate through the groups which hold BACKUP superblock/GDT copies in an - * ext4 filesystem. The counters should be initialized to 1, 5, and 7 before - * calling this for the first time. In a sparse filesystem it will be the - * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... - * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... - */ -static unsigned ext4_list_backups(struct super_block *sb, unsigned *three, - unsigned *five, unsigned *seven) -{ - unsigned *min = three; - int mult = 3; - unsigned ret; - - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { - ret = *min; - *min += 1; - return ret; - } - - if (*five < *min) { - min = five; - mult = 5; - } - if (*seven < *min) { - min = seven; - mult = 7; - } - - ret = *min; - *min *= mult; - - return ret; -} - -/* - * Check that all of the backup GDT blocks are held in the primary GDT block. - * It is assumed that they are stored in group order. Returns the number of - * groups in current filesystem that have BACKUPS, or -ve error code. - */ -static int verify_reserved_gdb(struct super_block *sb, - ext4_group_t end, - struct buffer_head *primary) -{ - const ext4_fsblk_t blk = primary->b_blocknr; - unsigned three = 1; - unsigned five = 5; - unsigned seven = 7; - unsigned grp; - __le32 *p = (__le32 *)primary->b_data; - int gdbackups = 0; - - while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { - if (le32_to_cpu(*p++) != - grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ - ext4_warning(sb, "reserved GDT %llu" - " missing grp %d (%llu)", - blk, grp, - grp * - (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + - blk); - return -EINVAL; - } - if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb)) - return -EFBIG; - } - - return gdbackups; -} - -/* - * Called when we need to bring a reserved group descriptor table block into - * use from the resize inode. The primary copy of the new GDT block currently - * is an indirect block (under the double indirect block in the resize inode). - * The new backup GDT blocks will be stored as leaf blocks in this indirect - * block, in group order. Even though we know all the block numbers we need, - * we check to ensure that the resize inode has actually reserved these blocks. - * - * Don't need to update the block bitmaps because the blocks are still in use. - * - * We get all of the error cases out of the way, so that we are sure to not - * fail once we start modifying the data on disk, because JBD has no rollback. - */ -static int add_new_gdb(handle_t *handle, struct inode *inode, - ext4_group_t group) -{ - struct super_block *sb = inode->i_sb; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); - ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; - struct buffer_head **o_group_desc, **n_group_desc; - struct buffer_head *dind; - struct buffer_head *gdb_bh; - int gdbackups; - struct ext4_iloc iloc; - __le32 *data; - int err; - - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG - "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", - gdb_num); - - /* - * If we are not using the primary superblock/GDT copy don't resize, - * because the user tools have no way of handling this. Probably a - * bad time to do it anyways. - */ - if (EXT4_SB(sb)->s_sbh->b_blocknr != - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { - ext4_warning(sb, "won't resize using backup superblock at %llu", - (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); - return -EPERM; - } - - gdb_bh = sb_bread(sb, gdblock); - if (!gdb_bh) - return -EIO; - - gdbackups = verify_reserved_gdb(sb, group, gdb_bh); - if (gdbackups < 0) { - err = gdbackups; - goto exit_bh; - } - - data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; - dind = sb_bread(sb, le32_to_cpu(*data)); - if (!dind) { - err = -EIO; - goto exit_bh; - } - - data = (__le32 *)dind->b_data; - if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { - ext4_warning(sb, "new group %u GDT block %llu not reserved", - group, gdblock); - err = -EINVAL; - goto exit_dind; - } - - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); - if (unlikely(err)) - goto exit_dind; - - err = ext4_journal_get_write_access(handle, gdb_bh); - if (unlikely(err)) - goto exit_sbh; - - err = ext4_journal_get_write_access(handle, dind); - if (unlikely(err)) - ext4_std_error(sb, err); - - /* ext4_reserve_inode_write() gets a reference on the iloc */ - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (unlikely(err)) - goto exit_dindj; - - n_group_desc = ext4_kvmalloc((gdb_num + 1) * - sizeof(struct buffer_head *), - GFP_NOFS); - if (!n_group_desc) { - err = -ENOMEM; - ext4_warning(sb, "not enough memory for %lu groups", - gdb_num + 1); - goto exit_inode; - } - - /* - * Finally, we have all of the possible failures behind us... - * - * Remove new GDT block from inode double-indirect block and clear out - * the new GDT block for use (which also "frees" the backup GDT blocks - * from the reserved inode). We don't need to change the bitmaps for - * these blocks, because they are marked as in-use from being in the - * reserved inode, and will become GDT blocks (primary and backup). - */ - data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; - err = ext4_handle_dirty_metadata(handle, NULL, dind); - if (unlikely(err)) { - ext4_std_error(sb, err); - goto exit_inode; - } - inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; - ext4_mark_iloc_dirty(handle, inode, &iloc); - memset(gdb_bh->b_data, 0, sb->s_blocksize); - err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); - if (unlikely(err)) { - ext4_std_error(sb, err); - goto exit_inode; - } - brelse(dind); - - o_group_desc = EXT4_SB(sb)->s_group_desc; - memcpy(n_group_desc, o_group_desc, - EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); - n_group_desc[gdb_num] = gdb_bh; - EXT4_SB(sb)->s_group_desc = n_group_desc; - EXT4_SB(sb)->s_gdb_count++; - ext4_kvfree(o_group_desc); - - le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); - if (err) - ext4_std_error(sb, err); - - return err; - -exit_inode: - ext4_kvfree(n_group_desc); - /* ext4_handle_release_buffer(handle, iloc.bh); */ - brelse(iloc.bh); -exit_dindj: - /* ext4_handle_release_buffer(handle, dind); */ -exit_sbh: - /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ -exit_dind: - brelse(dind); -exit_bh: - brelse(gdb_bh); - - ext4_debug("leaving with error %d\n", err); - return err; -} - -/* - * Called when we are adding a new group which has a backup copy of each of - * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. - * We need to add these reserved backup GDT blocks to the resize inode, so - * that they are kept for future resizing and not allocated to files. - * - * Each reserved backup GDT block will go into a different indirect block. - * The indirect blocks are actually the primary reserved GDT blocks, - * so we know in advance what their block numbers are. We only get the - * double-indirect block to verify it is pointing to the primary reserved - * GDT blocks so we don't overwrite a data block by accident. The reserved - * backup GDT blocks are stored in their reserved primary GDT block. - */ -static int reserve_backup_gdb(handle_t *handle, struct inode *inode, - ext4_group_t group) -{ - struct super_block *sb = inode->i_sb; - int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); - struct buffer_head **primary; - struct buffer_head *dind; - struct ext4_iloc iloc; - ext4_fsblk_t blk; - __le32 *data, *end; - int gdbackups = 0; - int res, i; - int err; - - primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS); - if (!primary) - return -ENOMEM; - - data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; - dind = sb_bread(sb, le32_to_cpu(*data)); - if (!dind) { - err = -EIO; - goto exit_free; - } - - blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count; - data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count % - EXT4_ADDR_PER_BLOCK(sb)); - end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb); - - /* Get each reserved primary GDT block and verify it holds backups */ - for (res = 0; res < reserved_gdb; res++, blk++) { - if (le32_to_cpu(*data) != blk) { - ext4_warning(sb, "reserved block %llu" - " not at offset %ld", - blk, - (long)(data - (__le32 *)dind->b_data)); - err = -EINVAL; - goto exit_bh; - } - primary[res] = sb_bread(sb, blk); - if (!primary[res]) { - err = -EIO; - goto exit_bh; - } - gdbackups = verify_reserved_gdb(sb, group, primary[res]); - if (gdbackups < 0) { - brelse(primary[res]); - err = gdbackups; - goto exit_bh; - } - if (++data >= end) - data = (__le32 *)dind->b_data; - } - - for (i = 0; i < reserved_gdb; i++) { - if ((err = ext4_journal_get_write_access(handle, primary[i]))) { - /* - int j; - for (j = 0; j < i; j++) - ext4_handle_release_buffer(handle, primary[j]); - */ - goto exit_bh; - } - } - - if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) - goto exit_bh; - - /* - * Finally we can add each of the reserved backup GDT blocks from - * the new group to its reserved primary GDT block. - */ - blk = group * EXT4_BLOCKS_PER_GROUP(sb); - for (i = 0; i < reserved_gdb; i++) { - int err2; - data = (__le32 *)primary[i]->b_data; - /* printk("reserving backup %lu[%u] = %lu\n", - primary[i]->b_blocknr, gdbackups, - blk + primary[i]->b_blocknr); */ - data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); - err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); - if (!err) - err = err2; - } - inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; - ext4_mark_iloc_dirty(handle, inode, &iloc); - -exit_bh: - while (--res >= 0) - brelse(primary[res]); - brelse(dind); - -exit_free: - kfree(primary); - - return err; -} - -/* - * Update the backup copies of the ext4 metadata. These don't need to be part - * of the main resize transaction, because e2fsck will re-write them if there - * is a problem (basically only OOM will cause a problem). However, we - * _should_ update the backups if possible, in case the primary gets trashed - * for some reason and we need to run e2fsck from a backup superblock. The - * important part is that the new block and inode counts are in the backup - * superblocks, and the location of the new group metadata in the GDT backups. - * - * We do not need take the s_resize_lock for this, because these - * blocks are not otherwise touched by the filesystem code when it is - * mounted. We don't need to worry about last changing from - * sbi->s_groups_count, because the worst that can happen is that we - * do not copy the full number of backups at this time. The resize - * which changed s_groups_count will backup again. - */ -static void update_backups(struct super_block *sb, - int blk_off, char *data, int size) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - const ext4_group_t last = sbi->s_groups_count; - const int bpg = EXT4_BLOCKS_PER_GROUP(sb); - unsigned three = 1; - unsigned five = 5; - unsigned seven = 7; - ext4_group_t group; - int rest = sb->s_blocksize - size; - handle_t *handle; - int err = 0, err2; - - handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); - if (IS_ERR(handle)) { - group = 1; - err = PTR_ERR(handle); - goto exit_err; - } - - while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) { - struct buffer_head *bh; - - /* Out of journal space, and can't get more - abort - so sad */ - if (ext4_handle_valid(handle) && - handle->h_buffer_credits == 0 && - ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && - (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) - break; - - bh = sb_getblk(sb, group * bpg + blk_off); - if (!bh) { - err = -EIO; - break; - } - ext4_debug("update metadata backup %#04lx\n", - (unsigned long)bh->b_blocknr); - if ((err = ext4_journal_get_write_access(handle, bh))) - break; - lock_buffer(bh); - memcpy(bh->b_data, data, size); - if (rest) - memset(bh->b_data + size, 0, rest); - set_buffer_uptodate(bh); - unlock_buffer(bh); - err = ext4_handle_dirty_metadata(handle, NULL, bh); - if (unlikely(err)) - ext4_std_error(sb, err); - brelse(bh); - } - if ((err2 = ext4_journal_stop(handle)) && !err) - err = err2; - - /* - * Ugh! Need to have e2fsck write the backup copies. It is too - * late to revert the resize, we shouldn't fail just because of - * the backup copies (they are only needed in case of corruption). - * - * However, if we got here we have a journal problem too, so we - * can't really start a transaction to mark the superblock. - * Chicken out and just set the flag on the hope it will be written - * to disk, and if not - we will simply wait until next fsck. - */ -exit_err: - if (err) { - ext4_warning(sb, "can't update backup for group %u (err %d), " - "forcing fsck on next reboot", group, err); - sbi->s_mount_state &= ~EXT4_VALID_FS; - sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); - mark_buffer_dirty(sbi->s_sbh); - } -} - -/* - * ext4_add_new_descs() adds @count group descriptor of groups - * starting at @group - * - * @handle: journal handle - * @sb: super block - * @group: the group no. of the first group desc to be added - * @resize_inode: the resize inode - * @count: number of group descriptors to be added - */ -static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, - ext4_group_t group, struct inode *resize_inode, - ext4_group_t count) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - struct buffer_head *gdb_bh; - int i, gdb_off, gdb_num, err = 0; - - for (i = 0; i < count; i++, group++) { - int reserved_gdb = ext4_bg_has_super(sb, group) ? - le16_to_cpu(es->s_reserved_gdt_blocks) : 0; - - gdb_off = group % EXT4_DESC_PER_BLOCK(sb); - gdb_num = group / EXT4_DESC_PER_BLOCK(sb); - - /* - * We will only either add reserved group blocks to a backup group - * or remove reserved blocks for the first group in a new group block. - * Doing both would be mean more complex code, and sane people don't - * use non-sparse filesystems anymore. This is already checked above. - */ - if (gdb_off) { - gdb_bh = sbi->s_group_desc[gdb_num]; - err = ext4_journal_get_write_access(handle, gdb_bh); - - if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) - err = reserve_backup_gdb(handle, resize_inode, group); - } else - err = add_new_gdb(handle, resize_inode, group); - if (err) - break; - } - return err; -} - -/* - * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg - */ -static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, - struct ext4_new_flex_group_data *flex_gd) -{ - struct ext4_new_group_data *group_data = flex_gd->groups; - struct ext4_group_desc *gdp; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct buffer_head *gdb_bh; - ext4_group_t group; - __u16 *bg_flags = flex_gd->bg_flags; - int i, gdb_off, gdb_num, err = 0; - - - for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) { - group = group_data->group; - - gdb_off = group % EXT4_DESC_PER_BLOCK(sb); - gdb_num = group / EXT4_DESC_PER_BLOCK(sb); - - /* - * get_write_access() has been called on gdb_bh by ext4_add_new_desc(). - */ - gdb_bh = sbi->s_group_desc[gdb_num]; - /* Update group descriptor block for new group */ - gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data + - gdb_off * EXT4_DESC_SIZE(sb)); - - memset(gdp, 0, EXT4_DESC_SIZE(sb)); - ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); - ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); - ext4_inode_table_set(sb, gdp, group_data->inode_table); - ext4_free_group_clusters_set(sb, gdp, - EXT4_B2C(sbi, group_data->free_blocks_count)); - ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); - gdp->bg_flags = cpu_to_le16(*bg_flags); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); - - err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); - if (unlikely(err)) { - ext4_std_error(sb, err); - break; - } - - /* - * We can allocate memory for mb_alloc based on the new group - * descriptor - */ - err = ext4_mb_add_groupinfo(sb, group, gdp); - if (err) - break; - } - return err; -} - -/* - * ext4_update_super() updates the super block so that the newly added - * groups can be seen by the filesystem. - * - * @sb: super block - * @flex_gd: new added groups - */ -static void ext4_update_super(struct super_block *sb, - struct ext4_new_flex_group_data *flex_gd) -{ - ext4_fsblk_t blocks_count = 0; - ext4_fsblk_t free_blocks = 0; - ext4_fsblk_t reserved_blocks = 0; - struct ext4_new_group_data *group_data = flex_gd->groups; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - int i; - - BUG_ON(flex_gd->count == 0 || group_data == NULL); - /* - * Make the new blocks and inodes valid next. We do this before - * increasing the group count so that once the group is enabled, - * all of its blocks and inodes are already valid. - * - * We always allocate group-by-group, then block-by-block or - * inode-by-inode within a group, so enabling these - * blocks/inodes before the group is live won't actually let us - * allocate the new space yet. - */ - for (i = 0; i < flex_gd->count; i++) { - blocks_count += group_data[i].blocks_count; - free_blocks += group_data[i].free_blocks_count; - } - - reserved_blocks = ext4_r_blocks_count(es) * 100; - do_div(reserved_blocks, ext4_blocks_count(es)); - reserved_blocks *= blocks_count; - do_div(reserved_blocks, 100); - - ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); - ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks); - le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * - flex_gd->count); - le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * - flex_gd->count); - - /* - * We need to protect s_groups_count against other CPUs seeing - * inconsistent state in the superblock. - * - * The precise rules we use are: - * - * * Writers must perform a smp_wmb() after updating all - * dependent data and before modifying the groups count - * - * * Readers must perform an smp_rmb() after reading the groups - * count and before reading any dependent data. - * - * NB. These rules can be relaxed when checking the group count - * while freeing data, as we can only allocate from a block - * group after serialising against the group count, and we can - * only then free after serialising in turn against that - * allocation. - */ - smp_wmb(); - - /* Update the global fs size fields */ - sbi->s_groups_count += flex_gd->count; - - /* Update the reserved block counts only once the new group is - * active. */ - ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + - reserved_blocks); - - /* Update the free space counts */ - percpu_counter_add(&sbi->s_freeclusters_counter, - EXT4_B2C(sbi, free_blocks)); - percpu_counter_add(&sbi->s_freeinodes_counter, - EXT4_INODES_PER_GROUP(sb) * flex_gd->count); - - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_FLEX_BG) && - sbi->s_log_groups_per_flex) { - ext4_group_t flex_group; - flex_group = ext4_flex_group(sbi, group_data[0].group); - atomic_add(EXT4_B2C(sbi, free_blocks), - &sbi->s_flex_groups[flex_group].free_clusters); - atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, - &sbi->s_flex_groups[flex_group].free_inodes); - } - - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: added group %u:" - "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, - blocks_count, free_blocks, reserved_blocks); -} - -/* Add a flex group to an fs. Ensure we handle all possible error conditions - * _before_ we start modifying the filesystem, because we cannot abort the - * transaction and not have it write the data to disk. - */ -static int ext4_flex_group_add(struct super_block *sb, - struct inode *resize_inode, - struct ext4_new_flex_group_data *flex_gd) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - ext4_fsblk_t o_blocks_count; - ext4_grpblk_t last; - ext4_group_t group; - handle_t *handle; - unsigned reserved_gdb; - int err = 0, err2 = 0, credit; - - BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags); - - reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); - o_blocks_count = ext4_blocks_count(es); - ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); - BUG_ON(last); - - err = setup_new_flex_group_blocks(sb, flex_gd); - if (err) - goto exit; - /* - * We will always be modifying at least the superblock and GDT - * block. If we are adding a group past the last current GDT block, - * we will also modify the inode and the dindirect block. If we - * are adding a group with superblock/GDT backups we will also - * modify each of the reserved GDT dindirect blocks. - */ - credit = flex_gd->count * 4 + reserved_gdb; - handle = ext4_journal_start_sb(sb, credit); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto exit; - } - - err = ext4_journal_get_write_access(handle, sbi->s_sbh); - if (err) - goto exit_journal; - - group = flex_gd->groups[0].group; - BUG_ON(group != EXT4_SB(sb)->s_groups_count); - err = ext4_add_new_descs(handle, sb, group, - resize_inode, flex_gd->count); - if (err) - goto exit_journal; - - err = ext4_setup_new_descs(handle, sb, flex_gd); - if (err) - goto exit_journal; - - ext4_update_super(sb, flex_gd); - - err = ext4_handle_dirty_super(handle, sb); - -exit_journal: - err2 = ext4_journal_stop(handle); - if (!err) - err = err2; - - if (!err) { - int i; - update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block)); - for (i = 0; i < flex_gd->count; i++, group++) { - struct buffer_head *gdb_bh; - int gdb_num; - gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb); - gdb_bh = sbi->s_group_desc[gdb_num]; - update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, - gdb_bh->b_size); - } - } -exit: - return err; -} - -static int ext4_setup_next_flex_gd(struct super_block *sb, - struct ext4_new_flex_group_data *flex_gd, - ext4_fsblk_t n_blocks_count, - unsigned long flexbg_size) -{ - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - struct ext4_new_group_data *group_data = flex_gd->groups; - ext4_fsblk_t o_blocks_count; - ext4_group_t n_group; - ext4_group_t group; - ext4_group_t last_group; - ext4_grpblk_t last; - ext4_grpblk_t blocks_per_group; - unsigned long i; - - blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb); - - o_blocks_count = ext4_blocks_count(es); - - if (o_blocks_count == n_blocks_count) - return 0; - - ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); - BUG_ON(last); - ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last); - - last_group = group | (flexbg_size - 1); - if (last_group > n_group) - last_group = n_group; - - flex_gd->count = last_group - group + 1; - - for (i = 0; i < flex_gd->count; i++) { - int overhead; - - group_data[i].group = group + i; - group_data[i].blocks_count = blocks_per_group; - overhead = ext4_bg_has_super(sb, group + i) ? - (1 + ext4_bg_num_gdb(sb, group + i) + - le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; - group_data[i].free_blocks_count = blocks_per_group - overhead; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) - flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | - EXT4_BG_INODE_UNINIT; - else - flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; - } - - if (last_group == n_group && - EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) - /* We need to initialize block bitmap of last group. */ - flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; - - if ((last_group == n_group) && (last != blocks_per_group - 1)) { - group_data[i - 1].blocks_count = last + 1; - group_data[i - 1].free_blocks_count -= blocks_per_group- - last - 1; - } - - return 1; -} - -/* Add group descriptor data to an existing or new group descriptor block. - * Ensure we handle all possible error conditions _before_ we start modifying - * the filesystem, because we cannot abort the transaction and not have it - * write the data to disk. - * - * If we are on a GDT block boundary, we need to get the reserved GDT block. - * Otherwise, we may need to add backup GDT blocks for a sparse group. - * - * We only need to hold the superblock lock while we are actually adding - * in the new group's counts to the superblock. Prior to that we have - * not really "added" the group at all. We re-check that we are still - * adding in the last group in case things have changed since verifying. - */ -int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) -{ - struct ext4_new_flex_group_data flex_gd; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - int reserved_gdb = ext4_bg_has_super(sb, input->group) ? - le16_to_cpu(es->s_reserved_gdt_blocks) : 0; - struct inode *inode = NULL; - int gdb_off, gdb_num; - int err; - __u16 bg_flags = 0; - - gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); - gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); - - if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { - ext4_warning(sb, "Can't resize non-sparse filesystem further"); - return -EPERM; - } - - if (ext4_blocks_count(es) + input->blocks_count < - ext4_blocks_count(es)) { - ext4_warning(sb, "blocks_count overflow"); - return -EINVAL; - } - - if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < - le32_to_cpu(es->s_inodes_count)) { - ext4_warning(sb, "inodes_count overflow"); - return -EINVAL; - } - - if (reserved_gdb || gdb_off == 0) { - if (!EXT4_HAS_COMPAT_FEATURE(sb, - EXT4_FEATURE_COMPAT_RESIZE_INODE) - || !le16_to_cpu(es->s_reserved_gdt_blocks)) { - ext4_warning(sb, - "No reserved GDT blocks, can't resize"); - return -EPERM; - } - inode = ext4_iget(sb, EXT4_RESIZE_INO); - if (IS_ERR(inode)) { - ext4_warning(sb, "Error opening resize inode"); - return PTR_ERR(inode); - } - } - - - err = verify_group_input(sb, input); - if (err) - goto out; - - flex_gd.count = 1; - flex_gd.groups = input; - flex_gd.bg_flags = &bg_flags; - err = ext4_flex_group_add(sb, inode, &flex_gd); -out: - iput(inode); - return err; -} /* ext4_group_add */ - -/* - * extend a group without checking assuming that checking has been done. - */ -static int ext4_group_extend_no_check(struct super_block *sb, - ext4_fsblk_t o_blocks_count, ext4_grpblk_t add) -{ - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - handle_t *handle; - int err = 0, err2; - - /* We will update the superblock, one block bitmap, and - * one group descriptor via ext4_group_add_blocks(). - */ - handle = ext4_journal_start_sb(sb, 3); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - ext4_warning(sb, "error %d on journal start", err); - return err; - } - - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); - if (err) { - ext4_warning(sb, "error %d on journal write access", err); - goto errout; - } - - ext4_blocks_count_set(es, o_blocks_count + add); - ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add); - ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, - o_blocks_count + add); - /* We add the blocks to the bitmap and set the group need init bit */ - err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); - if (err) - goto errout; - ext4_handle_dirty_super(handle, sb); - ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, - o_blocks_count + add); -errout: - err2 = ext4_journal_stop(handle); - if (err2 && !err) - err = err2; - - if (!err) { - if (test_opt(sb, DEBUG)) - printk(KERN_DEBUG "EXT4-fs: extended group to %llu " - "blocks\n", ext4_blocks_count(es)); - update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es, - sizeof(struct ext4_super_block)); - } - return err; -} - -/* - * Extend the filesystem to the new number of blocks specified. This entry - * point is only used to extend the current filesystem to the end of the last - * existing group. It can be accessed via ioctl, or by "remount,resize=<size>" - * for emergencies (because it has no dependencies on reserved blocks). - * - * If we _really_ wanted, we could use default values to call ext4_group_add() - * allow the "remount" trick to work for arbitrary resizing, assuming enough - * GDT blocks are reserved to grow to the desired size. - */ -int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, - ext4_fsblk_t n_blocks_count) -{ - ext4_fsblk_t o_blocks_count; - ext4_grpblk_t last; - ext4_grpblk_t add; - struct buffer_head *bh; - int err; - ext4_group_t group; - - o_blocks_count = ext4_blocks_count(es); - - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, - "extending last group from %llu to %llu blocks", - o_blocks_count, n_blocks_count); - - if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) - return 0; - - if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { - ext4_msg(sb, KERN_ERR, - "filesystem too large to resize to %llu blocks safely", - n_blocks_count); - if (sizeof(sector_t) < 8) - ext4_warning(sb, "CONFIG_LBDAF not enabled"); - return -EINVAL; - } - - if (n_blocks_count < o_blocks_count) { - ext4_warning(sb, "can't shrink FS - resize aborted"); - return -EINVAL; - } - - /* Handle the remaining blocks in the last group only. */ - ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); - - if (last == 0) { - ext4_warning(sb, "need to use ext2online to resize further"); - return -EPERM; - } - - add = EXT4_BLOCKS_PER_GROUP(sb) - last; - - if (o_blocks_count + add < o_blocks_count) { - ext4_warning(sb, "blocks_count overflow"); - return -EINVAL; - } - - if (o_blocks_count + add > n_blocks_count) - add = n_blocks_count - o_blocks_count; - - if (o_blocks_count + add < n_blocks_count) - ext4_warning(sb, "will only finish group (%llu blocks, %u new)", - o_blocks_count + add, add); - - /* See if the device is actually as big as what was requested */ - bh = sb_bread(sb, o_blocks_count + add - 1); - if (!bh) { - ext4_warning(sb, "can't read last block, resize aborted"); - return -ENOSPC; - } - brelse(bh); - - err = ext4_group_extend_no_check(sb, o_blocks_count, add); - return err; -} /* ext4_group_extend */ - -/* - * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count - * - * @sb: super block of the fs to be resized - * @n_blocks_count: the number of blocks resides in the resized fs - */ -int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) -{ - struct ext4_new_flex_group_data *flex_gd = NULL; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - struct buffer_head *bh; - struct inode *resize_inode; - ext4_fsblk_t o_blocks_count; - ext4_group_t o_group; - ext4_group_t n_group; - ext4_grpblk_t offset, add; - unsigned long n_desc_blocks; - unsigned long o_desc_blocks; - unsigned long desc_blocks; - int err = 0, flexbg_size = 1; - - o_blocks_count = ext4_blocks_count(es); - - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu " - "to %llu blocks", o_blocks_count, n_blocks_count); - - if (n_blocks_count < o_blocks_count) { - /* On-line shrinking not supported */ - ext4_warning(sb, "can't shrink FS - resize aborted"); - return -EINVAL; - } - - if (n_blocks_count == o_blocks_count) - /* Nothing need to do */ - return 0; - - ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset); - ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); - - n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) / - EXT4_DESC_PER_BLOCK(sb); - o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / - EXT4_DESC_PER_BLOCK(sb); - desc_blocks = n_desc_blocks - o_desc_blocks; - - if (desc_blocks && - (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) || - le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) { - ext4_warning(sb, "No reserved GDT blocks, can't resize"); - return -EPERM; - } - - resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); - if (IS_ERR(resize_inode)) { - ext4_warning(sb, "Error opening resize inode"); - return PTR_ERR(resize_inode); - } - - /* See if the device is actually as big as what was requested */ - bh = sb_bread(sb, n_blocks_count - 1); - if (!bh) { - ext4_warning(sb, "can't read last block, resize aborted"); - return -ENOSPC; - } - brelse(bh); - - /* extend the last group */ - if (n_group == o_group) - add = n_blocks_count - o_blocks_count; - else - add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1); - if (add > 0) { - err = ext4_group_extend_no_check(sb, o_blocks_count, add); - if (err) - goto out; - } - - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && - es->s_log_groups_per_flex) - flexbg_size = 1 << es->s_log_groups_per_flex; - - o_blocks_count = ext4_blocks_count(es); - if (o_blocks_count == n_blocks_count) - goto out; - - flex_gd = alloc_flex_gd(flexbg_size); - if (flex_gd == NULL) { - err = -ENOMEM; - goto out; - } - - /* Add flex groups. Note that a regular group is a - * flex group with 1 group. - */ - while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, - flexbg_size)) { - ext4_alloc_group_tables(sb, flex_gd, flexbg_size); - err = ext4_flex_group_add(sb, resize_inode, flex_gd); - if (unlikely(err)) - break; - } - -out: - if (flex_gd) - free_flex_gd(flex_gd); - - iput(resize_inode); - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu " - "upto %llu blocks", o_blocks_count, n_blocks_count); - return err; -} diff --git a/ANDROID_3.4.5/fs/ext4/super.c b/ANDROID_3.4.5/fs/ext4/super.c deleted file mode 100644 index a68703a5..00000000 --- a/ANDROID_3.4.5/fs/ext4/super.c +++ /dev/null @@ -1,4980 +0,0 @@ -/* - * linux/fs/ext4/super.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/inode.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - */ - -#include <linux/module.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/vmalloc.h> -#include <linux/jbd2.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/blkdev.h> -#include <linux/parser.h> -#include <linux/buffer_head.h> -#include <linux/exportfs.h> -#include <linux/vfs.h> -#include <linux/random.h> -#include <linux/mount.h> -#include <linux/namei.h> -#include <linux/quotaops.h> -#include <linux/seq_file.h> -#include <linux/proc_fs.h> -#include <linux/ctype.h> -#include <linux/log2.h> -#include <linux/crc16.h> -#include <linux/cleancache.h> -#include <asm/uaccess.h> - -#include <linux/kthread.h> -#include <linux/freezer.h> - -#include "ext4.h" -#include "ext4_extents.h" -#include "ext4_jbd2.h" -#include "xattr.h" -#include "acl.h" -#include "mballoc.h" - -#define CREATE_TRACE_POINTS -#include <trace/events/ext4.h> - -static struct proc_dir_entry *ext4_proc_root; -static struct kset *ext4_kset; -static struct ext4_lazy_init *ext4_li_info; -static struct mutex ext4_li_mtx; -static struct ext4_features *ext4_feat; - -static int ext4_load_journal(struct super_block *, struct ext4_super_block *, - unsigned long journal_devnum); -static int ext4_show_options(struct seq_file *seq, struct dentry *root); -static int ext4_commit_super(struct super_block *sb, int sync); -static void ext4_mark_recovery_complete(struct super_block *sb, - struct ext4_super_block *es); -static void ext4_clear_journal_err(struct super_block *sb, - struct ext4_super_block *es); -static int ext4_sync_fs(struct super_block *sb, int wait); -static const char *ext4_decode_error(struct super_block *sb, int errno, - char nbuf[16]); -static int ext4_remount(struct super_block *sb, int *flags, char *data); -static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); -static int ext4_unfreeze(struct super_block *sb); -static void ext4_write_super(struct super_block *sb); -static int ext4_freeze(struct super_block *sb); -static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data); -static inline int ext2_feature_set_ok(struct super_block *sb); -static inline int ext3_feature_set_ok(struct super_block *sb); -static int ext4_feature_set_ok(struct super_block *sb, int readonly); -static void ext4_destroy_lazyinit_thread(void); -static void ext4_unregister_li_request(struct super_block *sb); -static void ext4_clear_request_list(void); - -#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) -static struct file_system_type ext2_fs_type = { - .owner = THIS_MODULE, - .name = "ext2", - .mount = ext4_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) -#else -#define IS_EXT2_SB(sb) (0) -#endif - - -#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) -static struct file_system_type ext3_fs_type = { - .owner = THIS_MODULE, - .name = "ext3", - .mount = ext4_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) -#else -#define IS_EXT3_SB(sb) (0) -#endif - -void *ext4_kvmalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kmalloc(size, flags); - if (!ret) - ret = __vmalloc(size, flags, PAGE_KERNEL); - return ret; -} - -void *ext4_kvzalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kzalloc(size, flags); - if (!ret) - ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); - return ret; -} - -void ext4_kvfree(void *ptr) -{ - if (is_vmalloc_addr(ptr)) - vfree(ptr); - else - kfree(ptr); - -} - -ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, - struct ext4_group_desc *bg) -{ - return le32_to_cpu(bg->bg_block_bitmap_lo) | - (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); -} - -ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, - struct ext4_group_desc *bg) -{ - return le32_to_cpu(bg->bg_inode_bitmap_lo) | - (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); -} - -ext4_fsblk_t ext4_inode_table(struct super_block *sb, - struct ext4_group_desc *bg) -{ - return le32_to_cpu(bg->bg_inode_table_lo) | - (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); -} - -__u32 ext4_free_group_clusters(struct super_block *sb, - struct ext4_group_desc *bg) -{ - return le16_to_cpu(bg->bg_free_blocks_count_lo) | - (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); -} - -__u32 ext4_free_inodes_count(struct super_block *sb, - struct ext4_group_desc *bg) -{ - return le16_to_cpu(bg->bg_free_inodes_count_lo) | - (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); -} - -__u32 ext4_used_dirs_count(struct super_block *sb, - struct ext4_group_desc *bg) -{ - return le16_to_cpu(bg->bg_used_dirs_count_lo) | - (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); -} - -__u32 ext4_itable_unused_count(struct super_block *sb, - struct ext4_group_desc *bg) -{ - return le16_to_cpu(bg->bg_itable_unused_lo) | - (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); -} - -void ext4_block_bitmap_set(struct super_block *sb, - struct ext4_group_desc *bg, ext4_fsblk_t blk) -{ - bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); - if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) - bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); -} - -void ext4_inode_bitmap_set(struct super_block *sb, - struct ext4_group_desc *bg, ext4_fsblk_t blk) -{ - bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); - if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) - bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); -} - -void ext4_inode_table_set(struct super_block *sb, - struct ext4_group_desc *bg, ext4_fsblk_t blk) -{ - bg->bg_inode_table_lo = cpu_to_le32((u32)blk); - if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) - bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); -} - -void ext4_free_group_clusters_set(struct super_block *sb, - struct ext4_group_desc *bg, __u32 count) -{ - bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); - if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) - bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); -} - -void ext4_free_inodes_set(struct super_block *sb, - struct ext4_group_desc *bg, __u32 count) -{ - bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); - if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) - bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); -} - -void ext4_used_dirs_set(struct super_block *sb, - struct ext4_group_desc *bg, __u32 count) -{ - bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); - if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) - bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); -} - -void ext4_itable_unused_set(struct super_block *sb, - struct ext4_group_desc *bg, __u32 count) -{ - bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); - if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) - bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); -} - - -/* Just increment the non-pointer handle value */ -static handle_t *ext4_get_nojournal(void) -{ - handle_t *handle = current->journal_info; - unsigned long ref_cnt = (unsigned long)handle; - - BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); - - ref_cnt++; - handle = (handle_t *)ref_cnt; - - current->journal_info = handle; - return handle; -} - - -/* Decrement the non-pointer handle value */ -static void ext4_put_nojournal(handle_t *handle) -{ - unsigned long ref_cnt = (unsigned long)handle; - - BUG_ON(ref_cnt == 0); - - ref_cnt--; - handle = (handle_t *)ref_cnt; - - current->journal_info = handle; -} - -/* - * Wrappers for jbd2_journal_start/end. - * - * The only special thing we need to do here is to make sure that all - * journal_end calls result in the superblock being marked dirty, so - * that sync() will call the filesystem's write_super callback if - * appropriate. - * - * To avoid j_barrier hold in userspace when a user calls freeze(), - * ext4 prevents a new handle from being started by s_frozen, which - * is in an upper layer. - */ -handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) -{ - journal_t *journal; - handle_t *handle; - - trace_ext4_journal_start(sb, nblocks, _RET_IP_); - if (sb->s_flags & MS_RDONLY) - return ERR_PTR(-EROFS); - - journal = EXT4_SB(sb)->s_journal; - handle = ext4_journal_current_handle(); - - /* - * If a handle has been started, it should be allowed to - * finish, otherwise deadlock could happen between freeze - * and others(e.g. truncate) due to the restart of the - * journal handle if the filesystem is forzen and active - * handles are not stopped. - */ - if (!handle) - vfs_check_frozen(sb, SB_FREEZE_TRANS); - - if (!journal) - return ext4_get_nojournal(); - /* - * Special case here: if the journal has aborted behind our - * backs (eg. EIO in the commit thread), then we still need to - * take the FS itself readonly cleanly. - */ - if (is_journal_aborted(journal)) { - ext4_abort(sb, "Detected aborted journal"); - return ERR_PTR(-EROFS); - } - return jbd2_journal_start(journal, nblocks); -} - -/* - * The only special thing we need to do here is to make sure that all - * jbd2_journal_stop calls result in the superblock being marked dirty, so - * that sync() will call the filesystem's write_super callback if - * appropriate. - */ -int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) -{ - struct super_block *sb; - int err; - int rc; - - if (!ext4_handle_valid(handle)) { - ext4_put_nojournal(handle); - return 0; - } - sb = handle->h_transaction->t_journal->j_private; - err = handle->h_err; - rc = jbd2_journal_stop(handle); - - if (!err) - err = rc; - if (err) - __ext4_std_error(sb, where, line, err); - return err; -} - -void ext4_journal_abort_handle(const char *caller, unsigned int line, - const char *err_fn, struct buffer_head *bh, - handle_t *handle, int err) -{ - char nbuf[16]; - const char *errstr = ext4_decode_error(NULL, err, nbuf); - - BUG_ON(!ext4_handle_valid(handle)); - - if (bh) - BUFFER_TRACE(bh, "abort"); - - if (!handle->h_err) - handle->h_err = err; - - if (is_handle_aborted(handle)) - return; - - printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n", - caller, line, errstr, err_fn); - - jbd2_journal_abort_handle(handle); -} - -static void __save_error_info(struct super_block *sb, const char *func, - unsigned int line) -{ - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - es->s_last_error_time = cpu_to_le32(get_seconds()); - strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); - es->s_last_error_line = cpu_to_le32(line); - if (!es->s_first_error_time) { - es->s_first_error_time = es->s_last_error_time; - strncpy(es->s_first_error_func, func, - sizeof(es->s_first_error_func)); - es->s_first_error_line = cpu_to_le32(line); - es->s_first_error_ino = es->s_last_error_ino; - es->s_first_error_block = es->s_last_error_block; - } - /* - * Start the daily error reporting function if it hasn't been - * started already - */ - if (!es->s_error_count) - mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); - es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1); -} - -static void save_error_info(struct super_block *sb, const char *func, - unsigned int line) -{ - __save_error_info(sb, func, line); - ext4_commit_super(sb, 1); -} - -/* - * The del_gendisk() function uninitializes the disk-specific data - * structures, including the bdi structure, without telling anyone - * else. Once this happens, any attempt to call mark_buffer_dirty() - * (for example, by ext4_commit_super), will cause a kernel OOPS. - * This is a kludge to prevent these oops until we can put in a proper - * hook in del_gendisk() to inform the VFS and file system layers. - */ -static int block_device_ejected(struct super_block *sb) -{ - struct inode *bd_inode = sb->s_bdev->bd_inode; - struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info; - - return bdi->dev == NULL; -} - -static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) -{ - struct super_block *sb = journal->j_private; - struct ext4_sb_info *sbi = EXT4_SB(sb); - int error = is_journal_aborted(journal); - struct ext4_journal_cb_entry *jce, *tmp; - - spin_lock(&sbi->s_md_lock); - list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) { - list_del_init(&jce->jce_list); - spin_unlock(&sbi->s_md_lock); - jce->jce_func(sb, jce, error); - spin_lock(&sbi->s_md_lock); - } - spin_unlock(&sbi->s_md_lock); -} - -/* Deal with the reporting of failure conditions on a filesystem such as - * inconsistencies detected or read IO failures. - * - * On ext2, we can store the error state of the filesystem in the - * superblock. That is not possible on ext4, because we may have other - * write ordering constraints on the superblock which prevent us from - * writing it out straight away; and given that the journal is about to - * be aborted, we can't rely on the current, or future, transactions to - * write out the superblock safely. - * - * We'll just use the jbd2_journal_abort() error code to record an error in - * the journal instead. On recovery, the journal will complain about - * that error until we've noted it down and cleared it. - */ - -static void ext4_handle_error(struct super_block *sb) -{ - if (sb->s_flags & MS_RDONLY) - return; - - if (!test_opt(sb, ERRORS_CONT)) { - journal_t *journal = EXT4_SB(sb)->s_journal; - - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; - if (journal) - jbd2_journal_abort(journal, -EIO); - } - if (test_opt(sb, ERRORS_RO)) { - ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); - sb->s_flags |= MS_RDONLY; - } - if (test_opt(sb, ERRORS_PANIC)) - panic("EXT4-fs (device %s): panic forced after error\n", - sb->s_id); -} - -void __ext4_error(struct super_block *sb, const char *function, - unsigned int line, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", - sb->s_id, function, line, current->comm, &vaf); - va_end(args); - save_error_info(sb, function, line); - - ext4_handle_error(sb); -} - -void ext4_error_inode(struct inode *inode, const char *function, - unsigned int line, ext4_fsblk_t block, - const char *fmt, ...) -{ - va_list args; - struct va_format vaf; - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - - es->s_last_error_ino = cpu_to_le32(inode->i_ino); - es->s_last_error_block = cpu_to_le64(block); - save_error_info(inode->i_sb, function, line); - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - if (block) - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " - "inode #%lu: block %llu: comm %s: %pV\n", - inode->i_sb->s_id, function, line, inode->i_ino, - block, current->comm, &vaf); - else - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " - "inode #%lu: comm %s: %pV\n", - inode->i_sb->s_id, function, line, inode->i_ino, - current->comm, &vaf); - va_end(args); - - ext4_handle_error(inode->i_sb); -} - -void ext4_error_file(struct file *file, const char *function, - unsigned int line, ext4_fsblk_t block, - const char *fmt, ...) -{ - va_list args; - struct va_format vaf; - struct ext4_super_block *es; - struct inode *inode = file->f_dentry->d_inode; - char pathname[80], *path; - - es = EXT4_SB(inode->i_sb)->s_es; - es->s_last_error_ino = cpu_to_le32(inode->i_ino); - save_error_info(inode->i_sb, function, line); - path = d_path(&(file->f_path), pathname, sizeof(pathname)); - if (IS_ERR(path)) - path = "(unknown)"; - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - if (block) - printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu: " - "block %llu: comm %s: path %s: %pV\n", - inode->i_sb->s_id, function, line, inode->i_ino, - block, current->comm, path, &vaf); - else - printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu: " - "comm %s: path %s: %pV\n", - inode->i_sb->s_id, function, line, inode->i_ino, - current->comm, path, &vaf); - va_end(args); - - ext4_handle_error(inode->i_sb); -} - -static const char *ext4_decode_error(struct super_block *sb, int errno, - char nbuf[16]) -{ - char *errstr = NULL; - - switch (errno) { - case -EIO: - errstr = "IO failure"; - break; - case -ENOMEM: - errstr = "Out of memory"; - break; - case -EROFS: - if (!sb || (EXT4_SB(sb)->s_journal && - EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) - errstr = "Journal has aborted"; - else - errstr = "Readonly filesystem"; - break; - default: - /* If the caller passed in an extra buffer for unknown - * errors, textualise them now. Else we just return - * NULL. */ - if (nbuf) { - /* Check for truncated error codes... */ - if (snprintf(nbuf, 16, "error %d", -errno) >= 0) - errstr = nbuf; - } - break; - } - - return errstr; -} - -/* __ext4_std_error decodes expected errors from journaling functions - * automatically and invokes the appropriate error response. */ - -void __ext4_std_error(struct super_block *sb, const char *function, - unsigned int line, int errno) -{ - char nbuf[16]; - const char *errstr; - - /* Special case: if the error is EROFS, and we're not already - * inside a transaction, then there's really no point in logging - * an error. */ - if (errno == -EROFS && journal_current_handle() == NULL && - (sb->s_flags & MS_RDONLY)) - return; - - errstr = ext4_decode_error(sb, errno, nbuf); - printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", - sb->s_id, function, line, errstr); - save_error_info(sb, function, line); - - ext4_handle_error(sb); -} - -/* - * ext4_abort is a much stronger failure handler than ext4_error. The - * abort function may be used to deal with unrecoverable failures such - * as journal IO errors or ENOMEM at a critical moment in log management. - * - * We unconditionally force the filesystem into an ABORT|READONLY state, - * unless the error response on the fs has been set to panic in which - * case we take the easy way out and panic immediately. - */ - -void __ext4_abort(struct super_block *sb, const char *function, - unsigned int line, const char *fmt, ...) -{ - va_list args; - - save_error_info(sb, function, line); - va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id, - function, line); - vprintk(fmt, args); - printk("\n"); - va_end(args); - - if ((sb->s_flags & MS_RDONLY) == 0) { - ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); - sb->s_flags |= MS_RDONLY; - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; - if (EXT4_SB(sb)->s_journal) - jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); - save_error_info(sb, function, line); - } - if (test_opt(sb, ERRORS_PANIC)) - panic("EXT4-fs panic from previous error\n"); -} - -void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); - va_end(args); -} - -void __ext4_warning(struct super_block *sb, const char *function, - unsigned int line, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", - sb->s_id, function, line, &vaf); - va_end(args); -} - -void __ext4_grp_locked_error(const char *function, unsigned int line, - struct super_block *sb, ext4_group_t grp, - unsigned long ino, ext4_fsblk_t block, - const char *fmt, ...) -__releases(bitlock) -__acquires(bitlock) -{ - struct va_format vaf; - va_list args; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - - es->s_last_error_ino = cpu_to_le32(ino); - es->s_last_error_block = cpu_to_le64(block); - __save_error_info(sb, function, line); - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", - sb->s_id, function, line, grp); - if (ino) - printk(KERN_CONT "inode %lu: ", ino); - if (block) - printk(KERN_CONT "block %llu:", (unsigned long long) block); - printk(KERN_CONT "%pV\n", &vaf); - va_end(args); - - if (test_opt(sb, ERRORS_CONT)) { - ext4_commit_super(sb, 0); - return; - } - - ext4_unlock_group(sb, grp); - ext4_handle_error(sb); - /* - * We only get here in the ERRORS_RO case; relocking the group - * may be dangerous, but nothing bad will happen since the - * filesystem will have already been marked read/only and the - * journal has been aborted. We return 1 as a hint to callers - * who might what to use the return value from - * ext4_grp_locked_error() to distinguish between the - * ERRORS_CONT and ERRORS_RO case, and perhaps return more - * aggressively from the ext4 function in question, with a - * more appropriate error code. - */ - ext4_lock_group(sb, grp); - return; -} - -void ext4_update_dynamic_rev(struct super_block *sb) -{ - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - - if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) - return; - - ext4_warning(sb, - "updating to rev %d because of new feature flag, " - "running e2fsck is recommended", - EXT4_DYNAMIC_REV); - - es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); - es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); - es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); - /* leave es->s_feature_*compat flags alone */ - /* es->s_uuid will be set by e2fsck if empty */ - - /* - * The rest of the superblock fields should be zero, and if not it - * means they are likely already in use, so leave them alone. We - * can leave it up to e2fsck to clean up any inconsistencies there. - */ -} - -/* - * Open the external journal device - */ -static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) -{ - struct block_device *bdev; - char b[BDEVNAME_SIZE]; - - bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); - if (IS_ERR(bdev)) - goto fail; - return bdev; - -fail: - ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld", - __bdevname(dev, b), PTR_ERR(bdev)); - return NULL; -} - -/* - * Release the journal device - */ -static int ext4_blkdev_put(struct block_device *bdev) -{ - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -} - -static int ext4_blkdev_remove(struct ext4_sb_info *sbi) -{ - struct block_device *bdev; - int ret = -ENODEV; - - bdev = sbi->journal_bdev; - if (bdev) { - ret = ext4_blkdev_put(bdev); - sbi->journal_bdev = NULL; - } - return ret; -} - -static inline struct inode *orphan_list_entry(struct list_head *l) -{ - return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; -} - -static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) -{ - struct list_head *l; - - ext4_msg(sb, KERN_ERR, "sb orphan head is %d", - le32_to_cpu(sbi->s_es->s_last_orphan)); - - printk(KERN_ERR "sb_info orphan list:\n"); - list_for_each(l, &sbi->s_orphan) { - struct inode *inode = orphan_list_entry(l); - printk(KERN_ERR " " - "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", - inode->i_sb->s_id, inode->i_ino, inode, - inode->i_mode, inode->i_nlink, - NEXT_ORPHAN(inode)); - } -} - -static void ext4_put_super(struct super_block *sb) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - int i, err; - - ext4_unregister_li_request(sb); - dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - - flush_workqueue(sbi->dio_unwritten_wq); - destroy_workqueue(sbi->dio_unwritten_wq); - - lock_super(sb); - if (sbi->s_journal) { - err = jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; - if (err < 0) - ext4_abort(sb, "Couldn't clean up the journal"); - } - - del_timer(&sbi->s_err_report); - ext4_release_system_zone(sb); - ext4_mb_release(sb); - ext4_ext_release(sb); - ext4_xattr_put_super(sb); - - if (!(sb->s_flags & MS_RDONLY)) { - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - es->s_state = cpu_to_le16(sbi->s_mount_state); - } - if (sb->s_dirt || !(sb->s_flags & MS_RDONLY)) - ext4_commit_super(sb, 1); - - if (sbi->s_proc) { - remove_proc_entry("options", sbi->s_proc); - remove_proc_entry(sb->s_id, ext4_proc_root); - } - kobject_del(&sbi->s_kobj); - - for (i = 0; i < sbi->s_gdb_count; i++) - brelse(sbi->s_group_desc[i]); - ext4_kvfree(sbi->s_group_desc); - ext4_kvfree(sbi->s_flex_groups); - percpu_counter_destroy(&sbi->s_freeclusters_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - brelse(sbi->s_sbh); -#ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); -#endif - - /* Debugging code just in case the in-memory inode orphan list - * isn't empty. The on-disk one can be non-empty if we've - * detected an error and taken the fs readonly, but the - * in-memory list had better be clean by this point. */ - if (!list_empty(&sbi->s_orphan)) - dump_orphan_list(sb, sbi); - J_ASSERT(list_empty(&sbi->s_orphan)); - - invalidate_bdev(sb->s_bdev); - if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { - /* - * Invalidate the journal device's buffers. We don't want them - * floating about in memory - the physical journal device may - * hotswapped, and it breaks the `ro-after' testing code. - */ - sync_blockdev(sbi->journal_bdev); - invalidate_bdev(sbi->journal_bdev); - ext4_blkdev_remove(sbi); - } - if (sbi->s_mmp_tsk) - kthread_stop(sbi->s_mmp_tsk); - sb->s_fs_info = NULL; - /* - * Now that we are completely done shutting down the - * superblock, we need to actually destroy the kobject. - */ - unlock_super(sb); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - kfree(sbi->s_blockgroup_lock); - kfree(sbi); -} - -static struct kmem_cache *ext4_inode_cachep; - -/* - * Called inside transaction, so use GFP_NOFS - */ -static struct inode *ext4_alloc_inode(struct super_block *sb) -{ - struct ext4_inode_info *ei; - - ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); - if (!ei) - return NULL; - - ei->vfs_inode.i_version = 1; - ei->vfs_inode.i_data.writeback_index = 0; - memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); - INIT_LIST_HEAD(&ei->i_prealloc_list); - spin_lock_init(&ei->i_prealloc_lock); - ei->i_reserved_data_blocks = 0; - ei->i_reserved_meta_blocks = 0; - ei->i_allocated_meta_blocks = 0; - ei->i_da_metadata_calc_len = 0; - spin_lock_init(&(ei->i_block_reservation_lock)); -#ifdef CONFIG_QUOTA - ei->i_reserved_quota = 0; -#endif - ei->jinode = NULL; - INIT_LIST_HEAD(&ei->i_completed_io_list); - spin_lock_init(&ei->i_completed_io_lock); - ei->cur_aio_dio = NULL; - ei->i_sync_tid = 0; - ei->i_datasync_tid = 0; - atomic_set(&ei->i_ioend_count, 0); - atomic_set(&ei->i_aiodio_unwritten, 0); - - return &ei->vfs_inode; -} - -static int ext4_drop_inode(struct inode *inode) -{ - int drop = generic_drop_inode(inode); - - trace_ext4_drop_inode(inode, drop); - return drop; -} - -static void ext4_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); -} - -static void ext4_destroy_inode(struct inode *inode) -{ - if (!list_empty(&(EXT4_I(inode)->i_orphan))) { - ext4_msg(inode->i_sb, KERN_ERR, - "Inode %lu (%p): orphan list check failed!", - inode->i_ino, EXT4_I(inode)); - print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, - EXT4_I(inode), sizeof(struct ext4_inode_info), - true); - dump_stack(); - } - call_rcu(&inode->i_rcu, ext4_i_callback); -} - -static void init_once(void *foo) -{ - struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; - - INIT_LIST_HEAD(&ei->i_orphan); -#ifdef CONFIG_EXT4_FS_XATTR - init_rwsem(&ei->xattr_sem); -#endif - init_rwsem(&ei->i_data_sem); - inode_init_once(&ei->vfs_inode); -} - -static int init_inodecache(void) -{ - ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", - sizeof(struct ext4_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - init_once); - if (ext4_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -static void destroy_inodecache(void) -{ - kmem_cache_destroy(ext4_inode_cachep); -} - -void ext4_clear_inode(struct inode *inode) -{ - invalidate_inode_buffers(inode); - end_writeback(inode); - dquot_drop(inode); - ext4_discard_preallocations(inode); - if (EXT4_I(inode)->jinode) { - jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), - EXT4_I(inode)->jinode); - jbd2_free_inode(EXT4_I(inode)->jinode); - EXT4_I(inode)->jinode = NULL; - } -} - -static struct inode *ext4_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) -{ - struct inode *inode; - - if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) - return ERR_PTR(-ESTALE); - if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) - return ERR_PTR(-ESTALE); - - /* iget isn't really right if the inode is currently unallocated!! - * - * ext4_read_inode will return a bad_inode if the inode had been - * deleted, so we should be safe. - * - * Currently we don't know the generation for parent directory, so - * a generation of 0 means "accept any" - */ - inode = ext4_iget(sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - if (generation && inode->i_generation != generation) { - iput(inode); - return ERR_PTR(-ESTALE); - } - - return inode; -} - -static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - ext4_nfs_get_inode); -} - -static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - ext4_nfs_get_inode); -} - -/* - * Try to release metadata pages (indirect blocks, directories) which are - * mapped via the block device. Since these pages could have journal heads - * which would prevent try_to_free_buffers() from freeing them, we must use - * jbd2 layer's try_to_free_buffers() function to release them. - */ -static int bdev_try_to_free_page(struct super_block *sb, struct page *page, - gfp_t wait) -{ - journal_t *journal = EXT4_SB(sb)->s_journal; - - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) - return 0; - if (journal) - return jbd2_journal_try_to_free_buffers(journal, page, - wait & ~__GFP_WAIT); - return try_to_free_buffers(page); -} - -#ifdef CONFIG_QUOTA -#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") -#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) - -static int ext4_write_dquot(struct dquot *dquot); -static int ext4_acquire_dquot(struct dquot *dquot); -static int ext4_release_dquot(struct dquot *dquot); -static int ext4_mark_dquot_dirty(struct dquot *dquot); -static int ext4_write_info(struct super_block *sb, int type); -static int ext4_quota_on(struct super_block *sb, int type, int format_id, - struct path *path); -static int ext4_quota_off(struct super_block *sb, int type); -static int ext4_quota_on_mount(struct super_block *sb, int type); -static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, - size_t len, loff_t off); -static ssize_t ext4_quota_write(struct super_block *sb, int type, - const char *data, size_t len, loff_t off); - -static const struct dquot_operations ext4_quota_operations = { - .get_reserved_space = ext4_get_reserved_space, - .write_dquot = ext4_write_dquot, - .acquire_dquot = ext4_acquire_dquot, - .release_dquot = ext4_release_dquot, - .mark_dirty = ext4_mark_dquot_dirty, - .write_info = ext4_write_info, - .alloc_dquot = dquot_alloc, - .destroy_dquot = dquot_destroy, -}; - -static const struct quotactl_ops ext4_qctl_operations = { - .quota_on = ext4_quota_on, - .quota_off = ext4_quota_off, - .quota_sync = dquot_quota_sync, - .get_info = dquot_get_dqinfo, - .set_info = dquot_set_dqinfo, - .get_dqblk = dquot_get_dqblk, - .set_dqblk = dquot_set_dqblk -}; -#endif - -static const struct super_operations ext4_sops = { - .alloc_inode = ext4_alloc_inode, - .destroy_inode = ext4_destroy_inode, - .write_inode = ext4_write_inode, - .dirty_inode = ext4_dirty_inode, - .drop_inode = ext4_drop_inode, - .evict_inode = ext4_evict_inode, - .put_super = ext4_put_super, - .sync_fs = ext4_sync_fs, - .freeze_fs = ext4_freeze, - .unfreeze_fs = ext4_unfreeze, - .statfs = ext4_statfs, - .remount_fs = ext4_remount, - .show_options = ext4_show_options, -#ifdef CONFIG_QUOTA - .quota_read = ext4_quota_read, - .quota_write = ext4_quota_write, -#endif - .bdev_try_to_free_page = bdev_try_to_free_page, -}; - -static const struct super_operations ext4_nojournal_sops = { - .alloc_inode = ext4_alloc_inode, - .destroy_inode = ext4_destroy_inode, - .write_inode = ext4_write_inode, - .dirty_inode = ext4_dirty_inode, - .drop_inode = ext4_drop_inode, - .evict_inode = ext4_evict_inode, - .write_super = ext4_write_super, - .put_super = ext4_put_super, - .statfs = ext4_statfs, - .remount_fs = ext4_remount, - .show_options = ext4_show_options, -#ifdef CONFIG_QUOTA - .quota_read = ext4_quota_read, - .quota_write = ext4_quota_write, -#endif - .bdev_try_to_free_page = bdev_try_to_free_page, -}; - -static const struct export_operations ext4_export_ops = { - .fh_to_dentry = ext4_fh_to_dentry, - .fh_to_parent = ext4_fh_to_parent, - .get_parent = ext4_get_parent, -}; - -enum { - Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, - Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nouid32, Opt_debug, Opt_removed, - Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, - Opt_commit, Opt_min_batch_time, Opt_max_batch_time, - Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, - Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_data_err_abort, Opt_data_err_ignore, - Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, - Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_i_version, - Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, - Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, - Opt_inode_readahead_blks, Opt_journal_ioprio, - Opt_dioread_nolock, Opt_dioread_lock, - Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, -}; - -static const match_table_t tokens = { - {Opt_bsd_df, "bsddf"}, - {Opt_minix_df, "minixdf"}, - {Opt_grpid, "grpid"}, - {Opt_grpid, "bsdgroups"}, - {Opt_nogrpid, "nogrpid"}, - {Opt_nogrpid, "sysvgroups"}, - {Opt_resgid, "resgid=%u"}, - {Opt_resuid, "resuid=%u"}, - {Opt_sb, "sb=%u"}, - {Opt_err_cont, "errors=continue"}, - {Opt_err_panic, "errors=panic"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_nouid32, "nouid32"}, - {Opt_debug, "debug"}, - {Opt_removed, "oldalloc"}, - {Opt_removed, "orlov"}, - {Opt_user_xattr, "user_xattr"}, - {Opt_nouser_xattr, "nouser_xattr"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_noload, "norecovery"}, - {Opt_noload, "noload"}, - {Opt_removed, "nobh"}, - {Opt_removed, "bh"}, - {Opt_commit, "commit=%u"}, - {Opt_min_batch_time, "min_batch_time=%u"}, - {Opt_max_batch_time, "max_batch_time=%u"}, - {Opt_journal_dev, "journal_dev=%u"}, - {Opt_journal_checksum, "journal_checksum"}, - {Opt_journal_async_commit, "journal_async_commit"}, - {Opt_abort, "abort"}, - {Opt_data_journal, "data=journal"}, - {Opt_data_ordered, "data=ordered"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_data_err_abort, "data_err=abort"}, - {Opt_data_err_ignore, "data_err=ignore"}, - {Opt_offusrjquota, "usrjquota="}, - {Opt_usrjquota, "usrjquota=%s"}, - {Opt_offgrpjquota, "grpjquota="}, - {Opt_grpjquota, "grpjquota=%s"}, - {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, - {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, - {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, - {Opt_grpquota, "grpquota"}, - {Opt_noquota, "noquota"}, - {Opt_quota, "quota"}, - {Opt_usrquota, "usrquota"}, - {Opt_barrier, "barrier=%u"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_i_version, "i_version"}, - {Opt_stripe, "stripe=%u"}, - {Opt_delalloc, "delalloc"}, - {Opt_nodelalloc, "nodelalloc"}, - {Opt_mblk_io_submit, "mblk_io_submit"}, - {Opt_nomblk_io_submit, "nomblk_io_submit"}, - {Opt_block_validity, "block_validity"}, - {Opt_noblock_validity, "noblock_validity"}, - {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, - {Opt_journal_ioprio, "journal_ioprio=%u"}, - {Opt_auto_da_alloc, "auto_da_alloc=%u"}, - {Opt_auto_da_alloc, "auto_da_alloc"}, - {Opt_noauto_da_alloc, "noauto_da_alloc"}, - {Opt_dioread_nolock, "dioread_nolock"}, - {Opt_dioread_lock, "dioread_lock"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_init_itable, "init_itable=%u"}, - {Opt_init_itable, "init_itable"}, - {Opt_noinit_itable, "noinit_itable"}, - {Opt_removed, "check=none"}, /* mount option from ext2/3 */ - {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ - {Opt_removed, "reservation"}, /* mount option from ext2/3 */ - {Opt_removed, "noreservation"}, /* mount option from ext2/3 */ - {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */ - {Opt_err, NULL}, -}; - -static ext4_fsblk_t get_sb_block(void **data) -{ - ext4_fsblk_t sb_block; - char *options = (char *) *data; - - if (!options || strncmp(options, "sb=", 3) != 0) - return 1; /* Default location */ - - options += 3; - /* TODO: use simple_strtoll with >32bit ext4 */ - sb_block = simple_strtoul(options, &options, 0); - if (*options && *options != ',') { - printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n", - (char *) *data); - return 1; - } - if (*options == ',') - options++; - *data = (void *) options; - - return sb_block; -} - -#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) -static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" - "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; - -#ifdef CONFIG_QUOTA -static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - char *qname; - - if (sb_any_quota_loaded(sb) && - !sbi->s_qf_names[qtype]) { - ext4_msg(sb, KERN_ERR, - "Cannot change journaled " - "quota options when quota turned on"); - return -1; - } - qname = match_strdup(args); - if (!qname) { - ext4_msg(sb, KERN_ERR, - "Not enough memory for storing quotafile name"); - return -1; - } - if (sbi->s_qf_names[qtype] && - strcmp(sbi->s_qf_names[qtype], qname)) { - ext4_msg(sb, KERN_ERR, - "%s quota file already specified", QTYPE2NAME(qtype)); - kfree(qname); - return -1; - } - sbi->s_qf_names[qtype] = qname; - if (strchr(sbi->s_qf_names[qtype], '/')) { - ext4_msg(sb, KERN_ERR, - "quotafile must be on filesystem root"); - kfree(sbi->s_qf_names[qtype]); - sbi->s_qf_names[qtype] = NULL; - return -1; - } - set_opt(sb, QUOTA); - return 1; -} - -static int clear_qf_name(struct super_block *sb, int qtype) -{ - - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sb_any_quota_loaded(sb) && - sbi->s_qf_names[qtype]) { - ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" - " when quota turned on"); - return -1; - } - /* - * The space will be released later when all options are confirmed - * to be correct - */ - sbi->s_qf_names[qtype] = NULL; - return 1; -} -#endif - -#define MOPT_SET 0x0001 -#define MOPT_CLEAR 0x0002 -#define MOPT_NOSUPPORT 0x0004 -#define MOPT_EXPLICIT 0x0008 -#define MOPT_CLEAR_ERR 0x0010 -#define MOPT_GTE0 0x0020 -#ifdef CONFIG_QUOTA -#define MOPT_Q 0 -#define MOPT_QFMT 0x0040 -#else -#define MOPT_Q MOPT_NOSUPPORT -#define MOPT_QFMT MOPT_NOSUPPORT -#endif -#define MOPT_DATAJ 0x0080 - -static const struct mount_opts { - int token; - int mount_opt; - int flags; -} ext4_mount_opts[] = { - {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, - {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, - {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, - {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, - {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET}, - {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR}, - {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, - {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, - {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET}, - {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR}, - {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, - {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, - {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT}, - {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT}, - {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET}, - {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | - EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET}, - {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET}, - {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, - {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, - {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, - {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET}, - {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR}, - {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, - {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, - {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, - {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, - {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, - {Opt_commit, 0, MOPT_GTE0}, - {Opt_max_batch_time, 0, MOPT_GTE0}, - {Opt_min_batch_time, 0, MOPT_GTE0}, - {Opt_inode_readahead_blks, 0, MOPT_GTE0}, - {Opt_init_itable, 0, MOPT_GTE0}, - {Opt_stripe, 0, MOPT_GTE0}, - {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, - {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, - {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, -#ifdef CONFIG_EXT4_FS_XATTR - {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, - {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, -#else - {Opt_user_xattr, 0, MOPT_NOSUPPORT}, - {Opt_nouser_xattr, 0, MOPT_NOSUPPORT}, -#endif -#ifdef CONFIG_EXT4_FS_POSIX_ACL - {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, - {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, -#else - {Opt_acl, 0, MOPT_NOSUPPORT}, - {Opt_noacl, 0, MOPT_NOSUPPORT}, -#endif - {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, - {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, - {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, - {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, - MOPT_SET | MOPT_Q}, - {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, - MOPT_SET | MOPT_Q}, - {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | - EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q}, - {Opt_usrjquota, 0, MOPT_Q}, - {Opt_grpjquota, 0, MOPT_Q}, - {Opt_offusrjquota, 0, MOPT_Q}, - {Opt_offgrpjquota, 0, MOPT_Q}, - {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, - {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, - {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, - {Opt_err, 0, 0} -}; - -static int handle_mount_opt(struct super_block *sb, char *opt, int token, - substring_t *args, unsigned long *journal_devnum, - unsigned int *journal_ioprio, int is_remount) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - const struct mount_opts *m; - int arg = 0; - -#ifdef CONFIG_QUOTA - if (token == Opt_usrjquota) - return set_qf_name(sb, USRQUOTA, &args[0]); - else if (token == Opt_grpjquota) - return set_qf_name(sb, GRPQUOTA, &args[0]); - else if (token == Opt_offusrjquota) - return clear_qf_name(sb, USRQUOTA); - else if (token == Opt_offgrpjquota) - return clear_qf_name(sb, GRPQUOTA); -#endif - if (args->from && match_int(args, &arg)) - return -1; - switch (token) { - case Opt_noacl: - case Opt_nouser_xattr: - ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5"); - break; - case Opt_sb: - return 1; /* handled by get_sb_block() */ - case Opt_removed: - ext4_msg(sb, KERN_WARNING, - "Ignoring removed %s option", opt); - return 1; - case Opt_resuid: - sbi->s_resuid = arg; - return 1; - case Opt_resgid: - sbi->s_resgid = arg; - return 1; - case Opt_abort: - sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; - return 1; - case Opt_i_version: - sb->s_flags |= MS_I_VERSION; - return 1; - case Opt_journal_dev: - if (is_remount) { - ext4_msg(sb, KERN_ERR, - "Cannot specify journal on remount"); - return -1; - } - *journal_devnum = arg; - return 1; - case Opt_journal_ioprio: - if (arg < 0 || arg > 7) - return -1; - *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); - return 1; - } - - for (m = ext4_mount_opts; m->token != Opt_err; m++) { - if (token != m->token) - continue; - if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) - return -1; - if (m->flags & MOPT_EXPLICIT) - set_opt2(sb, EXPLICIT_DELALLOC); - if (m->flags & MOPT_CLEAR_ERR) - clear_opt(sb, ERRORS_MASK); - if (token == Opt_noquota && sb_any_quota_loaded(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot change quota " - "options when quota turned on"); - return -1; - } - - if (m->flags & MOPT_NOSUPPORT) { - ext4_msg(sb, KERN_ERR, "%s option not supported", opt); - } else if (token == Opt_commit) { - if (arg == 0) - arg = JBD2_DEFAULT_MAX_COMMIT_AGE; - sbi->s_commit_interval = HZ * arg; - } else if (token == Opt_max_batch_time) { - if (arg == 0) - arg = EXT4_DEF_MAX_BATCH_TIME; - sbi->s_max_batch_time = arg; - } else if (token == Opt_min_batch_time) { - sbi->s_min_batch_time = arg; - } else if (token == Opt_inode_readahead_blks) { - if (arg > (1 << 30)) - return -1; - if (arg && !is_power_of_2(arg)) { - ext4_msg(sb, KERN_ERR, - "EXT4-fs: inode_readahead_blks" - " must be a power of 2"); - return -1; - } - sbi->s_inode_readahead_blks = arg; - } else if (token == Opt_init_itable) { - set_opt(sb, INIT_INODE_TABLE); - if (!args->from) - arg = EXT4_DEF_LI_WAIT_MULT; - sbi->s_li_wait_mult = arg; - } else if (token == Opt_stripe) { - sbi->s_stripe = arg; - } else if (m->flags & MOPT_DATAJ) { - if (is_remount) { - if (!sbi->s_journal) - ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); - else if (test_opt(sb, DATA_FLAGS) != - m->mount_opt) { - ext4_msg(sb, KERN_ERR, - "Cannot change data mode on remount"); - return -1; - } - } else { - clear_opt(sb, DATA_FLAGS); - sbi->s_mount_opt |= m->mount_opt; - } -#ifdef CONFIG_QUOTA - } else if (m->flags & MOPT_QFMT) { - if (sb_any_quota_loaded(sb) && - sbi->s_jquota_fmt != m->mount_opt) { - ext4_msg(sb, KERN_ERR, "Cannot " - "change journaled quota options " - "when quota turned on"); - return -1; - } - sbi->s_jquota_fmt = m->mount_opt; -#endif - } else { - if (!args->from) - arg = 1; - if (m->flags & MOPT_CLEAR) - arg = !arg; - else if (unlikely(!(m->flags & MOPT_SET))) { - ext4_msg(sb, KERN_WARNING, - "buggy handling of option %s", opt); - WARN_ON(1); - return -1; - } - if (arg != 0) - sbi->s_mount_opt |= m->mount_opt; - else - sbi->s_mount_opt &= ~m->mount_opt; - } - return 1; - } - ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " - "or missing value", opt); - return -1; -} - -static int parse_options(char *options, struct super_block *sb, - unsigned long *journal_devnum, - unsigned int *journal_ioprio, - int is_remount) -{ -#ifdef CONFIG_QUOTA - struct ext4_sb_info *sbi = EXT4_SB(sb); -#endif - char *p; - substring_t args[MAX_OPT_ARGS]; - int token; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - if (!*p) - continue; - /* - * Initialize args struct so we know whether arg was - * found; some options take optional arguments. - */ - args[0].to = args[0].from = 0; - token = match_token(p, tokens, args); - if (handle_mount_opt(sb, p, token, args, journal_devnum, - journal_ioprio, is_remount) < 0) - return 0; - } -#ifdef CONFIG_QUOTA - if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { - if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) - clear_opt(sb, USRQUOTA); - - if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) - clear_opt(sb, GRPQUOTA); - - if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { - ext4_msg(sb, KERN_ERR, "old and new quota " - "format mixing"); - return 0; - } - - if (!sbi->s_jquota_fmt) { - ext4_msg(sb, KERN_ERR, "journaled quota format " - "not specified"); - return 0; - } - } else { - if (sbi->s_jquota_fmt) { - ext4_msg(sb, KERN_ERR, "journaled quota format " - "specified with no journaling " - "enabled"); - return 0; - } - } -#endif - return 1; -} - -static inline void ext4_show_quota_options(struct seq_file *seq, - struct super_block *sb) -{ -#if defined(CONFIG_QUOTA) - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_jquota_fmt) { - char *fmtname = ""; - - switch (sbi->s_jquota_fmt) { - case QFMT_VFS_OLD: - fmtname = "vfsold"; - break; - case QFMT_VFS_V0: - fmtname = "vfsv0"; - break; - case QFMT_VFS_V1: - fmtname = "vfsv1"; - break; - } - seq_printf(seq, ",jqfmt=%s", fmtname); - } - - if (sbi->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); - - if (sbi->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); - - if (test_opt(sb, USRQUOTA)) - seq_puts(seq, ",usrquota"); - - if (test_opt(sb, GRPQUOTA)) - seq_puts(seq, ",grpquota"); -#endif -} - -static const char *token2str(int token) -{ - static const struct match_token *t; - - for (t = tokens; t->token != Opt_err; t++) - if (t->token == token && !strchr(t->pattern, '=')) - break; - return t->pattern; -} - -/* - * Show an option if - * - it's set to a non-default value OR - * - if the per-sb default is different from the global default - */ -static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, - int nodefs) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt; - const struct mount_opts *m; - char sep = nodefs ? '\n' : ','; - -#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) -#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) - - if (sbi->s_sb_block != 1) - SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); - - for (m = ext4_mount_opts; m->token != Opt_err; m++) { - int want_set = m->flags & MOPT_SET; - if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || - (m->flags & MOPT_CLEAR_ERR)) - continue; - if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) - continue; /* skip if same as the default */ - if ((want_set && - (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || - (!want_set && (sbi->s_mount_opt & m->mount_opt))) - continue; /* select Opt_noFoo vs Opt_Foo */ - SEQ_OPTS_PRINT("%s", token2str(m->token)); - } - - if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID || - le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) - SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid); - if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID || - le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) - SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid); - def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); - if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) - SEQ_OPTS_PUTS("errors=remount-ro"); - if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) - SEQ_OPTS_PUTS("errors=continue"); - if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) - SEQ_OPTS_PUTS("errors=panic"); - if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) - SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); - if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) - SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); - if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) - SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); - if (sb->s_flags & MS_I_VERSION) - SEQ_OPTS_PUTS("i_version"); - if (nodefs || sbi->s_stripe) - SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); - if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) { - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - SEQ_OPTS_PUTS("data=journal"); - else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - SEQ_OPTS_PUTS("data=ordered"); - else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - SEQ_OPTS_PUTS("data=writeback"); - } - if (nodefs || - sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) - SEQ_OPTS_PRINT("inode_readahead_blks=%u", - sbi->s_inode_readahead_blks); - - if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && - (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) - SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); - - ext4_show_quota_options(seq, sb); - return 0; -} - -static int ext4_show_options(struct seq_file *seq, struct dentry *root) -{ - return _ext4_show_options(seq, root->d_sb, 0); -} - -static int options_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - int rc; - - seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw"); - rc = _ext4_show_options(seq, sb, 1); - seq_puts(seq, "\n"); - return rc; -} - -static int options_open_fs(struct inode *inode, struct file *file) -{ - return single_open(file, options_seq_show, PDE(inode)->data); -} - -static const struct file_operations ext4_seq_options_fops = { - .owner = THIS_MODULE, - .open = options_open_fs, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, - int read_only) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - int res = 0; - - if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { - ext4_msg(sb, KERN_ERR, "revision level too high, " - "forcing read-only mode"); - res = MS_RDONLY; - } - if (read_only) - goto done; - if (!(sbi->s_mount_state & EXT4_VALID_FS)) - ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " - "running e2fsck is recommended"); - else if ((sbi->s_mount_state & EXT4_ERROR_FS)) - ext4_msg(sb, KERN_WARNING, - "warning: mounting fs with errors, " - "running e2fsck is recommended"); - else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && - le16_to_cpu(es->s_mnt_count) >= - (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) - ext4_msg(sb, KERN_WARNING, - "warning: maximal mount count reached, " - "running e2fsck is recommended"); - else if (le32_to_cpu(es->s_checkinterval) && - (le32_to_cpu(es->s_lastcheck) + - le32_to_cpu(es->s_checkinterval) <= get_seconds())) - ext4_msg(sb, KERN_WARNING, - "warning: checktime reached, " - "running e2fsck is recommended"); - if (!sbi->s_journal) - es->s_state &= cpu_to_le16(~EXT4_VALID_FS); - if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) - es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); - le16_add_cpu(&es->s_mnt_count, 1); - es->s_mtime = cpu_to_le32(get_seconds()); - ext4_update_dynamic_rev(sb); - if (sbi->s_journal) - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - - ext4_commit_super(sb, 1); -done: - if (test_opt(sb, DEBUG)) - printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " - "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", - sb->s_blocksize, - sbi->s_groups_count, - EXT4_BLOCKS_PER_GROUP(sb), - EXT4_INODES_PER_GROUP(sb), - sbi->s_mount_opt, sbi->s_mount_opt2); - - cleancache_init_fs(sb); - return res; -} - -static int ext4_fill_flex_info(struct super_block *sb) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_group_desc *gdp = NULL; - ext4_group_t flex_group_count; - ext4_group_t flex_group; - unsigned int groups_per_flex = 0; - size_t size; - int i; - - sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; - if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { - sbi->s_log_groups_per_flex = 0; - return 1; - } - groups_per_flex = 1 << sbi->s_log_groups_per_flex; - - /* We allocate both existing and potentially added groups */ - flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + - ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << - EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; - size = flex_group_count * sizeof(struct flex_groups); - sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); - if (sbi->s_flex_groups == NULL) { - ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", - flex_group_count); - goto failed; - } - - for (i = 0; i < sbi->s_groups_count; i++) { - gdp = ext4_get_group_desc(sb, i, NULL); - - flex_group = ext4_flex_group(sbi, i); - atomic_add(ext4_free_inodes_count(sb, gdp), - &sbi->s_flex_groups[flex_group].free_inodes); - atomic_add(ext4_free_group_clusters(sb, gdp), - &sbi->s_flex_groups[flex_group].free_clusters); - atomic_add(ext4_used_dirs_count(sb, gdp), - &sbi->s_flex_groups[flex_group].used_dirs); - } - - return 1; -failed: - return 0; -} - -__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, - struct ext4_group_desc *gdp) -{ - __u16 crc = 0; - - if (sbi->s_es->s_feature_ro_compat & - cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - int offset = offsetof(struct ext4_group_desc, bg_checksum); - __le32 le_group = cpu_to_le32(block_group); - - crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); - crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); - crc = crc16(crc, (__u8 *)gdp, offset); - offset += sizeof(gdp->bg_checksum); /* skip checksum */ - /* for checksum of struct ext4_group_desc do the rest...*/ - if ((sbi->s_es->s_feature_incompat & - cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && - offset < le16_to_cpu(sbi->s_es->s_desc_size)) - crc = crc16(crc, (__u8 *)gdp + offset, - le16_to_cpu(sbi->s_es->s_desc_size) - - offset); - } - - return cpu_to_le16(crc); -} - -int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group, - struct ext4_group_desc *gdp) -{ - if ((sbi->s_es->s_feature_ro_compat & - cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) && - (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp))) - return 0; - - return 1; -} - -/* Called at mount-time, super-block is locked */ -static int ext4_check_descriptors(struct super_block *sb, - ext4_group_t *first_not_zeroed) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); - ext4_fsblk_t last_block; - ext4_fsblk_t block_bitmap; - ext4_fsblk_t inode_bitmap; - ext4_fsblk_t inode_table; - int flexbg_flag = 0; - ext4_group_t i, grp = sbi->s_groups_count; - - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) - flexbg_flag = 1; - - ext4_debug("Checking group descriptors"); - - for (i = 0; i < sbi->s_groups_count; i++) { - struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); - - if (i == sbi->s_groups_count - 1 || flexbg_flag) - last_block = ext4_blocks_count(sbi->s_es) - 1; - else - last_block = first_block + - (EXT4_BLOCKS_PER_GROUP(sb) - 1); - - if ((grp == sbi->s_groups_count) && - !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) - grp = i; - - block_bitmap = ext4_block_bitmap(sb, gdp); - if (block_bitmap < first_block || block_bitmap > last_block) { - ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " - "Block bitmap for group %u not in group " - "(block %llu)!", i, block_bitmap); - return 0; - } - inode_bitmap = ext4_inode_bitmap(sb, gdp); - if (inode_bitmap < first_block || inode_bitmap > last_block) { - ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " - "Inode bitmap for group %u not in group " - "(block %llu)!", i, inode_bitmap); - return 0; - } - inode_table = ext4_inode_table(sb, gdp); - if (inode_table < first_block || - inode_table + sbi->s_itb_per_group - 1 > last_block) { - ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " - "Inode table for group %u not in group " - "(block %llu)!", i, inode_table); - return 0; - } - ext4_lock_group(sb, i); - if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { - ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " - "Checksum for group %u failed (%u!=%u)", - i, le16_to_cpu(ext4_group_desc_csum(sbi, i, - gdp)), le16_to_cpu(gdp->bg_checksum)); - if (!(sb->s_flags & MS_RDONLY)) { - ext4_unlock_group(sb, i); - return 0; - } - } - ext4_unlock_group(sb, i); - if (!flexbg_flag) - first_block += EXT4_BLOCKS_PER_GROUP(sb); - } - if (NULL != first_not_zeroed) - *first_not_zeroed = grp; - - ext4_free_blocks_count_set(sbi->s_es, - EXT4_C2B(sbi, ext4_count_free_clusters(sb))); - sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); - return 1; -} - -/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at - * the superblock) which were deleted from all directories, but held open by - * a process at the time of a crash. We walk the list and try to delete these - * inodes at recovery time (only with a read-write filesystem). - * - * In order to keep the orphan inode chain consistent during traversal (in - * case of crash during recovery), we link each inode into the superblock - * orphan list_head and handle it the same way as an inode deletion during - * normal operation (which journals the operations for us). - * - * We only do an iget() and an iput() on each inode, which is very safe if we - * accidentally point at an in-use or already deleted inode. The worst that - * can happen in this case is that we get a "bit already cleared" message from - * ext4_free_inode(). The only reason we would point at a wrong inode is if - * e2fsck was run on this filesystem, and it must have already done the orphan - * inode cleanup for us, so we can safely abort without any further action. - */ -static void ext4_orphan_cleanup(struct super_block *sb, - struct ext4_super_block *es) -{ - unsigned int s_flags = sb->s_flags; - int nr_orphans = 0, nr_truncates = 0; -#ifdef CONFIG_QUOTA - int i; -#endif - if (!es->s_last_orphan) { - jbd_debug(4, "no orphan inodes to clean up\n"); - return; - } - - if (bdev_read_only(sb->s_bdev)) { - ext4_msg(sb, KERN_ERR, "write access " - "unavailable, skipping orphan cleanup"); - return; - } - - /* Check if feature set would not allow a r/w mount */ - if (!ext4_feature_set_ok(sb, 0)) { - ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " - "unknown ROCOMPAT features"); - return; - } - - if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { - if (es->s_last_orphan) - jbd_debug(1, "Errors on filesystem, " - "clearing orphan list.\n"); - es->s_last_orphan = 0; - jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); - return; - } - - if (s_flags & MS_RDONLY) { - ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); - sb->s_flags &= ~MS_RDONLY; - } -#ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sb->s_flags |= MS_ACTIVE; - /* Turn on quotas so that they are updated correctly */ - for (i = 0; i < MAXQUOTAS; i++) { - if (EXT4_SB(sb)->s_qf_names[i]) { - int ret = ext4_quota_on_mount(sb, i); - if (ret < 0) - ext4_msg(sb, KERN_ERR, - "Cannot turn on journaled " - "quota: error %d", ret); - } - } -#endif - - while (es->s_last_orphan) { - struct inode *inode; - - inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); - if (IS_ERR(inode)) { - es->s_last_orphan = 0; - break; - } - - list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); - dquot_initialize(inode); - if (inode->i_nlink) { - ext4_msg(sb, KERN_DEBUG, - "%s: truncating inode %lu to %lld bytes", - __func__, inode->i_ino, inode->i_size); - jbd_debug(2, "truncating inode %lu to %lld bytes\n", - inode->i_ino, inode->i_size); - ext4_truncate(inode); - nr_truncates++; - } else { - ext4_msg(sb, KERN_DEBUG, - "%s: deleting unreferenced inode %lu", - __func__, inode->i_ino); - jbd_debug(2, "deleting unreferenced inode %lu\n", - inode->i_ino); - nr_orphans++; - } - iput(inode); /* The delete magic happens here! */ - } - -#define PLURAL(x) (x), ((x) == 1) ? "" : "s" - - if (nr_orphans) - ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", - PLURAL(nr_orphans)); - if (nr_truncates) - ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", - PLURAL(nr_truncates)); -#ifdef CONFIG_QUOTA - /* Turn quotas off */ - for (i = 0; i < MAXQUOTAS; i++) { - if (sb_dqopt(sb)->files[i]) - dquot_quota_off(sb, i); - } -#endif - sb->s_flags = s_flags; /* Restore MS_RDONLY status */ -} - -/* - * Maximal extent format file size. - * Resulting logical blkno at s_maxbytes must fit in our on-disk - * extent format containers, within a sector_t, and within i_blocks - * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, - * so that won't be a limiting factor. - * - * However there is other limiting factor. We do store extents in the form - * of starting block and length, hence the resulting length of the extent - * covering maximum file size must fit into on-disk format containers as - * well. Given that length is always by 1 unit bigger than max unit (because - * we count 0 as well) we have to lower the s_maxbytes by one fs block. - * - * Note, this does *not* consider any metadata overhead for vfs i_blocks. - */ -static loff_t ext4_max_size(int blkbits, int has_huge_files) -{ - loff_t res; - loff_t upper_limit = MAX_LFS_FILESIZE; - - /* small i_blocks in vfs inode? */ - if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { - /* - * CONFIG_LBDAF is not enabled implies the inode - * i_block represent total blocks in 512 bytes - * 32 == size of vfs inode i_blocks * 8 - */ - upper_limit = (1LL << 32) - 1; - - /* total blocks in file system block size */ - upper_limit >>= (blkbits - 9); - upper_limit <<= blkbits; - } - - /* - * 32-bit extent-start container, ee_block. We lower the maxbytes - * by one fs block, so ee_len can cover the extent of maximum file - * size - */ - res = (1LL << 32) - 1; - res <<= blkbits; - - /* Sanity check against vm- & vfs- imposed limits */ - if (res > upper_limit) - res = upper_limit; - - return res; -} - -/* - * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect - * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. - * We need to be 1 filesystem block less than the 2^48 sector limit. - */ -static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) -{ - loff_t res = EXT4_NDIR_BLOCKS; - int meta_blocks; - loff_t upper_limit; - /* This is calculated to be the largest file size for a dense, block - * mapped file such that the file's total number of 512-byte sectors, - * including data and all indirect blocks, does not exceed (2^48 - 1). - * - * __u32 i_blocks_lo and _u16 i_blocks_high represent the total - * number of 512-byte sectors of the file. - */ - - if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { - /* - * !has_huge_files or CONFIG_LBDAF not enabled implies that - * the inode i_block field represents total file blocks in - * 2^32 512-byte sectors == size of vfs inode i_blocks * 8 - */ - upper_limit = (1LL << 32) - 1; - - /* total blocks in file system block size */ - upper_limit >>= (bits - 9); - - } else { - /* - * We use 48 bit ext4_inode i_blocks - * With EXT4_HUGE_FILE_FL set the i_blocks - * represent total number of blocks in - * file system block size - */ - upper_limit = (1LL << 48) - 1; - - } - - /* indirect blocks */ - meta_blocks = 1; - /* double indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)); - /* tripple indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); - - upper_limit -= meta_blocks; - upper_limit <<= bits; - - res += 1LL << (bits-2); - res += 1LL << (2*(bits-2)); - res += 1LL << (3*(bits-2)); - res <<= bits; - if (res > upper_limit) - res = upper_limit; - - if (res > MAX_LFS_FILESIZE) - res = MAX_LFS_FILESIZE; - - return res; -} - -static ext4_fsblk_t descriptor_loc(struct super_block *sb, - ext4_fsblk_t logical_sb_block, int nr) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_group_t bg, first_meta_bg; - int has_super = 0; - - first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); - - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || - nr < first_meta_bg) - return logical_sb_block + nr + 1; - bg = sbi->s_desc_per_block * nr; - if (ext4_bg_has_super(sb, bg)) - has_super = 1; - - return (has_super + ext4_group_first_block_no(sb, bg)); -} - -/** - * ext4_get_stripe_size: Get the stripe size. - * @sbi: In memory super block info - * - * If we have specified it via mount option, then - * use the mount option value. If the value specified at mount time is - * greater than the blocks per group use the super block value. - * If the super block value is greater than blocks per group return 0. - * Allocator needs it be less than blocks per group. - * - */ -static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) -{ - unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); - unsigned long stripe_width = - le32_to_cpu(sbi->s_es->s_raid_stripe_width); - int ret; - - if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) - ret = sbi->s_stripe; - else if (stripe_width <= sbi->s_blocks_per_group) - ret = stripe_width; - else if (stride <= sbi->s_blocks_per_group) - ret = stride; - else - ret = 0; - - /* - * If the stripe width is 1, this makes no sense and - * we set it to 0 to turn off stripe handling code. - */ - if (ret <= 1) - ret = 0; - - return ret; -} - -/* sysfs supprt */ - -struct ext4_attr { - struct attribute attr; - ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); - ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, - const char *, size_t); - int offset; -}; - -static int parse_strtoul(const char *buf, - unsigned long max, unsigned long *value) -{ - char *endp; - - *value = simple_strtoul(skip_spaces(buf), &endp, 0); - endp = skip_spaces(endp); - if (*endp || *value > max) - return -EINVAL; - - return 0; -} - -static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, - char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (s64) EXT4_C2B(sbi, - percpu_counter_sum(&sbi->s_dirtyclusters_counter))); -} - -static ssize_t session_write_kbytes_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - struct super_block *sb = sbi->s_buddy_cache->i_sb; - - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); - return snprintf(buf, PAGE_SIZE, "%lu\n", - (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - sbi->s_sectors_written_start) >> 1); -} - -static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - struct super_block *sb = sbi->s_buddy_cache->i_sb; - - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(sbi->s_kbytes_written + - ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1))); -} - -static ssize_t inode_readahead_blks_store(struct ext4_attr *a, - struct ext4_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned long t; - - if (parse_strtoul(buf, 0x40000000, &t)) - return -EINVAL; - - if (t && !is_power_of_2(t)) - return -EINVAL; - - sbi->s_inode_readahead_blks = t; - return count; -} - -static ssize_t sbi_ui_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); - - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); -} - -static ssize_t sbi_ui_store(struct ext4_attr *a, - struct ext4_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); - unsigned long t; - - if (parse_strtoul(buf, 0xffffffff, &t)) - return -EINVAL; - *ui = t; - return count; -} - -#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ -static struct ext4_attr ext4_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ - .offset = offsetof(struct ext4_sb_info, _elname), \ -} -#define EXT4_ATTR(name, mode, show, store) \ -static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) - -#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL) -#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) -#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) -#define EXT4_RW_ATTR_SBI_UI(name, elname) \ - EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) -#define ATTR_LIST(name) &ext4_attr_##name.attr - -EXT4_RO_ATTR(delayed_allocation_blocks); -EXT4_RO_ATTR(session_write_kbytes); -EXT4_RO_ATTR(lifetime_write_kbytes); -EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, - inode_readahead_blks_store, s_inode_readahead_blks); -EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); -EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); -EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); -EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); -EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); -EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); -EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); -EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); - -static struct attribute *ext4_attrs[] = { - ATTR_LIST(delayed_allocation_blocks), - ATTR_LIST(session_write_kbytes), - ATTR_LIST(lifetime_write_kbytes), - ATTR_LIST(inode_readahead_blks), - ATTR_LIST(inode_goal), - ATTR_LIST(mb_stats), - ATTR_LIST(mb_max_to_scan), - ATTR_LIST(mb_min_to_scan), - ATTR_LIST(mb_order2_req), - ATTR_LIST(mb_stream_req), - ATTR_LIST(mb_group_prealloc), - ATTR_LIST(max_writeback_mb_bump), - NULL, -}; - -/* Features this copy of ext4 supports */ -EXT4_INFO_ATTR(lazy_itable_init); -EXT4_INFO_ATTR(batched_discard); - -static struct attribute *ext4_feat_attrs[] = { - ATTR_LIST(lazy_itable_init), - ATTR_LIST(batched_discard), - NULL, -}; - -static ssize_t ext4_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - - return a->show ? a->show(a, sbi, buf) : 0; -} - -static ssize_t ext4_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - - return a->store ? a->store(a, sbi, buf, len) : 0; -} - -static void ext4_sb_release(struct kobject *kobj) -{ - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - complete(&sbi->s_kobj_unregister); -} - -static const struct sysfs_ops ext4_attr_ops = { - .show = ext4_attr_show, - .store = ext4_attr_store, -}; - -static struct kobj_type ext4_ktype = { - .default_attrs = ext4_attrs, - .sysfs_ops = &ext4_attr_ops, - .release = ext4_sb_release, -}; - -static void ext4_feat_release(struct kobject *kobj) -{ - complete(&ext4_feat->f_kobj_unregister); -} - -static struct kobj_type ext4_feat_ktype = { - .default_attrs = ext4_feat_attrs, - .sysfs_ops = &ext4_attr_ops, - .release = ext4_feat_release, -}; - -/* - * Check whether this filesystem can be mounted based on - * the features present and the RDONLY/RDWR mount requested. - * Returns 1 if this filesystem can be mounted as requested, - * 0 if it cannot be. - */ -static int ext4_feature_set_ok(struct super_block *sb, int readonly) -{ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) { - ext4_msg(sb, KERN_ERR, - "Couldn't mount because of " - "unsupported optional features (%x)", - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & - ~EXT4_FEATURE_INCOMPAT_SUPP)); - return 0; - } - - if (readonly) - return 1; - - /* Check that feature set is OK for a read-write mount */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { - ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " - "unsupported optional features (%x)", - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & - ~EXT4_FEATURE_RO_COMPAT_SUPP)); - return 0; - } - /* - * Large file size enabled file system can only be mounted - * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF - */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { - if (sizeof(blkcnt_t) < sizeof(u64)) { - ext4_msg(sb, KERN_ERR, "Filesystem with huge files " - "cannot be mounted RDWR without " - "CONFIG_LBDAF"); - return 0; - } - } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) && - !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { - ext4_msg(sb, KERN_ERR, - "Can't support bigalloc feature without " - "extents feature\n"); - return 0; - } - return 1; -} - -/* - * This function is called once a day if we have errors logged - * on the file system - */ -static void print_daily_error_info(unsigned long arg) -{ - struct super_block *sb = (struct super_block *) arg; - struct ext4_sb_info *sbi; - struct ext4_super_block *es; - - sbi = EXT4_SB(sb); - es = sbi->s_es; - - if (es->s_error_count) - ext4_msg(sb, KERN_NOTICE, "error count: %u", - le32_to_cpu(es->s_error_count)); - if (es->s_first_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", - sb->s_id, le32_to_cpu(es->s_first_error_time), - (int) sizeof(es->s_first_error_func), - es->s_first_error_func, - le32_to_cpu(es->s_first_error_line)); - if (es->s_first_error_ino) - printk(": inode %u", - le32_to_cpu(es->s_first_error_ino)); - if (es->s_first_error_block) - printk(": block %llu", (unsigned long long) - le64_to_cpu(es->s_first_error_block)); - printk("\n"); - } - if (es->s_last_error_time) { - printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", - sb->s_id, le32_to_cpu(es->s_last_error_time), - (int) sizeof(es->s_last_error_func), - es->s_last_error_func, - le32_to_cpu(es->s_last_error_line)); - if (es->s_last_error_ino) - printk(": inode %u", - le32_to_cpu(es->s_last_error_ino)); - if (es->s_last_error_block) - printk(": block %llu", (unsigned long long) - le64_to_cpu(es->s_last_error_block)); - printk("\n"); - } - mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ -} - -/* Find next suitable group and run ext4_init_inode_table */ -static int ext4_run_li_request(struct ext4_li_request *elr) -{ - struct ext4_group_desc *gdp = NULL; - ext4_group_t group, ngroups; - struct super_block *sb; - unsigned long timeout = 0; - int ret = 0; - - sb = elr->lr_super; - ngroups = EXT4_SB(sb)->s_groups_count; - - for (group = elr->lr_next_group; group < ngroups; group++) { - gdp = ext4_get_group_desc(sb, group, NULL); - if (!gdp) { - ret = 1; - break; - } - - if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) - break; - } - - if (group == ngroups) - ret = 1; - - if (!ret) { - timeout = jiffies; - ret = ext4_init_inode_table(sb, group, - elr->lr_timeout ? 0 : 1); - if (elr->lr_timeout == 0) { - timeout = (jiffies - timeout) * - elr->lr_sbi->s_li_wait_mult; - elr->lr_timeout = timeout; - } - elr->lr_next_sched = jiffies + elr->lr_timeout; - elr->lr_next_group = group + 1; - } - - return ret; -} - -/* - * Remove lr_request from the list_request and free the - * request structure. Should be called with li_list_mtx held - */ -static void ext4_remove_li_request(struct ext4_li_request *elr) -{ - struct ext4_sb_info *sbi; - - if (!elr) - return; - - sbi = elr->lr_sbi; - - list_del(&elr->lr_request); - sbi->s_li_request = NULL; - kfree(elr); -} - -static void ext4_unregister_li_request(struct super_block *sb) -{ - mutex_lock(&ext4_li_mtx); - if (!ext4_li_info) { - mutex_unlock(&ext4_li_mtx); - return; - } - - mutex_lock(&ext4_li_info->li_list_mtx); - ext4_remove_li_request(EXT4_SB(sb)->s_li_request); - mutex_unlock(&ext4_li_info->li_list_mtx); - mutex_unlock(&ext4_li_mtx); -} - -static struct task_struct *ext4_lazyinit_task; - -/* - * This is the function where ext4lazyinit thread lives. It walks - * through the request list searching for next scheduled filesystem. - * When such a fs is found, run the lazy initialization request - * (ext4_rn_li_request) and keep track of the time spend in this - * function. Based on that time we compute next schedule time of - * the request. When walking through the list is complete, compute - * next waking time and put itself into sleep. - */ -static int ext4_lazyinit_thread(void *arg) -{ - struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; - struct list_head *pos, *n; - struct ext4_li_request *elr; - unsigned long next_wakeup, cur; - - BUG_ON(NULL == eli); - -cont_thread: - while (true) { - next_wakeup = MAX_JIFFY_OFFSET; - - mutex_lock(&eli->li_list_mtx); - if (list_empty(&eli->li_request_list)) { - mutex_unlock(&eli->li_list_mtx); - goto exit_thread; - } - - list_for_each_safe(pos, n, &eli->li_request_list) { - elr = list_entry(pos, struct ext4_li_request, - lr_request); - - if (time_after_eq(jiffies, elr->lr_next_sched)) { - if (ext4_run_li_request(elr) != 0) { - /* error, remove the lazy_init job */ - ext4_remove_li_request(elr); - continue; - } - } - - if (time_before(elr->lr_next_sched, next_wakeup)) - next_wakeup = elr->lr_next_sched; - } - mutex_unlock(&eli->li_list_mtx); - - try_to_freeze(); - - cur = jiffies; - if ((time_after_eq(cur, next_wakeup)) || - (MAX_JIFFY_OFFSET == next_wakeup)) { - cond_resched(); - continue; - } - - schedule_timeout_interruptible(next_wakeup - cur); - - if (kthread_should_stop()) { - ext4_clear_request_list(); - goto exit_thread; - } - } - -exit_thread: - /* - * It looks like the request list is empty, but we need - * to check it under the li_list_mtx lock, to prevent any - * additions into it, and of course we should lock ext4_li_mtx - * to atomically free the list and ext4_li_info, because at - * this point another ext4 filesystem could be registering - * new one. - */ - mutex_lock(&ext4_li_mtx); - mutex_lock(&eli->li_list_mtx); - if (!list_empty(&eli->li_request_list)) { - mutex_unlock(&eli->li_list_mtx); - mutex_unlock(&ext4_li_mtx); - goto cont_thread; - } - mutex_unlock(&eli->li_list_mtx); - kfree(ext4_li_info); - ext4_li_info = NULL; - mutex_unlock(&ext4_li_mtx); - - return 0; -} - -static void ext4_clear_request_list(void) -{ - struct list_head *pos, *n; - struct ext4_li_request *elr; - - mutex_lock(&ext4_li_info->li_list_mtx); - list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { - elr = list_entry(pos, struct ext4_li_request, - lr_request); - ext4_remove_li_request(elr); - } - mutex_unlock(&ext4_li_info->li_list_mtx); -} - -static int ext4_run_lazyinit_thread(void) -{ - ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, - ext4_li_info, "ext4lazyinit"); - if (IS_ERR(ext4_lazyinit_task)) { - int err = PTR_ERR(ext4_lazyinit_task); - ext4_clear_request_list(); - kfree(ext4_li_info); - ext4_li_info = NULL; - printk(KERN_CRIT "EXT4-fs: error %d creating inode table " - "initialization thread\n", - err); - return err; - } - ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; - return 0; -} - -/* - * Check whether it make sense to run itable init. thread or not. - * If there is at least one uninitialized inode table, return - * corresponding group number, else the loop goes through all - * groups and return total number of groups. - */ -static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) -{ - ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; - struct ext4_group_desc *gdp = NULL; - - for (group = 0; group < ngroups; group++) { - gdp = ext4_get_group_desc(sb, group, NULL); - if (!gdp) - continue; - - if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) - break; - } - - return group; -} - -static int ext4_li_info_new(void) -{ - struct ext4_lazy_init *eli = NULL; - - eli = kzalloc(sizeof(*eli), GFP_KERNEL); - if (!eli) - return -ENOMEM; - - INIT_LIST_HEAD(&eli->li_request_list); - mutex_init(&eli->li_list_mtx); - - eli->li_state |= EXT4_LAZYINIT_QUIT; - - ext4_li_info = eli; - - return 0; -} - -static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, - ext4_group_t start) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_li_request *elr; - unsigned long rnd; - - elr = kzalloc(sizeof(*elr), GFP_KERNEL); - if (!elr) - return NULL; - - elr->lr_super = sb; - elr->lr_sbi = sbi; - elr->lr_next_group = start; - - /* - * Randomize first schedule time of the request to - * spread the inode table initialization requests - * better. - */ - get_random_bytes(&rnd, sizeof(rnd)); - elr->lr_next_sched = jiffies + (unsigned long)rnd % - (EXT4_DEF_LI_MAX_START_DELAY * HZ); - - return elr; -} - -static int ext4_register_li_request(struct super_block *sb, - ext4_group_t first_not_zeroed) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_li_request *elr; - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; - int ret = 0; - - if (sbi->s_li_request != NULL) { - /* - * Reset timeout so it can be computed again, because - * s_li_wait_mult might have changed. - */ - sbi->s_li_request->lr_timeout = 0; - return 0; - } - - if (first_not_zeroed == ngroups || - (sb->s_flags & MS_RDONLY) || - !test_opt(sb, INIT_INODE_TABLE)) - return 0; - - elr = ext4_li_request_new(sb, first_not_zeroed); - if (!elr) - return -ENOMEM; - - mutex_lock(&ext4_li_mtx); - - if (NULL == ext4_li_info) { - ret = ext4_li_info_new(); - if (ret) - goto out; - } - - mutex_lock(&ext4_li_info->li_list_mtx); - list_add(&elr->lr_request, &ext4_li_info->li_request_list); - mutex_unlock(&ext4_li_info->li_list_mtx); - - sbi->s_li_request = elr; - /* - * set elr to NULL here since it has been inserted to - * the request_list and the removal and free of it is - * handled by ext4_clear_request_list from now on. - */ - elr = NULL; - - if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { - ret = ext4_run_lazyinit_thread(); - if (ret) - goto out; - } -out: - mutex_unlock(&ext4_li_mtx); - if (ret) - kfree(elr); - return ret; -} - -/* - * We do not need to lock anything since this is called on - * module unload. - */ -static void ext4_destroy_lazyinit_thread(void) -{ - /* - * If thread exited earlier - * there's nothing to be done. - */ - if (!ext4_li_info || !ext4_lazyinit_task) - return; - - kthread_stop(ext4_lazyinit_task); -} - -static int ext4_fill_super(struct super_block *sb, void *data, int silent) -{ - char *orig_data = kstrdup(data, GFP_KERNEL); - struct buffer_head *bh; - struct ext4_super_block *es = NULL; - struct ext4_sb_info *sbi; - ext4_fsblk_t block; - ext4_fsblk_t sb_block = get_sb_block(&data); - ext4_fsblk_t logical_sb_block; - unsigned long offset = 0; - unsigned long journal_devnum = 0; - unsigned long def_mount_opts; - struct inode *root; - char *cp; - const char *descr; - int ret = -ENOMEM; - int blocksize, clustersize; - unsigned int db_count; - unsigned int i; - int needs_recovery, has_huge_files, has_bigalloc; - __u64 blocks_count; - int err; - unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; - ext4_group_t first_not_zeroed; - - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - goto out_free_orig; - - sbi->s_blockgroup_lock = - kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); - if (!sbi->s_blockgroup_lock) { - kfree(sbi); - goto out_free_orig; - } - sb->s_fs_info = sbi; - sbi->s_mount_opt = 0; - sbi->s_resuid = EXT4_DEF_RESUID; - sbi->s_resgid = EXT4_DEF_RESGID; - sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; - sbi->s_sb_block = sb_block; - if (sb->s_bdev->bd_part) - sbi->s_sectors_written_start = - part_stat_read(sb->s_bdev->bd_part, sectors[1]); - - /* Cleanup superblock name */ - for (cp = sb->s_id; (cp = strchr(cp, '/'));) - *cp = '!'; - - ret = -EINVAL; - blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); - if (!blocksize) { - ext4_msg(sb, KERN_ERR, "unable to set blocksize"); - goto out_fail; - } - - /* - * The ext4 superblock will not be buffer aligned for other than 1kB - * block sizes. We need to calculate the offset from buffer start. - */ - if (blocksize != EXT4_MIN_BLOCK_SIZE) { - logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; - offset = do_div(logical_sb_block, blocksize); - } else { - logical_sb_block = sb_block; - } - - if (!(bh = sb_bread(sb, logical_sb_block))) { - ext4_msg(sb, KERN_ERR, "unable to read superblock"); - goto out_fail; - } - /* - * Note: s_es must be initialized as soon as possible because - * some ext4 macro-instructions depend on its value - */ - es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); - sbi->s_es = es; - sb->s_magic = le16_to_cpu(es->s_magic); - if (sb->s_magic != EXT4_SUPER_MAGIC) - goto cantfind_ext4; - sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); - - /* Set defaults before we parse the mount options */ - def_mount_opts = le32_to_cpu(es->s_default_mount_opts); - set_opt(sb, INIT_INODE_TABLE); - if (def_mount_opts & EXT4_DEFM_DEBUG) - set_opt(sb, DEBUG); - if (def_mount_opts & EXT4_DEFM_BSDGROUPS) - set_opt(sb, GRPID); - if (def_mount_opts & EXT4_DEFM_UID16) - set_opt(sb, NO_UID32); - /* xattr user namespace & acls are now defaulted on */ -#ifdef CONFIG_EXT4_FS_XATTR - set_opt(sb, XATTR_USER); -#endif -#ifdef CONFIG_EXT4_FS_POSIX_ACL - set_opt(sb, POSIX_ACL); -#endif - set_opt(sb, MBLK_IO_SUBMIT); - if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) - set_opt(sb, JOURNAL_DATA); - else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) - set_opt(sb, ORDERED_DATA); - else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) - set_opt(sb, WRITEBACK_DATA); - - if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) - set_opt(sb, ERRORS_PANIC); - else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) - set_opt(sb, ERRORS_CONT); - else - set_opt(sb, ERRORS_RO); - if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) - set_opt(sb, BLOCK_VALIDITY); - if (def_mount_opts & EXT4_DEFM_DISCARD) - set_opt(sb, DISCARD); - - sbi->s_resuid = le16_to_cpu(es->s_def_resuid); - sbi->s_resgid = le16_to_cpu(es->s_def_resgid); - sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; - sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; - sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; - - if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) - set_opt(sb, BARRIER); - - /* - * enable delayed allocation by default - * Use -o nodelalloc to turn it off - */ - if (!IS_EXT3_SB(sb) && - ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) - set_opt(sb, DELALLOC); - - /* - * set default s_li_wait_mult for lazyinit, for the case there is - * no mount option specified. - */ - sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; - - if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, - &journal_devnum, &journal_ioprio, 0)) { - ext4_msg(sb, KERN_WARNING, - "failed to parse options in superblock: %s", - sbi->s_es->s_mount_opts); - } - sbi->s_def_mount_opt = sbi->s_mount_opt; - if (!parse_options((char *) data, sb, &journal_devnum, - &journal_ioprio, 0)) - goto failed_mount; - - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk_once(KERN_WARNING "EXT4-fs: Warning: mounting " - "with data=journal disables delayed " - "allocation and O_DIRECT support!\n"); - if (test_opt2(sb, EXPLICIT_DELALLOC)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and delalloc"); - goto failed_mount; - } - if (test_opt(sb, DIOREAD_NOLOCK)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and delalloc"); - goto failed_mount; - } - if (test_opt(sb, DELALLOC)) - clear_opt(sb, DELALLOC); - } - - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); - if (test_opt(sb, DIOREAD_NOLOCK)) { - if (blocksize < PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "dioread_nolock if block size != PAGE_SIZE"); - goto failed_mount; - } - } - - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); - - if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && - (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || - EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) || - EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U))) - ext4_msg(sb, KERN_WARNING, - "feature flags set on rev 0 fs, " - "running e2fsck is recommended"); - - if (IS_EXT2_SB(sb)) { - if (ext2_feature_set_ok(sb)) - ext4_msg(sb, KERN_INFO, "mounting ext2 file system " - "using the ext4 subsystem"); - else { - ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " - "to feature incompatibilities"); - goto failed_mount; - } - } - - if (IS_EXT3_SB(sb)) { - if (ext3_feature_set_ok(sb)) - ext4_msg(sb, KERN_INFO, "mounting ext3 file system " - "using the ext4 subsystem"); - else { - ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " - "to feature incompatibilities"); - goto failed_mount; - } - } - - /* - * Check feature flags regardless of the revision level, since we - * previously didn't change the revision level when setting the flags, - * so there is a chance incompat flags are set on a rev 0 filesystem. - */ - if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) - goto failed_mount; - - if (blocksize < EXT4_MIN_BLOCK_SIZE || - blocksize > EXT4_MAX_BLOCK_SIZE) { - ext4_msg(sb, KERN_ERR, - "Unsupported filesystem blocksize %d", blocksize); - goto failed_mount; - } - - if (sb->s_blocksize != blocksize) { - /* Validate the filesystem blocksize */ - if (!sb_set_blocksize(sb, blocksize)) { - ext4_msg(sb, KERN_ERR, "bad block size %d", - blocksize); - goto failed_mount; - } - - brelse(bh); - logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; - offset = do_div(logical_sb_block, blocksize); - bh = sb_bread(sb, logical_sb_block); - if (!bh) { - ext4_msg(sb, KERN_ERR, - "Can't read superblock on 2nd try"); - goto failed_mount; - } - es = (struct ext4_super_block *)(((char *)bh->b_data) + offset); - sbi->s_es = es; - if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { - ext4_msg(sb, KERN_ERR, - "Magic mismatch, very weird!"); - goto failed_mount; - } - } - - has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, - has_huge_files); - sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); - - if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { - sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; - sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; - } else { - sbi->s_inode_size = le16_to_cpu(es->s_inode_size); - sbi->s_first_ino = le32_to_cpu(es->s_first_ino); - if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || - (!is_power_of_2(sbi->s_inode_size)) || - (sbi->s_inode_size > blocksize)) { - ext4_msg(sb, KERN_ERR, - "unsupported inode size: %d", - sbi->s_inode_size); - goto failed_mount; - } - if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) - sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2); - } - - sbi->s_desc_size = le16_to_cpu(es->s_desc_size); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) { - if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || - sbi->s_desc_size > EXT4_MAX_DESC_SIZE || - !is_power_of_2(sbi->s_desc_size)) { - ext4_msg(sb, KERN_ERR, - "unsupported descriptor size %lu", - sbi->s_desc_size); - goto failed_mount; - } - } else - sbi->s_desc_size = EXT4_MIN_DESC_SIZE; - - sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); - sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) - goto cantfind_ext4; - - sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); - if (sbi->s_inodes_per_block == 0) - goto cantfind_ext4; - sbi->s_itb_per_group = sbi->s_inodes_per_group / - sbi->s_inodes_per_block; - sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); - sbi->s_sbh = bh; - sbi->s_mount_state = le16_to_cpu(es->s_state); - sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); - sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); - - for (i = 0; i < 4; i++) - sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); - sbi->s_def_hash_version = es->s_def_hash_version; - i = le32_to_cpu(es->s_flags); - if (i & EXT2_FLAGS_UNSIGNED_HASH) - sbi->s_hash_unsigned = 3; - else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { -#ifdef __CHAR_UNSIGNED__ - es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); - sbi->s_hash_unsigned = 3; -#else - es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); -#endif - } - - /* Handle clustersize */ - clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); - has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC); - if (has_bigalloc) { - if (clustersize < blocksize) { - ext4_msg(sb, KERN_ERR, - "cluster size (%d) smaller than " - "block size (%d)", clustersize, blocksize); - goto failed_mount; - } - sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - - le32_to_cpu(es->s_log_block_size); - sbi->s_clusters_per_group = - le32_to_cpu(es->s_clusters_per_group); - if (sbi->s_clusters_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#clusters per group too big: %lu", - sbi->s_clusters_per_group); - goto failed_mount; - } - if (sbi->s_blocks_per_group != - (sbi->s_clusters_per_group * (clustersize / blocksize))) { - ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " - "clusters per group (%lu) inconsistent", - sbi->s_blocks_per_group, - sbi->s_clusters_per_group); - goto failed_mount; - } - } else { - if (clustersize != blocksize) { - ext4_warning(sb, "fragment/cluster size (%d) != " - "block size (%d)", clustersize, - blocksize); - clustersize = blocksize; - } - if (sbi->s_blocks_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#blocks per group too big: %lu", - sbi->s_blocks_per_group); - goto failed_mount; - } - sbi->s_clusters_per_group = sbi->s_blocks_per_group; - sbi->s_cluster_bits = 0; - } - sbi->s_cluster_ratio = clustersize / blocksize; - - if (sbi->s_inodes_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#inodes per group too big: %lu", - sbi->s_inodes_per_group); - goto failed_mount; - } - - /* - * Test whether we have more sectors than will fit in sector_t, - * and whether the max offset is addressable by the page cache. - */ - err = generic_check_addressable(sb->s_blocksize_bits, - ext4_blocks_count(es)); - if (err) { - ext4_msg(sb, KERN_ERR, "filesystem" - " too large to mount safely on this system"); - if (sizeof(sector_t) < 8) - ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); - ret = err; - goto failed_mount; - } - - if (EXT4_BLOCKS_PER_GROUP(sb) == 0) - goto cantfind_ext4; - - /* check blocks count against device size */ - blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; - if (blocks_count && ext4_blocks_count(es) > blocks_count) { - ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " - "exceeds size of device (%llu blocks)", - ext4_blocks_count(es), blocks_count); - goto failed_mount; - } - - /* - * It makes no sense for the first data block to be beyond the end - * of the filesystem. - */ - if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { - ext4_msg(sb, KERN_WARNING, "bad geometry: first data " - "block %u is beyond end of filesystem (%llu)", - le32_to_cpu(es->s_first_data_block), - ext4_blocks_count(es)); - goto failed_mount; - } - blocks_count = (ext4_blocks_count(es) - - le32_to_cpu(es->s_first_data_block) + - EXT4_BLOCKS_PER_GROUP(sb) - 1); - do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); - if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { - ext4_msg(sb, KERN_WARNING, "groups count too large: %u " - "(block count %llu, first data block %u, " - "blocks per group %lu)", sbi->s_groups_count, - ext4_blocks_count(es), - le32_to_cpu(es->s_first_data_block), - EXT4_BLOCKS_PER_GROUP(sb)); - goto failed_mount; - } - sbi->s_groups_count = blocks_count; - sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, - (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); - db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / - EXT4_DESC_PER_BLOCK(sb); - sbi->s_group_desc = ext4_kvmalloc(db_count * - sizeof(struct buffer_head *), - GFP_KERNEL); - if (sbi->s_group_desc == NULL) { - ext4_msg(sb, KERN_ERR, "not enough memory"); - goto failed_mount; - } - - if (ext4_proc_root) - sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); - - if (sbi->s_proc) - proc_create_data("options", S_IRUGO, sbi->s_proc, - &ext4_seq_options_fops, sb); - - bgl_lock_init(sbi->s_blockgroup_lock); - - for (i = 0; i < db_count; i++) { - block = descriptor_loc(sb, logical_sb_block, i); - sbi->s_group_desc[i] = sb_bread(sb, block); - if (!sbi->s_group_desc[i]) { - ext4_msg(sb, KERN_ERR, - "can't read group descriptor %d", i); - db_count = i; - goto failed_mount2; - } - } - if (!ext4_check_descriptors(sb, &first_not_zeroed)) { - ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - goto failed_mount2; - } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) - if (!ext4_fill_flex_info(sb)) { - ext4_msg(sb, KERN_ERR, - "unable to initialize " - "flex_bg meta info!"); - goto failed_mount2; - } - - sbi->s_gdb_count = db_count; - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); - spin_lock_init(&sbi->s_next_gen_lock); - - init_timer(&sbi->s_err_report); - sbi->s_err_report.function = print_daily_error_info; - sbi->s_err_report.data = (unsigned long) sb; - - err = percpu_counter_init(&sbi->s_freeclusters_counter, - ext4_count_free_clusters(sb)); - if (!err) { - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - } - if (!err) { - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); - } - if (err) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); - goto failed_mount3; - } - - sbi->s_stripe = ext4_get_stripe_size(sbi); - sbi->s_max_writeback_mb_bump = 128; - - /* - * set up enough so that it can read an inode - */ - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; - sb->s_export_op = &ext4_export_ops; - sb->s_xattr = ext4_xattr_handlers; -#ifdef CONFIG_QUOTA - sb->s_qcop = &ext4_qctl_operations; - sb->dq_op = &ext4_quota_operations; -#endif - memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); - - INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ - mutex_init(&sbi->s_orphan_lock); - sbi->s_resize_flags = 0; - - sb->s_root = NULL; - - needs_recovery = (es->s_last_orphan != 0 || - EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_RECOVER)); - - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && - !(sb->s_flags & MS_RDONLY)) - if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) - goto failed_mount3; - - /* - * The first inode we look at is the journal inode. Don't try - * root first: it may be modified in the journal! - */ - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { - if (ext4_load_journal(sb, es, journal_devnum)) - goto failed_mount3; - } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && - EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { - ext4_msg(sb, KERN_ERR, "required journal recovery " - "suppressed and not mounted read-only"); - goto failed_mount_wq; - } else { - clear_opt(sb, DATA_FLAGS); - sbi->s_journal = NULL; - needs_recovery = 0; - goto no_journal; - } - - if (ext4_blocks_count(es) > 0xffffffffULL && - !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, - JBD2_FEATURE_INCOMPAT_64BIT)) { - ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); - goto failed_mount_wq; - } - - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } else if (test_opt(sb, JOURNAL_CHECKSUM)) { - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); - jbd2_journal_clear_features(sbi->s_journal, 0, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } else { - jbd2_journal_clear_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } - - /* We have now updated the journal if required, so we can - * validate the data journaling mode. */ - switch (test_opt(sb, DATA_FLAGS)) { - case 0: - /* No mode set, assume a default based on the journal - * capabilities: ORDERED_DATA if the journal can - * cope, else JOURNAL_DATA - */ - if (jbd2_journal_check_available_features - (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) - set_opt(sb, ORDERED_DATA); - else - set_opt(sb, JOURNAL_DATA); - break; - - case EXT4_MOUNT_ORDERED_DATA: - case EXT4_MOUNT_WRITEBACK_DATA: - if (!jbd2_journal_check_available_features - (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { - ext4_msg(sb, KERN_ERR, "Journal does not support " - "requested data journaling mode"); - goto failed_mount_wq; - } - default: - break; - } - set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); - - sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; - - /* - * The journal may have updated the bg summary counts, so we - * need to update the global counters. - */ - percpu_counter_set(&sbi->s_freeclusters_counter, - ext4_count_free_clusters(sb)); - percpu_counter_set(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - percpu_counter_set(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - percpu_counter_set(&sbi->s_dirtyclusters_counter, 0); - -no_journal: - /* - * The maximum number of concurrent works can be high and - * concurrency isn't really necessary. Limit it to 1. - */ - EXT4_SB(sb)->dio_unwritten_wq = - alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); - if (!EXT4_SB(sb)->dio_unwritten_wq) { - printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); - goto failed_mount_wq; - } - - /* - * The jbd2_journal_load will have done any necessary log recovery, - * so we can safely mount the rest of the filesystem now. - */ - - root = ext4_iget(sb, EXT4_ROOT_INO); - if (IS_ERR(root)) { - ext4_msg(sb, KERN_ERR, "get root inode failed"); - ret = PTR_ERR(root); - root = NULL; - goto failed_mount4; - } - if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { - ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); - iput(root); - goto failed_mount4; - } - sb->s_root = d_make_root(root); - if (!sb->s_root) { - ext4_msg(sb, KERN_ERR, "get root dentry failed"); - ret = -ENOMEM; - goto failed_mount4; - } - - if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY)) - sb->s_flags |= MS_RDONLY; - - /* determine the minimum size of new large inodes, if present */ - if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { - sbi->s_want_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) { - if (sbi->s_want_extra_isize < - le16_to_cpu(es->s_want_extra_isize)) - sbi->s_want_extra_isize = - le16_to_cpu(es->s_want_extra_isize); - if (sbi->s_want_extra_isize < - le16_to_cpu(es->s_min_extra_isize)) - sbi->s_want_extra_isize = - le16_to_cpu(es->s_min_extra_isize); - } - } - /* Check if enough inode space is available */ - if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > - sbi->s_inode_size) { - sbi->s_want_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; - ext4_msg(sb, KERN_INFO, "required extra inode space not" - "available"); - } - - err = ext4_setup_system_zone(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize system " - "zone (%d)", err); - goto failed_mount4a; - } - - ext4_ext_init(sb); - err = ext4_mb_init(sb, needs_recovery); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", - err); - goto failed_mount5; - } - - err = ext4_register_li_request(sb, first_not_zeroed); - if (err) - goto failed_mount6; - - sbi->s_kobj.kset = ext4_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, - "%s", sb->s_id); - if (err) - goto failed_mount7; - - EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; - ext4_orphan_cleanup(sb, es); - EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; - if (needs_recovery) { - ext4_msg(sb, KERN_INFO, "recovery complete"); - ext4_mark_recovery_complete(sb, es); - } - if (EXT4_SB(sb)->s_journal) { - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - descr = " journalled data mode"; - else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - descr = " ordered data mode"; - else - descr = " writeback data mode"; - } else - descr = "out journal"; - - ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, - *sbi->s_es->s_mount_opts ? "; " : "", orig_data); - - if (es->s_error_count) - mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ - - kfree(orig_data); - return 0; - -cantfind_ext4: - if (!silent) - ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); - goto failed_mount; - -failed_mount7: - ext4_unregister_li_request(sb); -failed_mount6: - ext4_mb_release(sb); -failed_mount5: - ext4_ext_release(sb); - ext4_release_system_zone(sb); -failed_mount4a: - dput(sb->s_root); - sb->s_root = NULL; -failed_mount4: - ext4_msg(sb, KERN_ERR, "mount failed"); - destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); -failed_mount_wq: - if (sbi->s_journal) { - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; - } -failed_mount3: - del_timer(&sbi->s_err_report); - if (sbi->s_flex_groups) - ext4_kvfree(sbi->s_flex_groups); - percpu_counter_destroy(&sbi->s_freeclusters_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - if (sbi->s_mmp_tsk) - kthread_stop(sbi->s_mmp_tsk); -failed_mount2: - for (i = 0; i < db_count; i++) - brelse(sbi->s_group_desc[i]); - ext4_kvfree(sbi->s_group_desc); -failed_mount: - if (sbi->s_proc) { - remove_proc_entry("options", sbi->s_proc); - remove_proc_entry(sb->s_id, ext4_proc_root); - } -#ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) - kfree(sbi->s_qf_names[i]); -#endif - ext4_blkdev_remove(sbi); - brelse(bh); -out_fail: - sb->s_fs_info = NULL; - kfree(sbi->s_blockgroup_lock); - kfree(sbi); -out_free_orig: - kfree(orig_data); - return ret; -} - -/* - * Setup any per-fs journal parameters now. We'll do this both on - * initial mount, once the journal has been initialised but before we've - * done any recovery; and again on any subsequent remount. - */ -static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - journal->j_commit_interval = sbi->s_commit_interval; - journal->j_min_batch_time = sbi->s_min_batch_time; - journal->j_max_batch_time = sbi->s_max_batch_time; - - write_lock(&journal->j_state_lock); - if (test_opt(sb, BARRIER)) - journal->j_flags |= JBD2_BARRIER; - else - journal->j_flags &= ~JBD2_BARRIER; - if (test_opt(sb, DATA_ERR_ABORT)) - journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; - else - journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; - write_unlock(&journal->j_state_lock); -} - -static journal_t *ext4_get_journal(struct super_block *sb, - unsigned int journal_inum) -{ - struct inode *journal_inode; - journal_t *journal; - - BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); - - /* First, test for the existence of a valid inode on disk. Bad - * things happen if we iget() an unused inode, as the subsequent - * iput() will try to delete it. */ - - journal_inode = ext4_iget(sb, journal_inum); - if (IS_ERR(journal_inode)) { - ext4_msg(sb, KERN_ERR, "no journal found"); - return NULL; - } - if (!journal_inode->i_nlink) { - make_bad_inode(journal_inode); - iput(journal_inode); - ext4_msg(sb, KERN_ERR, "journal inode is deleted"); - return NULL; - } - - jbd_debug(2, "Journal inode found at %p: %lld bytes\n", - journal_inode, journal_inode->i_size); - if (!S_ISREG(journal_inode->i_mode)) { - ext4_msg(sb, KERN_ERR, "invalid journal inode"); - iput(journal_inode); - return NULL; - } - - journal = jbd2_journal_init_inode(journal_inode); - if (!journal) { - ext4_msg(sb, KERN_ERR, "Could not load journal inode"); - iput(journal_inode); - return NULL; - } - journal->j_private = sb; - ext4_init_journal_params(sb, journal); - return journal; -} - -static journal_t *ext4_get_dev_journal(struct super_block *sb, - dev_t j_dev) -{ - struct buffer_head *bh; - journal_t *journal; - ext4_fsblk_t start; - ext4_fsblk_t len; - int hblock, blocksize; - ext4_fsblk_t sb_block; - unsigned long offset; - struct ext4_super_block *es; - struct block_device *bdev; - - BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); - - bdev = ext4_blkdev_get(j_dev, sb); - if (bdev == NULL) - return NULL; - - blocksize = sb->s_blocksize; - hblock = bdev_logical_block_size(bdev); - if (blocksize < hblock) { - ext4_msg(sb, KERN_ERR, - "blocksize too small for journal device"); - goto out_bdev; - } - - sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; - offset = EXT4_MIN_BLOCK_SIZE % blocksize; - set_blocksize(bdev, blocksize); - if (!(bh = __bread(bdev, sb_block, blocksize))) { - ext4_msg(sb, KERN_ERR, "couldn't read superblock of " - "external journal"); - goto out_bdev; - } - - es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); - if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || - !(le32_to_cpu(es->s_feature_incompat) & - EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { - ext4_msg(sb, KERN_ERR, "external journal has " - "bad superblock"); - brelse(bh); - goto out_bdev; - } - - if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { - ext4_msg(sb, KERN_ERR, "journal UUID does not match"); - brelse(bh); - goto out_bdev; - } - - len = ext4_blocks_count(es); - start = sb_block + 1; - brelse(bh); /* we're done with the superblock */ - - journal = jbd2_journal_init_dev(bdev, sb->s_bdev, - start, len, blocksize); - if (!journal) { - ext4_msg(sb, KERN_ERR, "failed to create device journal"); - goto out_bdev; - } - journal->j_private = sb; - ll_rw_block(READ, 1, &journal->j_sb_buffer); - wait_on_buffer(journal->j_sb_buffer); - if (!buffer_uptodate(journal->j_sb_buffer)) { - ext4_msg(sb, KERN_ERR, "I/O error on journal device"); - goto out_journal; - } - if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { - ext4_msg(sb, KERN_ERR, "External journal has more than one " - "user (unsupported) - %d", - be32_to_cpu(journal->j_superblock->s_nr_users)); - goto out_journal; - } - EXT4_SB(sb)->journal_bdev = bdev; - ext4_init_journal_params(sb, journal); - return journal; - -out_journal: - jbd2_journal_destroy(journal); -out_bdev: - ext4_blkdev_put(bdev); - return NULL; -} - -static int ext4_load_journal(struct super_block *sb, - struct ext4_super_block *es, - unsigned long journal_devnum) -{ - journal_t *journal; - unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); - dev_t journal_dev; - int err = 0; - int really_read_only; - - BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); - - if (journal_devnum && - journal_devnum != le32_to_cpu(es->s_journal_dev)) { - ext4_msg(sb, KERN_INFO, "external journal device major/minor " - "numbers have changed"); - journal_dev = new_decode_dev(journal_devnum); - } else - journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); - - really_read_only = bdev_read_only(sb->s_bdev); - - /* - * Are we loading a blank journal or performing recovery after a - * crash? For recovery, we need to check in advance whether we - * can get read-write access to the device. - */ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { - if (sb->s_flags & MS_RDONLY) { - ext4_msg(sb, KERN_INFO, "INFO: recovery " - "required on readonly filesystem"); - if (really_read_only) { - ext4_msg(sb, KERN_ERR, "write access " - "unavailable, cannot proceed"); - return -EROFS; - } - ext4_msg(sb, KERN_INFO, "write access will " - "be enabled during recovery"); - } - } - - if (journal_inum && journal_dev) { - ext4_msg(sb, KERN_ERR, "filesystem has both journal " - "and inode journals!"); - return -EINVAL; - } - - if (journal_inum) { - if (!(journal = ext4_get_journal(sb, journal_inum))) - return -EINVAL; - } else { - if (!(journal = ext4_get_dev_journal(sb, journal_dev))) - return -EINVAL; - } - - if (!(journal->j_flags & JBD2_BARRIER)) - ext4_msg(sb, KERN_INFO, "barriers disabled"); - - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) - err = jbd2_journal_wipe(journal, !really_read_only); - if (!err) { - char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); - if (save) - memcpy(save, ((char *) es) + - EXT4_S_ERR_START, EXT4_S_ERR_LEN); - err = jbd2_journal_load(journal); - if (save) - memcpy(((char *) es) + EXT4_S_ERR_START, - save, EXT4_S_ERR_LEN); - kfree(save); - } - - if (err) { - ext4_msg(sb, KERN_ERR, "error loading journal"); - jbd2_journal_destroy(journal); - return err; - } - - EXT4_SB(sb)->s_journal = journal; - ext4_clear_journal_err(sb, es); - - if (!really_read_only && journal_devnum && - journal_devnum != le32_to_cpu(es->s_journal_dev)) { - es->s_journal_dev = cpu_to_le32(journal_devnum); - - /* Make sure we flush the recovery flag to disk. */ - ext4_commit_super(sb, 1); - } - - return 0; -} - -static int ext4_commit_super(struct super_block *sb, int sync) -{ - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; - int error = 0; - - if (!sbh || block_device_ejected(sb)) - return error; - if (buffer_write_io_error(sbh)) { - /* - * Oh, dear. A previous attempt to write the - * superblock failed. This could happen because the - * USB device was yanked out. Or it could happen to - * be a transient write error and maybe the block will - * be remapped. Nothing we can do but to retry the - * write and hope for the best. - */ - ext4_msg(sb, KERN_ERR, "previous I/O error to " - "superblock detected"); - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); - } - /* - * If the file system is mounted read-only, don't update the - * superblock write time. This avoids updating the superblock - * write time when we are mounting the root file system - * read/only but we need to replay the journal; at that point, - * for people who are east of GMT and who make their clock - * tick in localtime for Windows bug-for-bug compatibility, - * the clock is set in the future, and this will cause e2fsck - * to complain and force a full file system check. - */ - if (!(sb->s_flags & MS_RDONLY)) - es->s_wtime = cpu_to_le32(get_seconds()); - if (sb->s_bdev->bd_part) - es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + - ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1)); - else - es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); - ext4_free_blocks_count_set(es, - EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeclusters_counter))); - es->s_free_inodes_count = - cpu_to_le32(percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeinodes_counter)); - sb->s_dirt = 0; - BUFFER_TRACE(sbh, "marking dirty"); - mark_buffer_dirty(sbh); - if (sync) { - error = sync_dirty_buffer(sbh); - if (error) - return error; - - error = buffer_write_io_error(sbh); - if (error) { - ext4_msg(sb, KERN_ERR, "I/O error while writing " - "superblock"); - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); - } - } - return error; -} - -/* - * Have we just finished recovery? If so, and if we are mounting (or - * remounting) the filesystem readonly, then we will end up with a - * consistent fs on disk. Record that fact. - */ -static void ext4_mark_recovery_complete(struct super_block *sb, - struct ext4_super_block *es) -{ - journal_t *journal = EXT4_SB(sb)->s_journal; - - if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { - BUG_ON(journal != NULL); - return; - } - jbd2_journal_lock_updates(journal); - if (jbd2_journal_flush(journal) < 0) - goto out; - - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && - sb->s_flags & MS_RDONLY) { - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - ext4_commit_super(sb, 1); - } - -out: - jbd2_journal_unlock_updates(journal); -} - -/* - * If we are mounting (or read-write remounting) a filesystem whose journal - * has recorded an error from a previous lifetime, move that error to the - * main filesystem now. - */ -static void ext4_clear_journal_err(struct super_block *sb, - struct ext4_super_block *es) -{ - journal_t *journal; - int j_errno; - const char *errstr; - - BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); - - journal = EXT4_SB(sb)->s_journal; - - /* - * Now check for any error status which may have been recorded in the - * journal by a prior ext4_error() or ext4_abort() - */ - - j_errno = jbd2_journal_errno(journal); - if (j_errno) { - char nbuf[16]; - - errstr = ext4_decode_error(sb, j_errno, nbuf); - ext4_warning(sb, "Filesystem error recorded " - "from previous mount: %s", errstr); - ext4_warning(sb, "Marking fs in need of filesystem check."); - - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb, 1); - - jbd2_journal_clear_err(journal); - } -} - -/* - * Force the running and committing transactions to commit, - * and wait on the commit. - */ -int ext4_force_commit(struct super_block *sb) -{ - journal_t *journal; - int ret = 0; - - if (sb->s_flags & MS_RDONLY) - return 0; - - journal = EXT4_SB(sb)->s_journal; - if (journal) { - vfs_check_frozen(sb, SB_FREEZE_TRANS); - ret = ext4_journal_force_commit(journal); - } - - return ret; -} - -static void ext4_write_super(struct super_block *sb) -{ - lock_super(sb); - ext4_commit_super(sb, 1); - unlock_super(sb); -} - -static int ext4_sync_fs(struct super_block *sb, int wait) -{ - int ret = 0; - tid_t target; - struct ext4_sb_info *sbi = EXT4_SB(sb); - - trace_ext4_sync_fs(sb, wait); - flush_workqueue(sbi->dio_unwritten_wq); - if (jbd2_journal_start_commit(sbi->s_journal, &target)) { - if (wait) - jbd2_log_wait_commit(sbi->s_journal, target); - } - return ret; -} - -/* - * LVM calls this function before a (read-only) snapshot is created. This - * gives us a chance to flush the journal completely and mark the fs clean. - * - * Note that only this function cannot bring a filesystem to be in a clean - * state independently, because ext4 prevents a new handle from being started - * by @sb->s_frozen, which stays in an upper layer. It thus needs help from - * the upper layer. - */ -static int ext4_freeze(struct super_block *sb) -{ - int error = 0; - journal_t *journal; - - if (sb->s_flags & MS_RDONLY) - return 0; - - journal = EXT4_SB(sb)->s_journal; - - /* Now we set up the journal barrier. */ - jbd2_journal_lock_updates(journal); - - /* - * Don't clear the needs_recovery flag if we failed to flush - * the journal. - */ - error = jbd2_journal_flush(journal); - if (error < 0) - goto out; - - /* Journal blocked and flushed, clear needs_recovery flag. */ - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - error = ext4_commit_super(sb, 1); -out: - /* we rely on s_frozen to stop further updates */ - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - return error; -} - -/* - * Called by LVM after the snapshot is done. We need to reset the RECOVER - * flag here, even though the filesystem is not technically dirty yet. - */ -static int ext4_unfreeze(struct super_block *sb) -{ - if (sb->s_flags & MS_RDONLY) - return 0; - - lock_super(sb); - /* Reset the needs_recovery flag before the fs is unlocked. */ - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - ext4_commit_super(sb, 1); - unlock_super(sb); - return 0; -} - -/* - * Structure to save mount options for ext4_remount's benefit - */ -struct ext4_mount_options { - unsigned long s_mount_opt; - unsigned long s_mount_opt2; - uid_t s_resuid; - gid_t s_resgid; - unsigned long s_commit_interval; - u32 s_min_batch_time, s_max_batch_time; -#ifdef CONFIG_QUOTA - int s_jquota_fmt; - char *s_qf_names[MAXQUOTAS]; -#endif -}; - -static int ext4_remount(struct super_block *sb, int *flags, char *data) -{ - struct ext4_super_block *es; - struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned long old_sb_flags; - struct ext4_mount_options old_opts; - int enable_quota = 0; - ext4_group_t g; - unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; - int err = 0; -#ifdef CONFIG_QUOTA - int i; -#endif - char *orig_data = kstrdup(data, GFP_KERNEL); - - /* Store the original options */ - lock_super(sb); - old_sb_flags = sb->s_flags; - old_opts.s_mount_opt = sbi->s_mount_opt; - old_opts.s_mount_opt2 = sbi->s_mount_opt2; - old_opts.s_resuid = sbi->s_resuid; - old_opts.s_resgid = sbi->s_resgid; - old_opts.s_commit_interval = sbi->s_commit_interval; - old_opts.s_min_batch_time = sbi->s_min_batch_time; - old_opts.s_max_batch_time = sbi->s_max_batch_time; -#ifdef CONFIG_QUOTA - old_opts.s_jquota_fmt = sbi->s_jquota_fmt; - for (i = 0; i < MAXQUOTAS; i++) - old_opts.s_qf_names[i] = sbi->s_qf_names[i]; -#endif - if (sbi->s_journal && sbi->s_journal->j_task->io_context) - journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; - - /* - * Allow the "check" option to be passed as a remount option. - */ - if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { - err = -EINVAL; - goto restore_opts; - } - - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) - ext4_abort(sb, "Abort forced by user"); - - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); - - es = sbi->s_es; - - if (sbi->s_journal) { - ext4_init_journal_params(sb, sbi->s_journal); - set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); - } - - if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { - err = -EROFS; - goto restore_opts; - } - - if (*flags & MS_RDONLY) { - err = dquot_suspend(sb, -1); - if (err < 0) - goto restore_opts; - - /* - * First of all, the unconditional stuff we have to do - * to disable replay of the journal when we next remount - */ - sb->s_flags |= MS_RDONLY; - - /* - * OK, test if we are remounting a valid rw partition - * readonly, and if so set the rdonly flag and then - * mark the partition as valid again. - */ - if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) && - (sbi->s_mount_state & EXT4_VALID_FS)) - es->s_state = cpu_to_le16(sbi->s_mount_state); - - if (sbi->s_journal) - ext4_mark_recovery_complete(sb, es); - } else { - /* Make sure we can mount this feature set readwrite */ - if (!ext4_feature_set_ok(sb, 0)) { - err = -EROFS; - goto restore_opts; - } - /* - * Make sure the group descriptor checksums - * are sane. If they aren't, refuse to remount r/w. - */ - for (g = 0; g < sbi->s_groups_count; g++) { - struct ext4_group_desc *gdp = - ext4_get_group_desc(sb, g, NULL); - - if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { - ext4_msg(sb, KERN_ERR, - "ext4_remount: Checksum for group %u failed (%u!=%u)", - g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), - le16_to_cpu(gdp->bg_checksum)); - err = -EINVAL; - goto restore_opts; - } - } - - /* - * If we have an unprocessed orphan list hanging - * around from a previously readonly bdev mount, - * require a full umount/remount for now. - */ - if (es->s_last_orphan) { - ext4_msg(sb, KERN_WARNING, "Couldn't " - "remount RDWR because of unprocessed " - "orphan inode list. Please " - "umount/remount instead"); - err = -EINVAL; - goto restore_opts; - } - - /* - * Mounting a RDONLY partition read-write, so reread - * and store the current valid flag. (It may have - * been changed by e2fsck since we originally mounted - * the partition.) - */ - if (sbi->s_journal) - ext4_clear_journal_err(sb, es); - sbi->s_mount_state = le16_to_cpu(es->s_state); - if (!ext4_setup_super(sb, es, 0)) - sb->s_flags &= ~MS_RDONLY; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_MMP)) - if (ext4_multi_mount_protect(sb, - le64_to_cpu(es->s_mmp_block))) { - err = -EROFS; - goto restore_opts; - } - enable_quota = 1; - } - } - - /* - * Reinitialize lazy itable initialization thread based on - * current settings - */ - if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE)) - ext4_unregister_li_request(sb); - else { - ext4_group_t first_not_zeroed; - first_not_zeroed = ext4_has_uninit_itable(sb); - ext4_register_li_request(sb, first_not_zeroed); - } - - ext4_setup_system_zone(sb); - if (sbi->s_journal == NULL) - ext4_commit_super(sb, 1); - -#ifdef CONFIG_QUOTA - /* Release old quota file names */ - for (i = 0; i < MAXQUOTAS; i++) - if (old_opts.s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(old_opts.s_qf_names[i]); -#endif - unlock_super(sb); - if (enable_quota) - dquot_resume(sb, -1); - - ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); - kfree(orig_data); - return 0; - -restore_opts: - sb->s_flags = old_sb_flags; - sbi->s_mount_opt = old_opts.s_mount_opt; - sbi->s_mount_opt2 = old_opts.s_mount_opt2; - sbi->s_resuid = old_opts.s_resuid; - sbi->s_resgid = old_opts.s_resgid; - sbi->s_commit_interval = old_opts.s_commit_interval; - sbi->s_min_batch_time = old_opts.s_min_batch_time; - sbi->s_max_batch_time = old_opts.s_max_batch_time; -#ifdef CONFIG_QUOTA - sbi->s_jquota_fmt = old_opts.s_jquota_fmt; - for (i = 0; i < MAXQUOTAS; i++) { - if (sbi->s_qf_names[i] && - old_opts.s_qf_names[i] != sbi->s_qf_names[i]) - kfree(sbi->s_qf_names[i]); - sbi->s_qf_names[i] = old_opts.s_qf_names[i]; - } -#endif - unlock_super(sb); - kfree(orig_data); - return err; -} - -/* - * Note: calculating the overhead so we can be compatible with - * historical BSD practice is quite difficult in the face of - * clusters/bigalloc. This is because multiple metadata blocks from - * different block group can end up in the same allocation cluster. - * Calculating the exact overhead in the face of clustered allocation - * requires either O(all block bitmaps) in memory or O(number of block - * groups**2) in time. We will still calculate the superblock for - * older file systems --- and if we come across with a bigalloc file - * system with zero in s_overhead_clusters the estimate will be close to - * correct especially for very large cluster sizes --- but for newer - * file systems, it's better to calculate this figure once at mkfs - * time, and store it in the superblock. If the superblock value is - * present (even for non-bigalloc file systems), we will use it. - */ -static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct super_block *sb = dentry->d_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - struct ext4_group_desc *gdp; - u64 fsid; - s64 bfree; - - if (test_opt(sb, MINIX_DF)) { - sbi->s_overhead_last = 0; - } else if (es->s_overhead_clusters) { - sbi->s_overhead_last = le32_to_cpu(es->s_overhead_clusters); - } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { - ext4_group_t i, ngroups = ext4_get_groups_count(sb); - ext4_fsblk_t overhead = 0; - - /* - * Compute the overhead (FS structures). This is constant - * for a given filesystem unless the number of block groups - * changes so we cache the previous value until it does. - */ - - /* - * All of the blocks before first_data_block are - * overhead - */ - overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); - - /* - * Add the overhead found in each block group - */ - for (i = 0; i < ngroups; i++) { - gdp = ext4_get_group_desc(sb, i, NULL); - overhead += ext4_num_overhead_clusters(sb, i, gdp); - cond_resched(); - } - sbi->s_overhead_last = overhead; - smp_wmb(); - sbi->s_blocks_last = ext4_blocks_count(es); - } - - buf->f_type = EXT4_SUPER_MAGIC; - buf->f_bsize = sb->s_blocksize; - buf->f_blocks = (ext4_blocks_count(es) - - EXT4_C2B(sbi, sbi->s_overhead_last)); - bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - - percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); - /* prevent underflow in case that few free space is available */ - buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); - buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); - if (buf->f_bfree < ext4_r_blocks_count(es)) - buf->f_bavail = 0; - buf->f_files = le32_to_cpu(es->s_inodes_count); - buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); - buf->f_namelen = EXT4_NAME_LEN; - fsid = le64_to_cpup((void *)es->s_uuid) ^ - le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; - - return 0; -} - -/* Helper function for writing quotas on sync - we need to start transaction - * before quota file is locked for write. Otherwise the are possible deadlocks: - * Process 1 Process 2 - * ext4_create() quota_sync() - * jbd2_journal_start() write_dquot() - * dquot_initialize() down(dqio_mutex) - * down(dqio_mutex) jbd2_journal_start() - * - */ - -#ifdef CONFIG_QUOTA - -static inline struct inode *dquot_to_inode(struct dquot *dquot) -{ - return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type]; -} - -static int ext4_write_dquot(struct dquot *dquot) -{ - int ret, err; - handle_t *handle; - struct inode *inode; - - inode = dquot_to_inode(dquot); - handle = ext4_journal_start(inode, - EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ret = dquot_commit(dquot); - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - return ret; -} - -static int ext4_acquire_dquot(struct dquot *dquot) -{ - int ret, err; - handle_t *handle; - - handle = ext4_journal_start(dquot_to_inode(dquot), - EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ret = dquot_acquire(dquot); - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - return ret; -} - -static int ext4_release_dquot(struct dquot *dquot) -{ - int ret, err; - handle_t *handle; - - handle = ext4_journal_start(dquot_to_inode(dquot), - EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); - if (IS_ERR(handle)) { - /* Release dquot anyway to avoid endless cycle in dqput() */ - dquot_release(dquot); - return PTR_ERR(handle); - } - ret = dquot_release(dquot); - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - return ret; -} - -static int ext4_mark_dquot_dirty(struct dquot *dquot) -{ - /* Are we journaling quotas? */ - if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || - EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { - dquot_mark_dquot_dirty(dquot); - return ext4_write_dquot(dquot); - } else { - return dquot_mark_dquot_dirty(dquot); - } -} - -static int ext4_write_info(struct super_block *sb, int type) -{ - int ret, err; - handle_t *handle; - - /* Data block + inode block */ - handle = ext4_journal_start(sb->s_root->d_inode, 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ret = dquot_commit_info(sb, type); - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - return ret; -} - -/* - * Turn on quotas during mount time - we need to find - * the quota file and such... - */ -static int ext4_quota_on_mount(struct super_block *sb, int type) -{ - return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], - EXT4_SB(sb)->s_jquota_fmt, type); -} - -/* - * Standard function to be called on quota_on - */ -static int ext4_quota_on(struct super_block *sb, int type, int format_id, - struct path *path) -{ - int err; - - if (!test_opt(sb, QUOTA)) - return -EINVAL; - - /* Quotafile not on the same filesystem? */ - if (path->dentry->d_sb != sb) - return -EXDEV; - /* Journaling quota? */ - if (EXT4_SB(sb)->s_qf_names[type]) { - /* Quotafile not in fs root? */ - if (path->dentry->d_parent != sb->s_root) - ext4_msg(sb, KERN_WARNING, - "Quota file not on filesystem root. " - "Journaled quota will not work"); - } - - /* - * When we journal data on quota file, we have to flush journal to see - * all updates to the file when we bypass pagecache... - */ - if (EXT4_SB(sb)->s_journal && - ext4_should_journal_data(path->dentry->d_inode)) { - /* - * We don't need to lock updates but journal_flush() could - * otherwise be livelocked... - */ - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - if (err) - return err; - } - - return dquot_quota_on(sb, type, format_id, path); -} - -static int ext4_quota_off(struct super_block *sb, int type) -{ - struct inode *inode = sb_dqopt(sb)->files[type]; - handle_t *handle; - - /* Force all delayed allocation blocks to be allocated. - * Caller already holds s_umount sem */ - if (test_opt(sb, DELALLOC)) - sync_filesystem(sb); - - if (!inode) - goto out; - - /* Update modification times of quota files when userspace can - * start looking at them */ - handle = ext4_journal_start(inode, 1); - if (IS_ERR(handle)) - goto out; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); - -out: - return dquot_quota_off(sb, type); -} - -/* Read data from quotafile - avoid pagecache and such because we cannot afford - * acquiring the locks... As quota files are never truncated and quota code - * itself serializes the operations (and no one else should touch the files) - * we don't have to be afraid of races */ -static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, - size_t len, loff_t off) -{ - struct inode *inode = sb_dqopt(sb)->files[type]; - ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); - int err = 0; - int offset = off & (sb->s_blocksize - 1); - int tocopy; - size_t toread; - struct buffer_head *bh; - loff_t i_size = i_size_read(inode); - - if (off > i_size) - return 0; - if (off+len > i_size) - len = i_size-off; - toread = len; - while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; - bh = ext4_bread(NULL, inode, blk, 0, &err); - if (err) - return err; - if (!bh) /* A hole? */ - memset(data, 0, tocopy); - else - memcpy(data, bh->b_data+offset, tocopy); - brelse(bh); - offset = 0; - toread -= tocopy; - data += tocopy; - blk++; - } - return len; -} - -/* Write to quotafile (we know the transaction is already started and has - * enough credits) */ -static ssize_t ext4_quota_write(struct super_block *sb, int type, - const char *data, size_t len, loff_t off) -{ - struct inode *inode = sb_dqopt(sb)->files[type]; - ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); - int err = 0; - int offset = off & (sb->s_blocksize - 1); - struct buffer_head *bh; - handle_t *handle = journal_current_handle(); - - if (EXT4_SB(sb)->s_journal && !handle) { - ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" - " cancelled because transaction is not started", - (unsigned long long)off, (unsigned long long)len); - return -EIO; - } - /* - * Since we account only one data block in transaction credits, - * then it is impossible to cross a block boundary. - */ - if (sb->s_blocksize - offset < len) { - ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" - " cancelled because not block aligned", - (unsigned long long)off, (unsigned long long)len); - return -EIO; - } - - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); - bh = ext4_bread(handle, inode, blk, 1, &err); - if (!bh) - goto out; - err = ext4_journal_get_write_access(handle, bh); - if (err) { - brelse(bh); - goto out; - } - lock_buffer(bh); - memcpy(bh->b_data+offset, data, len); - flush_dcache_page(bh->b_page); - unlock_buffer(bh); - err = ext4_handle_dirty_metadata(handle, NULL, bh); - brelse(bh); -out: - if (err) { - mutex_unlock(&inode->i_mutex); - return err; - } - if (inode->i_size < off + len) { - i_size_write(inode, off + len); - EXT4_I(inode)->i_disksize = inode->i_size; - ext4_mark_inode_dirty(handle, inode); - } - mutex_unlock(&inode->i_mutex); - return len; -} - -#endif - -static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super); -} - -#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) -static inline void register_as_ext2(void) -{ - int err = register_filesystem(&ext2_fs_type); - if (err) - printk(KERN_WARNING - "EXT4-fs: Unable to register as ext2 (%d)\n", err); -} - -static inline void unregister_as_ext2(void) -{ - unregister_filesystem(&ext2_fs_type); -} - -static inline int ext2_feature_set_ok(struct super_block *sb) -{ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP)) - return 0; - if (sb->s_flags & MS_RDONLY) - return 1; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP)) - return 0; - return 1; -} -MODULE_ALIAS("ext2"); -#else -static inline void register_as_ext2(void) { } -static inline void unregister_as_ext2(void) { } -static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } -#endif - -#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) -static inline void register_as_ext3(void) -{ - int err = register_filesystem(&ext3_fs_type); - if (err) - printk(KERN_WARNING - "EXT4-fs: Unable to register as ext3 (%d)\n", err); -} - -static inline void unregister_as_ext3(void) -{ - unregister_filesystem(&ext3_fs_type); -} - -static inline int ext3_feature_set_ok(struct super_block *sb) -{ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP)) - return 0; - if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - return 0; - if (sb->s_flags & MS_RDONLY) - return 1; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) - return 0; - return 1; -} -MODULE_ALIAS("ext3"); -#else -static inline void register_as_ext3(void) { } -static inline void unregister_as_ext3(void) { } -static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; } -#endif - -static struct file_system_type ext4_fs_type = { - .owner = THIS_MODULE, - .name = "ext4", - .mount = ext4_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; - -static int __init ext4_init_feat_adverts(void) -{ - struct ext4_features *ef; - int ret = -ENOMEM; - - ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL); - if (!ef) - goto out; - - ef->f_kobj.kset = ext4_kset; - init_completion(&ef->f_kobj_unregister); - ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL, - "features"); - if (ret) { - kfree(ef); - goto out; - } - - ext4_feat = ef; - ret = 0; -out: - return ret; -} - -static void ext4_exit_feat_adverts(void) -{ - kobject_put(&ext4_feat->f_kobj); - wait_for_completion(&ext4_feat->f_kobj_unregister); - kfree(ext4_feat); -} - -/* Shared across all ext4 file systems */ -wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; -struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; - -static int __init ext4_init_fs(void) -{ - int i, err; - - ext4_li_info = NULL; - mutex_init(&ext4_li_mtx); - - ext4_check_flag_values(); - - for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { - mutex_init(&ext4__aio_mutex[i]); - init_waitqueue_head(&ext4__ioend_wq[i]); - } - - err = ext4_init_pageio(); - if (err) - return err; - err = ext4_init_system_zone(); - if (err) - goto out6; - ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); - if (!ext4_kset) - goto out5; - ext4_proc_root = proc_mkdir("fs/ext4", NULL); - - err = ext4_init_feat_adverts(); - if (err) - goto out4; - - err = ext4_init_mballoc(); - if (err) - goto out3; - - err = ext4_init_xattr(); - if (err) - goto out2; - err = init_inodecache(); - if (err) - goto out1; - register_as_ext3(); - register_as_ext2(); - err = register_filesystem(&ext4_fs_type); - if (err) - goto out; - - return 0; -out: - unregister_as_ext2(); - unregister_as_ext3(); - destroy_inodecache(); -out1: - ext4_exit_xattr(); -out2: - ext4_exit_mballoc(); -out3: - ext4_exit_feat_adverts(); -out4: - if (ext4_proc_root) - remove_proc_entry("fs/ext4", NULL); - kset_unregister(ext4_kset); -out5: - ext4_exit_system_zone(); -out6: - ext4_exit_pageio(); - return err; -} - -static void __exit ext4_exit_fs(void) -{ - ext4_destroy_lazyinit_thread(); - unregister_as_ext2(); - unregister_as_ext3(); - unregister_filesystem(&ext4_fs_type); - destroy_inodecache(); - ext4_exit_xattr(); - ext4_exit_mballoc(); - ext4_exit_feat_adverts(); - remove_proc_entry("fs/ext4", NULL); - kset_unregister(ext4_kset); - ext4_exit_system_zone(); - ext4_exit_pageio(); -} - -MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -MODULE_DESCRIPTION("Fourth Extended Filesystem"); -MODULE_LICENSE("GPL"); -module_init(ext4_init_fs) -module_exit(ext4_exit_fs) diff --git a/ANDROID_3.4.5/fs/ext4/symlink.c b/ANDROID_3.4.5/fs/ext4/symlink.c deleted file mode 100644 index ed9354af..00000000 --- a/ANDROID_3.4.5/fs/ext4/symlink.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * linux/fs/ext4/symlink.c - * - * Only fast symlinks left here - the rest is done by generic code. AV, 1999 - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/symlink.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext4 symlink handling code - */ - -#include <linux/fs.h> -#include <linux/jbd2.h> -#include <linux/namei.h> -#include "ext4.h" -#include "xattr.h" - -static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); - nd_set_link(nd, (char *) ei->i_data); - return NULL; -} - -const struct inode_operations ext4_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, - .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext4_listxattr, - .removexattr = generic_removexattr, -#endif -}; - -const struct inode_operations ext4_fast_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = ext4_follow_link, - .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR - .setxattr = generic_setxattr, - .getxattr = generic_getxattr, - .listxattr = ext4_listxattr, - .removexattr = generic_removexattr, -#endif -}; diff --git a/ANDROID_3.4.5/fs/ext4/truncate.h b/ANDROID_3.4.5/fs/ext4/truncate.h deleted file mode 100644 index 011ba667..00000000 --- a/ANDROID_3.4.5/fs/ext4/truncate.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * linux/fs/ext4/truncate.h - * - * Common inline functions needed for truncate support - */ - -/* - * Truncate blocks that were not used by write. We have to truncate the - * pagecache as well so that corresponding buffers get properly unmapped. - */ -static inline void ext4_truncate_failed_write(struct inode *inode) -{ - truncate_inode_pages(inode->i_mapping, inode->i_size); - ext4_truncate(inode); -} - -/* - * Work out how many blocks we need to proceed with the next chunk of a - * truncate transaction. - */ -static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) -{ - ext4_lblk_t needed; - - needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); - - /* Give ourselves just enough room to cope with inodes in which - * i_blocks is corrupt: we've seen disk corruptions in the past - * which resulted in random data in an inode which looked enough - * like a regular file for ext4 to try to delete it. Things - * will go a bit crazy if that happens, but at least we should - * try not to panic the whole kernel. */ - if (needed < 2) - needed = 2; - - /* But we need to bound the transaction so we don't overflow the - * journal. */ - if (needed > EXT4_MAX_TRANS_DATA) - needed = EXT4_MAX_TRANS_DATA; - - return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; -} - diff --git a/ANDROID_3.4.5/fs/ext4/xattr.c b/ANDROID_3.4.5/fs/ext4/xattr.c deleted file mode 100644 index e88748e5..00000000 --- a/ANDROID_3.4.5/fs/ext4/xattr.c +++ /dev/null @@ -1,1608 +0,0 @@ -/* - * linux/fs/ext4/xattr.c - * - * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> - * - * Fix by Harrison Xing <harrison@mountainviewdata.com>. - * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>. - * Extended attributes for symlinks and special files added per - * suggestion of Luka Renko <luka.renko@hermes.si>. - * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, - * Red Hat Inc. - * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz - * and Andreas Gruenbacher <agruen@suse.de>. - */ - -/* - * Extended attributes are stored directly in inodes (on file systems with - * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl - * field contains the block number if an inode uses an additional block. All - * attributes must fit in the inode and one additional block. Blocks that - * contain the identical set of attributes may be shared among several inodes. - * Identical blocks are detected by keeping a cache of blocks that have - * recently been accessed. - * - * The attributes in inodes and on blocks have a different header; the entries - * are stored in the same format: - * - * +------------------+ - * | header | - * | entry 1 | | - * | entry 2 | | growing downwards - * | entry 3 | v - * | four null bytes | - * | . . . | - * | value 1 | ^ - * | value 3 | | growing upwards - * | value 2 | | - * +------------------+ - * - * The header is followed by multiple entry descriptors. In disk blocks, the - * entry descriptors are kept sorted. In inodes, they are unsorted. The - * attribute values are aligned to the end of the block in no specific order. - * - * Locking strategy - * ---------------- - * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem. - * EA blocks are only changed if they are exclusive to an inode, so - * holding xattr_sem also means that nothing but the EA block's reference - * count can change. Multiple writers to the same block are synchronized - * by the buffer lock. - */ - -#include <linux/init.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/mbcache.h> -#include <linux/quotaops.h> -#include <linux/rwsem.h> -#include "ext4_jbd2.h" -#include "ext4.h" -#include "xattr.h" -#include "acl.h" - -#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) -#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) -#define BFIRST(bh) ENTRY(BHDR(bh)+1) -#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) - -#ifdef EXT4_XATTR_DEBUG -# define ea_idebug(inode, f...) do { \ - printk(KERN_DEBUG "inode %s:%lu: ", \ - inode->i_sb->s_id, inode->i_ino); \ - printk(f); \ - printk("\n"); \ - } while (0) -# define ea_bdebug(bh, f...) do { \ - char b[BDEVNAME_SIZE]; \ - printk(KERN_DEBUG "block %s:%lu: ", \ - bdevname(bh->b_bdev, b), \ - (unsigned long) bh->b_blocknr); \ - printk(f); \ - printk("\n"); \ - } while (0) -#else -# define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) -# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) -#endif - -static void ext4_xattr_cache_insert(struct buffer_head *); -static struct buffer_head *ext4_xattr_cache_find(struct inode *, - struct ext4_xattr_header *, - struct mb_cache_entry **); -static void ext4_xattr_rehash(struct ext4_xattr_header *, - struct ext4_xattr_entry *); -static int ext4_xattr_list(struct dentry *dentry, char *buffer, - size_t buffer_size); - -static struct mb_cache *ext4_xattr_cache; - -static const struct xattr_handler *ext4_xattr_handler_map[] = { - [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, -#ifdef CONFIG_EXT4_FS_POSIX_ACL - [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, - [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, -#endif - [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, -#ifdef CONFIG_EXT4_FS_SECURITY - [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, -#endif -}; - -const struct xattr_handler *ext4_xattr_handlers[] = { - &ext4_xattr_user_handler, - &ext4_xattr_trusted_handler, -#ifdef CONFIG_EXT4_FS_POSIX_ACL - &ext4_xattr_acl_access_handler, - &ext4_xattr_acl_default_handler, -#endif -#ifdef CONFIG_EXT4_FS_SECURITY - &ext4_xattr_security_handler, -#endif - NULL -}; - -static inline const struct xattr_handler * -ext4_xattr_handler(int name_index) -{ - const struct xattr_handler *handler = NULL; - - if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) - handler = ext4_xattr_handler_map[name_index]; - return handler; -} - -/* - * Inode operation listxattr() - * - * dentry->d_inode->i_mutex: don't care - */ -ssize_t -ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - return ext4_xattr_list(dentry, buffer, size); -} - -static int -ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) -{ - while (!IS_LAST_ENTRY(entry)) { - struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry); - if ((void *)next >= end) - return -EIO; - entry = next; - } - return 0; -} - -static inline int -ext4_xattr_check_block(struct buffer_head *bh) -{ - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || - BHDR(bh)->h_blocks != cpu_to_le32(1)) - return -EIO; - return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); -} - -static inline int -ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) -{ - size_t value_size = le32_to_cpu(entry->e_value_size); - - if (entry->e_value_block != 0 || value_size > size || - le16_to_cpu(entry->e_value_offs) + value_size > size) - return -EIO; - return 0; -} - -static int -ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, - const char *name, size_t size, int sorted) -{ - struct ext4_xattr_entry *entry; - size_t name_len; - int cmp = 1; - - if (name == NULL) - return -EINVAL; - name_len = strlen(name); - entry = *pentry; - for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { - cmp = name_index - entry->e_name_index; - if (!cmp) - cmp = name_len - entry->e_name_len; - if (!cmp) - cmp = memcmp(name, entry->e_name, name_len); - if (cmp <= 0 && (sorted || cmp == 0)) - break; - } - *pentry = entry; - if (!cmp && ext4_xattr_check_entry(entry, size)) - return -EIO; - return cmp ? -ENODATA : 0; -} - -static int -ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size) -{ - struct buffer_head *bh = NULL; - struct ext4_xattr_entry *entry; - size_t size; - int error; - - ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", - name_index, name, buffer, (long)buffer_size); - - error = -ENODATA; - if (!EXT4_I(inode)->i_file_acl) - goto cleanup; - ea_idebug(inode, "reading block %llu", - (unsigned long long)EXT4_I(inode)->i_file_acl); - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) - goto cleanup; - ea_bdebug(bh, "b_count=%d, refcount=%d", - atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext4_xattr_check_block(bh)) { -bad_block: - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - error = -EIO; - goto cleanup; - } - ext4_xattr_cache_insert(bh); - entry = BFIRST(bh); - error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); - if (error == -EIO) - goto bad_block; - if (error) - goto cleanup; - size = le32_to_cpu(entry->e_value_size); - if (buffer) { - error = -ERANGE; - if (size > buffer_size) - goto cleanup; - memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), - size); - } - error = size; - -cleanup: - brelse(bh); - return error; -} - -static int -ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size) -{ - struct ext4_xattr_ibody_header *header; - struct ext4_xattr_entry *entry; - struct ext4_inode *raw_inode; - struct ext4_iloc iloc; - size_t size; - void *end; - int error; - - if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) - return -ENODATA; - error = ext4_get_inode_loc(inode, &iloc); - if (error) - return error; - raw_inode = ext4_raw_inode(&iloc); - header = IHDR(inode, raw_inode); - entry = IFIRST(header); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = ext4_xattr_check_names(entry, end); - if (error) - goto cleanup; - error = ext4_xattr_find_entry(&entry, name_index, name, - end - (void *)entry, 0); - if (error) - goto cleanup; - size = le32_to_cpu(entry->e_value_size); - if (buffer) { - error = -ERANGE; - if (size > buffer_size) - goto cleanup; - memcpy(buffer, (void *)IFIRST(header) + - le16_to_cpu(entry->e_value_offs), size); - } - error = size; - -cleanup: - brelse(iloc.bh); - return error; -} - -/* - * ext4_xattr_get() - * - * Copy an extended attribute into the buffer - * provided, or compute the buffer size required. - * Buffer is NULL to compute the size of the buffer required. - * - * Returns a negative error number on failure, or the number of bytes - * used / required on success. - */ -int -ext4_xattr_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size) -{ - int error; - - down_read(&EXT4_I(inode)->xattr_sem); - error = ext4_xattr_ibody_get(inode, name_index, name, buffer, - buffer_size); - if (error == -ENODATA) - error = ext4_xattr_block_get(inode, name_index, name, buffer, - buffer_size); - up_read(&EXT4_I(inode)->xattr_sem); - return error; -} - -static int -ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, - char *buffer, size_t buffer_size) -{ - size_t rest = buffer_size; - - for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { - const struct xattr_handler *handler = - ext4_xattr_handler(entry->e_name_index); - - if (handler) { - size_t size = handler->list(dentry, buffer, rest, - entry->e_name, - entry->e_name_len, - handler->flags); - if (buffer) { - if (size > rest) - return -ERANGE; - buffer += size; - } - rest -= size; - } - } - return buffer_size - rest; -} - -static int -ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) -{ - struct inode *inode = dentry->d_inode; - struct buffer_head *bh = NULL; - int error; - - ea_idebug(inode, "buffer=%p, buffer_size=%ld", - buffer, (long)buffer_size); - - error = 0; - if (!EXT4_I(inode)->i_file_acl) - goto cleanup; - ea_idebug(inode, "reading block %llu", - (unsigned long long)EXT4_I(inode)->i_file_acl); - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bh) - goto cleanup; - ea_bdebug(bh, "b_count=%d, refcount=%d", - atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext4_xattr_check_block(bh)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - error = -EIO; - goto cleanup; - } - ext4_xattr_cache_insert(bh); - error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); - -cleanup: - brelse(bh); - - return error; -} - -static int -ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) -{ - struct inode *inode = dentry->d_inode; - struct ext4_xattr_ibody_header *header; - struct ext4_inode *raw_inode; - struct ext4_iloc iloc; - void *end; - int error; - - if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) - return 0; - error = ext4_get_inode_loc(inode, &iloc); - if (error) - return error; - raw_inode = ext4_raw_inode(&iloc); - header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = ext4_xattr_check_names(IFIRST(header), end); - if (error) - goto cleanup; - error = ext4_xattr_list_entries(dentry, IFIRST(header), - buffer, buffer_size); - -cleanup: - brelse(iloc.bh); - return error; -} - -/* - * ext4_xattr_list() - * - * Copy a list of attribute names into the buffer - * provided, or compute the buffer size required. - * Buffer is NULL to compute the size of the buffer required. - * - * Returns a negative error number on failure, or the number of bytes - * used / required on success. - */ -static int -ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) -{ - int ret, ret2; - - down_read(&EXT4_I(dentry->d_inode)->xattr_sem); - ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); - if (ret < 0) - goto errout; - if (buffer) { - buffer += ret; - buffer_size -= ret; - } - ret = ext4_xattr_block_list(dentry, buffer, buffer_size); - if (ret < 0) - goto errout; - ret += ret2; -errout: - up_read(&EXT4_I(dentry->d_inode)->xattr_sem); - return ret; -} - -/* - * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is - * not set, set it. - */ -static void ext4_xattr_update_super_block(handle_t *handle, - struct super_block *sb) -{ - if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR)) - return; - - if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { - EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); - ext4_handle_dirty_super(handle, sb); - } -} - -/* - * Release the xattr block BH: If the reference count is > 1, decrement - * it; otherwise free the block. - */ -static void -ext4_xattr_release_block(handle_t *handle, struct inode *inode, - struct buffer_head *bh) -{ - struct mb_cache_entry *ce = NULL; - int error = 0; - - ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr); - error = ext4_journal_get_write_access(handle, bh); - if (error) - goto out; - - lock_buffer(bh); - if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { - ea_bdebug(bh, "refcount now=0; freeing"); - if (ce) - mb_cache_entry_free(ce); - get_bh(bh); - ext4_free_blocks(handle, inode, bh, 0, 1, - EXT4_FREE_BLOCKS_METADATA | - EXT4_FREE_BLOCKS_FORGET); - unlock_buffer(bh); - } else { - le32_add_cpu(&BHDR(bh)->h_refcount, -1); - if (ce) - mb_cache_entry_release(ce); - unlock_buffer(bh); - error = ext4_handle_dirty_metadata(handle, inode, bh); - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - dquot_free_block(inode, 1); - ea_bdebug(bh, "refcount now=%d; releasing", - le32_to_cpu(BHDR(bh)->h_refcount)); - } -out: - ext4_std_error(inode->i_sb, error); - return; -} - -/* - * Find the available free space for EAs. This also returns the total number of - * bytes used by EA entries. - */ -static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, - size_t *min_offs, void *base, int *total) -{ - for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - *total += EXT4_XATTR_LEN(last->e_name_len); - if (!last->e_value_block && last->e_value_size) { - size_t offs = le16_to_cpu(last->e_value_offs); - if (offs < *min_offs) - *min_offs = offs; - } - } - return (*min_offs - ((void *)last - base) - sizeof(__u32)); -} - -struct ext4_xattr_info { - int name_index; - const char *name; - const void *value; - size_t value_len; -}; - -struct ext4_xattr_search { - struct ext4_xattr_entry *first; - void *base; - void *end; - struct ext4_xattr_entry *here; - int not_found; -}; - -static int -ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) -{ - struct ext4_xattr_entry *last; - size_t free, min_offs = s->end - s->base, name_len = strlen(i->name); - - /* Compute min_offs and last. */ - last = s->first; - for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { - size_t offs = le16_to_cpu(last->e_value_offs); - if (offs < min_offs) - min_offs = offs; - } - } - free = min_offs - ((void *)last - s->base) - sizeof(__u32); - if (!s->not_found) { - if (!s->here->e_value_block && s->here->e_value_size) { - size_t size = le32_to_cpu(s->here->e_value_size); - free += EXT4_XATTR_SIZE(size); - } - free += EXT4_XATTR_LEN(name_len); - } - if (i->value) { - if (free < EXT4_XATTR_SIZE(i->value_len) || - free < EXT4_XATTR_LEN(name_len) + - EXT4_XATTR_SIZE(i->value_len)) - return -ENOSPC; - } - - if (i->value && s->not_found) { - /* Insert the new name. */ - size_t size = EXT4_XATTR_LEN(name_len); - size_t rest = (void *)last - (void *)s->here + sizeof(__u32); - memmove((void *)s->here + size, s->here, rest); - memset(s->here, 0, size); - s->here->e_name_index = i->name_index; - s->here->e_name_len = name_len; - memcpy(s->here->e_name, i->name, name_len); - } else { - if (!s->here->e_value_block && s->here->e_value_size) { - void *first_val = s->base + min_offs; - size_t offs = le16_to_cpu(s->here->e_value_offs); - void *val = s->base + offs; - size_t size = EXT4_XATTR_SIZE( - le32_to_cpu(s->here->e_value_size)); - - if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) { - /* The old and the new value have the same - size. Just replace. */ - s->here->e_value_size = - cpu_to_le32(i->value_len); - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); /* Clear pad bytes. */ - memcpy(val, i->value, i->value_len); - return 0; - } - - /* Remove the old value. */ - memmove(first_val + size, first_val, val - first_val); - memset(first_val, 0, size); - s->here->e_value_size = 0; - s->here->e_value_offs = 0; - min_offs += size; - - /* Adjust all value offsets. */ - last = s->first; - while (!IS_LAST_ENTRY(last)) { - size_t o = le16_to_cpu(last->e_value_offs); - if (!last->e_value_block && - last->e_value_size && o < offs) - last->e_value_offs = - cpu_to_le16(o + size); - last = EXT4_XATTR_NEXT(last); - } - } - if (!i->value) { - /* Remove the old name. */ - size_t size = EXT4_XATTR_LEN(name_len); - last = ENTRY((void *)last - size); - memmove(s->here, (void *)s->here + size, - (void *)last - (void *)s->here + sizeof(__u32)); - memset(last, 0, size); - } - } - - if (i->value) { - /* Insert the new value. */ - s->here->e_value_size = cpu_to_le32(i->value_len); - if (i->value_len) { - size_t size = EXT4_XATTR_SIZE(i->value_len); - void *val = s->base + min_offs - size; - s->here->e_value_offs = cpu_to_le16(min_offs - size); - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); /* Clear the pad bytes. */ - memcpy(val, i->value, i->value_len); - } - } - return 0; -} - -struct ext4_xattr_block_find { - struct ext4_xattr_search s; - struct buffer_head *bh; -}; - -static int -ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, - struct ext4_xattr_block_find *bs) -{ - struct super_block *sb = inode->i_sb; - int error; - - ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", - i->name_index, i->name, i->value, (long)i->value_len); - - if (EXT4_I(inode)->i_file_acl) { - /* The inode already has an extended attribute block. */ - bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bs->bh) - goto cleanup; - ea_bdebug(bs->bh, "b_count=%d, refcount=%d", - atomic_read(&(bs->bh->b_count)), - le32_to_cpu(BHDR(bs->bh)->h_refcount)); - if (ext4_xattr_check_block(bs->bh)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - error = -EIO; - goto cleanup; - } - /* Find the named attribute. */ - bs->s.base = BHDR(bs->bh); - bs->s.first = BFIRST(bs->bh); - bs->s.end = bs->bh->b_data + bs->bh->b_size; - bs->s.here = bs->s.first; - error = ext4_xattr_find_entry(&bs->s.here, i->name_index, - i->name, bs->bh->b_size, 1); - if (error && error != -ENODATA) - goto cleanup; - bs->s.not_found = error; - } - error = 0; - -cleanup: - return error; -} - -static int -ext4_xattr_block_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_block_find *bs) -{ - struct super_block *sb = inode->i_sb; - struct buffer_head *new_bh = NULL; - struct ext4_xattr_search *s = &bs->s; - struct mb_cache_entry *ce = NULL; - int error = 0; - -#define header(x) ((struct ext4_xattr_header *)(x)) - - if (i->value && i->value_len > sb->s_blocksize) - return -ENOSPC; - if (s->base) { - ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev, - bs->bh->b_blocknr); - error = ext4_journal_get_write_access(handle, bs->bh); - if (error) - goto cleanup; - lock_buffer(bs->bh); - - if (header(s->base)->h_refcount == cpu_to_le32(1)) { - if (ce) { - mb_cache_entry_free(ce); - ce = NULL; - } - ea_bdebug(bs->bh, "modifying in-place"); - error = ext4_xattr_set_entry(i, s); - if (!error) { - if (!IS_LAST_ENTRY(s->first)) - ext4_xattr_rehash(header(s->base), - s->here); - ext4_xattr_cache_insert(bs->bh); - } - unlock_buffer(bs->bh); - if (error == -EIO) - goto bad_block; - if (!error) - error = ext4_handle_dirty_metadata(handle, - inode, - bs->bh); - if (error) - goto cleanup; - goto inserted; - } else { - int offset = (char *)s->here - bs->bh->b_data; - - unlock_buffer(bs->bh); - ext4_handle_release_buffer(handle, bs->bh); - if (ce) { - mb_cache_entry_release(ce); - ce = NULL; - } - ea_bdebug(bs->bh, "cloning"); - s->base = kmalloc(bs->bh->b_size, GFP_NOFS); - error = -ENOMEM; - if (s->base == NULL) - goto cleanup; - memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); - s->first = ENTRY(header(s->base)+1); - header(s->base)->h_refcount = cpu_to_le32(1); - s->here = ENTRY(s->base + offset); - s->end = s->base + bs->bh->b_size; - } - } else { - /* Allocate a buffer where we construct the new block. */ - s->base = kzalloc(sb->s_blocksize, GFP_NOFS); - /* assert(header == s->base) */ - error = -ENOMEM; - if (s->base == NULL) - goto cleanup; - header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); - header(s->base)->h_blocks = cpu_to_le32(1); - header(s->base)->h_refcount = cpu_to_le32(1); - s->first = ENTRY(header(s->base)+1); - s->here = ENTRY(header(s->base)+1); - s->end = s->base + sb->s_blocksize; - } - - error = ext4_xattr_set_entry(i, s); - if (error == -EIO) - goto bad_block; - if (error) - goto cleanup; - if (!IS_LAST_ENTRY(s->first)) - ext4_xattr_rehash(header(s->base), s->here); - -inserted: - if (!IS_LAST_ENTRY(s->first)) { - new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce); - if (new_bh) { - /* We found an identical block in the cache. */ - if (new_bh == bs->bh) - ea_bdebug(new_bh, "keeping"); - else { - /* The old block is released after updating - the inode. */ - error = dquot_alloc_block(inode, 1); - if (error) - goto cleanup; - error = ext4_journal_get_write_access(handle, - new_bh); - if (error) - goto cleanup_dquot; - lock_buffer(new_bh); - le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); - ea_bdebug(new_bh, "reusing; refcount now=%d", - le32_to_cpu(BHDR(new_bh)->h_refcount)); - unlock_buffer(new_bh); - error = ext4_handle_dirty_metadata(handle, - inode, - new_bh); - if (error) - goto cleanup_dquot; - } - mb_cache_entry_release(ce); - ce = NULL; - } else if (bs->bh && s->base == bs->bh->b_data) { - /* We were modifying this block in-place. */ - ea_bdebug(bs->bh, "keeping this block"); - new_bh = bs->bh; - get_bh(new_bh); - } else { - /* We need to allocate a new block */ - ext4_fsblk_t goal, block; - - goal = ext4_group_first_block_no(sb, - EXT4_I(inode)->i_block_group); - - /* non-extent files can't have physical blocks past 2^32 */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - - /* - * take i_data_sem because we will test - * i_delalloc_reserved_flag in ext4_mb_new_blocks - */ - down_read((&EXT4_I(inode)->i_data_sem)); - block = ext4_new_meta_blocks(handle, inode, goal, 0, - NULL, &error); - up_read((&EXT4_I(inode)->i_data_sem)); - if (error) - goto cleanup; - - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); - - ea_idebug(inode, "creating block %llu", - (unsigned long long)block); - - new_bh = sb_getblk(sb, block); - if (!new_bh) { -getblk_failed: - ext4_free_blocks(handle, inode, NULL, block, 1, - EXT4_FREE_BLOCKS_METADATA); - error = -EIO; - goto cleanup; - } - lock_buffer(new_bh); - error = ext4_journal_get_create_access(handle, new_bh); - if (error) { - unlock_buffer(new_bh); - goto getblk_failed; - } - memcpy(new_bh->b_data, s->base, new_bh->b_size); - set_buffer_uptodate(new_bh); - unlock_buffer(new_bh); - ext4_xattr_cache_insert(new_bh); - error = ext4_handle_dirty_metadata(handle, - inode, new_bh); - if (error) - goto cleanup; - } - } - - /* Update the inode. */ - EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; - - /* Drop the previous xattr block. */ - if (bs->bh && bs->bh != new_bh) - ext4_xattr_release_block(handle, inode, bs->bh); - error = 0; - -cleanup: - if (ce) - mb_cache_entry_release(ce); - brelse(new_bh); - if (!(bs->bh && s->base == bs->bh->b_data)) - kfree(s->base); - - return error; - -cleanup_dquot: - dquot_free_block(inode, 1); - goto cleanup; - -bad_block: - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - goto cleanup; - -#undef header -} - -struct ext4_xattr_ibody_find { - struct ext4_xattr_search s; - struct ext4_iloc iloc; -}; - -static int -ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) -{ - struct ext4_xattr_ibody_header *header; - struct ext4_inode *raw_inode; - int error; - - if (EXT4_I(inode)->i_extra_isize == 0) - return 0; - raw_inode = ext4_raw_inode(&is->iloc); - header = IHDR(inode, raw_inode); - is->s.base = is->s.first = IFIRST(header); - is->s.here = is->s.first; - is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { - error = ext4_xattr_check_names(IFIRST(header), is->s.end); - if (error) - return error; - /* Find the named attribute. */ - error = ext4_xattr_find_entry(&is->s.here, i->name_index, - i->name, is->s.end - - (void *)is->s.base, 0); - if (error && error != -ENODATA) - return error; - is->s.not_found = error; - } - return 0; -} - -static int -ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) -{ - struct ext4_xattr_ibody_header *header; - struct ext4_xattr_search *s = &is->s; - int error; - - if (EXT4_I(inode)->i_extra_isize == 0) - return -ENOSPC; - error = ext4_xattr_set_entry(i, s); - if (error) - return error; - header = IHDR(inode, ext4_raw_inode(&is->iloc)); - if (!IS_LAST_ENTRY(s->first)) { - header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); - ext4_set_inode_state(inode, EXT4_STATE_XATTR); - } else { - header->h_magic = cpu_to_le32(0); - ext4_clear_inode_state(inode, EXT4_STATE_XATTR); - } - return 0; -} - -/* - * ext4_xattr_set_handle() - * - * Create, replace or remove an extended attribute for this inode. Value - * is NULL to remove an existing extended attribute, and non-NULL to - * either replace an existing extended attribute, or create a new extended - * attribute. The flags XATTR_REPLACE and XATTR_CREATE - * specify that an extended attribute must exist and must not exist - * previous to the call, respectively. - * - * Returns 0, or a negative error number on failure. - */ -int -ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, - const char *name, const void *value, size_t value_len, - int flags) -{ - struct ext4_xattr_info i = { - .name_index = name_index, - .name = name, - .value = value, - .value_len = value_len, - - }; - struct ext4_xattr_ibody_find is = { - .s = { .not_found = -ENODATA, }, - }; - struct ext4_xattr_block_find bs = { - .s = { .not_found = -ENODATA, }, - }; - unsigned long no_expand; - int error; - - if (!name) - return -EINVAL; - if (strlen(name) > 255) - return -ERANGE; - down_write(&EXT4_I(inode)->xattr_sem); - no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); - ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); - - error = ext4_reserve_inode_write(handle, inode, &is.iloc); - if (error) - goto cleanup; - - if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) { - struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); - memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); - ext4_clear_inode_state(inode, EXT4_STATE_NEW); - } - - error = ext4_xattr_ibody_find(inode, &i, &is); - if (error) - goto cleanup; - if (is.s.not_found) - error = ext4_xattr_block_find(inode, &i, &bs); - if (error) - goto cleanup; - if (is.s.not_found && bs.s.not_found) { - error = -ENODATA; - if (flags & XATTR_REPLACE) - goto cleanup; - error = 0; - if (!value) - goto cleanup; - } else { - error = -EEXIST; - if (flags & XATTR_CREATE) - goto cleanup; - } - if (!value) { - if (!is.s.not_found) - error = ext4_xattr_ibody_set(handle, inode, &i, &is); - else if (!bs.s.not_found) - error = ext4_xattr_block_set(handle, inode, &i, &bs); - } else { - error = ext4_xattr_ibody_set(handle, inode, &i, &is); - if (!error && !bs.s.not_found) { - i.value = NULL; - error = ext4_xattr_block_set(handle, inode, &i, &bs); - } else if (error == -ENOSPC) { - if (EXT4_I(inode)->i_file_acl && !bs.s.base) { - error = ext4_xattr_block_find(inode, &i, &bs); - if (error) - goto cleanup; - } - error = ext4_xattr_block_set(handle, inode, &i, &bs); - if (error) - goto cleanup; - if (!is.s.not_found) { - i.value = NULL; - error = ext4_xattr_ibody_set(handle, inode, &i, - &is); - } - } - } - if (!error) { - ext4_xattr_update_super_block(handle, inode->i_sb); - inode->i_ctime = ext4_current_time(inode); - if (!value) - ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); - error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); - /* - * The bh is consumed by ext4_mark_iloc_dirty, even with - * error != 0. - */ - is.iloc.bh = NULL; - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - } - -cleanup: - brelse(is.iloc.bh); - brelse(bs.bh); - if (no_expand == 0) - ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); - up_write(&EXT4_I(inode)->xattr_sem); - return error; -} - -/* - * ext4_xattr_set() - * - * Like ext4_xattr_set_handle, but start from an inode. This extended - * attribute modification is a filesystem transaction by itself. - * - * Returns 0, or a negative error number on failure. - */ -int -ext4_xattr_set(struct inode *inode, int name_index, const char *name, - const void *value, size_t value_len, int flags) -{ - handle_t *handle; - int error, retries = 0; - -retry: - handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - } else { - int error2; - - error = ext4_xattr_set_handle(handle, inode, name_index, name, - value, value_len, flags); - error2 = ext4_journal_stop(handle); - if (error == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - if (error == 0) - error = error2; - } - - return error; -} - -/* - * Shift the EA entries in the inode to create space for the increased - * i_extra_isize. - */ -static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, - int value_offs_shift, void *to, - void *from, size_t n, int blocksize) -{ - struct ext4_xattr_entry *last = entry; - int new_offs; - - /* Adjust the value offsets of the entries */ - for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - if (!last->e_value_block && last->e_value_size) { - new_offs = le16_to_cpu(last->e_value_offs) + - value_offs_shift; - BUG_ON(new_offs + le32_to_cpu(last->e_value_size) - > blocksize); - last->e_value_offs = cpu_to_le16(new_offs); - } - } - /* Shift the entries by n bytes */ - memmove(to, from, n); -} - -/* - * Expand an inode by new_extra_isize bytes when EAs are present. - * Returns 0 on success or negative error number on failure. - */ -int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, - struct ext4_inode *raw_inode, handle_t *handle) -{ - struct ext4_xattr_ibody_header *header; - struct ext4_xattr_entry *entry, *last, *first; - struct buffer_head *bh = NULL; - struct ext4_xattr_ibody_find *is = NULL; - struct ext4_xattr_block_find *bs = NULL; - char *buffer = NULL, *b_entry_name = NULL; - size_t min_offs, free; - int total_ino, total_blk; - void *base, *start, *end; - int extra_isize = 0, error = 0, tried_min_extra_isize = 0; - int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); - - down_write(&EXT4_I(inode)->xattr_sem); -retry: - if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) { - up_write(&EXT4_I(inode)->xattr_sem); - return 0; - } - - header = IHDR(inode, raw_inode); - entry = IFIRST(header); - - /* - * Check if enough free space is available in the inode to shift the - * entries ahead by new_extra_isize. - */ - - base = start = entry; - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - min_offs = end - base; - last = entry; - total_ino = sizeof(struct ext4_xattr_ibody_header); - - free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); - if (free >= new_extra_isize) { - entry = IFIRST(header); - ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - - new_extra_isize, (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, - (void *)header, total_ino, - inode->i_sb->s_blocksize); - EXT4_I(inode)->i_extra_isize = new_extra_isize; - error = 0; - goto cleanup; - } - - /* - * Enough free space isn't available in the inode, check if - * EA block can hold new_extra_isize bytes. - */ - if (EXT4_I(inode)->i_file_acl) { - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bh) - goto cleanup; - if (ext4_xattr_check_block(bh)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - error = -EIO; - goto cleanup; - } - base = BHDR(bh); - first = BFIRST(bh); - end = bh->b_data + bh->b_size; - min_offs = end - base; - free = ext4_xattr_free_space(first, &min_offs, base, - &total_blk); - if (free < new_extra_isize) { - if (!tried_min_extra_isize && s_min_extra_isize) { - tried_min_extra_isize++; - new_extra_isize = s_min_extra_isize; - brelse(bh); - goto retry; - } - error = -1; - goto cleanup; - } - } else { - free = inode->i_sb->s_blocksize; - } - - while (new_extra_isize > 0) { - size_t offs, size, entry_size; - struct ext4_xattr_entry *small_entry = NULL; - struct ext4_xattr_info i = { - .value = NULL, - .value_len = 0, - }; - unsigned int total_size; /* EA entry size + value size */ - unsigned int shift_bytes; /* No. of bytes to shift EAs by? */ - unsigned int min_total_size = ~0U; - - is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); - bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); - if (!is || !bs) { - error = -ENOMEM; - goto cleanup; - } - - is->s.not_found = -ENODATA; - bs->s.not_found = -ENODATA; - is->iloc.bh = NULL; - bs->bh = NULL; - - last = IFIRST(header); - /* Find the entry best suited to be pushed into EA block */ - entry = NULL; - for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - total_size = - EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + - EXT4_XATTR_LEN(last->e_name_len); - if (total_size <= free && total_size < min_total_size) { - if (total_size < new_extra_isize) { - small_entry = last; - } else { - entry = last; - min_total_size = total_size; - } - } - } - - if (entry == NULL) { - if (small_entry) { - entry = small_entry; - } else { - if (!tried_min_extra_isize && - s_min_extra_isize) { - tried_min_extra_isize++; - new_extra_isize = s_min_extra_isize; - goto retry; - } - error = -1; - goto cleanup; - } - } - offs = le16_to_cpu(entry->e_value_offs); - size = le32_to_cpu(entry->e_value_size); - entry_size = EXT4_XATTR_LEN(entry->e_name_len); - i.name_index = entry->e_name_index, - buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS); - b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); - if (!buffer || !b_entry_name) { - error = -ENOMEM; - goto cleanup; - } - /* Save the entry name and the entry value */ - memcpy(buffer, (void *)IFIRST(header) + offs, - EXT4_XATTR_SIZE(size)); - memcpy(b_entry_name, entry->e_name, entry->e_name_len); - b_entry_name[entry->e_name_len] = '\0'; - i.name = b_entry_name; - - error = ext4_get_inode_loc(inode, &is->iloc); - if (error) - goto cleanup; - - error = ext4_xattr_ibody_find(inode, &i, is); - if (error) - goto cleanup; - - /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(handle, inode, &i, is); - if (error) - goto cleanup; - - entry = IFIRST(header); - if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) - shift_bytes = new_extra_isize; - else - shift_bytes = entry_size + size; - /* Adjust the offsets and shift the remaining entries ahead */ - ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - - shift_bytes, (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes, - (void *)header, total_ino - entry_size, - inode->i_sb->s_blocksize); - - extra_isize += shift_bytes; - new_extra_isize -= shift_bytes; - EXT4_I(inode)->i_extra_isize = extra_isize; - - i.name = b_entry_name; - i.value = buffer; - i.value_len = size; - error = ext4_xattr_block_find(inode, &i, bs); - if (error) - goto cleanup; - - /* Add entry which was removed from the inode into the block */ - error = ext4_xattr_block_set(handle, inode, &i, bs); - if (error) - goto cleanup; - kfree(b_entry_name); - kfree(buffer); - b_entry_name = NULL; - buffer = NULL; - brelse(is->iloc.bh); - kfree(is); - kfree(bs); - } - brelse(bh); - up_write(&EXT4_I(inode)->xattr_sem); - return 0; - -cleanup: - kfree(b_entry_name); - kfree(buffer); - if (is) - brelse(is->iloc.bh); - kfree(is); - kfree(bs); - brelse(bh); - up_write(&EXT4_I(inode)->xattr_sem); - return error; -} - - - -/* - * ext4_xattr_delete_inode() - * - * Free extended attribute resources associated with this inode. This - * is called immediately before an inode is freed. We have exclusive - * access to the inode. - */ -void -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) -{ - struct buffer_head *bh = NULL; - - if (!EXT4_I(inode)->i_file_acl) - goto cleanup; - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) { - EXT4_ERROR_INODE(inode, "block %llu read error", - EXT4_I(inode)->i_file_acl); - goto cleanup; - } - if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || - BHDR(bh)->h_blocks != cpu_to_le32(1)) { - EXT4_ERROR_INODE(inode, "bad block %llu", - EXT4_I(inode)->i_file_acl); - goto cleanup; - } - ext4_xattr_release_block(handle, inode, bh); - EXT4_I(inode)->i_file_acl = 0; - -cleanup: - brelse(bh); -} - -/* - * ext4_xattr_put_super() - * - * This is called when a file system is unmounted. - */ -void -ext4_xattr_put_super(struct super_block *sb) -{ - mb_cache_shrink(sb->s_bdev); -} - -/* - * ext4_xattr_cache_insert() - * - * Create a new entry in the extended attribute cache, and insert - * it unless such an entry is already in the cache. - * - * Returns 0, or a negative error number on failure. - */ -static void -ext4_xattr_cache_insert(struct buffer_head *bh) -{ - __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); - struct mb_cache_entry *ce; - int error; - - ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS); - if (!ce) { - ea_bdebug(bh, "out of memory"); - return; - } - error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); - if (error) { - mb_cache_entry_free(ce); - if (error == -EBUSY) { - ea_bdebug(bh, "already in cache"); - error = 0; - } - } else { - ea_bdebug(bh, "inserting [%x]", (int)hash); - mb_cache_entry_release(ce); - } -} - -/* - * ext4_xattr_cmp() - * - * Compare two extended attribute blocks for equality. - * - * Returns 0 if the blocks are equal, 1 if they differ, and - * a negative error number on errors. - */ -static int -ext4_xattr_cmp(struct ext4_xattr_header *header1, - struct ext4_xattr_header *header2) -{ - struct ext4_xattr_entry *entry1, *entry2; - - entry1 = ENTRY(header1+1); - entry2 = ENTRY(header2+1); - while (!IS_LAST_ENTRY(entry1)) { - if (IS_LAST_ENTRY(entry2)) - return 1; - if (entry1->e_hash != entry2->e_hash || - entry1->e_name_index != entry2->e_name_index || - entry1->e_name_len != entry2->e_name_len || - entry1->e_value_size != entry2->e_value_size || - memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) - return 1; - if (entry1->e_value_block != 0 || entry2->e_value_block != 0) - return -EIO; - if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), - (char *)header2 + le16_to_cpu(entry2->e_value_offs), - le32_to_cpu(entry1->e_value_size))) - return 1; - - entry1 = EXT4_XATTR_NEXT(entry1); - entry2 = EXT4_XATTR_NEXT(entry2); - } - if (!IS_LAST_ENTRY(entry2)) - return 1; - return 0; -} - -/* - * ext4_xattr_cache_find() - * - * Find an identical extended attribute block. - * - * Returns a pointer to the block found, or NULL if such a block was - * not found or an error occurred. - */ -static struct buffer_head * -ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, - struct mb_cache_entry **pce) -{ - __u32 hash = le32_to_cpu(header->h_hash); - struct mb_cache_entry *ce; - - if (!header->h_hash) - return NULL; /* never share */ - ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -again: - ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev, - hash); - while (ce) { - struct buffer_head *bh; - - if (IS_ERR(ce)) { - if (PTR_ERR(ce) == -EAGAIN) - goto again; - break; - } - bh = sb_bread(inode->i_sb, ce->e_block); - if (!bh) { - EXT4_ERROR_INODE(inode, "block %lu read error", - (unsigned long) ce->e_block); - } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= - EXT4_XATTR_REFCOUNT_MAX) { - ea_idebug(inode, "block %lu refcount %d>=%d", - (unsigned long) ce->e_block, - le32_to_cpu(BHDR(bh)->h_refcount), - EXT4_XATTR_REFCOUNT_MAX); - } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { - *pce = ce; - return bh; - } - brelse(bh); - ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); - } - return NULL; -} - -#define NAME_HASH_SHIFT 5 -#define VALUE_HASH_SHIFT 16 - -/* - * ext4_xattr_hash_entry() - * - * Compute the hash of an extended attribute. - */ -static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, - struct ext4_xattr_entry *entry) -{ - __u32 hash = 0; - char *name = entry->e_name; - int n; - - for (n = 0; n < entry->e_name_len; n++) { - hash = (hash << NAME_HASH_SHIFT) ^ - (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ - *name++; - } - - if (entry->e_value_block == 0 && entry->e_value_size != 0) { - __le32 *value = (__le32 *)((char *)header + - le16_to_cpu(entry->e_value_offs)); - for (n = (le32_to_cpu(entry->e_value_size) + - EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) { - hash = (hash << VALUE_HASH_SHIFT) ^ - (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ - le32_to_cpu(*value++); - } - } - entry->e_hash = cpu_to_le32(hash); -} - -#undef NAME_HASH_SHIFT -#undef VALUE_HASH_SHIFT - -#define BLOCK_HASH_SHIFT 16 - -/* - * ext4_xattr_rehash() - * - * Re-compute the extended attribute hash value after an entry has changed. - */ -static void ext4_xattr_rehash(struct ext4_xattr_header *header, - struct ext4_xattr_entry *entry) -{ - struct ext4_xattr_entry *here; - __u32 hash = 0; - - ext4_xattr_hash_entry(header, entry); - here = ENTRY(header+1); - while (!IS_LAST_ENTRY(here)) { - if (!here->e_hash) { - /* Block is not shared if an entry's hash value == 0 */ - hash = 0; - break; - } - hash = (hash << BLOCK_HASH_SHIFT) ^ - (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ - le32_to_cpu(here->e_hash); - here = EXT4_XATTR_NEXT(here); - } - header->h_hash = cpu_to_le32(hash); -} - -#undef BLOCK_HASH_SHIFT - -int __init -ext4_init_xattr(void) -{ - ext4_xattr_cache = mb_cache_create("ext4_xattr", 6); - if (!ext4_xattr_cache) - return -ENOMEM; - return 0; -} - -void -ext4_exit_xattr(void) -{ - if (ext4_xattr_cache) - mb_cache_destroy(ext4_xattr_cache); - ext4_xattr_cache = NULL; -} diff --git a/ANDROID_3.4.5/fs/ext4/xattr.h b/ANDROID_3.4.5/fs/ext4/xattr.h deleted file mode 100644 index 25b7387f..00000000 --- a/ANDROID_3.4.5/fs/ext4/xattr.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - File: fs/ext4/xattr.h - - On-disk format of extended attributes for the ext4 filesystem. - - (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> -*/ - -#include <linux/xattr.h> - -/* Magic value in attribute blocks */ -#define EXT4_XATTR_MAGIC 0xEA020000 - -/* Maximum number of references to one attribute block */ -#define EXT4_XATTR_REFCOUNT_MAX 1024 - -/* Name indexes */ -#define EXT4_XATTR_INDEX_USER 1 -#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS 2 -#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT 3 -#define EXT4_XATTR_INDEX_TRUSTED 4 -#define EXT4_XATTR_INDEX_LUSTRE 5 -#define EXT4_XATTR_INDEX_SECURITY 6 - -struct ext4_xattr_header { - __le32 h_magic; /* magic number for identification */ - __le32 h_refcount; /* reference count */ - __le32 h_blocks; /* number of disk blocks used */ - __le32 h_hash; /* hash value of all attributes */ - __u32 h_reserved[4]; /* zero right now */ -}; - -struct ext4_xattr_ibody_header { - __le32 h_magic; /* magic number for identification */ -}; - -struct ext4_xattr_entry { - __u8 e_name_len; /* length of name */ - __u8 e_name_index; /* attribute name index */ - __le16 e_value_offs; /* offset in disk block of value */ - __le32 e_value_block; /* disk block attribute is stored on (n/i) */ - __le32 e_value_size; /* size of attribute value */ - __le32 e_hash; /* hash value of name and value */ - char e_name[0]; /* attribute name */ -}; - -#define EXT4_XATTR_PAD_BITS 2 -#define EXT4_XATTR_PAD (1<<EXT4_XATTR_PAD_BITS) -#define EXT4_XATTR_ROUND (EXT4_XATTR_PAD-1) -#define EXT4_XATTR_LEN(name_len) \ - (((name_len) + EXT4_XATTR_ROUND + \ - sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND) -#define EXT4_XATTR_NEXT(entry) \ - ((struct ext4_xattr_entry *)( \ - (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len))) -#define EXT4_XATTR_SIZE(size) \ - (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) - -#define IHDR(inode, raw_inode) \ - ((struct ext4_xattr_ibody_header *) \ - ((void *)raw_inode + \ - EXT4_GOOD_OLD_INODE_SIZE + \ - EXT4_I(inode)->i_extra_isize)) -#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) - -# ifdef CONFIG_EXT4_FS_XATTR - -extern const struct xattr_handler ext4_xattr_user_handler; -extern const struct xattr_handler ext4_xattr_trusted_handler; -extern const struct xattr_handler ext4_xattr_acl_access_handler; -extern const struct xattr_handler ext4_xattr_acl_default_handler; -extern const struct xattr_handler ext4_xattr_security_handler; - -extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); - -extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); -extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); -extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); - -extern void ext4_xattr_delete_inode(handle_t *, struct inode *); -extern void ext4_xattr_put_super(struct super_block *); - -extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, - struct ext4_inode *raw_inode, handle_t *handle); - -extern int __init ext4_init_xattr(void); -extern void ext4_exit_xattr(void); - -extern const struct xattr_handler *ext4_xattr_handlers[]; - -# else /* CONFIG_EXT4_FS_XATTR */ - -static inline int -ext4_xattr_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline int -ext4_xattr_set(struct inode *inode, int name_index, const char *name, - const void *value, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline int -ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, - const char *name, const void *value, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline void -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) -{ -} - -static inline void -ext4_xattr_put_super(struct super_block *sb) -{ -} - -static __init inline int -ext4_init_xattr(void) -{ - return 0; -} - -static inline void -ext4_exit_xattr(void) -{ -} - -static inline int -ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, - struct ext4_inode *raw_inode, handle_t *handle) -{ - return -EOPNOTSUPP; -} - -#define ext4_xattr_handlers NULL - -# endif /* CONFIG_EXT4_FS_XATTR */ - -#ifdef CONFIG_EXT4_FS_SECURITY -extern int ext4_init_security(handle_t *handle, struct inode *inode, - struct inode *dir, const struct qstr *qstr); -#else -static inline int ext4_init_security(handle_t *handle, struct inode *inode, - struct inode *dir, const struct qstr *qstr) -{ - return 0; -} -#endif diff --git a/ANDROID_3.4.5/fs/ext4/xattr_security.c b/ANDROID_3.4.5/fs/ext4/xattr_security.c deleted file mode 100644 index d2a20062..00000000 --- a/ANDROID_3.4.5/fs/ext4/xattr_security.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * linux/fs/ext4/xattr_security.c - * Handler for storing security labels as extended attributes. - */ - -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/security.h> -#include <linux/slab.h> -#include "ext4_jbd2.h" -#include "ext4.h" -#include "xattr.h" - -static size_t -ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; - const size_t total_len = prefix_len + name_len + 1; - - - if (list && total_len <= list_size) { - memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int -ext4_xattr_security_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, - name, buffer, size); -} - -static int -ext4_xattr_security_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, - name, value, size, flags); -} - -static int -ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) -{ - const struct xattr *xattr; - handle_t *handle = fs_info; - int err = 0; - - for (xattr = xattr_array; xattr->name != NULL; xattr++) { - err = ext4_xattr_set_handle(handle, inode, - EXT4_XATTR_INDEX_SECURITY, - xattr->name, xattr->value, - xattr->value_len, 0); - if (err < 0) - break; - } - return err; -} - -int -ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, - const struct qstr *qstr) -{ - return security_inode_init_security(inode, dir, qstr, - &ext4_initxattrs, handle); -} - -const struct xattr_handler ext4_xattr_security_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .list = ext4_xattr_security_list, - .get = ext4_xattr_security_get, - .set = ext4_xattr_security_set, -}; diff --git a/ANDROID_3.4.5/fs/ext4/xattr_trusted.c b/ANDROID_3.4.5/fs/ext4/xattr_trusted.c deleted file mode 100644 index 95f1f4ab..00000000 --- a/ANDROID_3.4.5/fs/ext4/xattr_trusted.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * linux/fs/ext4/xattr_trusted.c - * Handler for trusted extended attributes. - * - * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> - */ - -#include <linux/string.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include "ext4_jbd2.h" -#include "ext4.h" -#include "xattr.h" - -static size_t -ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!capable(CAP_SYS_ADMIN)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int -ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, - name, buffer, size); -} - -static int -ext4_xattr_trusted_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, - name, value, size, flags); -} - -const struct xattr_handler ext4_xattr_trusted_handler = { - .prefix = XATTR_TRUSTED_PREFIX, - .list = ext4_xattr_trusted_list, - .get = ext4_xattr_trusted_get, - .set = ext4_xattr_trusted_set, -}; diff --git a/ANDROID_3.4.5/fs/ext4/xattr_user.c b/ANDROID_3.4.5/fs/ext4/xattr_user.c deleted file mode 100644 index 0edb7611..00000000 --- a/ANDROID_3.4.5/fs/ext4/xattr_user.c +++ /dev/null @@ -1,61 +0,0 @@ -/* - * linux/fs/ext4/xattr_user.c - * Handler for extended user attributes. - * - * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> - */ - -#include <linux/string.h> -#include <linux/fs.h> -#include "ext4_jbd2.h" -#include "ext4.h" -#include "xattr.h" - -static size_t -ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t prefix_len = XATTR_USER_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!test_opt(dentry->d_sb, XATTR_USER)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_USER_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - -static int -ext4_xattr_user_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, XATTR_USER)) - return -EOPNOTSUPP; - return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER, - name, buffer, size); -} - -static int -ext4_xattr_user_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - if (strcmp(name, "") == 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, XATTR_USER)) - return -EOPNOTSUPP; - return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER, - name, value, size, flags); -} - -const struct xattr_handler ext4_xattr_user_handler = { - .prefix = XATTR_USER_PREFIX, - .list = ext4_xattr_user_list, - .get = ext4_xattr_user_get, - .set = ext4_xattr_user_set, -}; |