diff options
Diffstat (limited to 'fs/ext4/move_extent.c')
-rw-r--r-- | fs/ext4/move_extent.c | 1423 |
1 files changed, 1423 insertions, 0 deletions
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c new file mode 100644 index 00000000..c5826c62 --- /dev/null +++ b/fs/ext4/move_extent.c @@ -0,0 +1,1423 @@ +/* + * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd. + * Written by Takashi Sato <t-sato@yk.jp.nec.com> + * Akira Fujita <a-fujita@rs.jp.nec.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/fs.h> +#include <linux/quotaops.h> +#include <linux/slab.h> +#include "ext4_jbd2.h" +#include "ext4.h" + +/** + * get_ext_path - Find an extent path for designated logical block number. + * + * @inode: an inode which is searched + * @lblock: logical block number to find an extent path + * @path: pointer to an extent path pointer (for output) + * + * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value + * on failure. + */ +static inline int +get_ext_path(struct inode *inode, ext4_lblk_t lblock, + struct ext4_ext_path **path) +{ + int ret = 0; + + *path = ext4_ext_find_extent(inode, lblock, *path); + if (IS_ERR(*path)) { + ret = PTR_ERR(*path); + *path = NULL; + } else if ((*path)[ext_depth(inode)].p_ext == NULL) + ret = -ENODATA; + + return ret; +} + +/** + * copy_extent_status - Copy the extent's initialization status + * + * @src: an extent for getting initialize status + * @dest: an extent to be set the status + */ +static void +copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) +{ + if (ext4_ext_is_uninitialized(src)) + ext4_ext_mark_uninitialized(dest); + else + dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest)); +} + +/** + * mext_next_extent - Search for the next extent and set it to "extent" + * + * @inode: inode which is searched + * @path: this will obtain data for the next extent + * @extent: pointer to the next extent we have just gotten + * + * Search the next extent in the array of ext4_ext_path structure (@path) + * and set it to ext4_extent structure (@extent). In addition, the member of + * @path (->p_ext) also points the next extent. Return 0 on success, 1 if + * ext4_ext_path structure refers to the last extent, or a negative error + * value on failure. + */ +static int +mext_next_extent(struct inode *inode, struct ext4_ext_path *path, + struct ext4_extent **extent) +{ + struct ext4_extent_header *eh; + int ppos, leaf_ppos = path->p_depth; + + ppos = leaf_ppos; + if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { + /* leaf block */ + *extent = ++path[ppos].p_ext; + path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); + return 0; + } + + while (--ppos >= 0) { + if (EXT_LAST_INDEX(path[ppos].p_hdr) > + path[ppos].p_idx) { + int cur_ppos = ppos; + + /* index block */ + path[ppos].p_idx++; + path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); + if (path[ppos+1].p_bh) + brelse(path[ppos+1].p_bh); + path[ppos+1].p_bh = + sb_bread(inode->i_sb, path[ppos].p_block); + if (!path[ppos+1].p_bh) + return -EIO; + path[ppos+1].p_hdr = + ext_block_hdr(path[ppos+1].p_bh); + + /* Halfway index block */ + while (++cur_ppos < leaf_ppos) { + path[cur_ppos].p_idx = + EXT_FIRST_INDEX(path[cur_ppos].p_hdr); + path[cur_ppos].p_block = + ext4_idx_pblock(path[cur_ppos].p_idx); + if (path[cur_ppos+1].p_bh) + brelse(path[cur_ppos+1].p_bh); + path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, + path[cur_ppos].p_block); + if (!path[cur_ppos+1].p_bh) + return -EIO; + path[cur_ppos+1].p_hdr = + ext_block_hdr(path[cur_ppos+1].p_bh); + } + + path[leaf_ppos].p_ext = *extent = NULL; + + eh = path[leaf_ppos].p_hdr; + if (le16_to_cpu(eh->eh_entries) == 0) + /* empty leaf is found */ + return -ENODATA; + + /* leaf block */ + path[leaf_ppos].p_ext = *extent = + EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); + path[leaf_ppos].p_block = + ext4_ext_pblock(path[leaf_ppos].p_ext); + return 0; + } + } + /* We found the last extent */ + return 1; +} + +/** + * mext_check_null_inode - NULL check for two inodes + * + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + */ +static int +mext_check_null_inode(struct inode *inode1, struct inode *inode2, + const char *function, unsigned int line) +{ + int ret = 0; + + if (inode1 == NULL) { + __ext4_error(inode2->i_sb, function, line, + "Both inodes should not be NULL: " + "inode1 NULL inode2 %lu", inode2->i_ino); + ret = -EIO; + } else if (inode2 == NULL) { + __ext4_error(inode1->i_sb, function, line, + "Both inodes should not be NULL: " + "inode1 %lu inode2 NULL", inode1->i_ino); + ret = -EIO; + } + return ret; +} + +/** + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem + * + * @orig_inode: original inode structure + * @donor_inode: donor inode structure + * Acquire write lock of i_data_sem of the two inodes (orig and donor) by + * i_ino order. + */ +static void +double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) +{ + struct inode *first = orig_inode, *second = donor_inode; + + /* + * Use the inode number to provide the stable locking order instead + * of its address, because the C language doesn't guarantee you can + * compare pointers that don't come from the same array. + */ + if (donor_inode->i_ino < orig_inode->i_ino) { + first = donor_inode; + second = orig_inode; + } + + down_write(&EXT4_I(first)->i_data_sem); + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); +} + +/** + * double_up_write_data_sem - Release two inodes' write lock of i_data_sem + * + * @orig_inode: original inode structure to be released its lock first + * @donor_inode: donor inode structure to be released its lock second + * Release write lock of i_data_sem of two inodes (orig and donor). + */ +static void +double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) +{ + up_write(&EXT4_I(orig_inode)->i_data_sem); + up_write(&EXT4_I(donor_inode)->i_data_sem); +} + +/** + * mext_insert_across_blocks - Insert extents across leaf block + * + * @handle: journal handle + * @orig_inode: original inode + * @o_start: first original extent to be changed + * @o_end: last original extent to be changed + * @start_ext: first new extent to be inserted + * @new_ext: middle of new extent to be inserted + * @end_ext: last new extent to be inserted + * + * Allocate a new leaf block and insert extents into it. Return 0 on success, + * or a negative error value on failure. + */ +static int +mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, + struct ext4_extent *o_start, struct ext4_extent *o_end, + struct ext4_extent *start_ext, struct ext4_extent *new_ext, + struct ext4_extent *end_ext) +{ + struct ext4_ext_path *orig_path = NULL; + ext4_lblk_t eblock = 0; + int new_flag = 0; + int end_flag = 0; + int err = 0; + + if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) { + if (o_start == o_end) { + + /* start_ext new_ext end_ext + * donor |---------|-----------|--------| + * orig |------------------------------| + */ + end_flag = 1; + } else { + + /* start_ext new_ext end_ext + * donor |---------|----------|---------| + * orig |---------------|--------------| + */ + o_end->ee_block = end_ext->ee_block; + o_end->ee_len = end_ext->ee_len; + ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); + } + + o_start->ee_len = start_ext->ee_len; + eblock = le32_to_cpu(start_ext->ee_block); + new_flag = 1; + + } else if (start_ext->ee_len && new_ext->ee_len && + !end_ext->ee_len && o_start == o_end) { + + /* start_ext new_ext + * donor |--------------|---------------| + * orig |------------------------------| + */ + o_start->ee_len = start_ext->ee_len; + eblock = le32_to_cpu(start_ext->ee_block); + new_flag = 1; + + } else if (!start_ext->ee_len && new_ext->ee_len && + end_ext->ee_len && o_start == o_end) { + + /* new_ext end_ext + * donor |--------------|---------------| + * orig |------------------------------| + */ + o_end->ee_block = end_ext->ee_block; + o_end->ee_len = end_ext->ee_len; + ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); + + /* + * Set 0 to the extent block if new_ext was + * the first block. + */ + if (new_ext->ee_block) + eblock = le32_to_cpu(new_ext->ee_block); + + new_flag = 1; + } else { + ext4_debug("ext4 move extent: Unexpected insert case\n"); + return -EIO; + } + + if (new_flag) { + err = get_ext_path(orig_inode, eblock, &orig_path); + if (err) + goto out; + + if (ext4_ext_insert_extent(handle, orig_inode, + orig_path, new_ext, 0)) + goto out; + } + + if (end_flag) { + err = get_ext_path(orig_inode, + le32_to_cpu(end_ext->ee_block) - 1, &orig_path); + if (err) + goto out; + + if (ext4_ext_insert_extent(handle, orig_inode, + orig_path, end_ext, 0)) + goto out; + } +out: + if (orig_path) { + ext4_ext_drop_refs(orig_path); + kfree(orig_path); + } + + return err; + +} + +/** + * mext_insert_inside_block - Insert new extent to the extent block + * + * @o_start: first original extent to be moved + * @o_end: last original extent to be moved + * @start_ext: first new extent to be inserted + * @new_ext: middle of new extent to be inserted + * @end_ext: last new extent to be inserted + * @eh: extent header of target leaf block + * @range_to_move: used to decide how to insert extent + * + * Insert extents into the leaf block. The extent (@o_start) is overwritten + * by inserted extents. + */ +static void +mext_insert_inside_block(struct ext4_extent *o_start, + struct ext4_extent *o_end, + struct ext4_extent *start_ext, + struct ext4_extent *new_ext, + struct ext4_extent *end_ext, + struct ext4_extent_header *eh, + int range_to_move) +{ + int i = 0; + unsigned long len; + + /* Move the existing extents */ + if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) { + len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) - + (unsigned long)(o_end + 1); + memmove(o_end + 1 + range_to_move, o_end + 1, len); + } + + /* Insert start entry */ + if (start_ext->ee_len) + o_start[i++].ee_len = start_ext->ee_len; + + /* Insert new entry */ + if (new_ext->ee_len) { + o_start[i] = *new_ext; + ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext)); + } + + /* Insert end entry */ + if (end_ext->ee_len) + o_start[i] = *end_ext; + + /* Increment the total entries counter on the extent block */ + le16_add_cpu(&eh->eh_entries, range_to_move); +} + +/** + * mext_insert_extents - Insert new extent + * + * @handle: journal handle + * @orig_inode: original inode + * @orig_path: path indicates first extent to be changed + * @o_start: first original extent to be changed + * @o_end: last original extent to be changed + * @start_ext: first new extent to be inserted + * @new_ext: middle of new extent to be inserted + * @end_ext: last new extent to be inserted + * + * Call the function to insert extents. If we cannot add more extents into + * the leaf block, we call mext_insert_across_blocks() to create a + * new leaf block. Otherwise call mext_insert_inside_block(). Return 0 + * on success, or a negative error value on failure. + */ +static int +mext_insert_extents(handle_t *handle, struct inode *orig_inode, + struct ext4_ext_path *orig_path, + struct ext4_extent *o_start, + struct ext4_extent *o_end, + struct ext4_extent *start_ext, + struct ext4_extent *new_ext, + struct ext4_extent *end_ext) +{ + struct ext4_extent_header *eh; + unsigned long need_slots, slots_range; + int range_to_move, depth, ret; + + /* + * The extents need to be inserted + * start_extent + new_extent + end_extent. + */ + need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) + + (new_ext->ee_len ? 1 : 0); + + /* The number of slots between start and end */ + slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1) + / sizeof(struct ext4_extent); + + /* Range to move the end of extent */ + range_to_move = need_slots - slots_range; + depth = orig_path->p_depth; + orig_path += depth; + eh = orig_path->p_hdr; + + if (depth) { + /* Register to journal */ + ret = ext4_journal_get_write_access(handle, orig_path->p_bh); + if (ret) + return ret; + } + + /* Expansion */ + if (range_to_move > 0 && + (range_to_move > le16_to_cpu(eh->eh_max) + - le16_to_cpu(eh->eh_entries))) { + + ret = mext_insert_across_blocks(handle, orig_inode, o_start, + o_end, start_ext, new_ext, end_ext); + if (ret < 0) + return ret; + } else + mext_insert_inside_block(o_start, o_end, start_ext, new_ext, + end_ext, eh, range_to_move); + + if (depth) { + ret = ext4_handle_dirty_metadata(handle, orig_inode, + orig_path->p_bh); + if (ret) + return ret; + } else { + ret = ext4_mark_inode_dirty(handle, orig_inode); + if (ret < 0) + return ret; + } + + return 0; +} + +/** + * mext_leaf_block - Move one leaf extent block into the inode. + * + * @handle: journal handle + * @orig_inode: original inode + * @orig_path: path indicates first extent to be changed + * @dext: donor extent + * @from: start offset on the target file + * + * In order to insert extents into the leaf block, we must divide the extent + * in the leaf block into three extents. The one is located to be inserted + * extents, and the others are located around it. + * + * Therefore, this function creates structures to save extents of the leaf + * block, and inserts extents by calling mext_insert_extents() with + * created extents. Return 0 on success, or a negative error value on failure. + */ +static int +mext_leaf_block(handle_t *handle, struct inode *orig_inode, + struct ext4_ext_path *orig_path, struct ext4_extent *dext, + ext4_lblk_t *from) +{ + struct ext4_extent *oext, *o_start, *o_end, *prev_ext; + struct ext4_extent new_ext, start_ext, end_ext; + ext4_lblk_t new_ext_end; + int oext_alen, new_ext_alen, end_ext_alen; + int depth = ext_depth(orig_inode); + int ret; + + start_ext.ee_block = end_ext.ee_block = 0; + o_start = o_end = oext = orig_path[depth].p_ext; + oext_alen = ext4_ext_get_actual_len(oext); + start_ext.ee_len = end_ext.ee_len = 0; + + new_ext.ee_block = cpu_to_le32(*from); + ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext)); + new_ext.ee_len = dext->ee_len; + new_ext_alen = ext4_ext_get_actual_len(&new_ext); + new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; + + /* + * Case: original extent is first + * oext |--------| + * new_ext |--| + * start_ext |--| + */ + if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) && + le32_to_cpu(new_ext.ee_block) < + le32_to_cpu(oext->ee_block) + oext_alen) { + start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - + le32_to_cpu(oext->ee_block)); + start_ext.ee_block = oext->ee_block; + copy_extent_status(oext, &start_ext); + } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { + prev_ext = oext - 1; + /* + * We can merge new_ext into previous extent, + * if these are contiguous and same extent type. + */ + if (ext4_can_extents_be_merged(orig_inode, prev_ext, + &new_ext)) { + o_start = prev_ext; + start_ext.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(prev_ext) + + new_ext_alen); + start_ext.ee_block = oext->ee_block; + copy_extent_status(prev_ext, &start_ext); + new_ext.ee_len = 0; + } + } + + /* + * Case: new_ext_end must be less than oext + * oext |-----------| + * new_ext |-------| + */ + if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { + EXT4_ERROR_INODE(orig_inode, + "new_ext_end(%u) should be less than or equal to " + "oext->ee_block(%u) + oext_alen(%d) - 1", + new_ext_end, le32_to_cpu(oext->ee_block), + oext_alen); + ret = -EIO; + goto out; + } + + /* + * Case: new_ext is smaller than original extent + * oext |---------------| + * new_ext |-----------| + * end_ext |---| + */ + if (le32_to_cpu(oext->ee_block) <= new_ext_end && + new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) { + end_ext.ee_len = + cpu_to_le16(le32_to_cpu(oext->ee_block) + + oext_alen - 1 - new_ext_end); + copy_extent_status(oext, &end_ext); + end_ext_alen = ext4_ext_get_actual_len(&end_ext); + ext4_ext_store_pblock(&end_ext, + (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen)); + end_ext.ee_block = + cpu_to_le32(le32_to_cpu(o_end->ee_block) + + oext_alen - end_ext_alen); + } + + ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, + o_end, &start_ext, &new_ext, &end_ext); +out: + return ret; +} + +/** + * mext_calc_swap_extents - Calculate extents for extent swapping. + * + * @tmp_dext: the extent that will belong to the original inode + * @tmp_oext: the extent that will belong to the donor inode + * @orig_off: block offset of original inode + * @donor_off: block offset of donor inode + * @max_count: the maximum length of extents + * + * Return 0 on success, or a negative error value on failure. + */ +static int +mext_calc_swap_extents(struct ext4_extent *tmp_dext, + struct ext4_extent *tmp_oext, + ext4_lblk_t orig_off, ext4_lblk_t donor_off, + ext4_lblk_t max_count) +{ + ext4_lblk_t diff, orig_diff; + struct ext4_extent dext_old, oext_old; + + BUG_ON(orig_off != donor_off); + + /* original and donor extents have to cover the same block offset */ + if (orig_off < le32_to_cpu(tmp_oext->ee_block) || + le32_to_cpu(tmp_oext->ee_block) + + ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off) + return -ENODATA; + + if (orig_off < le32_to_cpu(tmp_dext->ee_block) || + le32_to_cpu(tmp_dext->ee_block) + + ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off) + return -ENODATA; + + dext_old = *tmp_dext; + oext_old = *tmp_oext; + + /* When tmp_dext is too large, pick up the target range. */ + diff = donor_off - le32_to_cpu(tmp_dext->ee_block); + + ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); + tmp_dext->ee_block = + cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); + tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); + + if (max_count < ext4_ext_get_actual_len(tmp_dext)) + tmp_dext->ee_len = cpu_to_le16(max_count); + + orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); + ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff); + + /* Adjust extent length if donor extent is larger than orig */ + if (ext4_ext_get_actual_len(tmp_dext) > + ext4_ext_get_actual_len(tmp_oext) - orig_diff) + tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) - + orig_diff); + + tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext)); + + copy_extent_status(&oext_old, tmp_dext); + copy_extent_status(&dext_old, tmp_oext); + + return 0; +} + +/** + * mext_replace_branches - Replace original extents with new extents + * + * @handle: journal handle + * @orig_inode: original inode + * @donor_inode: donor inode + * @from: block offset of orig_inode + * @count: block count to be replaced + * @err: pointer to save return value + * + * Replace original inode extents and donor inode extents page by page. + * We implement this replacement in the following three steps: + * 1. Save the block information of original and donor inodes into + * dummy extents. + * 2. Change the block information of original inode to point at the + * donor inode blocks. + * 3. Change the block information of donor inode to point at the saved + * original inode blocks in the dummy extents. + * + * Return replaced block count. + */ +static int +mext_replace_branches(handle_t *handle, struct inode *orig_inode, + struct inode *donor_inode, ext4_lblk_t from, + ext4_lblk_t count, int *err) +{ + struct ext4_ext_path *orig_path = NULL; + struct ext4_ext_path *donor_path = NULL; + struct ext4_extent *oext, *dext; + struct ext4_extent tmp_dext, tmp_oext; + ext4_lblk_t orig_off = from, donor_off = from; + int depth; + int replaced_count = 0; + int dext_alen; + + /* Protect extent trees against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); + + /* Get the original extent for the block "orig_off" */ + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) + goto out; + + /* Get the donor extent for the head */ + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) + goto out; + depth = ext_depth(orig_inode); + oext = orig_path[depth].p_ext; + tmp_oext = *oext; + + depth = ext_depth(donor_inode); + dext = donor_path[depth].p_ext; + tmp_dext = *dext; + + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + donor_off, count); + if (*err) + goto out; + + /* Loop for the donor extents */ + while (1) { + /* The extent for donor must be found. */ + if (!dext) { + EXT4_ERROR_INODE(donor_inode, + "The extent for donor must be found"); + *err = -EIO; + goto out; + } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { + EXT4_ERROR_INODE(donor_inode, + "Donor offset(%u) and the first block of donor " + "extent(%u) should be equal", + donor_off, + le32_to_cpu(tmp_dext.ee_block)); + *err = -EIO; + goto out; + } + + /* Set donor extent to orig extent */ + *err = mext_leaf_block(handle, orig_inode, + orig_path, &tmp_dext, &orig_off); + if (*err) + goto out; + + /* Set orig extent to donor extent */ + *err = mext_leaf_block(handle, donor_inode, + donor_path, &tmp_oext, &donor_off); + if (*err) + goto out; + + dext_alen = ext4_ext_get_actual_len(&tmp_dext); + replaced_count += dext_alen; + donor_off += dext_alen; + orig_off += dext_alen; + + /* Already moved the expected blocks */ + if (replaced_count >= count) + break; + + if (orig_path) + ext4_ext_drop_refs(orig_path); + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) + goto out; + depth = ext_depth(orig_inode); + oext = orig_path[depth].p_ext; + tmp_oext = *oext; + + if (donor_path) + ext4_ext_drop_refs(donor_path); + *err = get_ext_path(donor_inode, donor_off, &donor_path); + if (*err) + goto out; + depth = ext_depth(donor_inode); + dext = donor_path[depth].p_ext; + tmp_dext = *dext; + + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + donor_off, count - replaced_count); + if (*err) + goto out; + } + +out: + if (orig_path) { + ext4_ext_drop_refs(orig_path); + kfree(orig_path); + } + if (donor_path) { + ext4_ext_drop_refs(donor_path); + kfree(donor_path); + } + + ext4_ext_invalidate_cache(orig_inode); + ext4_ext_invalidate_cache(donor_inode); + + double_up_write_data_sem(orig_inode, donor_inode); + + return replaced_count; +} + +/** + * move_extent_per_page - Move extent data per page + * + * @o_filp: file structure of original file + * @donor_inode: donor inode + * @orig_page_offset: page index on original file + * @data_offset_in_page: block index where data swapping starts + * @block_len_in_page: the number of blocks to be swapped + * @uninit: orig extent is uninitialized or not + * @err: pointer to save return value + * + * Save the data in original inode blocks and replace original inode extents + * with donor inode extents by calling mext_replace_branches(). + * Finally, write out the saved data in new original inode blocks. Return + * replaced block count. + */ +static int +move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + pgoff_t orig_page_offset, int data_offset_in_page, + int block_len_in_page, int uninit, int *err) +{ + struct inode *orig_inode = o_filp->f_dentry->d_inode; + struct address_space *mapping = orig_inode->i_mapping; + struct buffer_head *bh; + struct page *page = NULL; + const struct address_space_operations *a_ops = mapping->a_ops; + handle_t *handle; + ext4_lblk_t orig_blk_offset; + long long offs = orig_page_offset << PAGE_CACHE_SHIFT; + unsigned long blocksize = orig_inode->i_sb->s_blocksize; + unsigned int w_flags = 0; + unsigned int tmp_data_size, data_size, replaced_size; + void *fsdata; + int i, jblocks; + int err2 = 0; + int replaced_count = 0; + int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + + /* + * It needs twice the amount of ordinary journal buffers because + * inode and donor_inode may change each different metadata blocks. + */ + jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; + handle = ext4_journal_start(orig_inode, jblocks); + if (IS_ERR(handle)) { + *err = PTR_ERR(handle); + return 0; + } + + if (segment_eq(get_fs(), KERNEL_DS)) + w_flags |= AOP_FLAG_UNINTERRUPTIBLE; + + orig_blk_offset = orig_page_offset * blocks_per_page + + data_offset_in_page; + + /* + * If orig extent is uninitialized one, + * it's not necessary force the page into memory + * and then force it to be written out again. + * Just swap data blocks between orig and donor. + */ + if (uninit) { + replaced_count = mext_replace_branches(handle, orig_inode, + donor_inode, orig_blk_offset, + block_len_in_page, err); + goto out2; + } + + offs = (long long)orig_blk_offset << orig_inode->i_blkbits; + + /* Calculate data_size */ + if ((orig_blk_offset + block_len_in_page - 1) == + ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { + /* Replace the last block */ + tmp_data_size = orig_inode->i_size & (blocksize - 1); + /* + * If data_size equal zero, it shows data_size is multiples of + * blocksize. So we set appropriate value. + */ + if (tmp_data_size == 0) + tmp_data_size = blocksize; + + data_size = tmp_data_size + + ((block_len_in_page - 1) << orig_inode->i_blkbits); + } else + data_size = block_len_in_page << orig_inode->i_blkbits; + + replaced_size = data_size; + + *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, + &page, &fsdata); + if (unlikely(*err < 0)) + goto out; + + if (!PageUptodate(page)) { + mapping->a_ops->readpage(o_filp, page); + lock_page(page); + } + + /* + * try_to_release_page() doesn't call releasepage in writeback mode. + * We should care about the order of writing to the same file + * by multiple move extent processes. + * It needs to call wait_on_page_writeback() to wait for the + * writeback of the page. + */ + wait_on_page_writeback(page); + + /* Release old bh and drop refs */ + try_to_release_page(page, 0); + + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, + orig_blk_offset, block_len_in_page, + &err2); + if (err2) { + if (replaced_count) { + block_len_in_page = replaced_count; + replaced_size = + block_len_in_page << orig_inode->i_blkbits; + } else + goto out; + } + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); + + bh = page_buffers(page); + for (i = 0; i < data_offset_in_page; i++) + bh = bh->b_this_page; + + for (i = 0; i < block_len_in_page; i++) { + *err = ext4_get_block(orig_inode, + (sector_t)(orig_blk_offset + i), bh, 0); + if (*err < 0) + goto out; + + if (bh->b_this_page != NULL) + bh = bh->b_this_page; + } + + *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, + page, fsdata); + page = NULL; + +out: + if (unlikely(page)) { + if (PageLocked(page)) + unlock_page(page); + page_cache_release(page); + ext4_journal_stop(handle); + } +out2: + ext4_journal_stop(handle); + + if (err2) + *err = err2; + + return replaced_count; +} + +/** + * mext_check_arguments - Check whether move extent can be done + * + * @orig_inode: original inode + * @donor_inode: donor inode + * @orig_start: logical start offset in block for orig + * @donor_start: logical start offset in block for donor + * @len: the number of blocks to be moved + * + * Check the arguments of ext4_move_extents() whether the files can be + * exchanged with each other. + * Return 0 on success, or a negative error value on failure. + */ +static int +mext_check_arguments(struct inode *orig_inode, + struct inode *donor_inode, __u64 orig_start, + __u64 donor_start, __u64 *len) +{ + ext4_lblk_t orig_blocks, donor_blocks; + unsigned int blkbits = orig_inode->i_blkbits; + unsigned int blocksize = 1 << blkbits; + + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { + ext4_debug("ext4 move extent: suid or sgid is set" + " to donor file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) + return -EPERM; + + /* Ext4 move extent does not support swapfile */ + if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { + ext4_debug("ext4 move extent: The argument files should " + "not be swapfile [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Files should be in the same ext4 FS */ + if (orig_inode->i_sb != donor_inode->i_sb) { + ext4_debug("ext4 move extent: The argument files " + "should be in same FS [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Ext4 move extent supports only extent based file */ + if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { + ext4_debug("ext4 move extent: orig file is not extents " + "based file [ino:orig %lu]\n", orig_inode->i_ino); + return -EOPNOTSUPP; + } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { + ext4_debug("ext4 move extent: donor file is not extents " + "based file [ino:donor %lu]\n", donor_inode->i_ino); + return -EOPNOTSUPP; + } + + if ((!orig_inode->i_size) || (!donor_inode->i_size)) { + ext4_debug("ext4 move extent: File size is 0 byte\n"); + return -EINVAL; + } + + /* Start offset should be same */ + if (orig_start != donor_start) { + ext4_debug("ext4 move extent: orig and donor's start " + "offset are not same [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if ((orig_start >= EXT_MAX_BLOCKS) || + (donor_start >= EXT_MAX_BLOCKS) || + (*len > EXT_MAX_BLOCKS) || + (orig_start + *len >= EXT_MAX_BLOCKS)) { + ext4_debug("ext4 move extent: Can't handle over [%u] blocks " + "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (orig_inode->i_size > donor_inode->i_size) { + donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; + /* TODO: eliminate this artificial restriction */ + if (orig_start >= donor_blocks) { + ext4_debug("ext4 move extent: orig start offset " + "[%llu] should be less than donor file blocks " + "[%u] [ino:orig %lu, donor %lu]\n", + orig_start, donor_blocks, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* TODO: eliminate this artificial restriction */ + if (orig_start + *len > donor_blocks) { + ext4_debug("ext4 move extent: End offset [%llu] should " + "be less than donor file blocks [%u]." + "So adjust length from %llu to %llu " + "[ino:orig %lu, donor %lu]\n", + orig_start + *len, donor_blocks, + *len, donor_blocks - orig_start, + orig_inode->i_ino, donor_inode->i_ino); + *len = donor_blocks - orig_start; + } + } else { + orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits; + if (orig_start >= orig_blocks) { + ext4_debug("ext4 move extent: start offset [%llu] " + "should be less than original file blocks " + "[%u] [ino:orig %lu, donor %lu]\n", + orig_start, orig_blocks, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (orig_start + *len > orig_blocks) { + ext4_debug("ext4 move extent: Adjust length " + "from %llu to %llu. Because it should be " + "less than original file blocks " + "[ino:orig %lu, donor %lu]\n", + *len, orig_blocks - orig_start, + orig_inode->i_ino, donor_inode->i_ino); + *len = orig_blocks - orig_start; + } + } + + if (!*len) { + ext4_debug("ext4 move extent: len should not be 0 " + "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, + donor_inode->i_ino); + return -EINVAL; + } + + return 0; +} + +/** + * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 + * + * @inode1: the inode structure + * @inode2: the inode structure + * + * Lock two inodes' i_mutex by i_ino order. + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + */ +static int +mext_inode_double_lock(struct inode *inode1, struct inode *inode2) +{ + int ret = 0; + + BUG_ON(inode1 == NULL && inode2 == NULL); + + ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); + if (ret < 0) + goto out; + + if (inode1 == inode2) { + mutex_lock(&inode1->i_mutex); + goto out; + } + + if (inode1->i_ino < inode2->i_ino) { + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); + } + +out: + return ret; +} + +/** + * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 + * + * @inode1: the inode that is released first + * @inode2: the inode that is released second + * + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + */ + +static int +mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) +{ + int ret = 0; + + BUG_ON(inode1 == NULL && inode2 == NULL); + + ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); + if (ret < 0) + goto out; + + if (inode1) + mutex_unlock(&inode1->i_mutex); + + if (inode2 && inode2 != inode1) + mutex_unlock(&inode2->i_mutex); + +out: + return ret; +} + +/** + * ext4_move_extents - Exchange the specified range of a file + * + * @o_filp: file structure of the original file + * @d_filp: file structure of the donor file + * @orig_start: start offset in block for orig + * @donor_start: start offset in block for donor + * @len: the number of blocks to be moved + * @moved_len: moved block length + * + * This function returns 0 and moved block length is set in moved_len + * if succeed, otherwise returns error value. + * + * Note: ext4_move_extents() proceeds the following order. + * 1:ext4_move_extents() calculates the last block number of moving extent + * function by the start block number (orig_start) and the number of blocks + * to be moved (len) specified as arguments. + * If the {orig, donor}_start points a hole, the extent's start offset + * pointed by ext_cur (current extent), holecheck_path, orig_path are set + * after hole behind. + * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent + * or the ext_cur exceeds the block_end which is last logical block number. + * 3:To get the length of continues area, call mext_next_extent() + * specified with the ext_cur (initial value is holecheck_path) re-cursive, + * until find un-continuous extent, the start logical block number exceeds + * the block_end or the extent points to the last extent. + * 4:Exchange the original inode data with donor inode data + * from orig_page_offset to seq_end_page. + * The start indexes of data are specified as arguments. + * That of the original inode is orig_page_offset, + * and the donor inode is also orig_page_offset + * (To easily handle blocksize != pagesize case, the offset for the + * donor inode is block unit). + * 5:Update holecheck_path and orig_path to points a next proceeding extent, + * then returns to step 2. + * 6:Release holecheck_path, orig_path and set the len to moved_len + * which shows the number of moved blocks. + * The moved_len is useful for the command to calculate the file offset + * for starting next move extent ioctl. + * 7:Return 0 on success, or a negative error value on failure. + */ +int +ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 orig_start, __u64 donor_start, __u64 len, + __u64 *moved_len) +{ + struct inode *orig_inode = o_filp->f_dentry->d_inode; + struct inode *donor_inode = d_filp->f_dentry->d_inode; + struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL; + struct ext4_extent *ext_prev, *ext_cur, *ext_dummy; + ext4_lblk_t block_start = orig_start; + ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; + ext4_lblk_t rest_blocks; + pgoff_t orig_page_offset = 0, seq_end_page; + int ret1, ret2, depth, last_extent = 0; + int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + int data_offset_in_page; + int block_len_in_page; + int uninit; + + /* orig and donor should be different file */ + if (orig_inode->i_ino == donor_inode->i_ino) { + ext4_debug("ext4 move extent: The argument files should not " + "be same file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Regular file check */ + if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { + ext4_debug("ext4 move extent: The argument files should be " + "regular file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Protect orig and donor inodes against a truncate */ + ret1 = mext_inode_double_lock(orig_inode, donor_inode); + if (ret1 < 0) + return ret1; + + /* Protect extent tree against block allocations via delalloc */ + double_down_write_data_sem(orig_inode, donor_inode); + /* Check the filesystem environment whether move_extent can be done */ + ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, + donor_start, &len); + if (ret1) + goto out; + + file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; + block_end = block_start + len - 1; + if (file_end < block_end) + len -= block_end - file_end; + + ret1 = get_ext_path(orig_inode, block_start, &orig_path); + if (ret1) + goto out; + + /* Get path structure to check the hole */ + ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); + if (ret1) + goto out; + + depth = ext_depth(orig_inode); + ext_cur = holecheck_path[depth].p_ext; + + /* + * Get proper starting location of block replacement if block_start was + * within the hole. + */ + if (le32_to_cpu(ext_cur->ee_block) + + ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { + /* + * The hole exists between extents or the tail of + * original file. + */ + last_extent = mext_next_extent(orig_inode, + holecheck_path, &ext_cur); + if (last_extent < 0) { + ret1 = last_extent; + goto out; + } + last_extent = mext_next_extent(orig_inode, orig_path, + &ext_dummy); + if (last_extent < 0) { + ret1 = last_extent; + goto out; + } + seq_start = le32_to_cpu(ext_cur->ee_block); + } else if (le32_to_cpu(ext_cur->ee_block) > block_start) + /* The hole exists at the beginning of original file. */ + seq_start = le32_to_cpu(ext_cur->ee_block); + else + seq_start = block_start; + + /* No blocks within the specified range. */ + if (le32_to_cpu(ext_cur->ee_block) > block_end) { + ext4_debug("ext4 move extent: The specified range of file " + "may be the hole\n"); + ret1 = -EINVAL; + goto out; + } + + /* Adjust start blocks */ + add_blocks = min(le32_to_cpu(ext_cur->ee_block) + + ext4_ext_get_actual_len(ext_cur), block_end + 1) - + max(le32_to_cpu(ext_cur->ee_block), block_start); + + while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { + seq_blocks += add_blocks; + + /* Adjust tail blocks */ + if (seq_start + seq_blocks - 1 > block_end) + seq_blocks = block_end - seq_start + 1; + + ext_prev = ext_cur; + last_extent = mext_next_extent(orig_inode, holecheck_path, + &ext_cur); + if (last_extent < 0) { + ret1 = last_extent; + break; + } + add_blocks = ext4_ext_get_actual_len(ext_cur); + + /* + * Extend the length of contiguous block (seq_blocks) + * if extents are contiguous. + */ + if (ext4_can_extents_be_merged(orig_inode, + ext_prev, ext_cur) && + block_end >= le32_to_cpu(ext_cur->ee_block) && + !last_extent) + continue; + + /* Is original extent is uninitialized */ + uninit = ext4_ext_is_uninitialized(ext_prev); + + data_offset_in_page = seq_start % blocks_per_page; + + /* + * Calculate data blocks count that should be swapped + * at the first page. + */ + if (data_offset_in_page + seq_blocks > blocks_per_page) { + /* Swapped blocks are across pages */ + block_len_in_page = + blocks_per_page - data_offset_in_page; + } else { + /* Swapped blocks are in a page */ + block_len_in_page = seq_blocks; + } + + orig_page_offset = seq_start >> + (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); + seq_end_page = (seq_start + seq_blocks - 1) >> + (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); + seq_start = le32_to_cpu(ext_cur->ee_block); + rest_blocks = seq_blocks; + + /* + * Up semaphore to avoid following problems: + * a. transaction deadlock among ext4_journal_start, + * ->write_begin via pagefault, and jbd2_journal_commit + * b. racing with ->readpage, ->write_begin, and ext4_get_block + * in move_extent_per_page + */ + double_up_write_data_sem(orig_inode, donor_inode); + + while (orig_page_offset <= seq_end_page) { + + /* Swap original branches with new branches */ + block_len_in_page = move_extent_per_page( + o_filp, donor_inode, + orig_page_offset, + data_offset_in_page, + block_len_in_page, uninit, + &ret1); + + /* Count how many blocks we have exchanged */ + *moved_len += block_len_in_page; + if (ret1 < 0) + break; + if (*moved_len > len) { + EXT4_ERROR_INODE(orig_inode, + "We replaced blocks too much! " + "sum of replaced: %llu requested: %llu", + *moved_len, len); + ret1 = -EIO; + break; + } + + orig_page_offset++; + data_offset_in_page = 0; + rest_blocks -= block_len_in_page; + if (rest_blocks > blocks_per_page) + block_len_in_page = blocks_per_page; + else + block_len_in_page = rest_blocks; + } + + double_down_write_data_sem(orig_inode, donor_inode); + if (ret1 < 0) + break; + + /* Decrease buffer counter */ + if (holecheck_path) + ext4_ext_drop_refs(holecheck_path); + ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); + if (ret1) + break; + depth = holecheck_path->p_depth; + + /* Decrease buffer counter */ + if (orig_path) + ext4_ext_drop_refs(orig_path); + ret1 = get_ext_path(orig_inode, seq_start, &orig_path); + if (ret1) + break; + + ext_cur = holecheck_path[depth].p_ext; + add_blocks = ext4_ext_get_actual_len(ext_cur); + seq_blocks = 0; + + } +out: + if (*moved_len) { + ext4_discard_preallocations(orig_inode); + ext4_discard_preallocations(donor_inode); + } + + if (orig_path) { + ext4_ext_drop_refs(orig_path); + kfree(orig_path); + } + if (holecheck_path) { + ext4_ext_drop_refs(holecheck_path); + kfree(holecheck_path); + } + double_up_write_data_sem(orig_inode, donor_inode); + ret2 = mext_inode_double_unlock(orig_inode, donor_inode); + + if (ret1) + return ret1; + else if (ret2) + return ret2; + + return 0; +} |