* [RFC][PATCH 3/9]ext4: Exchange the extents between two inodes
@ 2008-10-24 10:09 Akira Fujita
0 siblings, 0 replies; only message in thread
From: Akira Fujita @ 2008-10-24 10:09 UTC (permalink / raw)
To: linux-ext4, Theodore Tso, Mingming Cao; +Cc: linux-fsdevel
ext4: online defrag -- Exchange the extents between two inodes
From: Akira Fujita <a-fujita@rs.jp.nec.com>
For each page, exchange the extents between the temporary inode
and the original inode, and then write them.
Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
---
fs/ext4/defrag.c | 477 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 476 insertions(+), 1 deletions(-)
diff --git a/fs/ext4/defrag.c b/fs/ext4/defrag.c
index 729f001..0b90d4d 100644
--- a/fs/ext4/defrag.c
+++ b/fs/ext4/defrag.c
@@ -91,6 +91,361 @@ err:
}
/**
+ * ext4_defrag_merge_extents - Merge new extent
+ *
+ * @handle: journal handle
+ * @org_inode: original inode
+ * @org_path: path indicates first extent to be defraged
+ * @o_start: first original extent to be defraged
+ * @o_end: last original extent to be defraged
+ * @start_ext: first new extent to be merged
+ * @new_ext: middle of new extent to be merged
+ * @end_ext: last new extent to be merged
+ * @replaced: the number of blocks which will be replaced with new_ext
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_merge_extents(handle_t *handle, struct inode *org_inode,
+ struct ext4_ext_path *org_path,
+ struct ext4_extent *o_start, struct ext4_extent *o_end,
+ struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+ struct ext4_extent *end_ext, ext4_fsblk_t replaced)
+{
+ return 0;
+}
+
+/**
+ * ext4_defrag_leaf_block - Defragmentation for one leaf extent block
+ *
+ * @handle: journal handle
+ * @org_inode: original inode
+ * @org_path: path indicates first extent to be defraged
+ * @dext: destination extent
+ * @from: start offset on the target file
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
+ struct ext4_ext_path *org_path, struct ext4_extent *dext,
+ ext4_lblk_t *from)
+{
+ struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext;
+ struct ext4_extent new_ext, start_ext, end_ext;
+ ext4_fsblk_t replaced = 0;
+ ext4_lblk_t new_end, lblock;
+ unsigned long depth;
+ unsigned short len;
+ ext4_fsblk_t new_phys_end;
+ int ret;
+
+ depth = ext_depth(org_inode);
+ start_ext.ee_len = end_ext.ee_len = 0;
+ o_start = o_end = oext = org_path[depth].p_ext;
+ ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+ new_ext.ee_len = dext->ee_len;
+ len = le16_to_cpu(new_ext.ee_len);
+ new_ext.ee_block = cpu_to_le32(*from);
+ lblock = le32_to_cpu(oext->ee_block);
+ new_end = le32_to_cpu(new_ext.ee_block)
+ + le16_to_cpu(new_ext.ee_len) - 1;
+ new_phys_end = ext_pblock(&new_ext)
+ + le16_to_cpu(new_ext.ee_len) - 1;
+
+ /*
+ * First original extent
+ * dest |---------------|
+ * org |---------------|
+ */
+ if (le32_to_cpu(new_ext.ee_block) >
+ le32_to_cpu(oext->ee_block) &&
+ le32_to_cpu(new_ext.ee_block) <
+ le32_to_cpu(oext->ee_block)
+ + le16_to_cpu(oext->ee_len)) {
+ start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block)
+ - le32_to_cpu(oext->ee_block));
+ replaced += le16_to_cpu(oext->ee_len)
+ - le16_to_cpu(start_ext.ee_len);
+ } else if (oext > EXT_FIRST_EXTENT(org_path[depth].p_hdr)) {
+ /* We can merge previous extent. */
+ prev_ext = oext - 1;
+ if (((ext_pblock(prev_ext) + le16_to_cpu(prev_ext->ee_len))
+ == ext_pblock(&new_ext))
+ && (le32_to_cpu(prev_ext->ee_block)
+ + le16_to_cpu(prev_ext->ee_len)
+ == le32_to_cpu(new_ext.ee_block))) {
+ o_start = prev_ext;
+ start_ext.ee_len = cpu_to_le16(
+ le16_to_cpu(prev_ext->ee_len)
+ + le16_to_cpu(new_ext.ee_len));
+ new_ext.ee_len = 0;
+ }
+ }
+
+ for (;;) {
+ /* The extent for destination must be found. */
+ BUG_ON(!oext || lblock != le32_to_cpu(oext->ee_block));
+ lblock += le16_to_cpu(oext->ee_len);
+
+ /*
+ * Middle of original extent
+ * dest |-------------------|
+ * org |-----------------|
+ */
+ if (le32_to_cpu(new_ext.ee_block) <=
+ le32_to_cpu(oext->ee_block) &&
+ new_end >= le32_to_cpu(oext->ee_block)
+ + le16_to_cpu(oext->ee_len) - 1)
+ replaced += le16_to_cpu(oext->ee_len);
+
+ /*
+ * Last original extent
+ * dest |----------------|
+ * org |---------------|
+ */
+ if (new_end >= le32_to_cpu(oext->ee_block) &&
+ new_end < le32_to_cpu(oext->ee_block)
+ + le16_to_cpu(oext->ee_len) - 1) {
+ end_ext.ee_len
+ = cpu_to_le16(le32_to_cpu(oext->ee_block)
+ + le16_to_cpu(oext->ee_len) - 1 - new_end);
+ ext4_ext_store_pblock(&end_ext, (ext_pblock(o_end)
+ + le16_to_cpu(oext->ee_len)
+ - le16_to_cpu(end_ext.ee_len)));
+ end_ext.ee_block
+ = cpu_to_le32(le32_to_cpu(o_end->ee_block)
+ + le16_to_cpu(oext->ee_len)
+ - le16_to_cpu(end_ext.ee_len));
+ replaced += le16_to_cpu(oext->ee_len)
+ - le16_to_cpu(end_ext.ee_len);
+ }
+
+ /*
+ * Detected the block end, reached the number of replaced
+ * blocks to dext->ee_len. Then merge the extent.
+ */
+ if (oext == EXT_LAST_EXTENT(org_path[depth].p_hdr) ||
+ new_end <= le32_to_cpu(oext->ee_block)
+ + le16_to_cpu(oext->ee_len) - 1) {
+ ret = ext4_defrag_merge_extents(handle, org_inode,
+ org_path, o_start, o_end, &start_ext,
+ &new_ext, &end_ext, replaced);
+ if (ret < 0)
+ return ret;
+
+ /* All expected blocks are replaced */
+ if (le16_to_cpu(new_ext.ee_len) <= 0)
+ return 0;
+
+ /* Re-calculate new_ext */
+ le16_add_cpu(&new_ext.ee_len, -replaced);
+ le32_add_cpu(&new_ext.ee_block, replaced);
+ ext4_ext_store_pblock(&new_ext, ext_pblock(&new_ext)
+ + replaced);
+ replaced = 0;
+ start_ext.ee_len = end_ext.ee_len = 0;
+ o_start = NULL;
+
+ /* All expected blocks are replaced. */
+ if (le16_to_cpu(new_ext.ee_len) <= 0)
+ return 0;
+ }
+
+ /* Get the next extent for original. */
+ if (org_path)
+ ext4_ext_drop_refs(org_path);
+ org_path = ext4_ext_find_extent(org_inode, lblock, org_path);
+ if (IS_ERR(org_path)) {
+ ret = PTR_ERR(org_path);
+ org_path = NULL;
+ return ret;
+ }
+ depth = ext_depth(org_inode);
+ oext = org_path[depth].p_ext;
+ if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
+ <= lblock)
+ return -ENOENT;
+
+ o_end = oext;
+ if (!o_start)
+ o_start = oext;
+ }
+}
+
+/**
+ * ext4_defrag_replace_branches - Replace original extents with new extents
+ *
+ * @handle: journal handle
+ * @org_inode: original inode
+ * @dest_inode: temporary inode
+ * @from: block offset of org_inode
+ * @dest_off: block offset of dest_inode
+ * @count: block count to be replaced
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ * Replace extents for blocks from "from" to "from + count - 1".
+ */
+static int
+ext4_defrag_replace_branches(handle_t *handle, struct inode *org_inode,
+ struct inode *dest_inode, ext4_lblk_t from,
+ ext4_lblk_t dest_off, ext4_lblk_t count)
+{
+ struct ext4_ext_path *org_path = NULL;
+ struct ext4_ext_path *dest_path = NULL;
+ struct ext4_extent *oext, *dext, *swap_ext;
+ struct ext4_extent tmp_ext, tmp_ext2;
+ ext4_lblk_t diff, org_diff;
+ int err = 0;
+ int depth;
+ int replaced_count = 0;
+
+ /* Get the original extent for the block "from" */
+ org_path = ext4_ext_find_extent(org_inode, from, NULL);
+ if (IS_ERR(org_path)) {
+ err = PTR_ERR(org_path);
+ org_path = NULL;
+ goto out;
+ }
+
+ /* Get the destination extent for the head */
+ dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
+ if (IS_ERR(dest_path)) {
+ err = PTR_ERR(dest_path);
+ dest_path = NULL;
+ goto out;
+ }
+ depth = ext_depth(dest_inode);
+ dext = dest_path[depth].p_ext;
+ /* When dext is too large, pick up the target range. */
+ diff = dest_off - le32_to_cpu(dext->ee_block);
+ ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
+ tmp_ext.ee_block = cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
+ tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
+ if (count < le16_to_cpu(tmp_ext.ee_len))
+ tmp_ext.ee_len = cpu_to_le16(count);
+ dext = &tmp_ext;
+
+ depth = ext_depth(org_inode);
+ oext = org_path[depth].p_ext;
+ org_diff = from - le32_to_cpu(oext->ee_block);
+ ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
+ tmp_ext2.ee_block = tmp_ext.ee_block;
+
+ /* Adjust extent length when blocksize != pagesize */
+ if (le16_to_cpu(tmp_ext.ee_len) <=
+ le16_to_cpu(oext->ee_len) - org_diff) {
+ tmp_ext2.ee_len = tmp_ext.ee_len;
+ } else {
+ tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
+ - org_diff);
+ tmp_ext.ee_len = tmp_ext2.ee_len;
+ }
+ swap_ext = &tmp_ext2;
+
+ /* Loop for the destination extents */
+ while (1) {
+ /* The extent for destination must be found. */
+ BUG_ON(!dext || dest_off != le32_to_cpu(dext->ee_block));
+
+ /* Loop for the original extent blocks */
+ err = ext4_defrag_leaf_block(handle, org_inode,
+ org_path, dext, &from);
+ if (err < 0)
+ goto out;
+
+ /*
+ * We need the function which fixes extent information for
+ * inserting.
+ * e.g. ext4_defrag_merge_extents()
+ */
+ err = ext4_defrag_leaf_block(handle, dest_inode,
+ dest_path, swap_ext, &dest_off);
+ if (err < 0)
+ goto out;
+
+ replaced_count += le16_to_cpu(dext->ee_len);
+ dest_off += le16_to_cpu(dext->ee_len);
+ from += le16_to_cpu(dext->ee_len);
+
+ /* Already moved the expected blocks */
+ if (replaced_count >= count)
+ break;
+
+ if (org_path)
+ ext4_ext_drop_refs(org_path);
+ org_path = ext4_ext_find_extent(org_inode, from, NULL);
+ if (IS_ERR(org_path)) {
+ err = PTR_ERR(org_path);
+ org_path = NULL;
+ goto out;
+ }
+ depth = ext_depth(org_inode);
+ oext = org_path[depth].p_ext;
+ if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
+ <= from) {
+ err = 0;
+ goto out;
+ }
+
+ if (dest_path)
+ ext4_ext_drop_refs(dest_path);
+ dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
+ if (IS_ERR(dest_path)) {
+ err = PTR_ERR(dest_path);
+ dest_path = NULL;
+ goto out;
+ }
+ depth = ext_depth(dest_inode);
+ dext = dest_path[depth].p_ext;
+ if (le32_to_cpu(dext->ee_block) + le16_to_cpu(dext->ee_len)
+ <= dest_off) {
+ err = 0;
+ goto out;
+ }
+
+ /* When dext is too large, pick up the target range. */
+ diff = dest_off - le32_to_cpu(dext->ee_block);
+ ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
+ tmp_ext.ee_block =
+ cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
+ tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
+
+ if (count - replaced_count < le16_to_cpu(tmp_ext.ee_len))
+ tmp_ext.ee_len = cpu_to_le16(count - replaced_count);
+
+ dext = &tmp_ext;
+
+ org_diff = from - le32_to_cpu(oext->ee_block);
+ ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
+ tmp_ext2.ee_block = tmp_ext.ee_block;
+
+ /* Adjust extent length when blocksize != pagesize */
+ if (le16_to_cpu(tmp_ext.ee_len) <=
+ le16_to_cpu(oext->ee_len) - org_diff) {
+ tmp_ext2.ee_len = tmp_ext.ee_len;
+ } else {
+ tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
+ - org_diff);
+ tmp_ext.ee_len = tmp_ext2.ee_len;
+ }
+ swap_ext = &tmp_ext2;
+ }
+
+out:
+ if (org_path) {
+ ext4_ext_drop_refs(org_path);
+ kfree(org_path);
+ }
+ if (dest_path) {
+ ext4_ext_drop_refs(dest_path);
+ kfree(dest_path);
+ }
+
+ return err;
+}
+
+/**
* ext4_defrag_fill_ar - Prepare to multiple block allocate for tmp inode
*
* @org_inode: original inode
@@ -185,7 +540,127 @@ ext4_defrag_partial(struct inode *tmp_inode, struct file *filp,
pgoff_t org_page_offset, ext4_lblk_t dest_blk_offset,
int data_offset_in_page, int block_len_in_page)
{
- return 0;
+ struct inode *org_inode = filp->f_dentry->d_inode;
+ struct address_space *mapping = org_inode->i_mapping;
+ struct buffer_head *bh;
+ struct page *page = NULL;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ handle_t *handle;
+ ext4_lblk_t org_blk_offset;
+ long long offs = org_page_offset << PAGE_CACHE_SHIFT;
+ unsigned long blocksize = org_inode->i_sb->s_blocksize;
+ unsigned int w_flags = 0;
+ unsigned int tmp_data_len;
+ unsigned data_len;
+ void *fsdata;
+ int ret, i, jblocks;
+ int blocks_per_page = PAGE_CACHE_SIZE >> org_inode->i_blkbits;
+
+ /*
+ * It needs twice the amount of ordinary journal buffers because
+ * inode and tmp_inode may change each different metadata blocks.
+ */
+ jblocks = ext4_writepage_trans_blocks(org_inode) * 2;
+ handle = ext4_journal_start(org_inode, jblocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
+
+ if (segment_eq(get_fs(), KERNEL_DS))
+ w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+ org_blk_offset = org_page_offset * blocks_per_page +
+ data_offset_in_page;
+ offs = (long long)org_blk_offset << org_inode->i_blkbits;
+
+ /* Calculate data_len */
+ if ((org_blk_offset + block_len_in_page - 1) ==
+ ((org_inode->i_size - 1) >> org_inode->i_blkbits)) {
+ /* the case which we replace the last block */
+ tmp_data_len = org_inode->i_size & (blocksize - 1);
+ /*
+ * If data_len equal zero, it shows data_len is multiples of
+ * blocksize. So we set appropriate value.
+ */
+ if (tmp_data_len == 0)
+ tmp_data_len = blocksize;
+
+ data_len = tmp_data_len +
+ ((block_len_in_page - 1) << org_inode->i_blkbits);
+ } else {
+ data_len = block_len_in_page << org_inode->i_blkbits;
+ }
+
+ up_write(&EXT4_I(org_inode)->i_data_sem);
+ ret = a_ops->write_begin(filp, mapping, offs, data_len, w_flags, &page,
+ &fsdata);
+ down_write(&EXT4_I(org_inode)->i_data_sem);
+
+ if (unlikely(ret < 0))
+ goto out;
+
+ if (!PageUptodate(page)) {
+ up_write(&EXT4_I(org_inode)->i_data_sem);
+ mapping->a_ops->readpage(filp, page);
+ down_write(&EXT4_I(org_inode)->i_data_sem);
+ lock_page(page);
+ }
+
+ /*
+ * try_to_release_page() doesn't call releasepage in writeback mode.
+ * We should care about the order of writing to the same file
+ * by multiple defrag processes.
+ * It needs to call wait_on_page_writeback() to wait for the
+ * writeback of the page.
+ */
+ if (PageWriteback(page))
+ wait_on_page_writeback(page);
+
+ /* Release old bh and drop refs */
+ try_to_release_page(page, 0);
+ ret = ext4_defrag_replace_branches(handle, org_inode, tmp_inode,
+ org_blk_offset, dest_blk_offset,
+ block_len_in_page);
+ if (ret < 0)
+ goto out;
+
+ /* Clear the inode cache not to refer to the old data */
+ ext4_ext_invalidate_cache(org_inode);
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, 1 << org_inode->i_blkbits, 0);
+
+ bh = page_buffers(page);
+ for (i = 0; i < data_offset_in_page; i++)
+ bh = bh->b_this_page;
+
+ for (i = 0; i < block_len_in_page; i++) {
+ up_write(&EXT4_I(org_inode)->i_data_sem);
+ ret = ext4_get_block(org_inode, (sector_t)(org_blk_offset + i),
+ bh, 0);
+ down_write(&EXT4_I(org_inode)->i_data_sem);
+
+ if (ret < 0)
+ goto out;
+
+ if (bh->b_this_page != NULL)
+ bh = bh->b_this_page;
+ }
+
+ ret = a_ops->write_end(filp, mapping, offs, data_len, data_len, page,
+ fsdata);
+ page = NULL;
+
+out:
+ if (unlikely(page)) {
+ if (PageLocked(page))
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ ext4_journal_stop(handle);
+
+ return ret < 0 ? ret : 0;
}
/**
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2008-10-24 10:10 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-24 10:09 [RFC][PATCH 3/9]ext4: Exchange the extents between two inodes Akira Fujita
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.