[RFC][PATCH 3/8]ext4: read and write file data with memory page

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Akira Fujita <a-fujita@rs.jp.nec.com>
To: linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	Theodore Tso <tytso@mit.edu>, Mingming Cao <cmm@us.ibm.com>
Cc: Akira Fujita <a-fujita@rs.jp.nec.com>
Subject: [RFC][PATCH 3/8]ext4: read and write file data with memory page
Date: Fri, 30 May 2008 20:18:05 +0900	[thread overview]
Message-ID: <483FE26D.8070803@rs.jp.nec.com> (raw)

ext4: online defrag-- Read and write file data with memory page

From: Akira Fujita <a-fujita@rs.jp.nec.com>

Read the file data from the old blocks to the page and
write the file data on the page into the new blocks.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
---
 fs/ext4/defrag.c |  464 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/ext4.h   |    2 +
 fs/ext4/inode.c  |    3 +-
 3 files changed, 466 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/defrag.c b/fs/ext4/defrag.c
index 621276b..f5d75c2 100644
--- a/fs/ext4/defrag.c
+++ b/fs/ext4/defrag.c
@@ -134,6 +134,368 @@ int ext4_defrag_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 }

 /**
+ * ext4_defrag_merge_extents - Merge new extent
+ *
+ * @handle:	journal handle
+ * @org_inode:	original inode
+ * @org_path:	path indicates first extent to be defraged
+ * @o_start:	first original extent to be defraged
+ * @o_end:	last original extent to be defraged
+ * @start_ext:	first new extent to be merged
+ * @new_ext:	middle of new extent to be merged
+ * @end_ext:	last new extent to be merged
+ * @replaced:	the number of blocks which will be replaced with new_ext
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_merge_extents(handle_t *handle, struct inode *org_inode,
+		struct ext4_ext_path *org_path,
+		struct ext4_extent *o_start, struct ext4_extent *o_end,
+		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+		struct ext4_extent *end_ext, ext4_fsblk_t replaced)
+{
+	return 0;
+}
+
+/**
+ * ext4_defrag_leaf_block - Defragmentation for one leaf extent block
+ *
+ * @handle:		journal handle
+ * @org_inode:		original inode
+ * @org_path:		path indicates first extent to be defraged
+ * @dext:		destination extent
+ * @from:		start offset on the target file
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
+		struct ext4_ext_path *org_path, struct ext4_extent *dext,
+		ext4_lblk_t *from)
+{
+	struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext;
+	struct ext4_extent new_ext, start_ext, end_ext;
+	ext4_fsblk_t replaced = 0;
+	ext4_lblk_t new_end, lblock;
+	unsigned long depth;
+	unsigned short len;
+	ext4_fsblk_t new_phys_end;
+	int	ret;
+
+	depth = ext_depth(org_inode);
+	start_ext.ee_len = end_ext.ee_len = 0;
+	o_start = o_end = oext = org_path[depth].p_ext;
+	ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+	new_ext.ee_len = dext->ee_len;
+	len = le16_to_cpu(new_ext.ee_len);
+	new_ext.ee_block = cpu_to_le32(*from);
+	lblock = le32_to_cpu(oext->ee_block);
+	new_end = le32_to_cpu(new_ext.ee_block)
+		+ le16_to_cpu(new_ext.ee_len) - 1;
+	new_phys_end = ext_pblock(&new_ext)
+		+ le16_to_cpu(new_ext.ee_len) - 1;
+
+	/*
+	 * First original extent
+	 * dest	 |---------------|
+	 * org  |---------------|
+	 */
+	if (le32_to_cpu(new_ext.ee_block) >
+		le32_to_cpu(oext->ee_block) &&
+		le32_to_cpu(new_ext.ee_block) <
+		le32_to_cpu(oext->ee_block)
+		+ le16_to_cpu(oext->ee_len)) {
+		start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block)
+					- le32_to_cpu(oext->ee_block));
+		replaced += le16_to_cpu(oext->ee_len)
+					- le16_to_cpu(start_ext.ee_len);
+	} else if (oext > EXT_FIRST_EXTENT(org_path[depth].p_hdr)) {
+		/* We can merge previous extent. */
+		prev_ext = oext - 1;
+		if (((ext_pblock(prev_ext) + le16_to_cpu(prev_ext->ee_len))
+				 == ext_pblock(&new_ext))
+		 && (le32_to_cpu(prev_ext->ee_block)
+			+ le16_to_cpu(prev_ext->ee_len)
+				 == le32_to_cpu(new_ext.ee_block))) {
+			o_start = prev_ext;
+			start_ext.ee_len = cpu_to_le16(
+					le16_to_cpu(prev_ext->ee_len)
+					+ le16_to_cpu(new_ext.ee_len));
+			new_ext.ee_len = 0;
+		}
+	}
+
+	for (;;) {
+		/* The extent for destination must be found. */
+		BUG_ON(!oext || lblock != le32_to_cpu(oext->ee_block));
+		lblock += le16_to_cpu(oext->ee_len);
+
+		/*
+		 * Middle of original extent
+		 * dest |-------------------|
+		 * org   |-----------------|
+		 */
+		if (le32_to_cpu(new_ext.ee_block) <=
+			le32_to_cpu(oext->ee_block) &&
+			new_end >= le32_to_cpu(oext->ee_block)
+			+ le16_to_cpu(oext->ee_len) - 1)
+			replaced += le16_to_cpu(oext->ee_len);
+
+		/*
+		 * Last original extent
+		 * dest |----------------|
+		 * org	  |---------------|
+		 */
+		if (new_end >= le32_to_cpu(oext->ee_block) &&
+			new_end < le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1) {
+			end_ext.ee_len
+				= cpu_to_le16(le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1 - new_end);
+			ext4_ext_store_pblock(&end_ext, (ext_pblock(o_end)
+				+ le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len)));
+			end_ext.ee_block
+				= cpu_to_le32(le32_to_cpu(o_end->ee_block)
+				+ le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len));
+			replaced += le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len);
+		}
+
+		/*
+		 * Detected the block end, reached the number of replaced
+		 * blocks to dext->ee_len. Then merge the extent.
+		 */
+		if (oext == EXT_LAST_EXTENT(org_path[depth].p_hdr) ||
+			new_end <= le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1) {
+			ret = ext4_defrag_merge_extents(handle, org_inode,
+					org_path, o_start, o_end, &start_ext,
+					&new_ext, &end_ext, replaced);
+			if (ret < 0)
+				return ret;
+
+			/* All expected blocks are replaced */
+			if (le16_to_cpu(new_ext.ee_len) <= 0)
+				return 0;
+
+			/* Re-calculate new_ext */
+			le16_add_cpu(&new_ext.ee_len, -replaced);
+			le32_add_cpu(&new_ext.ee_block, replaced);
+			ext4_ext_store_pblock(&new_ext, ext_pblock(&new_ext)
+					+ replaced);
+			replaced = 0;
+			start_ext.ee_len = end_ext.ee_len = 0;
+			o_start = NULL;
+
+			/* All expected blocks are replaced. */
+			if (le16_to_cpu(new_ext.ee_len) <= 0)
+				return 0;
+		}
+
+		/* Get the next extent for original. */
+		if (org_path)
+			ext4_ext_drop_refs(org_path);
+		org_path = ext4_ext_find_extent(org_inode, lblock, org_path);
+		if (IS_ERR(org_path)) {
+			ret = PTR_ERR(org_path);
+			org_path = NULL;
+			return ret;
+		}
+		depth = ext_depth(org_inode);
+		oext = org_path[depth].p_ext;
+		if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
+			<= lblock)
+			return -ENOENT;
+
+		o_end = oext;
+		if (!o_start)
+			o_start = oext;
+	}
+}
+
+/**
+ * ext4_defrag_replace_branches - Replace original extents with new extents
+ *
+ * @handle:		journal handle
+ * @org_inode:		original inode
+ * @dest_inode:		temporary inode
+ * @from_page:		page offset of org_inode
+ * @dest_from_page:	page offset of dest_inode
+ * @count_page:		page count to be replaced
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ * Replace extents for blocks from "from" to "from + count - 1".
+ */
+static int
+ext4_defrag_replace_branches(handle_t *handle, struct inode *org_inode,
+			struct inode *dest_inode, pgoff_t from_page,
+			pgoff_t dest_from_page, pgoff_t count_page)
+{
+	struct ext4_ext_path *org_path = NULL;
+	struct ext4_ext_path *dest_path = NULL;
+	struct ext4_extent *oext, *dext, *swap_ext;
+	struct ext4_extent tmp_ext, tmp_ext2;
+	ext4_lblk_t from, count, dest_off, diff, org_diff;
+	int err = 0;
+	int depth;
+	int replaced_count = 0;
+
+	from = (ext4_lblk_t)from_page <<
+			(PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+	count = (ext4_lblk_t)count_page <<
+			(PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+	dest_off = (ext4_lblk_t)dest_from_page <<
+			(PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+
+	/* Get the original extent for the block "from" */
+	org_path = ext4_ext_find_extent(org_inode, from, NULL);
+	if (IS_ERR(org_path)) {
+		err = PTR_ERR(org_path);
+		org_path = NULL;
+		goto out;
+	}
+
+	/* Get the destination extent for the head */
+	dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
+	if (IS_ERR(dest_path)) {
+		err = PTR_ERR(dest_path);
+		dest_path = NULL;
+		goto out;
+	}
+	depth = ext_depth(dest_inode);
+	dext = dest_path[depth].p_ext;
+	/* When dext is too large, pick up the target range. */
+	diff = dest_off - le32_to_cpu(dext->ee_block);
+	ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
+	tmp_ext.ee_block = cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
+	tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
+	if (count < le16_to_cpu(tmp_ext.ee_len))
+		tmp_ext.ee_len = cpu_to_le16(count);
+	dext = &tmp_ext;
+
+	depth = ext_depth(org_inode);
+	oext = org_path[depth].p_ext;
+	org_diff = from - le32_to_cpu(oext->ee_block);
+	ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
+	tmp_ext2.ee_block = tmp_ext.ee_block;
+
+	/* Adjust extent length when blocksize != pagesize */
+	if (le16_to_cpu(tmp_ext.ee_len) <=
+		le16_to_cpu(oext->ee_len) - org_diff) {
+		tmp_ext2.ee_len = tmp_ext.ee_len;
+	} else {
+		tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
+						- org_diff);
+		tmp_ext.ee_len = tmp_ext2.ee_len;
+	}
+	swap_ext = &tmp_ext2;
+
+	/* Loop for the destination extents */
+	while (1) {
+		/* The extent for destination must be found. */
+		BUG_ON(!dext || dest_off != le32_to_cpu(dext->ee_block));
+
+		/* Loop for the original extent blocks */
+		err = ext4_defrag_leaf_block(handle, org_inode,
+						org_path, dext, &from);
+		if (err < 0)
+			goto out;
+
+		/*
+		 * We need the function which fixes extent information for
+		 * inserting.
+		 * e.g. ext4_defrag_merge_extents()
+		 */
+		err = ext4_defrag_leaf_block(handle, dest_inode,
+					dest_path, swap_ext, &dest_off);
+		if (err < 0)
+			goto out;
+
+		replaced_count += le16_to_cpu(dext->ee_len);
+		dest_off += le16_to_cpu(dext->ee_len);
+		from += le16_to_cpu(dext->ee_len);
+
+		/* Already moved the expected blocks */
+		if (replaced_count >= count)
+			break;
+
+		if (org_path)
+			ext4_ext_drop_refs(org_path);
+		org_path = ext4_ext_find_extent(org_inode, from, NULL);
+		if (IS_ERR(org_path)) {
+			err = PTR_ERR(org_path);
+			org_path = NULL;
+			goto out;
+		}
+		depth = ext_depth(org_inode);
+		oext = org_path[depth].p_ext;
+		if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
+			<= from) {
+			err = 0;
+			goto out;
+		}
+
+		if (dest_path)
+			ext4_ext_drop_refs(dest_path);
+		dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
+		if (IS_ERR(dest_path)) {
+			err = PTR_ERR(dest_path);
+			dest_path = NULL;
+			goto out;
+		}
+		depth = ext_depth(dest_inode);
+		dext = dest_path[depth].p_ext;
+		if (le32_to_cpu(dext->ee_block) + le16_to_cpu(dext->ee_len)
+			<= dest_off) {
+			err = 0;
+			goto out;
+		}
+
+		/* When dext is too large, pick up the target range. */
+		diff = dest_off - le32_to_cpu(dext->ee_block);
+		ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
+		tmp_ext.ee_block =
+			cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
+		tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
+
+		if (count - replaced_count < le16_to_cpu(tmp_ext.ee_len))
+			tmp_ext.ee_len = cpu_to_le16(count - replaced_count);
+
+		dext = &tmp_ext;
+
+		org_diff = from - le32_to_cpu(oext->ee_block);
+		ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
+		tmp_ext2.ee_block = tmp_ext.ee_block;
+
+		/* Adjust extent length when blocksize != pagesize */
+		if (le16_to_cpu(tmp_ext.ee_len) <=
+			le16_to_cpu(oext->ee_len) - org_diff) {
+			tmp_ext2.ee_len = tmp_ext.ee_len;
+		} else {
+			tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
+							- org_diff);
+			tmp_ext.ee_len = tmp_ext2.ee_len;
+		}
+		swap_ext = &tmp_ext2;
+	}
+
+out:
+	if (org_path) {
+		ext4_ext_drop_refs(org_path);
+		kfree(org_path);
+	}
+	if (dest_path) {
+		ext4_ext_drop_refs(dest_path);
+		kfree(dest_path);
+	}
+
+	return err;
+}
+
+/**
  * ext4_defrag_fill_ar - Prepare to multiple block allocate for tmp inode
  *
  * @org_inode:		original inode
@@ -228,7 +590,107 @@ static int
 ext4_defrag_partial(struct inode *tmp_inode, struct file *filp,
 			pgoff_t org_offset, pgoff_t dest_offset)
 {
-	return 0;
+	struct inode *org_inode = filp->f_dentry->d_inode;
+	struct address_space *mapping = org_inode->i_mapping;
+	struct buffer_head *bh;
+	struct page *page;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	handle_t *handle;
+	pgoff_t offset_in_page = PAGE_SIZE;
+	int ret, i, jblocks, blocks_per_page;
+	int blocksize = org_inode->i_sb->s_blocksize;
+	long long offs = org_offset << PAGE_CACHE_SHIFT;
+	unsigned long blk_off = 0;
+	unsigned int w_flags = 0;
+	void *fsdata;
+
+	/*
+	 * It needs twice the amount of ordinary journal buffers because
+	 * inode and tmp_inode may change each different metadata blocks.
+	 */
+	jblocks = ext4_writepage_trans_blocks(org_inode) * 2;
+	handle = ext4_journal_start(org_inode, jblocks);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		return ret;
+	}
+
+	if (segment_eq(get_fs(), KERNEL_DS))
+		w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+	if (org_offset == ((org_inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
+		offset_in_page = (org_inode->i_size & (PAGE_CACHE_SIZE - 1));
+		/*
+		 * Set PAGE_CACHE_SIZE to offset_in_page not be 0
+		 * if org_offset is the last page and i_size is
+		 * multiples of PAGE_CACHE_SIZE.
+		 */
+		if (offset_in_page == 0)
+			offset_in_page = PAGE_CACHE_SIZE;
+	}
+
+	up_write(&EXT4_I(org_inode)->i_data_sem);
+	ret = a_ops->write_begin(filp, mapping, offs,
+				offset_in_page, w_flags, &page, &fsdata);
+	down_write(&EXT4_I(org_inode)->i_data_sem);
+
+	if (unlikely(ret < 0))
+		goto out;
+
+	if (!PageUptodate(page)) {
+		mapping->a_ops->readpage(filp, page);
+		lock_page(page);
+	}
+
+	/*
+	 * try_to_release_page() doesn't call relasepage in writeback mode.
+	 * We should care about the order of writing to the same file
+	 * by multiple defrag processes.
+	 * It needs to call wait_on_page_writeback() to wait for the
+	 * writeback of the page.
+	 */
+	if (PageWriteback(page))
+		wait_on_page_writeback(page);
+
+	/* Release old bh and drop refs */
+	try_to_release_page(page, 0);
+	ret = ext4_defrag_replace_branches(handle, org_inode, tmp_inode,
+					org_offset, dest_offset, 1);
+
+	if (ret < 0)
+		goto out;
+
+	/* Clear the inode cache not to refer to the old data */
+	ext4_ext_invalidate_cache(org_inode);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << org_inode->i_blkbits, 0);
+
+	blocks_per_page = PAGE_SIZE / blocksize;
+	blk_off = org_offset * blocks_per_page;
+
+	bh = page_buffers(page);
+	for (i = 0; i < blocks_per_page; i++) {
+		up_write(&EXT4_I(org_inode)->i_data_sem);
+		ret = ext4_get_block(org_inode, blk_off++, bh, 0);
+		down_write(&EXT4_I(org_inode)->i_data_sem);
+
+		if (ret < 0)
+			goto out;
+
+		if (bh->b_this_page != NULL)
+			bh = bh->b_this_page;
+	}
+
+	ret = a_ops->write_end(filp, mapping, offs, offset_in_page,
+				offset_in_page, page, fsdata);
+
+	if (unlikely(ret < 0))
+		goto out;
+out:
+	ext4_journal_stop(handle);
+
+	return (ret < 0 ? ret : 0);
 }

 /**
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d64a4ae..1e9ce39 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1079,6 +1079,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
 		struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+extern int ext4_get_block(struct inode *inode, sector_t iblock,
+		struct buffer_head *bh_result, int create);

 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 71db3d6..a30f56c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1037,8 +1037,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 	up_write((&EXT4_I(inode)->i_data_sem));
 	return retval;
 }

                 reply	other threads:[~2008-05-30 11:18 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:621276b dfblob:f5d75c2 dfblob:d64a4ae dfblob:1e9ce39
dfblob:71db3d6 dfblob:a30f56c )
 OR (
bs:"[RFC][PATCH 3/8]ext4: read and write file data with memory page" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=483FE26D.8070803@rs.jp.nec.com \
    --to=a-fujita@rs.jp.nec.com \
    --cc=cmm@us.ibm.com \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.