All of lore.kernel.org
 help / color / mirror / Atom feed
From: Akira Fujita <a-fujita@rs.jp.nec.com>
To: linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	Theodore Tso <tytso@mit.edu>, Mingming Cao <cmm@us.ibm.com>
Cc: Akira Fujita <a-fujita@rs.jp.nec.com>
Subject: [RFC][PATCH 1/8]ext4: main function of defrag and ioctl implementation
Date: Fri, 30 May 2008 20:17:48 +0900	[thread overview]
Message-ID: <483FE25C.5000103@rs.jp.nec.com> (raw)

ext4: online defrag-- Main function of defrag and ioctl implementation

From: Akira Fujita <a-fujita@rs.jp.nec.com>

Create the temporary inode and do defrag per
defrag_size (defalut 64MB).

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
---
 fs/ext4/Makefile       |    2 +-
 fs/ext4/defrag.c       |  448 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/ext4.h         |   18 ++
 fs/ext4/ext4_extents.h |    2 +
 fs/ext4/extents.c      |    2 +-
 fs/ext4/ioctl.c        |    3 +
 6 files changed, 473 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ac6fa8c..8028102 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o

 ext4dev-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
 		   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		   ext4_jbd2.o migrate.o mballoc.o
+		   ext4_jbd2.o migrate.o mballoc.o defrag.o

 ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)	+= xattr.o xattr_user.o xattr_trusted.o
 ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)	+= acl.o
diff --git a/fs/ext4/defrag.c b/fs/ext4/defrag.c
index e69de29..a591e11 100644
--- a/fs/ext4/defrag.c
+++ b/fs/ext4/defrag.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2008, NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ *            Akira Fujita <a-fujita@rs.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+/* Online defragmentation for EXT4 */
+
+#include <linux/quotaops.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "group.h"
+
+/**
+ * ext4_defrag_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode:	inode which is searched
+ * @path:	this will obtain data for the next extent
+ * @extent:	pointer to the next extent we have just gotten
+ *
+ * This function returns 0 or 1(last entry) if succeed, otherwise
+ * returns -EIO.
+ */
+static int
+ext4_defrag_next_extent(struct inode *inode, struct ext4_ext_path *path,
+			struct ext4_extent **extent)
+{
+	return 0;
+}
+
+int ext4_defrag_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+			unsigned long arg)
+{
+	int err = 0;
+
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+		printk(KERN_ERR "ext4 defrag: ino[%lu] is not extents "
+					"based file\n", inode->i_ino);
+		return -EOPNOTSUPP;
+	}
+
+	if (cmd == EXT4_IOC_DEFRAG) {
+		struct ext4_ext_defrag_data defrag;
+		struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+
+		if (!capable(CAP_DAC_OVERRIDE)) {
+			if ((inode->i_mode & S_IRUSR) != S_IRUSR)
+				return -EACCES;
+			if (current->fsuid != inode->i_uid)
+				return -EACCES;
+		}
+
+		if (copy_from_user(&defrag,
+			(struct ext4_ext_defrag_data __user *)arg,
+						sizeof(defrag)))
+			return -EFAULT;
+
+		/* Check goal offset if goal offset was given from userspace */
+		if (defrag.goal != -1 &&
+				ext4_blocks_count(es) <= defrag.goal) {
+			printk(KERN_ERR "ext4 defrag: Invalid goal offset"
+				" %llu, you can set goal offset up to %llu\n",
+				defrag.goal, ext4_blocks_count(es) - 1);
+			return -EINVAL;
+		}
+
+		err = ext4_defrag(filp, defrag.start_offset,
+				defrag.defrag_size);
+	}
+
+	return err;
+}
+
+/**
+ * ext4_defrag_partial - Defrag a file per page
+ *
+ * @tmp_inode:		temporary inode
+ * @filp:		pointer to file
+ * @org_offset:		page index on original file
+ * @dest_offset:	page index on temporary file
+ *
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_partial(struct inode *tmp_inode, struct file *filp,
+			pgoff_t org_offset, pgoff_t dest_offset)
+{
+	return 0;
+}
+
+/**
+ * ext4_defrag_new_extent_tree - Get contiguous blocks and build an extent tree
+ *
+ * @org_inode:		original inode
+ * @tmp_inode:		temporary inode
+ * @org_path:		indicating the original inode's extent
+ * @tar_start:		starting offset to allocate in blocks
+ * @tar_blocks:		the number of blocks to allocate
+ * @iblock:		file related offset
+ *
+ *
+ * This function returns the value as below:
+ *	0 (succeed)
+ *	1 (not improved)
+ *	negative value (error case)
+ */
+static int
+ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode,
+			struct ext4_ext_path *org_path, ext4_lblk_t tar_start,
+			ext4_lblk_t tar_blocks, ext4_lblk_t iblock)
+{
+	return 0;
+}
+
+/**
+ * ext4_defrag_check - Check the enviroment whether a defrag can be done
+ *
+ * @org_inode:		original inode
+ * @defrag_size:	size of defrag in blocks
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_check(struct inode *org_inode, ext4_lblk_t defrag_size)
+{
+
+	/* ext4 online defrag supports only 4KB block size */
+	if (org_inode->i_sb->s_blocksize != DEFRAG_BLOCK_SIZE) {
+		printk(KERN_ERR "ext4 defrag: ext4 online defrag supports "
+				"only 4KB block size for the moment.\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* ext4 online defrag needs mballoc mount option. */
+	if (!test_opt(org_inode->i_sb, MBALLOC)) {
+		printk(KERN_ERR "ext4 defrag: multiblock allocation "
+				"is disabled\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+/**
+ * ext4_defrag_init_tmp_inode - Create a temporary inode
+ *
+ * @org_inode:		original inode
+ *
+ * This function returns pointer to the struct inode if succeed,
+ * otherwise returns error value.
+ */
+static struct inode *
+ext4_defrag_init_tmp_inode(struct inode *org_inode)
+{
+	handle_t *handle;
+	struct inode *tmp_inode;
+
+	handle = ext4_journal_start(org_inode,
+		EXT4_DATA_TRANS_BLOCKS(org_inode->i_sb) +
+		EXT4_INDEX_EXTRA_TRANS_BLOCKS + 4 +
+		2 * EXT4_QUOTA_INIT_BLOCKS(org_inode->i_sb));
+	if (IS_ERR(handle))
+		/* Return error code */
+		return (struct inode *)handle;
+
+	tmp_inode = ext4_new_inode(handle,
+		org_inode->i_sb->s_root->d_inode, S_IFREG);
+	if (IS_ERR(tmp_inode))
+		goto out;
+
+	i_size_write(tmp_inode, i_size_read(org_inode));
+	tmp_inode->i_nlink = 0;
+	ext4_ext_tree_init(handle, tmp_inode);
+	ext4_orphan_add(handle, tmp_inode);
+
+out:
+	ext4_journal_stop(handle);
+
+	return tmp_inode;
+}
+
+/**
+ * ext4_defrag - Defrag the specified range of a file
+ *
+ * If no-option is specified, ext4_defrag() proceeds the following order.
+ * 1.ext4_defrag() calculates the block number where defrag terminates
+ *   by the start block number(defrag_start) and the size of defraged data
+ *   (defrag_size) specified as arguments.
+ *   If the defrag_start points a hole, the extent's start offset pointed by
+ *   ext_cur(current extent), holecheck_path, org_path are set after
+ *   hole behind.
+ * 2.Continue step 3 to step 5, until the holecheck_path points to last_extent
+ *   or the ext_cur exceeds the block_end which is last logical block number.
+ * 3.To get a length of continues area, call ext4_defrag_next_extent()
+ *   specified with the ext_cur(initial value is holecheck_path) re-cursive,
+ *   until find un-continuous extent, the start logical block number exceeds
+ *   the block_end or the extent points to the last extent.
+ * 4.After determining the length of continuous block,
+ *   allocates continuous blocks to a temporary inode
+ *   by ext4_defrag_new_extent_tree().
+ * 5.Exchange the original inode data with temporary inode data
+ *   from page_offset to seq_end_page by page unit.
+ *   The start page index of data are specified as arguments:
+ *   the original inode is page_offset, the temporary inode is dest_offset.
+ * 6.Update holecheck_path and org_path to points a next proceeding extent,
+ *   and release the temporary inode holding the original fragmented data.
+ *   Then, returns to step 2.
+ * 7.Release holecheck_path, org_path and temporary inode,
+ *    and returns the defrag_size which is the size of defraged data.
+ *    The defrag_size is used for the command to calculate the file offset
+ *    where a next defrag processing start.
+ *    (Since the defrag command calls defrag_ioctl() by 64MB unit,
+ *     a file bigger than 64MB calls defrag_ioctl many times.)
+ *
+ * @filp:		pointer to file
+ * @block_start:	starting offset to defrag in blocks
+ * @defrag_size:	size of defrag in blocks
+ *
+ * This function returns the number of blocks if succeed, otherwise
+ * returns error value.
+ */
+int
+ext4_defrag(struct file *filp, ext4_lblk_t block_start,
+		ext4_lblk_t defrag_size)
+{
+	struct inode *org_inode = filp->f_dentry->d_inode, *tmp_inode = NULL;
+	struct ext4_ext_path *org_path = NULL, *holecheck_path = NULL;
+	struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+	ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+	pgoff_t page_offset, seq_end_page, dest_offset;
+	int ret, depth, seq_extents, last_extent = 0;
+
+	/* Check the filesystem enviroment whether defrag can be done */
+	ret = ext4_defrag_check(org_inode, defrag_size);
+	if (ret < 0)
+		return ret;
+
+	file_end = (org_inode->i_size - 1) >> org_inode->i_blkbits;
+	block_end = block_start + defrag_size - 1;
+	if (file_end < block_end)
+		defrag_size -= block_end - file_end;
+
+	mutex_lock(&org_inode->i_mutex);
+	down_write(&EXT4_I(org_inode)->i_data_sem);
+
+	org_path = ext4_ext_find_extent(org_inode, block_start, NULL);
+	if (IS_ERR(org_path)) {
+		ret = PTR_ERR(org_path);
+		org_path = NULL;
+		goto out;
+	}
+
+	/* Get path structure to check the hole */
+	holecheck_path = ext4_ext_find_extent(org_inode, block_start, NULL);
+	if (IS_ERR(holecheck_path)) {
+		ret = PTR_ERR(holecheck_path);
+		holecheck_path = NULL;
+		goto out;
+	}
+
+	depth = ext_depth(org_inode);
+	ext_cur = holecheck_path[depth].p_ext;
+	if (ext_cur == NULL)
+		goto out;
+
+	/*
+	 * Get proper extent whose ee_block is beyond block_start
+	 * if block_start was within the hole.
+	 */
+	if (le32_to_cpu(ext_cur->ee_block) +
+		le16_to_cpu(ext_cur->ee_len) - 1 < block_start) {
+		last_extent = ext4_defrag_next_extent(org_inode,
+					holecheck_path, &ext_cur);
+		if (last_extent < 0) {
+			ret = last_extent;
+			goto out;
+		}
+		last_extent = ext4_defrag_next_extent(org_inode, org_path,
+							&ext_dummy);
+		if (last_extent < 0) {
+			ret = last_extent;
+			goto out;
+		}
+	}
+	seq_extents = 1;
+	seq_start = le32_to_cpu(ext_cur->ee_block);
+
+	/* No blocks within the specified range. */
+	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+		printk(KERN_INFO "ext4 defrag: The specified range of file"
+				" may be the hole\n");
+		goto out;
+	}
+
+	/* Adjust start blocks */
+	add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+			 le16_to_cpu(ext_cur->ee_len), block_end + 1) -
+		     max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+	while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+		seq_blocks += add_blocks;
+
+		/* Create a temporary inode to be exchanged data block */
+		tmp_inode = ext4_defrag_init_tmp_inode(org_inode);
+		if (IS_ERR(tmp_inode)) {
+			ret = PTR_ERR(tmp_inode);
+			tmp_inode = NULL;
+			goto out;
+		}
+
+		/* Adjust tail blocks */
+		if (seq_start + seq_blocks - 1 > block_end)
+			seq_blocks = block_end - seq_start + 1;
+
+		ext_prev = ext_cur;
+		last_extent = ext4_defrag_next_extent(org_inode,
+					holecheck_path, &ext_cur);
+		if (last_extent < 0) {
+			ret = last_extent;
+			break;
+		}
+		if (!last_extent)
+			seq_extents++;
+		add_blocks = le16_to_cpu(ext_cur->ee_len);
+
+		/*
+		 * Extend the length of contiguous block (seq_blocks)
+		 * if extents are contiguous.
+		 */
+		if (le32_to_cpu(ext_prev->ee_block) +
+				le16_to_cpu(ext_prev->ee_len) ==
+				le32_to_cpu(ext_cur->ee_block) &&
+				block_end >= le32_to_cpu(ext_cur->ee_block) &&
+				!last_extent) {
+			if (tmp_inode) {
+				iput(tmp_inode);
+				tmp_inode = NULL;
+			}
+			continue;
+		}
+
+		/* Found an isolated block */
+		if (seq_extents == 1) {
+			seq_start = le32_to_cpu(ext_cur->ee_block);
+			goto CLEANUP;
+		}
+
+		ret = ext4_defrag_new_extent_tree(org_inode, tmp_inode,
+					org_path, seq_start, seq_blocks,
+					block_start);
+
+		if (ret < 0) {
+			break;
+		} else if (ret == 1) {
+			ret = 0;
+			seq_start = le32_to_cpu(ext_cur->ee_block);
+			goto CLEANUP;
+		}
+
+		page_offset = seq_start >>
+				(PAGE_CACHE_SHIFT - org_inode->i_blkbits);
+		dest_offset = 0;
+		seq_end_page = (seq_start + seq_blocks - 1) >>
+				(PAGE_CACHE_SHIFT - org_inode->i_blkbits);
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+
+		/*
+		 * Discard all preallocations.
+		 * This is provisional solution.
+		 * When true ext4_mb_return_to_preallocation() is
+		 * implemented, this will be removed.
+		 */
+		ext4_mb_discard_inode_preallocations(org_inode);
+
+		while (page_offset <= seq_end_page) {
+			/* Swap original branches with new branches */
+			ret = ext4_defrag_partial(tmp_inode, filp,
+					page_offset, dest_offset);
+			if (ret < 0)
+				goto out;
+
+			page_offset++;
+			dest_offset++;
+		}
+
+		/* Decrease buffer counter */
+		if (holecheck_path)
+			ext4_ext_drop_refs(holecheck_path);
+		holecheck_path = ext4_ext_find_extent(org_inode,
+						seq_start, holecheck_path);
+		if (IS_ERR(holecheck_path)) {
+			ret = PTR_ERR(holecheck_path);
+			holecheck_path = NULL;
+			break;
+		}
+		depth = holecheck_path->p_depth;
+
+CLEANUP:
+		/* Decrease buffer counter */
+		if (org_path)
+			ext4_ext_drop_refs(org_path);
+		org_path = ext4_ext_find_extent(org_inode, seq_start, org_path);
+		if (IS_ERR(org_path)) {
+			ret = PTR_ERR(org_path);
+			org_path = NULL;
+			break;
+		}
+
+		ext_cur = holecheck_path[depth].p_ext;
+		add_blocks = le16_to_cpu(ext_cur->ee_len);
+		seq_blocks = 0;
+		dest_offset = 0;
+		seq_extents = 1;
+
+		if (tmp_inode) {
+			iput(tmp_inode);
+			tmp_inode = NULL;
+		}
+	}
+
+out:
+	if (org_path) {
+		ext4_ext_drop_refs(org_path);
+		kfree(org_path);
+	}
+	if (holecheck_path) {
+		ext4_ext_drop_refs(holecheck_path);
+		kfree(holecheck_path);
+	}
+
+	up_write(&EXT4_I(org_inode)->i_data_sem);
+	mutex_unlock(&org_inode->i_mutex);
+
+	if (tmp_inode)
+		iput(tmp_inode);
+
+	return (ret ? ret : defrag_size);
+}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a7a541d..67281ad 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -298,6 +298,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
 #define EXT4_IOC_SETRSVSZ		_IOW('f', 6, long)
 #define EXT4_IOC_MIGRATE		_IO('f', 7)
+#define EXT4_IOC_DEFRAG		_IOW('f', 10, struct ext4_ext_defrag_data)

 /*
  * ioctl commands in 32 bit emulation
@@ -315,6 +316,18 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
 #define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION

+/*
+ * Will go away.
+ * ext4 online defrag supports only 4KB block size.
+ */
+#define DEFRAG_BLOCK_SIZE	4096
+
+struct ext4_ext_defrag_data {
+	ext4_lblk_t start_offset;	/* start offset to defrag in blocks */
+	ext4_lblk_t defrag_size;	/* size of defrag in blocks */
+	ext4_fsblk_t goal;		/* block offset for allocation */
+};
+

 /*
  *  Mount options
@@ -1113,6 +1126,11 @@ extern void ext4_inode_bitmap_set(struct super_block *sb,
 				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_table_set(struct super_block *sb,
 				 struct ext4_group_desc *bg, ext4_fsblk_t blk);
+/* defrag.c */
+extern int ext4_defrag(struct file *filp, ext4_lblk_t block_start,
+			ext4_lblk_t defrag_size);
+extern int ext4_defrag_ioctl(struct inode *, struct file *, unsigned int,
+				unsigned long);

 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b5..9868c02 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -228,5 +228,7 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
 extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
 						ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *path);
 #endif /* _EXT4_EXTENTS */

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5ba81b3..ffced61 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -48,7 +48,7 @@
  * ext_pblock:
  * combine low and high parts of physical block number into ext4_fsblk_t
  */
-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
 {
 	ext4_fsblk_t block;

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7a6c2f1..4c7fca1 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -241,6 +241,9 @@ setversion_out:

 		return err;
 	}
+	case EXT4_IOC_DEFRAG: {
+		return ext4_defrag_ioctl(inode, filp, cmd, arg);
+	}
 	case EXT4_IOC_GROUP_ADD: {
 		struct ext4_new_group_data input;
 		struct super_block *sb = inode->i_sb;

                 reply	other threads:[~2008-05-30 11:17 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=483FE25C.5000103@rs.jp.nec.com \
    --to=a-fujita@rs.jp.nec.com \
    --cc=cmm@us.ibm.com \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.