linux-ext4.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH 7/9]ext4: Add the EXT4_IOC_FIEMAP_INO ioctl
@ 2008-10-24 10:09 Akira Fujita
  2008-10-26  8:40 ` Andreas Dilger
  0 siblings, 1 reply; 12+ messages in thread
From: Akira Fujita @ 2008-10-24 10:09 UTC (permalink / raw)
  To: linux-ext4, Theodore Tso, Mingming Cao; +Cc: linux-fsdevel

ext4: online defrag -- Add the EXT4_IOC_FIEMAP_INO ioctl.

From: Akira Fujita <a-fujita@rs.jp.nec.com>

The EXT4_IOC_FIEMAP_INO is used to get extents information of
inode which set to ioctl.
The defragger uses this ioctl to check the fragment condition
and to get extents information in the specified block group.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
---
 fs/ext4/defrag.c |  113 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/ext4.h   |    7 +++
 fs/ext4/ioctl.c  |    7 +++
 3 files changed, 127 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/defrag.c b/fs/ext4/defrag.c
index 891e599..67030bc 100644
--- a/fs/ext4/defrag.c
+++ b/fs/ext4/defrag.c
@@ -20,6 +20,8 @@
 #include "ext4_extents.h"
 #include "group.h"

+#define FIEMAP_MAX_EXTENTS	(UINT_MAX / sizeof(struct fiemap_extent))
+
 /**
  * ext4_defrag_next_extent - Search for the next extent and set it to "extent"
  *
@@ -91,6 +93,117 @@ err:
 }

 /**
+ * ext4_defrag_fiemap_ino - Get extents information by inode number
+ *
+ * @filp:			pointer to file
+ * @arg:			pointer to fiemap_ino
+ *  @fiemap_ino->ino:		an inode number which is used to get
+ *				extent information
+ *  @fiemap_ino->fiemap:	request for fiemap ioctl
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+int
+ext4_defrag_fiemap_ino(struct file *filp, unsigned long arg)
+{
+	struct fiemap_ino fiemap_ino;
+	struct fiemap_extent_info fieinfo = { 0, };
+	struct inode *inode;
+	struct super_block *sb = filp->f_dentry->d_inode->i_sb;
+	u64 len = 0;
+	int err = 0;
+
+	if (copy_from_user(&fiemap_ino, (struct fiemap_ino __user *)arg,
+			   sizeof(struct fiemap_ino)))
+		return -EFAULT;
+
+	/* Special inodes shouldn't be choiced */
+	if (fiemap_ino.ino < EXT4_GOOD_OLD_FIRST_INO)
+		return -ENOENT;
+
+	inode = ext4_iget(sb, fiemap_ino.ino);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	/*
+	 * If we do not have the permission to access the inode,
+	 * just skip it.
+	 */
+	if (!capable(CAP_DAC_OVERRIDE)) {
+		if ((inode->i_mode & S_IRUSR) != S_IRUSR)
+			return -EACCES;
+		if (current->fsuid != inode->i_uid)
+			return -EACCES;
+	}
+
+	/* Return -ENOENT if a file does not exist */
+	if (!inode->i_nlink || !S_ISREG(inode->i_mode)) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	if (!inode->i_op->fiemap) {
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (fiemap_ino.fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (fiemap_ino.fiemap.fm_length == 0) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (fiemap_ino.fiemap.fm_start > sb->s_maxbytes) {
+		err = -EFBIG;
+		goto out;
+	}
+
+	/*
+	 * Check offset and length
+	 * If the specified range exceeds the max file size,
+	 * adjust the length.
+	 */
+	if ((fiemap_ino.fiemap.fm_length > sb->s_maxbytes) ||
+	    (sb->s_maxbytes - fiemap_ino.fiemap.fm_length)
+	     < fiemap_ino.fiemap.fm_start)
+		len = sb->s_maxbytes - fiemap_ino.fiemap.fm_start;
+	else
+		len = fiemap_ino.fiemap.fm_length;
+
+	fieinfo.fi_flags = fiemap_ino.fiemap.fm_flags;
+	fieinfo.fi_extents_max = fiemap_ino.fiemap.fm_extent_count;
+	fieinfo.fi_extents_start =
+		(struct fiemap_extent *)(arg + sizeof(fiemap_ino));
+
+	if (fiemap_ino.fiemap.fm_extent_count != 0 &&
+	    !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
+		       fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
+		filemap_write_and_wait(inode->i_mapping);
+
+	err = inode->i_op->fiemap(inode, &fieinfo,
+				fiemap_ino.fiemap.fm_start, len);
+	if (!err) {
+		fiemap_ino.fiemap.fm_flags = fieinfo.fi_flags;
+		fiemap_ino.fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
+		if (copy_to_user((char *)arg, &fiemap_ino, sizeof(fiemap_ino)))
+			err = -EFAULT;
+	}
+
+out:
+	iput(inode);
+	return err;
+}
+
+/**
  * ext4_defrag_merge_across_blocks - Merge extents across leaf block
  *
  * @handle:		journal handle
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f7b092d..c72703f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -305,6 +305,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC_DEFRAG			_IOW('f', 15, struct ext4_ext_defrag_data)
 #define EXT4_IOC_SUPER_INFO		_IOR('f', 16, struct ext4_super_block)
 #define EXT4_IOC_FREE_BLOCKS_INFO	_IOWR('f', 17, struct ext4_extents_info)
+#define EXT4_IOC_FIEMAP_INO		_IOWR('f', 18, struct fiemap_ino)

 /*
  * ioctl commands in 32 bit emulation
@@ -352,6 +353,11 @@ struct ext4_extents_info {
 	struct ext4_extent_data ext[DEFRAG_MAX_ENT];
 };

+struct fiemap_ino {
+	__u64 ino;
+	struct fiemap fiemap;
+};
+
 #define EXT4_TRANS_META_BLOCKS 4 /* bitmap + group desc + sb + inode */

 /*
@@ -1185,6 +1191,7 @@ extern int ext4_ext_journal_restart(handle_t *handle, int needed);
 /* defrag.c */
 extern int ext4_defrag(struct file *filp, ext4_lblk_t block_start,
 			ext4_lblk_t defrag_size, ext4_fsblk_t goal);
+extern int ext4_defrag_fiemap_ino(struct file *filp, unsigned long arg);

 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5709574..b69b54a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -256,6 +256,13 @@ setversion_out:
 			return 0;
 	}

+	case EXT4_IOC_FIEMAP_INO: {
+		int err;
+
+		err = ext4_defrag_fiemap_ino(filp, arg);
+		return err;
+	}
+
 	case EXT4_IOC_FREE_BLOCKS_INFO: {
 		struct ext4_extents_info ext_info;
 		int err;

^ permalink raw reply related	[flat|nested] 12+ messages in thread
* Re: [RFC][PATCH 7/9]ext4: Add the EXT4_IOC_FIEMAP_INO ioctl
@ 2008-11-13 11:34 Akira Fujita
  0 siblings, 0 replies; 12+ messages in thread
From: Akira Fujita @ 2008-11-13 11:34 UTC (permalink / raw)
  To: Theodore Tso; +Cc: Christoph Hellwig, Andreas Dilger, linux-ext4, Mingming Cao

Hi Ted,

Theodore Tso wrote:
> On Fri, Oct 31, 2008 at 06:05:47AM -0400, Christoph Hellwig wrote:
>>> I'll hack up a generic open_by_handle and then we can gather the
>>> reaction - it shouldn't be more than about one or two hundred lines of
>>> code.  Note that you really want an open by handle and not just inum for
>>> a defragmentation tool - without the generation you can easily run into
>>> races.
>
> I think having a generic open_by_handle is a Good Thing, but it
> actually isn't quite enough for defrag, because that brings up the
> question of how defrag can create the handle in the first place.
>
> In the case of Aryan's desire to get the list of files that were read
> during boot, it's pretty obvious how we can define an interface which
> would make available a set of file handles corresponding to the files
> that were opened during the boot process, and then that list of file
> handles can be saved to a file and used on the subsequent boot to do
> the readahead.   Fairly straight forward.
>
> In the case of the defrag situation, though, we need to step back and
> figure out what we are trying to do.  What the userspace program is
> apparently trying to do is to get the block extent maps used by all of
> the inodes in the block group.  The problem is we aren't opening the
> inodes by pathname, so we couldn't create a handle in the first place
> (since in order to create a handle, we need the containing directory).
>
> The bigger question is whether the defrag code is asking the right
> question in the first place.  The issue is that is that it currently
> assumes that in order to find the owner of a particular block (or more
> generally, extent), you should search the inodes in the block's
> blockgroup.  The problem is that for a very fragmented filesystem,
> most of the blocks' owners might not be in their block group.  In
> fact, over time, if you use defrag -f frequently, it will move blocks
> belonging to inodes in one block group to other block groups, which
> will make defrag -f's job even harder, and eventually impossible, for
> inodes belonging to other block groups.
>
>> Akira, can you please comment on these issues before going on?
>> I think the generation issue is a particularly important one if you
>> want to allow defrag by normal users.
>
> I'm not at all sure that it makes sense to allow "defrag -f" to be
> used by normal users.  The problem here is we're fighting multiple
> constraints.  First of all, we want to keep policy in the kernel to an
> absolute minimum, since debugging kernel code is a mess, and I don't
> think we want the complexity of a full-fledge defragger in the kernel.
> Secondly, though, if we are going to do this in userspace, we need to
> push a huge amount of information to the userspace defrag program,
> that immediately raises some very serious security issues, because we
> don't want to leak information unduly to random userspace programs.

All right.
I will change "defrag -f" to admit only root user.

>>> Btw, any reason the XFS approach of passing in *file descriptors* for both
>>> the inode to be defragmented and the "donor" inode doesn't work for you?
>
> I agree this is probably the better approach; it would certainly
> reduce the amount of new code that needs to be added to the kernel.
> Right now the "donor"/temporary inode is created and allocated in the
> kernel, and then the kernel decides whether or not the temporary inode
> is an improvement.  If we make the userspace code responsible for
> creating the temporary inode and then using fallocate() to allocate
> the new blocks, then userspace can call FIEMAP and decide whether it
> is an improvement.

There is no problem if it is only for normal defrag,
but I think fallocate is not enough for the other defrag mode (-r and -f)
because we can't specify physical block offset.
For example, in defrag -r, physical block offset of parent directory is
set to the goal for mballoc.
Also in defrag -f, physical block offset is used to allocate specified
range as well.

Do you mean that defrag -r and -f are unnecessary?
I think these options are advantages
if we compare ext4 defrag with other FS's defrag feature.

>
> P.S. I've been looking at ext4_defrag_partial(), and the page locking
> looks wrong.  The page is only locked if it it hasn't been read into
> memory yet?   And at the end of the function, we do this?
>
>        	      if (PageLocked(page))
> 			unlock_page(page);

In case defrag failed between write_begin() and write_end(),
we have to call unlock_page() because we've already have the page lock
with write_begin().
So unlock_page() is called in the end of ext4_defrag_partial() as error case.
Is there something wrong?

Regards,
Akira Fujita

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2008-11-13 11:35 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-10-24 10:09 [RFC][PATCH 7/9]ext4: Add the EXT4_IOC_FIEMAP_INO ioctl Akira Fujita
2008-10-26  8:40 ` Andreas Dilger
2008-10-26  8:48   ` Christoph Hellwig
2008-10-26  8:49     ` Christoph Hellwig
2008-10-31 10:05     ` Christoph Hellwig
2008-11-06  7:39       ` Akira Fujita
2008-11-06 16:15       ` Theodore Tso
2008-10-27 10:21   ` Akira Fujita
2008-10-27 19:55     ` Andreas Dilger
2008-10-31  9:46       ` Akira Fujita
2008-11-04 21:42         ` Andreas Dilger
  -- strict thread matches above, loose matches on Subject: below --
2008-11-13 11:34 Akira Fujita

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).