From: Theodore Ts'o <tytso@mit.edu>
To: Ext4 Developers List <linux-ext4@vger.kernel.org>
Cc: Zheng Liu <gnehzuil.liu@gmail.com>, Theodore Ts'o <tytso@mit.edu>
Subject: [PATCH 5/5] ext4: add new ioctl EXT4_IOC_PRECACHE_EXTENTS
Date: Tue, 16 Jul 2013 11:18:03 -0400 [thread overview]
Message-ID: <1373987883-4466-6-git-send-email-tytso@mit.edu> (raw)
In-Reply-To: <1373987883-4466-1-git-send-email-tytso@mit.edu>
Add a new ioctl which forces the all of the extents in an inode to be
cached in the extent_status tree. This is critically important when
using AIO to a preallocated file, since if we need to read in blocks
from the extent tree, the io_submit(2) system call becomes
synchronous, and the AIO is no longer "A", which is bad.
In addition, for most files which have an external leaf tree block,
the cost of caching the information in the extent status tree will be
less than caching the entire 4k block in the buffer cache. So it is
generally a win to keep the extent information cached.
---
fs/ext4/ext4.h | 17 +++++++-----
fs/ext4/extents.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++-
fs/ext4/extents_status.c | 19 ++++++++++----
fs/ext4/ioctl.c | 3 +++
4 files changed, 93 insertions(+), 13 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a8497b0..16b531c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -561,15 +561,16 @@ enum {
#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
/*
- * The bit position of this flag must not overlap with any of the
- * EXT4_GET_BLOCKS_*. It is used by ext4_ext_find_extent(),
+ * The bit position of these flags must not overlap with any of the
+ * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(),
* read_extent_tree_block(), ext4_split_extent_at(),
- * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf() to
- * indicate that the we shouldn't be caching the extents when reading
- * from the extent tree while a truncate or punch hole operation
- * is in progress.
+ * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
+ * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
+ * caching the extents when reading from the extent tree while a
+ * truncate or punch hole operation is in progress.
*/
#define EXT4_EX_NOCACHE 0x0400
+#define EXT4_EX_FORCE_CACHE 0x0800
/*
* Flags used by ext4_free_blocks
@@ -601,6 +602,7 @@ enum {
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
+#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -1386,6 +1388,7 @@ enum {
nolocking */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
+ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -2704,7 +2707,7 @@ extern int ext4_find_delalloc_range(struct inode *inode,
extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
-
+extern int ext4_ext_precache(struct inode *inode);
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 671dd91..f8f0f91 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -482,7 +482,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
if (err < 0)
goto errout;
}
- if (buffer_verified(bh))
+ if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
return bh;
err = __ext4_ext_check(function, line, inode,
ext_block_hdr(bh), depth, pblk);
@@ -526,6 +526,71 @@ errout:
__read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
(depth), (flags))
+/*
+ * This function is called to cache a file's extent information in the
+ * extent status tree
+ */
+int ext4_ext_precache(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_ext_path *path = NULL;
+ struct buffer_head *bh = 0;
+ int i = 0, depth, ret = 0;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return 0; /* not an extent-mapped inode */
+
+ down_read(&ei->i_data_sem);
+ depth = ext_depth(inode);
+
+ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
+ GFP_NOFS);
+ if (path == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* Don't cache anything if there are no external extent blocks */
+ if (depth == 0)
+ goto out;
+ path[0].p_hdr = ext_inode_hdr(inode);
+ ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
+ if (ret)
+ goto out;
+ path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
+ while (i >= 0) {
+ /*
+ * If this is a leaf block or we've reached the end of
+ * the index block, go up
+ */
+ if ((i == depth) ||
+ path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
+ brelse(path[i].p_bh);
+ path[i].p_bh = NULL;
+ i--;
+ continue;
+ }
+ bh = read_extent_tree_block(inode,
+ ext4_idx_pblock(path[i].p_idx++),
+ depth - i - 1,
+ EXT4_EX_FORCE_CACHE);
+ if (IS_ERR(bh)) {
+ ret = PTR_ERR(bh);
+ break;
+ }
+ i++;
+ path[i].p_bh = bh;
+ path[i].p_hdr = ext_block_hdr(bh);
+ path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
+ }
+ ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
+out:
+ up_read(&ei->i_data_sem);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return ret;
+}
+
#ifdef EXT_DEBUG
static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
{
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 1dc5df0..a1cbbcf 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -710,11 +710,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
write_lock(&EXT4_I(inode)->i_es_lock);
es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
- if (es && ((es->es_lblk <= lblk) || (es->es_lblk <= end)))
- goto out;
-
- __es_insert_extent(inode, &newes);
-out:
+ if (!es || es->es_lblk > end)
+ __es_insert_extent(inode, &newes);
write_unlock(&EXT4_I(inode)->i_es_lock);
}
@@ -930,6 +927,12 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
eia = list_entry(a, struct ext4_inode_info, i_es_lru);
eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+ if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+ !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+ return 1;
+ if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+ ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+ return -1;
if (eia->i_touch_when == eib->i_touch_when)
return 0;
if (time_after(eia->i_touch_when, eib->i_touch_when))
@@ -1069,10 +1072,16 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
struct rb_node *node;
struct extent_status *es;
int nr_shrunk = 0;
+ static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
if (ei->i_es_lru_nr == 0)
return 0;
+ if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+ __ratelimit(&_rs))
+ ext4_warning(inode->i_sb, "forced shrink of precached extents");
+
node = rb_first(&tree->root);
while (node != NULL) {
es = rb_entry(node, struct extent_status, rb_node);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 9491ac0..3c7304c 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -622,6 +622,8 @@ resizefs_out:
return 0;
}
+ case EXT4_IOC_PRECACHE_EXTENTS:
+ return ext4_ext_precache(inode);
default:
return -ENOTTY;
@@ -686,6 +688,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_MOVE_EXT:
case FITRIM:
case EXT4_IOC_RESIZE_FS:
+ case EXT4_IOC_PRECACHE_EXTENTS:
break;
default:
return -ENOIOCTLCMD;
--
1.7.12.rc0.22.gcdd159b
next prev parent reply other threads:[~2013-07-16 15:18 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-07-16 15:17 [PATCH 0/5 v2] add extent status tree caching Theodore Ts'o
2013-07-16 15:17 ` [PATCH 1/5] ext4: refactor code to read the extent tree block Theodore Ts'o
2013-07-16 15:18 ` [PATCH 2/5] ext4: print the block number of invalid extent tree blocks Theodore Ts'o
2013-07-18 0:56 ` Zheng Liu
2013-07-16 15:18 ` [PATCH 3/5] ext4: use unsigned int for es_status values Theodore Ts'o
2013-07-16 15:18 ` [PATCH 4/5] ext4: cache all of an extent tree's leaf block upon reading Theodore Ts'o
2013-07-16 15:18 ` Theodore Ts'o [this message]
2013-07-18 1:19 ` [PATCH 5/5] ext4: add new ioctl EXT4_IOC_PRECACHE_EXTENTS Zheng Liu
2013-07-18 2:50 ` Theodore Ts'o
2013-07-18 13:06 ` Zheng Liu
2013-07-18 15:21 ` Theodore Ts'o
2013-07-18 18:35 ` [PATCH 0/5 v2] add extent status tree caching Eric Sandeen
2013-07-18 18:53 ` Theodore Ts'o
2013-07-19 0:56 ` Eric Sandeen
2013-07-19 2:59 ` Theodore Ts'o
2013-07-19 3:33 ` Dave Chinner
2013-07-19 14:22 ` Jeff Moyer
2013-07-19 16:19 ` Theodore Ts'o
2013-07-22 1:38 ` Dave Chinner
2013-07-22 2:17 ` Zheng Liu
2013-07-22 10:02 ` Dave Chinner
2013-07-22 12:57 ` Zheng Liu
2013-07-30 3:08 ` Dave Chinner
2013-08-04 1:27 ` Theodore Ts'o
2013-08-13 3:10 ` Dave Chinner
2013-08-13 3:21 ` Eric Sandeen
2013-08-13 13:04 ` Theodore Ts'o
2013-08-16 3:21 ` Dave Chinner
2013-08-16 14:39 ` Theodore Ts'o
2013-07-18 23:54 ` Zheng Liu
2013-07-19 0:07 ` Theodore Ts'o
2013-07-19 1:03 ` Zheng Liu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1373987883-4466-6-git-send-email-tytso@mit.edu \
--to=tytso@mit.edu \
--cc=gnehzuil.liu@gmail.com \
--cc=linux-ext4@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).