From: "Darrick J. Wong" <darrick.wong@oracle.com>
To: Zhi Yong Wu <zwu.kernel@gmail.com>
Cc: linux-fsdevel@vger.kernel.org, linux-ext4@vger.kernel.org,
linux-btrfs@vger.kernel.org, linux-kernel@vger.kernel.org,
linuxram@linux.vnet.ibm.com, viro@zeniv.linux.org.uk,
david@fromorbit.com, tytso@mit.edu, cmm@us.ibm.com,
wuzhy@linux.vnet.ibm.com, wenqing.lz@taobao.com
Subject: Re: [RFC v4+ hot_track 03/19] vfs: add I/O frequency update function
Date: Wed, 7 Nov 2012 10:49:52 -0800 [thread overview]
Message-ID: <20121107184952.GA4143@blackbox.djwong.org> (raw)
In-Reply-To: <CAEH94LgDuTqV2sddxs9mvp-2bKjwcSPag7iXa4UFYCTixwJFFA@mail.gmail.com>
On Wed, Nov 07, 2012 at 04:27:05PM +0800, Zhi Yong Wu wrote:
> On Wed, Nov 7, 2012 at 6:45 AM, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > On Mon, Oct 29, 2012 at 12:30:45PM +0800, zwu.kernel@gmail.com wrote:
> >> From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> >>
> >> Add some util helpers to update access frequencies
> >> for one file or its range.
> >>
> >> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> >> ---
> >> fs/hot_tracking.c | 179 ++++++++++++++++++++++++++++++++++++++++++
> >> fs/hot_tracking.h | 7 ++
> >> include/linux/hot_tracking.h | 2 +
> >> 3 files changed, 188 insertions(+), 0 deletions(-)
> >>
> >> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
> >> index 68591f0..0a7d9a3 100644
> >> --- a/fs/hot_tracking.c
> >> +++ b/fs/hot_tracking.c
> >> @@ -172,6 +172,137 @@ static void hot_inode_tree_exit(struct hot_info *root)
> >> }
> >> }
> >>
> >> +struct hot_inode_item
> >> +*hot_inode_item_find(struct hot_info *root, u64 ino)
> >> +{
> >> + struct hot_inode_item *he;
> >> + int ret;
> >> +
> >> +again:
> >> + spin_lock(&root->lock);
> >> + he = radix_tree_lookup(&root->hot_inode_tree, ino);
> >> + if (he) {
> >> + kref_get(&he->hot_inode.refs);
> >> + spin_unlock(&root->lock);
> >> + return he;
> >> + }
> >> + spin_unlock(&root->lock);
> >> +
> >> + he = kmem_cache_zalloc(hot_inode_item_cachep,
> >> + GFP_KERNEL | GFP_NOFS);
> >> + if (!he)
> >> + return ERR_PTR(-ENOMEM);
> >> +
> >> + hot_inode_item_init(he, ino, &root->hot_inode_tree);
> >> +
> >> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
> >> + if (ret) {
> >> + kmem_cache_free(hot_inode_item_cachep, he);
> >> + return ERR_PTR(ret);
> >> + }
> >> +
> >> + spin_lock(&root->lock);
> >> + ret = radix_tree_insert(&root->hot_inode_tree, ino, he);
> >> + if (ret == -EEXIST) {
> >> + kmem_cache_free(hot_inode_item_cachep, he);
> >> + spin_unlock(&root->lock);
> >> + radix_tree_preload_end();
> >> + goto again;
> >> + }
> >> + spin_unlock(&root->lock);
> >> + radix_tree_preload_end();
> >> +
> >> + kref_get(&he->hot_inode.refs);
> >> + return he;
> >> +}
> >> +EXPORT_SYMBOL_GPL(hot_inode_item_find);
> >> +
> >> +static struct hot_range_item
> >> +*hot_range_item_find(struct hot_inode_item *he,
> >> + u32 start)
> >> +{
> >> + struct hot_range_item *hr;
> >> + int ret;
> >> +
> >> +again:
> >> + spin_lock(&he->lock);
> >> + hr = radix_tree_lookup(&he->hot_range_tree, start);
> >> + if (hr) {
> >> + kref_get(&hr->hot_range.refs);
> >> + spin_unlock(&he->lock);
> >> + return hr;
> >> + }
> >> + spin_unlock(&he->lock);
> >> +
> >> + hr = kmem_cache_zalloc(hot_range_item_cachep,
> >> + GFP_KERNEL | GFP_NOFS);
> >> + if (!hr)
> >> + return ERR_PTR(-ENOMEM);
> >> +
> >> + hot_range_item_init(hr, start, he);
> >> +
> >> + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
> >> + if (ret) {
> >> + kmem_cache_free(hot_range_item_cachep, hr);
> >> + return ERR_PTR(ret);
> >> + }
> >> +
> >> + spin_lock(&he->lock);
> >> + ret = radix_tree_insert(&he->hot_range_tree, start, hr);
> >> + if (ret == -EEXIST) {
> >> + kmem_cache_free(hot_range_item_cachep, hr);
> >> + spin_unlock(&he->lock);
> >> + radix_tree_preload_end();
> >> + goto again;
> >> + }
> >> + spin_unlock(&he->lock);
> >> + radix_tree_preload_end();
> >> +
> >> + kref_get(&hr->hot_range.refs);
> >> + return hr;
> >> +}
> >> +
> >> +/*
> >> + * This function does the actual work of updating
> >> + * the frequency numbers, whatever they turn out to be.
> >> + */
> >> +static u64 hot_average_update(struct timespec old_atime,
> >> + struct timespec cur_time, u64 old_avg)
> >> +{
> >> + struct timespec delta_ts;
> >> + u64 new_avg;
> >> + u64 new_delta;
> >> +
> >> + delta_ts = timespec_sub(cur_time, old_atime);
> >> + new_delta = timespec_to_ns(&delta_ts) >> FREQ_POWER;
> >> +
> >> + new_avg = (old_avg << FREQ_POWER) - old_avg + new_delta;
> >> + new_avg = new_avg >> FREQ_POWER;
> >> +
> >> + return new_avg;
> >> +}
> >> +
> >> +static void hot_freq_data_update(struct hot_freq_data *freq_data, bool write)
> >> +{
> >> + struct timespec cur_time = current_kernel_time();
> >> +
> >> + if (write) {
> >> + freq_data->nr_writes += 1;
> >> + freq_data->avg_delta_writes = hot_average_update(
> >> + freq_data->last_write_time,
> >> + cur_time,
> >> + freq_data->avg_delta_writes);
> >> + freq_data->last_write_time = cur_time;
> >> + } else {
> >> + freq_data->nr_reads += 1;
> >> + freq_data->avg_delta_reads = hot_average_update(
> >> + freq_data->last_read_time,
> >> + cur_time,
> >> + freq_data->avg_delta_reads);
> >
> > I think you could just pass in a pointer to
> > freq_data->avg_delta_{writes,reads} here...
> why?
freq_data->avg_delta_{reads,writes} seems to be an in/out parameter, but by
specifying it once as an in parameter and again as an lvalue, you're increasing
the chances that someone will screw it up some time later -- you're not
preventing me from accidentally writing this:
freq_data->avg_delta_writes = hot_average_update(..., freq_data->avg_delta_reads);
...which (at least in my head) becomes an easier mistake to make once you start
mixing in the function pointers a few patches later, and (my) brain has to wrap
itself around all the punctuation.
> >> + freq_data->last_read_time = cur_time;
> >> + }
> >> +}
> >> +
> >> /*
> >> * Initialize kmem cache for hot_inode_item and hot_range_item.
> >> */
> >> @@ -199,6 +330,54 @@ err:
> >> EXPORT_SYMBOL_GPL(hot_cache_init);
> >>
> >> /*
> >> + * Main function to update access frequency from read/writepage(s) hooks
> >> + */
> >> +void hot_update_freqs(struct inode *inode, u64 start,
> >> + u64 len, int rw)
> >> +{
> >> + struct hot_info *root = inode->i_sb->s_hot_root;
> >> + struct hot_inode_item *he;
> >> + struct hot_range_item *hr;
> >> + u32 cur, end;
> >> +
> >> + if (!root || (len == 0))
> >> + return;
> >> +
> >> + he = hot_inode_item_find(root, inode->i_ino);
> >> + if (IS_ERR(he)) {
> >> + WARN_ON(1);
> >> + return;
> >> + }
> >> +
> >> + spin_lock(&he->hot_inode.lock);
> >> + hot_freq_data_update(&he->hot_inode.hot_freq_data, rw);
> >> + spin_unlock(&he->hot_inode.lock);
> >> +
> >> + /*
> >> + * Align ranges on RANGE_SIZE boundary
> >> + * to prevent proliferation of range structs
> >> + */
> >> + end = (start + len + RANGE_SIZE - 1) >> RANGE_BITS;
> >> + for (cur = (start >> RANGE_BITS); cur < end; cur++) {
> >
> > Hm... start is u64, cur is u32, RANGE_BITS is 20. Doesn't this overflow if,
> > say, I have a sparse file with blocks way out at 2^53 bytes?
> ah, good catch, thanks.
Actually, I should go further -- why not use loff_t? The rest of the fs/ code
does.
> > Also, RANGE_SIZE means that the hot tracking range granularity is 1MiB? How
> yes.
> > did you decide on that? Will we ever want to change that?
> It is one assumption, do you think 1 MB is not appropriate? Do you
> mean to add one proc file interface for it?
I don't know about a procfs interface -- debugfs, perhaps?
But actually, I was thinking that the fs might have a better idea of the range
granularity that it wants to handle. Possibly it might be useful to try to
align with raid stripes or other topology, too... though that's difficult.
Also, for the fses that use allocation units (clusters), it might be useful
to collect heat data per-cluster.
On the other hand, it might not make much of a difference since most files tend
to fit in ~4K anyway, and the extra granularity will increase memory
consumption for large files. I don't mind having a 1MB default, but having a
knob would certainly make it easier to tune, or in the future, to test if that
1MB default still makes sense.
--D
> >
> >> + hr = hot_range_item_find(he, cur);
> >> + if (IS_ERR(hr)) {
> >> + WARN_ON(1);
> >
> > WARN(1, "hot_range_item_find returns %d\n", PTR_ERR(hr)); ?
> OK, done.
> >
> > --D
> >
> >> + hot_inode_item_put(he);
> >> + return;
> >> + }
> >> +
> >> + spin_lock(&hr->hot_range.lock);
> >> + hot_freq_data_update(&hr->hot_range.hot_freq_data, rw);
> >> + spin_unlock(&hr->hot_range.lock);
> >> +
> >> + hot_range_item_put(hr);
> >> + }
> >> +
> >> + hot_inode_item_put(he);
> >> +}
> >> +EXPORT_SYMBOL_GPL(hot_update_freqs);
> >> +
> >> +/*
> >> * Initialize the data structures for hot data tracking.
> >> */
> >> int hot_track_init(struct super_block *sb)
> >> diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
> >> index e7ba121..cc4666e 100644
> >> --- a/fs/hot_tracking.h
> >> +++ b/fs/hot_tracking.h
> >> @@ -20,6 +20,13 @@
> >> #define FREQ_DATA_TYPE_INODE (1 << 0)
> >> #define FREQ_DATA_TYPE_RANGE (1 << 1)
> >>
> >> +/* size of sub-file ranges */
> >> +#define RANGE_BITS 20
> >> +#define RANGE_SIZE (1 << RANGE_BITS)
> >> +
> >> +#define FREQ_POWER 4
> >> +
> >> void hot_inode_item_put(struct hot_inode_item *he);
> >> +struct hot_inode_item *hot_inode_item_find(struct hot_info *root, u64 ino);
> >>
> >> #endif /* __HOT_TRACKING__ */
> >> diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
> >> index 4233207..e2d6028 100644
> >> --- a/include/linux/hot_tracking.h
> >> +++ b/include/linux/hot_tracking.h
> >> @@ -71,5 +71,7 @@ struct hot_info {
> >> extern void __init hot_cache_init(void);
> >> extern int hot_track_init(struct super_block *sb);
> >> extern void hot_track_exit(struct super_block *sb);
> >> +extern void hot_update_freqs(struct inode *inode, u64 start,
> >> + u64 len, int rw);
> >>
> >> #endif /* _LINUX_HOTTRACK_H */
> >> --
> >> 1.7.6.5
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
>
> --
> Regards,
>
> Zhi Yong Wu
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
next prev parent reply other threads:[~2012-11-07 18:49 UTC|newest]
Thread overview: 60+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-10-29 4:30 [RFC v4+ hot_track 00/19] vfs: hot data tracking zwu.kernel
2012-10-29 4:30 ` [RFC v4+ hot_track 01/19] vfs: introduce private radix tree structures zwu.kernel
2012-10-29 4:30 ` [RFC v4+ hot_track 02/19] vfs: initialize and free data structures zwu.kernel
2012-11-06 22:24 ` David Sterba
2012-11-07 6:55 ` Zhi Yong Wu
2012-11-16 6:16 ` Zhi Yong Wu
2012-10-29 4:30 ` [RFC v4+ hot_track 03/19] vfs: add I/O frequency update function zwu.kernel
2012-11-05 11:07 ` Steven Whitehouse
2012-11-05 11:47 ` Zhi Yong Wu
2012-11-06 22:37 ` David Sterba
2012-11-07 7:03 ` Zhi Yong Wu
2012-11-06 22:45 ` Darrick J. Wong
2012-11-07 8:27 ` Zhi Yong Wu
2012-11-07 18:49 ` Darrick J. Wong [this message]
2012-11-08 2:52 ` Zhi Yong Wu
2012-10-29 4:30 ` [RFC v4+ hot_track 04/19] vfs: add two map arrays zwu.kernel
2012-10-29 4:30 ` [RFC v4+ hot_track 05/19] vfs: add hooks to enable hot tracking zwu.kernel
2012-11-06 22:51 ` David Sterba
2012-11-07 7:06 ` Zhi Yong Wu
2012-10-29 4:30 ` [RFC v4+ hot_track 06/19] vfs: add temp calculation function zwu.kernel
2012-10-29 4:30 ` [RFC v4+ hot_track 07/19] vfs: add map info update function zwu.kernel
2012-10-29 4:30 ` [RFC v4+ hot_track 08/19] vfs: add aging function zwu.kernel
2012-10-29 4:30 ` [RFC v4+ hot_track 09/19] vfs: add one work queue zwu.kernel
2012-11-05 11:21 ` Steven Whitehouse
2012-11-05 11:55 ` Zhi Yong Wu
2012-11-05 12:07 ` Steven Whitehouse
2012-11-05 12:20 ` Zhi Yong Wu
2012-10-29 4:30 ` [RFC v4+ hot_track 10/19] vfs: introduce hot func register framework zwu.kernel
2012-11-06 23:14 ` David Sterba
2012-11-07 7:18 ` Zhi Yong Wu
2012-11-06 23:30 ` Darrick J. Wong
2012-11-07 8:34 ` Zhi Yong Wu
2012-11-07 18:58 ` Darrick J. Wong
2012-11-08 2:59 ` Zhi Yong Wu
2012-10-29 4:30 ` [RFC v4+ hot_track 11/19] vfs: register one shrinker zwu.kernel
2012-10-29 4:30 ` [RFC v4+ hot_track 12/19] vfs: add one ioctl interface zwu.kernel
2012-11-06 23:30 ` David Sterba
2012-11-07 7:36 ` Zhi Yong Wu
2012-10-29 4:30 ` [RFC v4+ hot_track 13/19] debugfs: introduce one function zwu.kernel
2012-10-29 18:11 ` Greg KH
2012-10-29 22:25 ` Zhi Yong Wu
2012-10-29 22:34 ` Greg KH
2012-10-29 22:45 ` Zhi Yong Wu
2012-10-29 22:54 ` Greg KH
2012-10-29 22:58 ` Zhi Yong Wu
2012-10-29 4:30 ` [RFC v4+ hot_track 14/19] vfs: add debugfs support zwu.kernel
2012-11-06 23:45 ` David Sterba
2012-11-07 7:49 ` Zhi Yong Wu
2012-10-29 4:30 ` [RFC v4+ hot_track 15/19] sysfs: add two hot_track proc files zwu.kernel
2012-10-29 18:10 ` Greg KH
2012-10-29 22:26 ` Zhi Yong Wu
2012-10-29 4:30 ` [RFC v4+ hot_track 16/19] btrfs: add hot tracking support zwu.kernel
2012-11-07 0:00 ` David Sterba
2012-11-07 8:03 ` Zhi Yong Wu
2012-10-29 4:30 ` [RFC v4+ hot_track 17/19] xfs: " zwu.kernel
2012-10-29 4:31 ` [RFC v4+ hot_track 18/19] ext4: " zwu.kernel
2012-10-29 4:31 ` [RFC v4+ hot_track 19/19] vfs: add documentation zwu.kernel
2012-10-29 10:30 ` [RFC v4+ hot_track 00/19] vfs: hot data tracking Andi Kleen
2012-10-29 12:31 ` Zhi Yong Wu
2012-11-07 8:51 ` Zhi Yong Wu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20121107184952.GA4143@blackbox.djwong.org \
--to=darrick.wong@oracle.com \
--cc=cmm@us.ibm.com \
--cc=david@fromorbit.com \
--cc=linux-btrfs@vger.kernel.org \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linuxram@linux.vnet.ibm.com \
--cc=tytso@mit.edu \
--cc=viro@zeniv.linux.org.uk \
--cc=wenqing.lz@taobao.com \
--cc=wuzhy@linux.vnet.ibm.com \
--cc=zwu.kernel@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).