From: zwu.kernel@gmail.com
To: viro@zeniv.linux.org.uk
Cc: torvalds@linux-foundation.org, linux-fsdevel@vger.kernel.org,
Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>,
Chandra Seetharaman <sekharan@us.ibm.com>
Subject: [[RESEND]PATCH v4 02/10] VFS hot tracking: Track IO and record heat information
Date: Mon, 12 Aug 2013 10:20:16 +0800 [thread overview]
Message-ID: <1376274024-28689-3-git-send-email-zwu.kernel@gmail.com> (raw)
In-Reply-To: <1376274024-28689-1-git-send-email-zwu.kernel@gmail.com>
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
This patch adds read/write code paths: include read_pages(),
do_writepages(), do_generic_file_read() and __blockdev_direct_IO()
to record heat information.
When real disk i/o for an inode is done, its own hot_inode_item will
be created or updated in the RB tree for the filesystem, and the i/o freq for
all of its extents will also be created/updated in the RB-tree per inode.
Each of the two structures hot_inode_item and hot_range_item
contains a hot_freq_data struct with its frequency of access metrics
(number of {reads, writes}, last {read,write} time, frequency of
{reads,writes}).
Each hot_inode_item contains one hot_range_tree struct which is keyed by
{inode, offset, length} and used to keep track of all the ranges in this file.
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
fs/direct-io.c | 5 +
fs/hot_tracking.c | 234 +++++++++++++++++++++++++++++++++++++++++++
fs/hot_tracking.h | 1 +
fs/namei.c | 3 +
include/linux/hot_tracking.h | 28 ++++++
mm/filemap.c | 6 ++
mm/page-writeback.c | 12 +++
mm/readahead.c | 6 ++
8 files changed, 295 insertions(+)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7ab90f5..0d7f8c7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -38,6 +38,7 @@
#include <linux/atomic.h>
#include <linux/prefetch.h>
#include <linux/aio.h>
+#include "hot_tracking.h"
/*
* How many user pages to map in one call to get_user_pages(). This determines
@@ -1295,6 +1296,10 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
prefetch(bdev->bd_queue);
prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
+ /* Hot tracking */
+ hot_update_freqs(inode, offset,
+ iov_length(iov, nr_segs), rw & WRITE);
+
return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
nr_segs, get_block, end_io,
submit_io, flags);
diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index 8a65472..e53fd36 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -23,6 +23,8 @@ static void hot_range_item_init(struct hot_range_item *hr,
struct hot_inode_item *he, loff_t start)
{
kref_init(&hr->refs);
+ hr->freq.avg_delta_reads = (u64) -1;
+ hr->freq.avg_delta_writes = (u64) -1;
hr->start = start;
hr->len = hot_bit_shift(1, RANGE_BITS, true);
hr->hot_inode = he;
@@ -62,6 +64,64 @@ void hot_range_item_put(struct hot_range_item *hr)
}
EXPORT_SYMBOL_GPL(hot_range_item_put);
+struct hot_range_item
+*hot_range_item_lookup(struct hot_inode_item *he, loff_t start, int alloc)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct hot_range_item *hr, *hr_new = NULL;
+
+ start = hot_bit_shift(start, RANGE_BITS, true);
+
+ /* walk tree to find insertion point */
+redo:
+ spin_lock(&he->i_lock);
+ p = &he->hot_range_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ hr = rb_entry(parent, struct hot_range_item, rb_node);
+ if (start < hr->start)
+ p = &(*p)->rb_left;
+ else if (start > (hr->start + hr->len - 1))
+ p = &(*p)->rb_right;
+ else {
+ hot_range_item_get(hr);
+ if (hr_new) {
+ /*
+ * Lost the race. Somebody else inserted
+ * the item for the range. Free the
+ * newly allocated item.
+ */
+ hot_range_item_put(hr_new);
+ }
+ spin_unlock(&he->i_lock);
+
+ return hr;
+ }
+ }
+
+ if (hr_new) {
+ rb_link_node(&hr_new->rb_node, parent, p);
+ rb_insert_color(&hr_new->rb_node, &he->hot_range_tree);
+ hot_range_item_get(hr_new); /* For the caller */
+ spin_unlock(&he->i_lock);
+ return hr_new;
+ }
+ spin_unlock(&he->i_lock);
+
+ if (!alloc)
+ return ERR_PTR(-ENOENT);
+
+ hr_new = kmem_cache_zalloc(hot_range_item_cachep, GFP_NOFS);
+ if (!hr_new)
+ return ERR_PTR(-ENOMEM);
+
+ hot_range_item_init(hr_new, he, start);
+
+ goto redo;
+}
+EXPORT_SYMBOL_GPL(hot_range_item_lookup);
+
/*
* Free the entire hot_range_tree.
*/
@@ -85,6 +145,8 @@ static void hot_inode_item_init(struct hot_inode_item *he,
struct hot_info *root, u64 ino)
{
kref_init(&he->refs);
+ he->freq.avg_delta_reads = (u64) -1;
+ he->freq.avg_delta_writes = (u64) -1;
he->i_ino = ino;
he->hot_root = root;
spin_lock_init(&he->i_lock);
@@ -124,6 +186,126 @@ void hot_inode_item_put(struct hot_inode_item *he)
}
EXPORT_SYMBOL_GPL(hot_inode_item_put);
+struct hot_inode_item
+*hot_inode_item_lookup(struct hot_info *root, u64 ino, int alloc)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct hot_inode_item *he, *he_new = NULL;
+
+ /* walk tree to find insertion point */
+redo:
+ spin_lock(&root->t_lock);
+ p = &root->hot_inode_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ he = rb_entry(parent, struct hot_inode_item, rb_node);
+ if (ino < he->i_ino)
+ p = &(*p)->rb_left;
+ else if (ino > he->i_ino)
+ p = &(*p)->rb_right;
+ else {
+ hot_inode_item_get(he);
+ if (he_new) {
+ /*
+ * Lost the race. Somebody else inserted
+ * the item for the inode. Free the
+ * newly allocated item.
+ */
+ hot_inode_item_put(he_new);
+ }
+ spin_unlock(&root->t_lock);
+
+ return he;
+ }
+ }
+
+ if (he_new) {
+ rb_link_node(&he_new->rb_node, parent, p);
+ rb_insert_color(&he_new->rb_node, &root->hot_inode_tree);
+ hot_inode_item_get(he_new); /* For the caller */
+ spin_unlock(&root->t_lock);
+ return he_new;
+ }
+ spin_unlock(&root->t_lock);
+
+ if (!alloc)
+ return ERR_PTR(-ENOENT);
+
+ he_new = kmem_cache_zalloc(hot_inode_item_cachep, GFP_NOFS);
+ if (!he_new)
+ return ERR_PTR(-ENOMEM);
+
+ hot_inode_item_init(he_new, root, ino);
+
+ goto redo;
+}
+EXPORT_SYMBOL_GPL(hot_inode_item_lookup);
+
+void hot_inode_item_unlink(struct inode *inode)
+{
+ struct hot_info *root = inode->i_sb->s_hot_root;
+ struct hot_inode_item *he;
+
+ if (!root || !S_ISREG(inode->i_mode))
+ return;
+
+ he = hot_inode_item_lookup(root, inode->i_ino, 0);
+ if (IS_ERR(he))
+ return;
+
+ spin_lock(&root->t_lock);
+ hot_inode_item_put(he);
+ hot_inode_item_put(he); /* For the caller */
+ spin_unlock(&root->t_lock);
+}
+EXPORT_SYMBOL_GPL(hot_inode_item_unlink);
+
+/*
+ * This function does the actual work of updating
+ * the frequency numbers.
+ *
+ * avg_delta_{reads,writes} are indeed a kind of simple moving
+ * average of the time difference between each of the last
+ * 2^(FREQ_POWER) reads/writes. If there have not yet been that
+ * many reads or writes, it's likely that the values will be very
+ * large; They are initialized to the largest possible value for the
+ * data type. Simply, we don't want a few fast access to a file to
+ * automatically make it appear very hot.
+ */
+static void hot_freq_calc(struct timespec old_atime,
+ struct timespec cur_time, u64 *avg)
+{
+ struct timespec delta_ts;
+ u64 new_delta;
+
+ delta_ts = timespec_sub(cur_time, old_atime);
+ new_delta = timespec_to_ns(&delta_ts) >> FREQ_POWER;
+
+ *avg = (*avg << FREQ_POWER) - *avg + new_delta;
+ *avg = *avg >> FREQ_POWER;
+}
+
+static void hot_freq_update(struct hot_info *root,
+ struct hot_freq *freq, bool write)
+{
+ struct timespec cur_time = current_kernel_time();
+
+ if (write) {
+ freq->nr_writes += 1;
+ hot_freq_calc(freq->last_write_time,
+ cur_time,
+ &freq->avg_delta_writes);
+ freq->last_write_time = cur_time;
+ } else {
+ freq->nr_reads += 1;
+ hot_freq_calc(freq->last_read_time,
+ cur_time,
+ &freq->avg_delta_reads);
+ freq->last_read_time = cur_time;
+ }
+}
+
/*
* Initialize kmem cache for hot_inode_item and hot_range_item.
*/
@@ -145,6 +327,58 @@ void __init hot_cache_init(void)
}
EXPORT_SYMBOL_GPL(hot_cache_init);
+/*
+ * Main function to update i/o access frequencies, and it will be called
+ * from read/writepages() hooks, which are read_pages(), do_writepages(),
+ * do_generic_file_read(), and __blockdev_direct_IO().
+ */
+void hot_update_freqs(struct inode *inode, loff_t start,
+ size_t len, int rw)
+{
+ struct hot_info *root = inode->i_sb->s_hot_root;
+ struct hot_inode_item *he;
+ struct hot_range_item *hr;
+ u64 range_size;
+ loff_t cur, end;
+
+ if (!root || (len == 0) || !S_ISREG(inode->i_mode))
+ return;
+
+ he = hot_inode_item_lookup(root, inode->i_ino, 1);
+ if (IS_ERR(he))
+ return;
+
+ hot_freq_update(root, &he->freq, rw);
+
+ /*
+ * Align ranges on range size boundary
+ * to prevent proliferation of range structs
+ */
+ range_size = hot_bit_shift(1, RANGE_BITS, true);
+ end = hot_bit_shift((start + len + range_size - 1),
+ RANGE_BITS, false);
+ cur = hot_bit_shift(start, RANGE_BITS, false);
+ for (; cur < end; cur++) {
+ hr = hot_range_item_lookup(he, cur, 1);
+ if (IS_ERR(hr)) {
+ WARN(1, "hot_range_item_lookup returns %ld\n",
+ PTR_ERR(hr));
+ return;
+ }
+
+ hot_freq_update(root, &hr->freq, rw);
+
+ spin_lock(&he->i_lock);
+ hot_range_item_put(hr);
+ spin_unlock(&he->i_lock);
+ }
+
+ spin_lock(&root->t_lock);
+ hot_inode_item_put(he);
+ spin_unlock(&root->t_lock);
+}
+EXPORT_SYMBOL_GPL(hot_update_freqs);
+
static struct hot_info *hot_tree_init(struct super_block *sb)
{
struct hot_info *root;
diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
index 2776092..bb4cb16 100644
--- a/fs/hot_tracking.h
+++ b/fs/hot_tracking.h
@@ -16,5 +16,6 @@
/* size of sub-file ranges */
#define RANGE_BITS 20
+#define FREQ_POWER 4
#endif /* __HOT_TRACKING__ */
diff --git a/fs/namei.c b/fs/namei.c
index 89a612e..5ef5e8c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3454,6 +3454,9 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
}
mutex_unlock(&dentry->d_inode->i_mutex);
+ if (!error && !dentry->d_inode->i_nlink)
+ hot_inode_item_unlink(dentry->d_inode);
+
/* We don't d_delete() NFS sillyrenamed files--they still exist. */
if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
fsnotify_link_count(dentry->d_inode);
diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
index a7d128d..80b198e 100644
--- a/include/linux/hot_tracking.h
+++ b/include/linux/hot_tracking.h
@@ -34,8 +34,24 @@ enum {
MAX_TYPES,
};
+/*
+ * A frequency data struct holds values that are used to
+ * determine temperature of files and file ranges. These structs
+ * are members of hot_inode_item and hot_range_item
+ */
+struct hot_freq {
+ struct timespec last_read_time;
+ struct timespec last_write_time;
+ u32 nr_reads;
+ u32 nr_writes;
+ u64 avg_delta_reads;
+ u64 avg_delta_writes;
+ u32 last_temp;
+};
+
/* An item representing an inode and its access frequency */
struct hot_inode_item {
+ struct hot_freq freq; /* frequency data */
struct kref refs;
struct rb_node rb_node; /* rbtree index */
struct rcu_head rcu;
@@ -50,6 +66,7 @@ struct hot_inode_item {
* an inode whose frequency is being tracked
*/
struct hot_range_item {
+ struct hot_freq freq; /* frequency data */
struct kref refs;
struct rb_node rb_node; /* rbtree index */
struct rcu_head rcu;
@@ -70,6 +87,17 @@ extern void hot_range_item_put(struct hot_range_item *hr);
extern void hot_inode_item_put(struct hot_inode_item *he);
extern void hot_range_item_get(struct hot_range_item *hr);
extern void hot_inode_item_get(struct hot_inode_item *he);
+extern void hot_update_freqs(struct inode *inode,
+ loff_t start, size_t len, int rw);
+extern struct hot_range_item
+*hot_range_item_lookup(struct hot_inode_item *he,
+ loff_t start, int alloc);
+extern struct hot_inode_item
+*hot_inode_item_lookup(struct hot_info *root,
+ u64 ino, int alloc);
+extern void hot_inode_item_unlink(struct inode *inode);
+extern void hot_update_freqs(struct inode *inode, loff_t start,
+ size_t len, int rw);
static inline u64 hot_bit_shift(u64 counter, u32 bits, bool dir)
{
diff --git a/mm/filemap.c b/mm/filemap.c
index 4b51ac1..28d3ad9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
+#include <linux/hot_tracking.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -1242,6 +1243,11 @@ readpage:
* PG_error will be set again if readpage fails.
*/
ClearPageError(page);
+
+ /* Hot tracking */
+ hot_update_freqs(inode, page->index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
+
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3f0c895..5b1744b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,6 +36,7 @@
#include <linux/pagevec.h>
#include <linux/timer.h>
#include <linux/sched/rt.h>
+#include <linux/hot_tracking.h>
#include <trace/events/writeback.h>
/*
@@ -1921,13 +1922,24 @@ EXPORT_SYMBOL(generic_writepages);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
+ loff_t start = 0;
+ size_t count = 0;
if (wbc->nr_to_write <= 0)
return 0;
+
+ start = mapping->writeback_index << PAGE_CACHE_SHIFT;
+ count = wbc->nr_to_write;
+
if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
else
ret = generic_writepages(mapping, wbc);
+
+ /* Hot tracking */
+ hot_update_freqs(mapping->host, start,
+ (count - wbc->nr_to_write) * PAGE_CACHE_SIZE, 1);
+
return ret;
}
diff --git a/mm/readahead.c b/mm/readahead.c
index 829a77c..5867265 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -19,6 +19,7 @@
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/file.h>
+#include <linux/hot_tracking.h>
/*
* Initialise a struct file's readahead state. Assumes that the caller has
@@ -115,6 +116,11 @@ static int read_pages(struct address_space *mapping, struct file *filp,
unsigned page_idx;
int ret;
+ /* Hot tracking */
+ hot_update_freqs(mapping->host,
+ list_to_page(pages)->index << PAGE_CACHE_SHIFT,
+ (size_t)nr_pages * PAGE_CACHE_SIZE, 0);
+
blk_start_plug(&plug);
if (mapping->a_ops->readpages) {
--
1.7.11.7
next prev parent reply other threads:[~2013-08-12 2:22 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-08-12 2:20 [[RESEND]PATCH v4 00/10] VFS hot tracking zwu.kernel
2013-08-12 2:20 ` [[RESEND]PATCH v4 01/10] VFS hot tracking: Define basic data structures and functions zwu.kernel
2013-08-12 2:20 ` zwu.kernel [this message]
2013-08-12 2:20 ` [[RESEND]PATCH v4 03/10] VFS hot tracking: Add a workqueue to move items between hot maps zwu.kernel
2013-08-12 2:20 ` [[RESEND]PATCH v4 04/10] VFS hot tracking: Add shrinker functionality to curtail memory usage zwu.kernel
2013-08-12 2:20 ` [[RESEND]PATCH v4 05/10] VFS hot tracking: Add an ioctl to get hot tracking information zwu.kernel
2013-08-12 2:20 ` [[RESEND]PATCH v4 06/10] VFS hot tracking: Add a /proc interface to make the interval tunable zwu.kernel
2013-08-12 2:20 ` [[RESEND]PATCH v4 07/10] VFS hot tracking: Add a /proc interfaces to control memory usage zwu.kernel
2013-08-12 2:20 ` [[RESEND]PATCH v4 08/10] VFS hot tracking: Add documentation zwu.kernel
2013-08-12 2:20 ` [[RESEND]PATCH v4 09/10] VFS hot tracking, btrfs: Add hot tracking support zwu.kernel
2013-08-12 2:20 ` [[RESEND]PATCH v4 10/10] VFS hot tracking, xfs: " zwu.kernel
2013-08-12 2:27 ` [[RESEND]PATCH v4 00/10] VFS hot tracking Zhi Yong Wu
2013-08-13 18:32 ` Jörn Engel
2013-08-13 21:22 ` Al Viro
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1376274024-28689-3-git-send-email-zwu.kernel@gmail.com \
--to=zwu.kernel@gmail.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=sekharan@us.ibm.com \
--cc=torvalds@linux-foundation.org \
--cc=viro@zeniv.linux.org.uk \
--cc=wuzhy@linux.vnet.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).