All of lore.kernel.org
 help / color / mirror / Atom feed
From: zwu.kernel@gmail.com
To: viro@zeniv.linux.org.uk
Cc: torvalds@linux-foundation.org, linux-fsdevel@vger.kernel.org,
	Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>,
	Chandra Seetharaman <sekharan@us.ibm.com>
Subject: [[RESEND]PATCH v4 02/10] VFS hot tracking: Track IO and record heat information
Date: Mon, 12 Aug 2013 10:20:16 +0800	[thread overview]
Message-ID: <1376274024-28689-3-git-send-email-zwu.kernel@gmail.com> (raw)
In-Reply-To: <1376274024-28689-1-git-send-email-zwu.kernel@gmail.com>

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

This patch adds read/write code paths: include read_pages(),
do_writepages(), do_generic_file_read() and __blockdev_direct_IO()
to record heat information.

When real disk i/o for an inode is done, its own hot_inode_item will
be created or updated in the RB tree for the filesystem, and the i/o freq for
all of its extents will also be created/updated in the RB-tree per inode.

Each of the two structures hot_inode_item and hot_range_item
contains a hot_freq_data struct with its frequency of access metrics
(number of {reads, writes}, last {read,write} time, frequency of
{reads,writes}).

Each hot_inode_item contains one hot_range_tree struct which is keyed by
{inode, offset, length} and used to keep track of all the ranges in this file.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/direct-io.c               |   5 +
 fs/hot_tracking.c            | 234 +++++++++++++++++++++++++++++++++++++++++++
 fs/hot_tracking.h            |   1 +
 fs/namei.c                   |   3 +
 include/linux/hot_tracking.h |  28 ++++++
 mm/filemap.c                 |   6 ++
 mm/page-writeback.c          |  12 +++
 mm/readahead.c               |   6 ++
 8 files changed, 295 insertions(+)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7ab90f5..0d7f8c7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -38,6 +38,7 @@
 #include <linux/atomic.h>
 #include <linux/prefetch.h>
 #include <linux/aio.h>
+#include "hot_tracking.h"
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
@@ -1295,6 +1296,10 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	prefetch(bdev->bd_queue);
 	prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
 
+	/* Hot tracking */
+	hot_update_freqs(inode, offset,
+			iov_length(iov, nr_segs), rw & WRITE);
+
 	return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
 				     nr_segs, get_block, end_io,
 				     submit_io, flags);
diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index 8a65472..e53fd36 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -23,6 +23,8 @@ static void hot_range_item_init(struct hot_range_item *hr,
 			struct hot_inode_item *he, loff_t start)
 {
 	kref_init(&hr->refs);
+	hr->freq.avg_delta_reads = (u64) -1;
+	hr->freq.avg_delta_writes = (u64) -1;
 	hr->start = start;
 	hr->len = hot_bit_shift(1, RANGE_BITS, true);
 	hr->hot_inode = he;
@@ -62,6 +64,64 @@ void hot_range_item_put(struct hot_range_item *hr)
 }
 EXPORT_SYMBOL_GPL(hot_range_item_put);
 
+struct hot_range_item
+*hot_range_item_lookup(struct hot_inode_item *he, loff_t start, int alloc)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct hot_range_item *hr, *hr_new = NULL;
+
+	start = hot_bit_shift(start, RANGE_BITS, true);
+
+	/* walk tree to find insertion point */
+redo:
+	spin_lock(&he->i_lock);
+	p = &he->hot_range_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		hr = rb_entry(parent, struct hot_range_item, rb_node);
+		if (start < hr->start)
+			p = &(*p)->rb_left;
+		else if (start > (hr->start + hr->len - 1))
+			p = &(*p)->rb_right;
+		else {
+			hot_range_item_get(hr);
+			if (hr_new) {
+				/*
+				 * Lost the race. Somebody else inserted
+				 * the item for the range. Free the
+				 * newly allocated item.
+				 */
+				hot_range_item_put(hr_new);
+			}
+			spin_unlock(&he->i_lock);
+
+			return hr;
+		}
+	}
+
+	if (hr_new) {
+		rb_link_node(&hr_new->rb_node, parent, p);
+		rb_insert_color(&hr_new->rb_node, &he->hot_range_tree);
+		hot_range_item_get(hr_new); /* For the caller */
+		spin_unlock(&he->i_lock);
+		return hr_new;
+	}
+        spin_unlock(&he->i_lock);
+
+	if (!alloc)
+		return ERR_PTR(-ENOENT);
+
+	hr_new = kmem_cache_zalloc(hot_range_item_cachep, GFP_NOFS);
+	if (!hr_new)
+		return ERR_PTR(-ENOMEM);
+
+	hot_range_item_init(hr_new, he, start);
+
+	goto redo;
+}
+EXPORT_SYMBOL_GPL(hot_range_item_lookup);
+
 /*
  * Free the entire hot_range_tree.
  */
@@ -85,6 +145,8 @@ static void hot_inode_item_init(struct hot_inode_item *he,
 			struct hot_info *root, u64 ino)
 {
 	kref_init(&he->refs);
+	he->freq.avg_delta_reads = (u64) -1;
+	he->freq.avg_delta_writes = (u64) -1;
 	he->i_ino = ino;
 	he->hot_root = root;
 	spin_lock_init(&he->i_lock);
@@ -124,6 +186,126 @@ void hot_inode_item_put(struct hot_inode_item *he)
 }
 EXPORT_SYMBOL_GPL(hot_inode_item_put);
 
+struct hot_inode_item
+*hot_inode_item_lookup(struct hot_info *root, u64 ino, int alloc)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct hot_inode_item *he, *he_new = NULL;
+
+	/* walk tree to find insertion point */
+redo:
+	spin_lock(&root->t_lock);
+	p = &root->hot_inode_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		he = rb_entry(parent, struct hot_inode_item, rb_node);
+		if (ino < he->i_ino)
+			p = &(*p)->rb_left;
+		else if (ino > he->i_ino)
+			p = &(*p)->rb_right;
+		else {
+			hot_inode_item_get(he);
+			if (he_new) {
+				/*
+				 * Lost the race. Somebody else inserted
+				 * the item for the inode. Free the
+				 * newly allocated item.
+				 */
+				hot_inode_item_put(he_new);
+			}
+			spin_unlock(&root->t_lock);
+
+			return he;
+		}
+	}
+
+	if (he_new) {
+		rb_link_node(&he_new->rb_node, parent, p);
+		rb_insert_color(&he_new->rb_node, &root->hot_inode_tree);
+		hot_inode_item_get(he_new); /* For the caller */
+		spin_unlock(&root->t_lock);
+		return he_new;
+	}
+	spin_unlock(&root->t_lock);
+
+	if (!alloc)
+		return ERR_PTR(-ENOENT);
+
+	he_new = kmem_cache_zalloc(hot_inode_item_cachep, GFP_NOFS);
+	if (!he_new)
+		return ERR_PTR(-ENOMEM);
+
+	hot_inode_item_init(he_new, root, ino);
+
+	goto redo;
+}
+EXPORT_SYMBOL_GPL(hot_inode_item_lookup);
+
+void hot_inode_item_unlink(struct inode *inode)
+{
+	struct hot_info *root = inode->i_sb->s_hot_root;
+	struct hot_inode_item *he;
+
+	if (!root || !S_ISREG(inode->i_mode))
+		return;
+
+	he = hot_inode_item_lookup(root, inode->i_ino, 0);
+	if (IS_ERR(he))
+                return;
+
+	spin_lock(&root->t_lock);
+	hot_inode_item_put(he);
+	hot_inode_item_put(he); /* For the caller */
+	spin_unlock(&root->t_lock);
+}
+EXPORT_SYMBOL_GPL(hot_inode_item_unlink);
+
+/*
+ * This function does the actual work of updating
+ * the frequency numbers.
+ *
+ * avg_delta_{reads,writes} are indeed a kind of simple moving
+ * average of the time difference between each of the last
+ * 2^(FREQ_POWER) reads/writes. If there have not yet been that
+ * many reads or writes, it's likely that the values will be very
+ * large; They are initialized to the largest possible value for the
+ * data type. Simply, we don't want a few fast access to a file to
+ * automatically make it appear very hot.
+ */
+static void hot_freq_calc(struct timespec old_atime,
+		struct timespec cur_time, u64 *avg)
+{
+	struct timespec delta_ts;
+	u64 new_delta;
+
+	delta_ts = timespec_sub(cur_time, old_atime);
+	new_delta = timespec_to_ns(&delta_ts) >> FREQ_POWER;
+
+	*avg = (*avg << FREQ_POWER) - *avg + new_delta;
+	*avg = *avg >> FREQ_POWER;
+}
+
+static void hot_freq_update(struct hot_info *root,
+		struct hot_freq *freq, bool write)
+{
+	struct timespec cur_time = current_kernel_time();
+
+	if (write) {
+		freq->nr_writes += 1;
+		hot_freq_calc(freq->last_write_time,
+				cur_time,
+				&freq->avg_delta_writes);
+		freq->last_write_time = cur_time;
+	} else {
+		freq->nr_reads += 1;
+		hot_freq_calc(freq->last_read_time,
+				cur_time,
+				&freq->avg_delta_reads);
+		freq->last_read_time = cur_time;
+	}
+}
+
 /*
  * Initialize kmem cache for hot_inode_item and hot_range_item.
  */
@@ -145,6 +327,58 @@ void __init hot_cache_init(void)
 }
 EXPORT_SYMBOL_GPL(hot_cache_init);
 
+/*
+ * Main function to update i/o access frequencies, and it will be called
+ * from read/writepages() hooks, which are read_pages(), do_writepages(),
+ * do_generic_file_read(), and __blockdev_direct_IO().
+ */
+void hot_update_freqs(struct inode *inode, loff_t start,
+			size_t len, int rw)
+{
+	struct hot_info *root = inode->i_sb->s_hot_root;
+	struct hot_inode_item *he;
+	struct hot_range_item *hr;
+	u64 range_size;
+	loff_t cur, end;
+
+	if (!root || (len == 0) || !S_ISREG(inode->i_mode))
+		return;
+
+	he = hot_inode_item_lookup(root, inode->i_ino, 1);
+	if (IS_ERR(he))
+		return;
+
+	hot_freq_update(root, &he->freq, rw);
+
+	/*
+	 * Align ranges on range size boundary
+	 * to prevent proliferation of range structs
+	 */
+	range_size  = hot_bit_shift(1, RANGE_BITS, true);
+	end = hot_bit_shift((start + len + range_size - 1),
+			RANGE_BITS, false);
+	cur = hot_bit_shift(start, RANGE_BITS, false);
+	for (; cur < end; cur++) {
+		hr = hot_range_item_lookup(he, cur, 1);
+		if (IS_ERR(hr)) {
+			WARN(1, "hot_range_item_lookup returns %ld\n",
+				PTR_ERR(hr));
+			return;
+		}
+
+		hot_freq_update(root, &hr->freq, rw);
+
+		spin_lock(&he->i_lock);
+		hot_range_item_put(hr);
+		spin_unlock(&he->i_lock);
+	}
+
+	spin_lock(&root->t_lock);
+	hot_inode_item_put(he);
+	spin_unlock(&root->t_lock);
+}
+EXPORT_SYMBOL_GPL(hot_update_freqs);
+
 static struct hot_info *hot_tree_init(struct super_block *sb)
 {
 	struct hot_info *root;
diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
index 2776092..bb4cb16 100644
--- a/fs/hot_tracking.h
+++ b/fs/hot_tracking.h
@@ -16,5 +16,6 @@
 
 /* size of sub-file ranges */
 #define RANGE_BITS 20
+#define FREQ_POWER 4
 
 #endif /* __HOT_TRACKING__ */
diff --git a/fs/namei.c b/fs/namei.c
index 89a612e..5ef5e8c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3454,6 +3454,9 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 	}
 	mutex_unlock(&dentry->d_inode->i_mutex);
 
+	if (!error && !dentry->d_inode->i_nlink)
+		hot_inode_item_unlink(dentry->d_inode);
+
 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
 	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
 		fsnotify_link_count(dentry->d_inode);
diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
index a7d128d..80b198e 100644
--- a/include/linux/hot_tracking.h
+++ b/include/linux/hot_tracking.h
@@ -34,8 +34,24 @@ enum {
 	MAX_TYPES,
 };
 
+/*
+ * A frequency data struct holds values that are used to
+ * determine temperature of files and file ranges. These structs
+ * are members of hot_inode_item and hot_range_item
+ */
+struct hot_freq {
+	struct timespec last_read_time;
+	struct timespec last_write_time;
+	u32 nr_reads;
+	u32 nr_writes;
+	u64 avg_delta_reads;
+	u64 avg_delta_writes;
+	u32 last_temp;
+};
+
 /* An item representing an inode and its access frequency */
 struct hot_inode_item {
+	struct hot_freq freq;           /* frequency data */
 	struct kref refs;
 	struct rb_node rb_node;         /* rbtree index */
 	struct rcu_head rcu;
@@ -50,6 +66,7 @@ struct hot_inode_item {
  * an inode whose frequency is being tracked
  */
 struct hot_range_item {
+	struct hot_freq freq;                   /* frequency data */
 	struct kref refs;
 	struct rb_node rb_node;                 /* rbtree index */
 	struct rcu_head rcu;
@@ -70,6 +87,17 @@ extern void hot_range_item_put(struct hot_range_item *hr);
 extern void hot_inode_item_put(struct hot_inode_item *he);
 extern void hot_range_item_get(struct hot_range_item *hr);
 extern void hot_inode_item_get(struct hot_inode_item *he);
+extern void hot_update_freqs(struct inode *inode,
+			loff_t start, size_t len, int rw);
+extern struct hot_range_item
+*hot_range_item_lookup(struct hot_inode_item *he,
+			loff_t start, int alloc);
+extern struct hot_inode_item
+*hot_inode_item_lookup(struct hot_info *root,
+			u64 ino, int alloc);
+extern void hot_inode_item_unlink(struct inode *inode);
+extern void hot_update_freqs(struct inode *inode, loff_t start,
+			size_t len, int rw);
 
 static inline u64 hot_bit_shift(u64 counter, u32 bits, bool dir)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index 4b51ac1..28d3ad9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/cleancache.h>
+#include <linux/hot_tracking.h>
 #include "internal.h"
 
 #define CREATE_TRACE_POINTS
@@ -1242,6 +1243,11 @@ readpage:
 		 * PG_error will be set again if readpage fails.
 		 */
 		ClearPageError(page);
+
+		/* Hot tracking */
+		hot_update_freqs(inode, page->index << PAGE_CACHE_SHIFT,
+				PAGE_CACHE_SIZE, 0);
+
 		/* Start the actual read. The read will unlock the page. */
 		error = mapping->a_ops->readpage(filp, page);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3f0c895..5b1744b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,6 +36,7 @@
 #include <linux/pagevec.h>
 #include <linux/timer.h>
 #include <linux/sched/rt.h>
+#include <linux/hot_tracking.h>
 #include <trace/events/writeback.h>
 
 /*
@@ -1921,13 +1922,24 @@ EXPORT_SYMBOL(generic_writepages);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	int ret;
+	loff_t start = 0;
+	size_t count = 0;
 
 	if (wbc->nr_to_write <= 0)
 		return 0;
+
+	start = mapping->writeback_index << PAGE_CACHE_SHIFT;
+	count = wbc->nr_to_write;
+
 	if (mapping->a_ops->writepages)
 		ret = mapping->a_ops->writepages(mapping, wbc);
 	else
 		ret = generic_writepages(mapping, wbc);
+
+	/* Hot tracking */
+	hot_update_freqs(mapping->host, start,
+			(count - wbc->nr_to_write) * PAGE_CACHE_SIZE, 1);
+
 	return ret;
 }
 
diff --git a/mm/readahead.c b/mm/readahead.c
index 829a77c..5867265 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/file.h>
+#include <linux/hot_tracking.h>
 
 /*
  * Initialise a struct file's readahead state.  Assumes that the caller has
@@ -115,6 +116,11 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 	unsigned page_idx;
 	int ret;
 
+	/* Hot tracking */
+	hot_update_freqs(mapping->host,
+			list_to_page(pages)->index << PAGE_CACHE_SHIFT,
+			(size_t)nr_pages * PAGE_CACHE_SIZE, 0);
+
 	blk_start_plug(&plug);
 
 	if (mapping->a_ops->readpages) {
-- 
1.7.11.7


  parent reply	other threads:[~2013-08-12  2:22 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-08-12  2:20 [[RESEND]PATCH v4 00/10] VFS hot tracking zwu.kernel
2013-08-12  2:20 ` [[RESEND]PATCH v4 01/10] VFS hot tracking: Define basic data structures and functions zwu.kernel
2013-08-12  2:20 ` zwu.kernel [this message]
2013-08-12  2:20 ` [[RESEND]PATCH v4 03/10] VFS hot tracking: Add a workqueue to move items between hot maps zwu.kernel
2013-08-12  2:20 ` [[RESEND]PATCH v4 04/10] VFS hot tracking: Add shrinker functionality to curtail memory usage zwu.kernel
2013-08-12  2:20 ` [[RESEND]PATCH v4 05/10] VFS hot tracking: Add an ioctl to get hot tracking information zwu.kernel
2013-08-12  2:20 ` [[RESEND]PATCH v4 06/10] VFS hot tracking: Add a /proc interface to make the interval tunable zwu.kernel
2013-08-12  2:20 ` [[RESEND]PATCH v4 07/10] VFS hot tracking: Add a /proc interfaces to control memory usage zwu.kernel
2013-08-12  2:20 ` [[RESEND]PATCH v4 08/10] VFS hot tracking: Add documentation zwu.kernel
2013-08-12  2:20 ` [[RESEND]PATCH v4 09/10] VFS hot tracking, btrfs: Add hot tracking support zwu.kernel
2013-08-12  2:20 ` [[RESEND]PATCH v4 10/10] VFS hot tracking, xfs: " zwu.kernel
2013-08-12  2:27 ` [[RESEND]PATCH v4 00/10] VFS hot tracking Zhi Yong Wu
2013-08-13 18:32 ` Jörn Engel
2013-08-13 21:22 ` Al Viro

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1376274024-28689-3-git-send-email-zwu.kernel@gmail.com \
    --to=zwu.kernel@gmail.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=sekharan@us.ibm.com \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=wuzhy@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.