All of lore.kernel.org
 help / color / mirror / Atom feed
From: zwu.kernel@gmail.com
To: linux-fsdevel@vger.kernel.org
Cc: viro@zeniv.linux.org.uk, sekharan@us.ibm.com,
	linuxram@us.ibm.com, david@fromorbit.com,
	chris.mason@fusionio.com, jbacik@fusionio.com,
	Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Subject: [PATCH v3 02/13] VFS hot tracking: add i/o freq tracking hooks
Date: Fri, 21 Jun 2013 20:17:11 +0800	[thread overview]
Message-ID: <1371817042-8556-3-git-send-email-zwu.kernel@gmail.com> (raw)
In-Reply-To: <1371817042-8556-1-git-send-email-zwu.kernel@gmail.com>

From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>

  Add i/o freq tracking hooks in real read/write code paths
which include read_pages(), do_writepages(), do_generic_file_read(),
and __blockdev_direct_IO().
  Currently whole FS has one RB tree to track i/o freqs for
all inodes which had real disk i/o, while every inode has its
own one RB tree to track i/o freqs for all of its extents.
  When real disk i/o for the inode are done, its own i/o freq will
be created or updated in the RB tree per FS, and the i/o freq for
all of its extents will also be done in the RB-tree per inode.
  Also, Each of the two structures hot_inode_item and hot_range_item
contains a hot_freq_data struct with its frequency of access metrics
(number of {reads, writes}, last {read,write} time, frequency of
{reads,writes}).
  Also, each hot_inode_item contains one hot_range_tree
struct which is keyed by {inode, offset, length}
and used to keep track of all the ranges in this file.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 fs/direct-io.c               |   5 +
 fs/hot_tracking.c            | 284 +++++++++++++++++++++++++++++++++++++++++++
 fs/hot_tracking.h            |   4 +
 fs/namei.c                   |   2 +
 include/linux/hot_tracking.h |  17 +++
 mm/filemap.c                 |   6 +
 mm/page-writeback.c          |  12 ++
 mm/readahead.c               |   6 +
 8 files changed, 336 insertions(+)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7ab90f5..6cb0598 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -38,6 +38,7 @@
 #include <linux/atomic.h>
 #include <linux/prefetch.h>
 #include <linux/aio.h>
+#include "hot_tracking.h"
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
@@ -1295,6 +1296,10 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	prefetch(bdev->bd_queue);
 	prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
 
+	/* Hot data tracking */
+	hot_update_freqs(inode, offset, iov_length(iov, nr_segs),
+			rw & WRITE);
+
 	return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
 				     nr_segs, get_block, end_io,
 				     submit_io, flags);
diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
index 6bf4229..cc899f4 100644
--- a/fs/hot_tracking.c
+++ b/fs/hot_tracking.c
@@ -26,6 +26,26 @@ static struct kmem_cache *hot_range_item_cachep __read_mostly;
 
 static void hot_inode_item_free(struct kref *kref);
 
+static void hot_comm_item_init(struct hot_comm_item *ci, int type)
+{
+	kref_init(&ci->refs);
+	clear_bit(HOT_DELETING, &ci->delete_flag);
+	memset(&ci->hot_freq_data, 0, sizeof(struct hot_freq_data));
+	ci->hot_freq_data.avg_delta_reads = (u64) -1;
+	ci->hot_freq_data.avg_delta_writes = (u64) -1;
+	ci->hot_freq_data.flags = type;
+}
+
+static void hot_range_item_init(struct hot_range_item *hr,
+			struct hot_inode_item *he, loff_t start)
+{
+	hr->start = start;
+	hr->len = hot_shift(1, RANGE_BITS, true);
+	hr->hot_inode = he;
+	hr->storage_type = -1;
+	hot_comm_item_init(&hr->hot_range, TYPE_RANGE);
+}
+
 static void hot_comm_item_free_cb(struct rcu_head *head)
 {
 	struct hot_comm_item *ci = container_of(head,
@@ -65,10 +85,27 @@ void hot_comm_item_put(struct hot_comm_item *ci)
 }
 EXPORT_SYMBOL_GPL(hot_comm_item_put);
 
+/*
+ * root->t_lock or he->i_lock is acquired in this function
+ */
 static void hot_comm_item_unlink(struct hot_info *root,
 				struct hot_comm_item *ci)
 {
 	if (!test_and_set_bit(HOT_DELETING, &ci->delete_flag)) {
+		if (ci->hot_freq_data.flags == TYPE_RANGE) {
+			struct hot_range_item *hr = container_of(ci,
+					struct hot_range_item, hot_range);
+			struct hot_inode_item *he = hr->hot_inode;
+
+			spin_lock(&he->i_lock);
+			rb_erase(&ci->rb_node, &he->hot_range_tree);
+			spin_unlock(&he->i_lock);
+		} else {
+			spin_lock(&root->t_lock);
+			rb_erase(&ci->rb_node, &root->hot_inode_tree);
+			spin_unlock(&root->t_lock);
+		}
+
 		hot_comm_item_put(ci);
 	}
 }
@@ -94,6 +131,15 @@ static void hot_range_tree_free(struct hot_inode_item *he)
 
 }
 
+static void hot_inode_item_init(struct hot_inode_item *he,
+			struct hot_info *hot_root, u64 ino)
+{
+	he->i_ino = ino;
+	he->hot_root = hot_root;
+	spin_lock_init(&he->i_lock);
+	hot_comm_item_init(&he->hot_inode, TYPE_INODE);
+}
+
 static void hot_inode_item_free(struct kref *kref)
 {
 	struct hot_comm_item *ci = container_of(kref,
@@ -107,6 +153,195 @@ static void hot_inode_item_free(struct kref *kref)
 	call_rcu(&he->hot_inode.c_rcu, hot_comm_item_free_cb);
 }
 
+/* root->t_lock is acquired in this function. */
+struct hot_inode_item
+*hot_inode_item_lookup(struct hot_info *root, u64 ino, int alloc)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct hot_comm_item *ci;
+	struct hot_inode_item *he, *he_new = NULL;
+
+	/* walk tree to find insertion point */
+redo:
+	spin_lock(&root->t_lock);
+	p = &root->hot_inode_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		ci = rb_entry(parent, struct hot_comm_item, rb_node);
+		he = container_of(ci, struct hot_inode_item, hot_inode);
+		if (ino < he->i_ino)
+			p = &(*p)->rb_left;
+		else if (ino > he->i_ino)
+			p = &(*p)->rb_right;
+		else {
+			hot_comm_item_get(&he->hot_inode);
+			spin_unlock(&root->t_lock);
+			if (he_new)
+				/*
+				 * Lost the race. Somebody else inserted
+				 * the item for the inode. Free the
+				 * newly allocated item.
+				 */
+				kmem_cache_free(hot_inode_item_cachep, he_new);
+
+			if (test_bit(HOT_DELETING, &he->hot_inode.delete_flag))
+				return ERR_PTR(-ENOENT);
+
+			return he;
+		}
+	}
+
+	if (he_new) {
+		rb_link_node(&he_new->hot_inode.rb_node, parent, p);
+		rb_insert_color(&he_new->hot_inode.rb_node,
+				&root->hot_inode_tree);
+		hot_comm_item_get(&he_new->hot_inode);
+		spin_unlock(&root->t_lock);
+		return he_new;
+	}
+	spin_unlock(&root->t_lock);
+
+	if (!alloc)
+		return ERR_PTR(-ENOENT);
+
+	he_new = kmem_cache_zalloc(hot_inode_item_cachep, GFP_NOFS);
+	if (!he_new)
+		return ERR_PTR(-ENOMEM);
+
+	hot_inode_item_init(he_new, root, ino);
+
+	goto redo;
+}
+EXPORT_SYMBOL_GPL(hot_inode_item_lookup);
+
+void hot_inode_item_delete(struct inode *inode)
+{
+	struct hot_info *root = inode->i_sb->s_hot_root;
+	struct hot_inode_item *he;
+
+	if (!root || !S_ISREG(inode->i_mode))
+		return;
+
+	he = hot_inode_item_lookup(root, inode->i_ino, 0);
+	if (IS_ERR(he))
+		return;
+
+	hot_comm_item_put(&he->hot_inode); /* for lookup */
+	hot_comm_item_unlink(root, &he->hot_inode);
+}
+EXPORT_SYMBOL_GPL(hot_inode_item_delete);
+
+/* he->i_lock is acquired in this function. */
+struct hot_range_item
+*hot_range_item_lookup(struct hot_inode_item *he, loff_t start, int alloc)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct hot_comm_item *ci;
+	struct hot_range_item *hr, *hr_new = NULL;
+
+	start = hot_shift(start, RANGE_BITS, true);
+
+	/* walk tree to find insertion point */
+redo:
+	spin_lock(&he->i_lock);
+	p = &he->hot_range_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		ci = rb_entry(parent, struct hot_comm_item, rb_node);
+		hr = container_of(ci, struct hot_range_item, hot_range);
+		if (start < hr->start)
+			p = &(*p)->rb_left;
+		else if (start > (hr->start + hr->len - 1))
+			p = &(*p)->rb_right;
+		else {
+			hot_comm_item_get(&hr->hot_range);
+			spin_unlock(&he->i_lock);
+			if(hr_new)
+				/*
+				 * Lost the race. Somebody else inserted
+				 * the item for the range. Free the
+				 * newly allocated item.
+				 */
+				kmem_cache_free(hot_range_item_cachep, hr_new);
+
+			if (test_bit(HOT_DELETING, &hr->hot_range.delete_flag))
+				return ERR_PTR(-ENOENT);
+
+			return hr;
+		}
+	}
+
+	if (hr_new) {
+		rb_link_node(&hr_new->hot_range.rb_node, parent, p);
+		rb_insert_color(&hr_new->hot_range.rb_node,
+				&he->hot_range_tree);
+		hot_comm_item_get(&hr_new->hot_range);
+		spin_unlock(&he->i_lock);
+		return hr_new;
+	}
+	spin_unlock(&he->i_lock);
+
+	if (!alloc)
+		return ERR_PTR(-ENOENT);
+
+	hr_new = kmem_cache_zalloc(hot_range_item_cachep, GFP_NOFS);
+	if (!hr_new)
+		return ERR_PTR(-ENOMEM);
+
+	hot_range_item_init(hr_new, he, start);
+
+	goto redo;
+}
+EXPORT_SYMBOL_GPL(hot_range_item_lookup);
+
+/*
+ * This function does the actual work of updating
+ * the frequency numbers.
+ *
+ * avg_delta_{reads,writes} are indeed a kind of simple moving
+ * average of the time difference between each of the last
+ * 2^(FREQ_POWER) reads/writes. If there have not yet been that
+ * many reads or writes, it's likely that the values will be very
+ * large; They are initialized to the largest possible value for the
+ * data type. Simply, we don't want a few fast access to a file to
+ * automatically make it appear very hot.
+ */
+static void hot_freq_calc(struct timespec old_atime,
+		struct timespec cur_time, u64 *avg)
+{
+	struct timespec delta_ts;
+	u64 new_delta;
+
+	delta_ts = timespec_sub(cur_time, old_atime);
+	new_delta = timespec_to_ns(&delta_ts) >> FREQ_POWER;
+
+	*avg = (*avg << FREQ_POWER) - *avg + new_delta;
+	*avg = *avg >> FREQ_POWER;
+}
+
+static void hot_freq_update(struct hot_info *root,
+		struct hot_comm_item *ci, bool write)
+{
+	struct timespec cur_time = current_kernel_time();
+	struct hot_freq_data *freq_data = &ci->hot_freq_data;
+
+	if (write) {
+		freq_data->nr_writes += 1;
+		hot_freq_calc(freq_data->last_write_time,
+				cur_time,
+				&freq_data->avg_delta_writes);
+		freq_data->last_write_time = cur_time;
+	} else {
+		freq_data->nr_reads += 1;
+		hot_freq_calc(freq_data->last_read_time,
+				cur_time,
+				&freq_data->avg_delta_reads);
+		freq_data->last_read_time = cur_time;
+	}
+}
+
 /*
  * Initialize kmem cache for hot_inode_item and hot_range_item.
  */
@@ -128,6 +363,55 @@ void __init hot_cache_init(void)
 }
 EXPORT_SYMBOL_GPL(hot_cache_init);
 
+/*
+ * Main function to update i/o access frequencies, and it will be called
+ * from read/writepages() hooks, which are read_pages(), do_writepages(),
+ * do_generic_file_read(), and __blockdev_direct_IO().
+ */
+void hot_update_freqs(struct inode *inode, loff_t start,
+			size_t len, int rw)
+{
+	struct hot_info *root = inode->i_sb->s_hot_root;
+	struct hot_inode_item *he;
+	struct hot_range_item *hr;
+	u64 range_size;
+	loff_t cur, end;
+
+	if (!root || (len == 0) || !S_ISREG(inode->i_mode))
+		return;
+
+	he = hot_inode_item_lookup(root, inode->i_ino, 1);
+	if (IS_ERR(he))
+		return;
+
+	hot_freq_update(root, &he->hot_inode, rw);
+
+	/*
+	 * Align ranges on range size boundary
+	 * to prevent proliferation of range structs
+	 */
+	range_size  = hot_shift(1, RANGE_BITS, true);
+	end = hot_shift((start + len + range_size - 1),
+			RANGE_BITS, false);
+	cur = hot_shift(start, RANGE_BITS, false);
+	for (; cur < end; cur++) {
+		hr = hot_range_item_lookup(he, cur, 1);
+		if (IS_ERR(hr)) {
+			WARN(1, "hot_range_item_lookup returns %ld\n",
+				PTR_ERR(hr));
+			hot_comm_item_put(&he->hot_inode);
+			return;
+		}
+
+		hot_freq_update(root, &hr->hot_range, rw);
+
+		hot_comm_item_put(&hr->hot_range);
+	}
+
+	hot_comm_item_put(&he->hot_inode);
+}
+EXPORT_SYMBOL_GPL(hot_update_freqs);
+
 static struct hot_info *hot_tree_init(struct super_block *sb)
 {
 	struct hot_info *root;
diff --git a/fs/hot_tracking.h b/fs/hot_tracking.h
index a2ee95f..bb4cb16 100644
--- a/fs/hot_tracking.h
+++ b/fs/hot_tracking.h
@@ -14,4 +14,8 @@
 
 #include <linux/hot_tracking.h>
 
+/* size of sub-file ranges */
+#define RANGE_BITS 20
+#define FREQ_POWER 4
+
 #endif /* __HOT_TRACKING__ */
diff --git a/fs/namei.c b/fs/namei.c
index 9ed9361..5685445 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3394,6 +3394,8 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 	if (!dir->i_op->unlink)
 		return -EPERM;
 
+	hot_inode_item_delete(dentry->d_inode);
+
 	mutex_lock(&dentry->d_inode->i_mutex);
 	if (d_mountpoint(dentry))
 		error = -EBUSY;
diff --git a/include/linux/hot_tracking.h b/include/linux/hot_tracking.h
index b57de1f..1437248 100644
--- a/include/linux/hot_tracking.h
+++ b/include/linux/hot_tracking.h
@@ -67,6 +67,8 @@ struct hot_inode_item {
 	struct hot_comm_item hot_inode; /* node in hot_inode_tree */
 	struct rb_root hot_range_tree;	/* tree of ranges */
 	spinlock_t i_lock;		/* protect above tree */
+	struct hot_info *hot_root;	/* associated hot_info */
+	u64 i_ino;			/* inode number from inode */
 };
 
 /*
@@ -76,6 +78,9 @@ struct hot_inode_item {
 struct hot_range_item {
 	struct hot_comm_item hot_range;
 	struct hot_inode_item *hot_inode;	/* associated hot_inode_item */
+	loff_t start;				/* offset in bytes */
+	size_t len;				/* length in bytes */
+	int storage_type;			/* type of storage */
 };
 
 struct hot_info {
@@ -89,6 +94,13 @@ extern void __init hot_cache_init(void);
 extern int hot_track_init(struct super_block *sb);
 extern void hot_track_exit(struct super_block *sb);
 extern void hot_comm_item_put(struct hot_comm_item *ci);
+extern void hot_update_freqs(struct inode *inode, loff_t start,
+				size_t len, int rw);
+extern struct hot_inode_item *hot_inode_item_lookup(struct hot_info *root,
+						u64 ino, int alloc);
+extern struct hot_range_item *hot_range_item_lookup(struct hot_inode_item *he,
+						loff_t start, int alloc);
+extern void hot_inode_item_delete(struct inode *inode);
 
 static inline u64 hot_shift(u64 counter, u32 bits, bool dir)
 {
@@ -98,6 +110,11 @@ static inline u64 hot_shift(u64 counter, u32 bits, bool dir)
 		return counter >> bits;
 }
 
+static inline void hot_comm_item_get(struct hot_comm_item *ci)
+{
+	kref_get(&ci->refs);
+}
+
 #endif /* __KERNEL__ */
 
 #endif  /* _LINUX_HOTTRACK_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 7905fe7..eb64c49 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/cleancache.h>
+#include <linux/hot_tracking.h>
 #include "internal.h"
 
 #define CREATE_TRACE_POINTS
@@ -1242,6 +1243,11 @@ readpage:
 		 * PG_error will be set again if readpage fails.
 		 */
 		ClearPageError(page);
+
+		/* Hot data tracking */
+		hot_update_freqs(inode, (loff_t)page->index << PAGE_CACHE_SHIFT,
+				PAGE_CACHE_SIZE, 0);
+
 		/* Start the actual read. The read will unlock the page. */
 		error = mapping->a_ops->readpage(filp, page);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4514ad7..4bbca3a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,6 +36,7 @@
 #include <linux/pagevec.h>
 #include <linux/timer.h>
 #include <linux/sched/rt.h>
+#include <linux/hot_tracking.h>
 #include <trace/events/writeback.h>
 
 /*
@@ -1921,13 +1922,24 @@ EXPORT_SYMBOL(generic_writepages);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	int ret;
+	loff_t start = 0;
+	size_t count = 0;
 
 	if (wbc->nr_to_write <= 0)
 		return 0;
+
+	start = mapping->writeback_index << PAGE_CACHE_SHIFT;
+	count = wbc->nr_to_write;
+
 	if (mapping->a_ops->writepages)
 		ret = mapping->a_ops->writepages(mapping, wbc);
 	else
 		ret = generic_writepages(mapping, wbc);
+
+	/* Hot data tracking */
+	hot_update_freqs(mapping->host, start,
+			(count - wbc->nr_to_write) * PAGE_CACHE_SIZE, 1);
+
 	return ret;
 }
 
diff --git a/mm/readahead.c b/mm/readahead.c
index daed28d..901396b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/file.h>
+#include <linux/hot_tracking.h>
 
 /*
  * Initialise a struct file's readahead state.  Assumes that the caller has
@@ -115,6 +116,11 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 	unsigned page_idx;
 	int ret;
 
+	/* Hot data tracking */
+	hot_update_freqs(mapping->host,
+			list_to_page(pages)->index << PAGE_CACHE_SHIFT,
+			(size_t)nr_pages * PAGE_CACHE_SIZE, 0);
+
 	blk_start_plug(&plug);
 
 	if (mapping->a_ops->readpages) {
-- 
1.7.11.7


  parent reply	other threads:[~2013-06-21 12:48 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-06-21 12:17 [PATCH v3 00/13] VFS hot tracking zwu.kernel
2013-06-21 12:17 ` [PATCH v3 01/13] VFS hot tracking: introduce some data structures zwu.kernel
2013-06-21 12:17 ` zwu.kernel [this message]
2013-06-21 12:17 ` [PATCH v3 03/13] VFS hot tracking: add one wq to update hot map zwu.kernel
2013-06-21 12:17 ` [PATCH v3 04/13] VFS hot tracking: register one shrinker zwu.kernel
2013-06-21 12:17 ` [PATCH v3 05/13] VFS hot tracking, rcu: introduce one rcu macro for list zwu.kernel
2013-06-21 12:17 ` [PATCH v3 06/13] VFS hot tracking, seq_file: add seq_list rcu interfaces zwu.kernel
2013-06-21 12:17 ` [PATCH v3 07/13] VFS hot tracking: add debugfs support zwu.kernel
2013-06-21 12:17 ` [PATCH v3 08/13] VFS hot tracking: add one ioctl interface zwu.kernel
2013-06-21 12:17 ` [PATCH v3 09/13] VFS hot tracking, procfs: add one proc interface zwu.kernel
2013-06-21 12:17 ` [PATCH v3 10/13] VFS hot tracking: add memory caping function zwu.kernel
2013-06-21 12:17 ` [PATCH v3 11/13] VFS hot tracking, btrfs: add hot tracking support zwu.kernel
2013-06-21 12:17 ` [PATCH v3 12/13] VFS hot tracking: add documentation zwu.kernel
2013-06-21 12:17 ` [PATCH v3 13/13] VFS hot tracking: add fs hot type support zwu.kernel
2013-06-24 13:41 ` [PATCH v3 00/13] VFS hot tracking Zhi Yong Wu
2013-06-28 16:03 ` Al Viro
2013-07-01 13:19   ` Zhi Yong Wu
2013-07-03 13:30     ` Al Viro
2013-07-03 15:16       ` Zhi Yong Wu
2013-07-08 12:44         ` Zhi Yong Wu
2013-07-02 12:45   ` Zhi Yong Wu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1371817042-8556-3-git-send-email-zwu.kernel@gmail.com \
    --to=zwu.kernel@gmail.com \
    --cc=chris.mason@fusionio.com \
    --cc=david@fromorbit.com \
    --cc=jbacik@fusionio.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linuxram@us.ibm.com \
    --cc=sekharan@us.ibm.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=wuzhy@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.