linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] [RFC] Range tree implementation
@ 2012-02-10  0:16 John Stultz
  0 siblings, 0 replies; 15+ messages in thread
From: John Stultz @ 2012-02-10  0:16 UTC (permalink / raw)
  To: linux-kernel
  Cc: John Stultz, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel

After Andrew suggested something like his mumbletree idea
to better store a list of ranges, I worked on a few different
approaches, and this is what I've finally managed to get working.

I suspect range-tree isn't a totally accurate name, but I
couldn't quite make out the difference between range trees
and interval trees, so I just picked one to call it. Do
let me know if you have a better name.

The idea of storing ranges in a tree is nice, but has a number
of complications. When adding a range, its possible that a
large range will consume and merge a number of smaller ranges.
When removing a range, its possible you may end up splitting an
existing range, causing one range to become two. This makes it
very difficult to provide generic list_head like behavior, as
the parent structures would need to be duplicated and removed,
and that has lots of memory ownership issues.

So, this is a much simplified and more list_head like
implementation. You can add a node to a tree, or remove a node
to a tree, but the generic implementation doesn't do the
merging or splitting for you. But it does provide helpers to
find overlapping and adjacent ranges.

I made the tree self-balancing via splaying as it seemed easier
to handle with the merging/splitting cases I originally tried to
make the generic code handle, but since I've dropped that, I
suspect it can be reworked to use a rbtree. I just wanted to get
this out for initial review.

Andrew also really wanted this range-tree implementation to be
resuable so we don't duplicate the file locking logic. I'm not
totally convinced that the requirements between the volatile
ranges and file locking are really equivelent, but this reduced
impelementation may make it possible.

Do let me know what you think or if you have other ideas for
better ways to do the same.

thanks
-john

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Android Kernel Team <kernel-team@android.com>
CC: Robert Love <rlove@google.com>
CC: Mel Gorman <mel@csn.ul.ie>
CC: Hugh Dickins <hughd@google.com>
CC: Dave Hansen <dave@linux.vnet.ibm.com>
CC: Rik van Riel <riel@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/rangetree.h |   35 +++++
 lib/Makefile              |    2 +-
 lib/rangetree.c           |  325 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 361 insertions(+), 1 deletions(-)
 create mode 100644 include/linux/rangetree.h
 create mode 100644 lib/rangetree.c

diff --git a/include/linux/rangetree.h b/include/linux/rangetree.h
new file mode 100644
index 0000000..998ebcc
--- /dev/null
+++ b/include/linux/rangetree.h
@@ -0,0 +1,35 @@
+#ifndef _LINUX_RANGETREE_H
+#define _LINUX_RANGETREE_H
+
+#include <linux/types.h>
+#include <linux/fs.h>
+
+struct range_tree_node {
+	struct range_tree_node *parent;
+	struct range_tree_node *left;
+	struct range_tree_node *right;
+	u64 start;
+	u64 end;
+};
+
+static inline void range_tree_node_init(struct range_tree_node *node)
+{
+	node->parent = NULL;
+	node->left = NULL;
+	node->right = NULL;
+	node->start = 0;
+	node->end = 0;
+}
+
+extern struct range_tree_node *range_tree_in_range(struct range_tree_node *root,
+							 u64 start, u64 end);
+extern struct range_tree_node *range_tree_in_range_adjacent(
+						struct range_tree_node *root,
+							 u64 start, u64 end);
+extern struct range_tree_node *range_tree_add(struct range_tree_node *root,
+						struct range_tree_node *node);
+extern struct range_tree_node *range_tree_remove(struct range_tree_node *root,
+						struct range_tree_node *node);
+#endif
+
+
diff --git a/lib/Makefile b/lib/Makefile
index 18515f0..f43ef0d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -12,7 +12,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
 	 proportions.o prio_heap.o ratelimit.o show_mem.o \
-	 is_single_threaded.o plist.o decompress.o
+	 is_single_threaded.o plist.o decompress.o rangetree.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/rangetree.c b/lib/rangetree.c
new file mode 100644
index 0000000..db20665
--- /dev/null
+++ b/lib/rangetree.c
@@ -0,0 +1,325 @@
+#include <linux/rangetree.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+/**
+ * rotate_right - Splay tree helper
+ * @node: node to be rotated right
+ * @root: tree root
+ *
+ * Returns the tree root after rotating the node right
+ */
+static struct range_tree_node *rotate_right(struct range_tree_node *node,
+						struct range_tree_node *root)
+{
+	struct range_tree_node *new_root, *new_right;
+
+	if (!node->left)
+		return root;
+
+	new_root = node->left;
+	new_right = node;
+
+	if (root == node)
+		root = new_root;
+
+	new_right->left = new_root->right;
+	new_root->parent = new_right->parent;
+	if (new_root->parent) {
+		if (new_root->parent->right == new_right)
+			new_root->parent->right = new_root;
+		else
+			new_root->parent->left = new_root;
+	}
+	new_right->parent = new_root;
+
+	new_root->right = new_right;
+
+	if (new_right->left)
+		new_right->left->parent = new_right;
+
+
+	/* Paranoid sanity checking */
+	if (new_root->left)
+		BUG_ON(new_root->left->parent != new_root);
+	if (new_root->right)
+		BUG_ON(new_root->right->parent != new_root);
+	if (new_right->left)
+		BUG_ON(new_right->left->parent != new_right);
+	if (new_right->right)
+		BUG_ON(new_right->right->parent != new_right);
+
+
+	return root;
+
+}
+
+/**
+ * rotate_left - Splay tree helper
+ * @node: node to be rotated left
+ * @root: tree root
+ *
+ * Returns the tree root after rotating the node left
+ */
+static struct range_tree_node *rotate_left(struct range_tree_node *node,
+						struct range_tree_node *root)
+{
+	struct range_tree_node *new_root, *new_left;
+
+	if (!node->right)
+		return root;
+
+	new_root = node->right;
+	new_left = node;
+
+	if (root == node)
+		root = new_root;
+
+	new_left->right = new_root->left;
+	if (new_left->right)
+		new_left->right->parent = new_left;
+	new_root->parent = new_left->parent;
+	if (new_root->parent) {
+		if (new_root->parent->left == new_left)
+			new_root->parent->left = new_root;
+		else
+			new_root->parent->right = new_root;
+	}
+	new_left->parent = new_root;
+	new_root->left = new_left;
+
+
+	/* Paranoid sanity checking */
+	if (new_root->left)
+		BUG_ON(new_root->left->parent != new_root);
+	if (new_root->right)
+		BUG_ON(new_root->right->parent != new_root);
+	if (new_left->left)
+		BUG_ON(new_left->left->parent != new_left);
+	if (new_left->right)
+		BUG_ON(new_left->right->parent != new_left);
+
+	return root;
+}
+
+/**
+ * splay_tree Splays a node to the top of a tree
+ * @root: root of the splay tree
+ * @node: node to be splayed to the root
+ *
+ * Returns the root of a tree after splaying
+ */
+static struct range_tree_node *splay_tree(struct range_tree_node *root,
+						struct range_tree_node *node)
+{
+restart:
+	if (root == node)
+		return root;
+
+	if (node->parent == root) {
+		if (root->left == node)
+			root = rotate_right(root, root);
+		else
+			root = rotate_left(root, root);
+		return root;
+	} else {
+		struct range_tree_node *parent, *grandparent;
+
+		parent = node->parent;
+		grandparent = parent->parent;
+
+		if ((node == parent->left) && (parent == grandparent->left)) {
+			root = rotate_right(grandparent, root);
+			root = rotate_right(parent, root);
+		} else if ((node == parent->right) &&
+					(parent == grandparent->right)) {
+			root = rotate_left(grandparent, root);
+			root = rotate_left(parent, root);
+		} else if ((node == parent->right) &&
+					(parent == grandparent->left)) {
+			root = rotate_left(parent, root);
+			root = rotate_right(grandparent, root);
+		} else if ((node == parent->left) &&
+					(parent == grandparent->right)) {
+			root = rotate_right(parent, root);
+			root = rotate_left(grandparent, root);
+		} else {
+			BUG_ON(1); /* Something is odd */
+		}
+		goto restart;
+	}
+}
+
+
+/**
+ * range_tree_in_range - Returns the first node that overlaps with the
+ *                       given range
+ * @root: range_tree root
+ * @start: range start
+ * @end: range end
+ *
+ */
+struct range_tree_node *range_tree_in_range(struct range_tree_node *root,
+						u64 start, u64 end)
+{
+	struct range_tree_node *candidate = root;
+
+	if (!candidate)
+		return 0;
+restart:
+	if (end < candidate->start) {
+		if (candidate->left) {
+			candidate = candidate->left;
+			goto restart;
+		}
+	} else if (start > candidate->end) {
+		if (candidate->right) {
+			candidate = candidate->right;
+			goto restart;
+		}
+	} else
+		return candidate;
+	return 0;
+}
+
+
+/**
+ * range_tree_in_range - Returns the first node that overlaps or is adjacent
+ *                       with the given range
+ * @root: range_tree root
+ * @start: range start
+ * @end: range end
+ *
+ */
+struct range_tree_node *range_tree_in_range_adjacent(
+					struct range_tree_node *root,
+					u64 start, u64 end)
+{
+	struct range_tree_node *candidate = root;
+
+	if (!candidate)
+		return 0;
+restart:
+	if (end + 1 < candidate->start) {
+		if (candidate->left) {
+			candidate = candidate->left;
+			goto restart;
+		}
+	} else if (start > candidate->end + 1) {
+		if (candidate->right) {
+			candidate = candidate->right;
+			goto restart;
+		}
+	} else
+		return candidate;
+	return 0;
+}
+
+/**
+ * range_tree_add - Add a node to a range tree
+ * @root: range tree to be added to
+ * @node: range_tree_node to be added
+ *
+ * Adds a node to the range tree.
+ */
+struct range_tree_node *range_tree_add(struct range_tree_node *root,
+					struct range_tree_node *node)
+{
+	struct range_tree_node *candidate;
+	/* make sure its not connected */
+	BUG_ON(node->parent || node->left || node->right);
+
+	if (!root)
+		return node;
+
+	candidate = root;
+restart:
+	if (node->start < candidate->start) {
+		if (candidate->left) {
+			candidate = candidate->left;
+			goto restart;
+		}
+		candidate->left = node;
+		node->parent = candidate;
+	} else if (node->start > candidate->start) {
+		if (candidate->right) {
+			candidate = candidate->right;
+			goto restart;
+		}
+		candidate->right = node;
+		node->parent = candidate;
+	}
+
+	root = splay_tree(root, node);
+	return root;
+}
+
+/**
+ * range_tree_merge - Helper to merge two range sub-trees
+ * @left: left subtree to be merged
+ * @right: right subtree to be merged
+ *
+ * Returns a merged range tree of two subtrees. left subtree
+ * must be all less then the right subtree.
+ */
+static struct range_tree_node *range_tree_merge(struct range_tree_node *left,
+						struct range_tree_node *right)
+{
+	struct range_tree_node *merge;
+
+	if (!left)
+		return right;
+	if (!right)
+		return left;
+
+	merge = left;
+	/* grab the right-most node on the left side */
+	while (merge->right)
+		merge = merge->right;
+	merge->right = right;
+	if (right)
+		right->parent = merge;
+
+	return left;
+}
+
+/**
+ * null_node: Helper that clears node data
+ * @node: Node to be cleared
+ */
+static void null_node(struct range_tree_node *node)
+{
+	node->left = node->right = node->parent = NULL;
+	node->start = node->end = 0;
+}
+
+/**
+ * range_tree_remove: Removes a given node from the tree
+ * @root: root of tree
+ * @node: Node to be removed
+ *
+ * Removes a node and splays the tree
+ */
+struct range_tree_node *range_tree_remove(struct range_tree_node *root,
+						struct range_tree_node *node)
+{
+	struct range_tree_node *subtree;
+
+	subtree = range_tree_merge(node->left, node->right);
+
+	if (subtree)
+		subtree->parent = node->parent;
+
+	if (node->parent && node->parent->left == node)
+		node->parent->left = subtree;
+	if (node->parent && node->parent->right == node)
+		node->parent->right = subtree;
+
+	null_node(node);
+	if (node == root)
+		return subtree;
+
+	if (subtree)
+		root = splay_tree(root, subtree);
+	return root;
+}
-- 
1.7.3.2.146.gca209


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 1/2] [RFC] Range tree implementation
  2012-03-16 22:51 [PATCH 0/2] [RFC] Volatile ranges (v4) John Stultz
@ 2012-03-16 22:51 ` John Stultz
  2012-03-20 10:00   ` Dmitry Adamushko
  2012-03-20 16:44   ` Aneesh Kumar K.V
  0 siblings, 2 replies; 15+ messages in thread
From: John Stultz @ 2012-03-16 22:51 UTC (permalink / raw)
  To: linux-kernel
  Cc: John Stultz, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel,
	Dmitry Adamushko, Dave Chinner, Neil Brown, Andrea Righi,
	Aneesh Kumar K.V

After Andrew suggested something like his mumbletree idea
to better store a list of ranges, I worked on a few different
approaches, and this is what I've finally managed to get working.

I suspect range-tree isn't a totally accurate name, but I
couldn't quite make out the difference between range trees
and interval trees, so I just picked one to call it. Do
let me know if you have a better name.

The idea of storing ranges in a tree is nice, but has a number
of complications. When adding a range, its possible that a
large range will consume and merge a number of smaller ranges.
When removing a range, its possible you may end up splitting an
existing range, causing one range to become two. This makes it
very difficult to provide generic list_head like behavior, as
the parent structures would need to be duplicated and removed,
and that has lots of memory ownership issues.

So, this is a much simplified and more list_head like
implementation. You can add a node to a tree, or remove a node
to a tree, but the generic implementation doesn't do the
merging or splitting for you. But it does provide helpers to
find overlapping and adjacent ranges.

Andrew also really wanted this range-tree implementation to be
resuable so we don't duplicate the file locking logic. I'm not
totally convinced that the requirements between the volatile
ranges and file locking are really equivelent, but this reduced
impelementation may make it possible.

Do let me know what you think or if you have other ideas for
better ways to do the same.

Changelog:
v2:
* Reworked code to use an rbtree instead of splaying

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Android Kernel Team <kernel-team@android.com>
CC: Robert Love <rlove@google.com>
CC: Mel Gorman <mel@csn.ul.ie>
CC: Hugh Dickins <hughd@google.com>
CC: Dave Hansen <dave@linux.vnet.ibm.com>
CC: Rik van Riel <riel@redhat.com>
CC: Dmitry Adamushko <dmitry.adamushko@gmail.com>
CC: Dave Chinner <david@fromorbit.com>
CC: Neil Brown <neilb@suse.de>
CC: Andrea Righi <andrea@betterlinux.com>
CC: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/rangetree.h |   53 +++++++++++++++++++++++
 lib/Makefile              |    2 +-
 lib/rangetree.c           |  105 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 159 insertions(+), 1 deletions(-)
 create mode 100644 include/linux/rangetree.h
 create mode 100644 lib/rangetree.c

diff --git a/include/linux/rangetree.h b/include/linux/rangetree.h
new file mode 100644
index 0000000..ca03821
--- /dev/null
+++ b/include/linux/rangetree.h
@@ -0,0 +1,53 @@
+#ifndef _LINUX_RANGETREE_H
+#define _LINUX_RANGETREE_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+struct range_tree_node {
+	struct rb_node rb;
+	u64 start;
+	u64 end;
+};
+
+struct range_tree_root {
+	struct rb_root head;
+};
+
+static inline void range_tree_init(struct range_tree_root *root)
+{
+	root->head = RB_ROOT;
+}
+
+static inline void range_tree_node_init(struct range_tree_node *node)
+{
+	rb_init_node(&node->rb);
+	node->start = 0;
+	node->end = 0;
+}
+
+static inline int range_tree_empty(struct range_tree_root *root)
+{
+	return RB_EMPTY_ROOT(&root->head);
+}
+
+static inline
+struct range_tree_node *range_tree_root_node(struct range_tree_root *root)
+{
+	struct range_tree_node *ret;
+	ret = container_of(root->head.rb_node, struct range_tree_node, rb);
+	return ret;
+}
+
+extern struct range_tree_node *range_tree_in_range(struct range_tree_root *root,
+							 u64 start, u64 end);
+extern struct range_tree_node *range_tree_in_range_adjacent(
+						struct range_tree_root *root,
+							 u64 start, u64 end);
+extern void range_tree_add(struct range_tree_root *root,
+						struct range_tree_node *node);
+extern void range_tree_remove(struct range_tree_root *root,
+						struct range_tree_node *node);
+#endif
+
+
diff --git a/lib/Makefile b/lib/Makefile
index 18515f0..f43ef0d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -12,7 +12,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
 	 proportions.o prio_heap.o ratelimit.o show_mem.o \
-	 is_single_threaded.o plist.o decompress.o
+	 is_single_threaded.o plist.o decompress.o rangetree.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/rangetree.c b/lib/rangetree.c
new file mode 100644
index 0000000..0f6208a
--- /dev/null
+++ b/lib/rangetree.c
@@ -0,0 +1,105 @@
+#include <linux/rangetree.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+
+/**
+ * range_tree_in_range - Returns the first node that overlaps with the
+ *                       given range
+ * @root: range_tree root
+ * @start: range start
+ * @end: range end
+ *
+ */
+struct range_tree_node *range_tree_in_range(struct range_tree_root *root,
+						u64 start, u64 end)
+{
+	struct rb_node **p = &root->head.rb_node;
+	struct range_tree_node *candidate;
+
+	while (*p) {
+		candidate = rb_entry(*p, struct range_tree_node, rb);
+		if (end < candidate->start)
+			p = &(*p)->rb_left;
+		else if (start > candidate->end)
+			p = &(*p)->rb_right;
+		else
+			return candidate;
+	}
+
+	return 0;
+}
+
+
+/**
+ * range_tree_in_range - Returns the first node that overlaps or is adjacent
+ *                       with the given range
+ * @root: range_tree root
+ * @start: range start
+ * @end: range end
+ *
+ */
+struct range_tree_node *range_tree_in_range_adjacent(
+					struct range_tree_root *root,
+					u64 start, u64 end)
+{
+	struct rb_node **p = &root->head.rb_node;
+	struct range_tree_node *candidate;
+
+	while (*p) {
+		candidate = rb_entry(*p, struct range_tree_node, rb);
+		if (end+1 < candidate->start)
+			p = &(*p)->rb_left;
+		else if (start > candidate->end + 1)
+			p = &(*p)->rb_right;
+		else
+			return candidate;
+	}
+	return 0;
+}
+
+/**
+ * range_tree_add - Add a node to a range tree
+ * @root: range tree to be added to
+ * @node: range_tree_node to be added
+ *
+ * Adds a node to the range tree.
+ */
+void range_tree_add(struct range_tree_root *root,
+					struct range_tree_node *node)
+{
+	struct rb_node **p = &root->head.rb_node;
+	struct rb_node *parent = NULL;
+	struct range_tree_node *ptr;
+
+	WARN_ON_ONCE(!RB_EMPTY_NODE(&node->rb));
+
+	while (*p) {
+		parent = *p;
+		ptr = rb_entry(parent, struct range_tree_node, rb);
+		if (node->start < ptr->start)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&node->rb, parent, p);
+	rb_insert_color(&node->rb, &root->head);
+
+}
+
+
+/**
+ * range_tree_remove: Removes a given node from the tree
+ * @root: root of tree
+ * @node: Node to be removed
+ *
+ * Removes a node and splays the tree
+ */
+void range_tree_remove(struct range_tree_root *root,
+						struct range_tree_node *node)
+{
+	WARN_ON_ONCE(RB_EMPTY_NODE(&node->rb));
+
+	rb_erase(&node->rb, &root->head);
+	RB_CLEAR_NODE(&node->rb);
+}
-- 
1.7.3.2.146.gca209


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] [RFC] Range tree implementation
  2012-03-16 22:51 ` [PATCH 1/2] [RFC] Range tree implementation John Stultz
@ 2012-03-20 10:00   ` Dmitry Adamushko
  2012-03-20 18:04     ` John Stultz
  2012-03-20 16:44   ` Aneesh Kumar K.V
  1 sibling, 1 reply; 15+ messages in thread
From: Dmitry Adamushko @ 2012-03-20 10:00 UTC (permalink / raw)
  To: John Stultz
  Cc: linux-kernel, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel, Dave Chinner,
	Neil Brown, Andrea Righi, Aneesh Kumar K.V

Hi John,

On 16 March 2012 23:51, John Stultz <john.stultz@linaro.org> wrote:
> After Andrew suggested something like his mumbletree idea
> to better store a list of ranges, I worked on a few different
> approaches, and this is what I've finally managed to get working.
>
> I suspect range-tree isn't a totally accurate name, but I
> couldn't quite make out the difference between range trees
> and interval trees, so I just picked one to call it. Do
> let me know if you have a better name.
>
> The idea of storing ranges in a tree is nice, but has a number
> of complications. When adding a range, its possible that a
> large range will consume and merge a number of smaller ranges.

Have you considered using 'prio_tree' (include/linux/prio_tree.h)? If
we aim at addressing a wide range of possible use-cases (different
patterns of adding/removing volatile ranges), then, at first glance,
prio_tree looks like a better approach.

e.g. for the "consume and merge a number of smaller ranges" scenario
above, prio_tree gives O(log n) [ O(log n + m) ] behavior iso O(m log
n) in your case.

--Dmitry

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] [RFC] Range tree implementation
  2012-03-16 22:51 ` [PATCH 1/2] [RFC] Range tree implementation John Stultz
  2012-03-20 10:00   ` Dmitry Adamushko
@ 2012-03-20 16:44   ` Aneesh Kumar K.V
  1 sibling, 0 replies; 15+ messages in thread
From: Aneesh Kumar K.V @ 2012-03-20 16:44 UTC (permalink / raw)
  To: John Stultz, linux-kernel
  Cc: John Stultz, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel,
	Dmitry Adamushko, Dave Chinner, Neil Brown, Andrea Righi

John Stultz <john.stultz@linaro.org> writes:


....

> +/**
> + * range_tree_in_range - Returns the first node that overlaps with the
> + *                       given range
> + * @root: range_tree root
> + * @start: range start
> + * @end: range end
> + *
> + */
> +struct range_tree_node *range_tree_in_range(struct range_tree_root *root,
> +						u64 start, u64 end)
> +{
> +	struct rb_node **p = &root->head.rb_node;
> +	struct range_tree_node *candidate;
> +
> +	while (*p) {
> +		candidate = rb_entry(*p, struct range_tree_node, rb);
> +		if (end < candidate->start)
> +			p = &(*p)->rb_left;
> +		else if (start > candidate->end)
> +			p = &(*p)->rb_right;
> +		else
> +			return candidate;
> +	}
> +
> +	return 0;


return NULL ?

> +}
> +
> +
> +/**
> + * range_tree_in_range - Returns the first node that overlaps or is adjacent
> + *                       with the given range
> + * @root: range_tree root
> + * @start: range start
> + * @end: range end
> + *
> + */


The comment needs update 

> +struct range_tree_node *range_tree_in_range_adjacent(
> +					struct range_tree_root *root,
> +					u64 start, u64 end)
> +{
> +	struct rb_node **p = &root->head.rb_node;
> +	struct range_tree_node *candidate;
> +
> +	while (*p) {
> +		candidate = rb_entry(*p, struct range_tree_node, rb);
> +		if (end+1 < candidate->start)
> +			p = &(*p)->rb_left;
> +		else if (start > candidate->end + 1)
> +			p = &(*p)->rb_right;
> +		else
> +			return candidate;
> +	}
> +	return 0;
> +}
> +


Below is my hack to get hugetlbfs code converted. The patch compiles.
Will test and send a signed-off-by version later.

not-signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

 fs/hugetlbfs/inode.c    |    3 +-
 include/linux/hugetlb.h |    2 +
 mm/hugetlb.c            |  291 ++++++++++++++++++++++-------------------------
 3 files changed, 139 insertions(+), 157 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index ca4fa70..8309f5e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -455,6 +455,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		info = HUGETLBFS_I(inode);
 		mpol_shared_policy_init(&info->policy, NULL);
+		range_tree_init(&info->rg_root);
 		inode->i_op = &hugetlbfs_dir_inode_operations;
 		inode->i_fop = &simple_dir_operations;
 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
@@ -478,7 +479,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		INIT_LIST_HEAD(&inode->i_mapping->private_list);
 		info = HUGETLBFS_I(inode);
 		/*
 		 * The policy is initialized here even if we are creating a
@@ -488,6 +488,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		 * the rb tree will still be empty.
 		 */
 		mpol_shared_policy_init(&info->policy, NULL);
+		range_tree_init(&info->rg_root);
 		switch (mode & S_IFMT) {
 		default:
 			init_special_inode(inode, mode, dev);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 32e948c..b785541a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -5,6 +5,7 @@
 #include <linux/fs.h>
 #include <linux/hugetlb_inline.h>
 #include <linux/cgroup.h>
+#include <linux/rangetree.h>
 
 struct ctl_table;
 struct user_struct;
@@ -150,6 +151,7 @@ struct hugetlbfs_sb_info {
 
 struct hugetlbfs_inode_info {
 	struct shared_policy policy;
+	struct range_tree_root rg_root;
 	struct inode vfs_inode;
 };
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4e1462d..a83727d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -69,148 +69,94 @@ static DEFINE_SPINLOCK(hugetlb_lock);
  *	down_read(&mm->mmap_sem);
  *	mutex_lock(&hugetlb_instantiation_mutex);
  */
-struct file_region {
-	struct list_head link;
-	long from;
-	long to;
-};
-
-static long region_add(struct list_head *head, long f, long t)
+static long region_chg(struct range_tree_root *rg_root, long start, long end,
+		       struct range_tree_node **rg_nodep)
 {
-	struct file_region *rg, *nrg, *trg;
+	long chg = 0;
+	struct range_tree_node *rg_node;
 
-	/* Locate the region we are either in or before. */
-	list_for_each_entry(rg, head, link)
-		if (f <= rg->to)
-			break;
+	rg_node = range_tree_in_range_adjacent(rg_root, start, end);
+	/*
+	 * If we need to allocate a new range_tree_node, we return a charge
+	 * with NULL *rg_node;
+	 */
+	if (rg_node == NULL)
+		return end - start;
 
-	/* Round our left edge to the current segment if it encloses us. */
-	if (f > rg->from)
-		f = rg->from;
+	if (start < rg_node->start)
+		chg +=  rg_node->start - start;
+	if (rg_node->end < end)
+		chg += end - rg_node->end;
 
-	/* Check for and consume any regions we now overlap with. */
-	nrg = rg;
-	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-		if (&rg->link == head)
-			break;
-		if (rg->from > t)
-			break;
-
-		/* If this area reaches higher then extend our area to
-		 * include it completely.  If this is not the first area
-		 * which we intend to reuse, free it. */
-		if (rg->to > t)
-			t = rg->to;
-		if (rg != nrg) {
-			list_del(&rg->link);
-			kfree(rg);
-		}
-	}
-	nrg->from = f;
-	nrg->to = t;
-	return 0;
+	*rg_nodep = rg_node;
+	return chg;
 }
 
-static long region_chg(struct list_head *head, long f, long t)
+static void region_add(struct range_tree_root *rg_root, long start, long end,
+		       struct range_tree_node *rg_node)
 {
-	struct file_region *rg, *nrg;
-	long chg = 0;
-
-	/* Locate the region we are before or in. */
-	list_for_each_entry(rg, head, link)
-		if (f <= rg->to)
-			break;
+	if (rg_node == NULL)
+		return;
 
-	/* If we are below the current region then a new region is required.
-	 * Subtle, allocate a new region at the position but make it zero
-	 * size such that we can guarantee to record the reservation. */
-	if (&rg->link == head || t < rg->from) {
-		nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
-		if (!nrg)
-			return -ENOMEM;
-		nrg->from = f;
-		nrg->to   = f;
-		INIT_LIST_HEAD(&nrg->link);
-		list_add(&nrg->link, rg->link.prev);
+	if (start < rg_node->start)
+		rg_node->start = start;
 
-		return t - f;
-	}
+	if (end > rg_node->end)
+		rg_node->end = end;
 
-	/* Round our left edge to the current segment if it encloses us. */
-	if (f > rg->from)
-		f = rg->from;
-	chg = t - f;
-
-	/* Check for and consume any regions we now overlap with. */
-	list_for_each_entry(rg, rg->link.prev, link) {
-		if (&rg->link == head)
-			break;
-		if (rg->from > t)
-			return chg;
-
-		/* We overlap with this area, if it extends further than
-		 * us then we must extend ourselves.  Account for its
-		 * existing reservation. */
-		if (rg->to > t) {
-			chg += rg->to - t;
-			t = rg->to;
-		}
-		chg -= rg->to - rg->from;
-	}
-	return chg;
+	range_tree_add(rg_root, rg_node);
 }
 
-static long region_truncate(struct list_head *head, long end)
+static long region_truncate(struct range_tree_root *rg_root, long off)
 {
-	struct file_region *rg, *trg;
 	long chg = 0;
-
-	/* Locate the region we are either in or before. */
-	list_for_each_entry(rg, head, link)
-		if (end <= rg->to)
-			break;
-	if (&rg->link == head)
-		return 0;
-
-	/* If we are in the middle of a region then adjust it. */
-	if (end > rg->from) {
-		chg = rg->to - end;
-		rg->to = end;
-		rg = list_entry(rg->link.next, typeof(*rg), link);
-	}
-
-	/* Drop any remaining regions. */
-	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-		if (&rg->link == head)
-			break;
-		chg += rg->to - rg->from;
-		list_del(&rg->link);
-		kfree(rg);
+	struct rb_node *rb_node;
+
+restart:
+	rb_node = rb_first(&rg_root->head);
+	while (rb_node) {
+		struct range_tree_node *rg_node;
+		rg_node = rb_entry(rb_node, struct range_tree_node, rb);
+		if (rg_node->end <= off) {
+			rb_node = rb_next(rb_node);
+			continue;
+		}
+		if (rg_node->start < off) {
+			chg += rg_node->end - off;
+			rg_node->end = off;
+			rb_node = rb_next(rb_node);
+			continue;
+		}
+		chg += rg_node->end - rg_node->start;
+		rb_erase(rb_node, &rg_root->head);
+		goto restart;
 	}
 	return chg;
 }
 
-static long region_count(struct list_head *head, long f, long t)
+static long region_count(struct range_tree_root *rg_root, long start, long end)
 {
-	struct file_region *rg;
 	long chg = 0;
+	struct rb_node *rb_node;
 
-	/* Locate each segment we overlap with, and count that overlap. */
-	list_for_each_entry(rg, head, link) {
-		int seg_from;
-		int seg_to;
+	rb_node = rb_first(&rg_root->head);
+	while (rb_node) {
+		int seg_from, seg_to;
+		struct range_tree_node *rg_node;
 
-		if (rg->to <= f)
+		rg_node = rb_entry(rb_node, struct range_tree_node, rb);
+		if (rg_node->end <= start) {
+			rb_node = rb_next(rb_node);
 			continue;
-		if (rg->from >= t)
+		}
+		if (rg_node->start >= end)
 			break;
-
-		seg_from = max(rg->from, f);
-		seg_to = min(rg->to, t);
+		seg_from = max(rg_node->start, (u64)start);
+		seg_to = min(rg_node->end, (u64)end);
 
 		chg += seg_to - seg_from;
+		rb_node = rb_next(rb_node);
 	}
-
 	return chg;
 }
 
@@ -302,7 +248,7 @@ static void set_vma_private_data(struct vm_area_struct *vma,
 
 struct resv_map {
 	struct kref refs;
-	struct list_head regions;
+	struct range_tree_root rg_root;
 };
 
 static struct resv_map *resv_map_alloc(void)
@@ -312,7 +258,7 @@ static struct resv_map *resv_map_alloc(void)
 		return NULL;
 
 	kref_init(&resv_map->refs);
-	INIT_LIST_HEAD(&resv_map->regions);
+	range_tree_init(&resv_map->rg_root);
 
 	return resv_map;
 }
@@ -322,7 +268,7 @@ static void resv_map_release(struct kref *ref)
 	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
 
 	/* Clear out any active regions before we release the map. */
-	region_truncate(&resv_map->regions, 0);
+	region_truncate(&resv_map->rg_root, 0);
 	kfree(resv_map);
 }
 
@@ -980,16 +926,19 @@ static void return_unused_surplus_pages(struct hstate *h,
  * No action is required on failure.
  */
 static long vma_needs_reservation(struct hstate *h,
-			struct vm_area_struct *vma, unsigned long addr)
+				  struct vm_area_struct *vma,
+				  unsigned long addr,
+				  struct range_tree_node **rg_node)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
+	struct hugetlbfs_inode_info *hinfo = HUGETLBFS_I(inode);
 
+	*rg_node = NULL;
 	if (vma->vm_flags & VM_MAYSHARE) {
 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-		return region_chg(&inode->i_mapping->private_list,
-							idx, idx + 1);
 
+		return region_chg(&hinfo->rg_root, idx, idx + 1, rg_node);
 	} else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
 		return 1;
 
@@ -998,28 +947,34 @@ static long vma_needs_reservation(struct hstate *h,
 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		struct resv_map *reservations = vma_resv_map(vma);
 
-		err = region_chg(&reservations->regions, idx, idx + 1);
+		err = region_chg(&reservations->rg_root, idx, idx + 1, rg_node);
 		if (err < 0)
 			return err;
 		return 0;
 	}
 }
 static void vma_commit_reservation(struct hstate *h,
-			struct vm_area_struct *vma, unsigned long addr)
+				   struct vm_area_struct *vma,
+				   unsigned long addr,
+				   struct range_tree_node *rg_node)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
+	struct hugetlbfs_inode_info *hinfo = HUGETLBFS_I(inode);
+
+	if (rg_node == NULL)
+		return;
 
 	if (vma->vm_flags & VM_MAYSHARE) {
 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-		region_add(&inode->i_mapping->private_list, idx, idx + 1);
+		region_add(&hinfo->rg_root, idx, idx + 1, rg_node);
 
 	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		struct resv_map *reservations = vma_resv_map(vma);
 
 		/* Mark this page used in the map. */
-		region_add(&reservations->regions, idx, idx + 1);
+		region_add(&reservations->rg_root, idx, idx + 1, rg_node);
 	}
 }
 
@@ -1027,6 +982,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 				    unsigned long addr, int avoid_reserve)
 {
 	int ret, idx;
+	struct range_tree_node *rg_node;
 	struct hstate *h = hstate_vma(vma);
 	struct page *page;
 	struct mem_cgroup *memcg;
@@ -1042,18 +998,24 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	 * MAP_NORESERVE mappings may also need pages and quota allocated
 	 * if no reserve mapping overlaps.
 	 */
-	chg = vma_needs_reservation(h, vma, addr);
-	if (chg < 0)
-		return ERR_PTR(-ENOMEM);
-	if (chg)
-		if (hugetlb_get_quota(inode->i_mapping, chg))
-			return ERR_PTR(-ENOSPC);
+	chg = vma_needs_reservation(h, vma, addr, &rg_node);
+	if (chg > 0 && rg_node == NULL) {
+		rg_node = kzalloc(sizeof(*rg_node), GFP_KERNEL);
+		if (rg_node == NULL)
+			return ERR_PTR(-ENOMEM);
+	}
 
+	if (chg) {
+		if (hugetlb_get_quota(inode->i_mapping, chg)) {
+			ret = -ENOSPC;
+			goto err_out;
+		}
+	}
 	ret = mem_cgroup_hugetlb_charge_page(idx, pages_per_huge_page(h),
 					     &memcg);
 	if (ret) {
-		hugetlb_put_quota(inode->i_mapping, chg);
-		return ERR_PTR(-ENOSPC);
+		ret = -ENOSPC;
+		goto err_out_quota;
 	}
 	spin_lock(&hugetlb_lock);
 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
@@ -1062,21 +1024,26 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	if (!page) {
 		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
 		if (!page) {
-			mem_cgroup_hugetlb_uncharge_memcg(idx,
-							 pages_per_huge_page(h),
-							 memcg);
-			hugetlb_put_quota(inode->i_mapping, chg);
-			return ERR_PTR(-ENOSPC);
+			ret = -ENOSPC;
+			goto err_out_uncharge;
 		}
 	}
 
 	set_page_private(page, (unsigned long) mapping);
 
-	vma_commit_reservation(h, vma, addr);
+	vma_commit_reservation(h, vma, addr, rg_node);
 	/* update page cgroup details */
 	mem_cgroup_hugetlb_commit_charge(idx, pages_per_huge_page(h),
 					 memcg, page);
 	return page;
+
+err_out_uncharge:
+	mem_cgroup_hugetlb_uncharge_memcg(idx, pages_per_huge_page(h), memcg);
+err_out_quota:
+	hugetlb_put_quota(inode->i_mapping, chg);
+err_out:
+	kfree(rg_node);
+	return ERR_PTR(ret);
 }
 
 int __weak alloc_bootmem_huge_page(struct hstate *h)
@@ -2170,7 +2137,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 		end = vma_hugecache_offset(h, vma, vma->vm_end);
 
 		reserve = (end - start) -
-			region_count(&reservations->regions, start, end);
+			region_count(&reservations->rg_root, start, end);
 
 		kref_put(&reservations->refs, resv_map_release);
 
@@ -2697,11 +2664,13 @@ retry:
 	 * any allocations necessary to record that reservation occur outside
 	 * the spinlock.
 	 */
-	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
-		if (vma_needs_reservation(h, vma, address) < 0) {
+	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+		struct range_tree_node *rg_node;
+		if (vma_needs_reservation(h, vma, address, &rg_node) < 0) {
 			ret = VM_FAULT_OOM;
 			goto backout_unlocked;
 		}
+	}
 
 	spin_lock(&mm->page_table_lock);
 	size = i_size_read(mapping->host) >> huge_page_shift(h);
@@ -2789,7 +2758,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * consumed.
 	 */
 	if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
-		if (vma_needs_reservation(h, vma, address) < 0) {
+		struct range_tree_node *rg_node;
+		if (vma_needs_reservation(h, vma, address, &rg_node) < 0) {
 			ret = VM_FAULT_OOM;
 			goto out_mutex;
 		}
@@ -2975,7 +2945,9 @@ int hugetlb_reserve_pages(struct inode *inode,
 					vm_flags_t vm_flags)
 {
 	long ret, chg;
+	struct range_tree_node *rg_node;
 	struct hstate *h = hstate_inode(inode);
+	struct hugetlbfs_inode_info *hinfo = HUGETLBFS_I(inode);
 
 	/*
 	 * Only apply hugepage reservation if asked. At fault time, an
@@ -2992,25 +2964,27 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 * called to make the mapping read-write. Assume !vma is a shm mapping
 	 */
 	if (!vma || vma->vm_flags & VM_MAYSHARE)
-		chg = region_chg(&inode->i_mapping->private_list, from, to);
+		chg = region_chg(&hinfo->rg_root, from, to, &rg_node);
 	else {
 		struct resv_map *resv_map = resv_map_alloc();
 		if (!resv_map)
 			return -ENOMEM;
 
-		chg = to - from;
-
+		chg = region_chg(&resv_map->rg_root, from, to, &rg_node);
 		set_vma_resv_map(vma, resv_map);
 		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
 	}
-
-	if (chg < 0)
-		return chg;
-
+	if (chg > 0 && rg_node == NULL ) {
+		/* We need allocate a new node */
+		rg_node = kzalloc(sizeof(*rg_node), GFP_KERNEL);
+		if (rg_node == NULL)
+			return -ENOMEM;
+	}
 	/* There must be enough filesystem quota for the mapping */
-	if (hugetlb_get_quota(inode->i_mapping, chg))
-		return -ENOSPC;
-
+	if (hugetlb_get_quota(inode->i_mapping, chg)) {
+		ret = -ENOSPC;
+		goto err_out;
+	}
 	/*
 	 * Check enough hugepages are available for the reservation.
 	 * Hand back the quota if there are not
@@ -3018,7 +2992,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 	ret = hugetlb_acct_memory(h, chg);
 	if (ret < 0) {
 		hugetlb_put_quota(inode->i_mapping, chg);
-		return ret;
+		goto err_out;
 	}
 
 	/*
@@ -3033,14 +3007,19 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 * else has to be done for private mappings here
 	 */
 	if (!vma || vma->vm_flags & VM_MAYSHARE)
-		region_add(&inode->i_mapping->private_list, from, to);
+		region_add(&hinfo->rg_root, from, to, rg_node);
 	return 0;
+err_out:
+	kfree(rg_node);
+	return ret;
 }
 
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 {
 	struct hstate *h = hstate_inode(inode);
-	long chg = region_truncate(&inode->i_mapping->private_list, offset);
+	struct hugetlbfs_inode_info *hinfo = HUGETLBFS_I(inode);
+
+	long chg = region_truncate(&hinfo->rg_root, offset);
 
 	spin_lock(&inode->i_lock);
 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] [RFC] Range tree implementation
  2012-03-20 10:00   ` Dmitry Adamushko
@ 2012-03-20 18:04     ` John Stultz
  0 siblings, 0 replies; 15+ messages in thread
From: John Stultz @ 2012-03-20 18:04 UTC (permalink / raw)
  To: Dmitry Adamushko
  Cc: linux-kernel, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel, Dave Chinner,
	Neil Brown, Andrea Righi, Aneesh Kumar K.V

On 03/20/2012 03:00 AM, Dmitry Adamushko wrote:
> Hi John,
>
> On 16 March 2012 23:51, John Stultz<john.stultz@linaro.org>  wrote:
>> After Andrew suggested something like his mumbletree idea
>> to better store a list of ranges, I worked on a few different
>> approaches, and this is what I've finally managed to get working.
>>
>> I suspect range-tree isn't a totally accurate name, but I
>> couldn't quite make out the difference between range trees
>> and interval trees, so I just picked one to call it. Do
>> let me know if you have a better name.
>>
>> The idea of storing ranges in a tree is nice, but has a number
>> of complications. When adding a range, its possible that a
>> large range will consume and merge a number of smaller ranges.
> Have you considered using 'prio_tree' (include/linux/prio_tree.h)? If
> we aim at addressing a wide range of possible use-cases (different
> patterns of adding/removing volatile ranges), then, at first glance,
> prio_tree looks like a better approach.
I'll take a closer look at that!

> e.g. for the "consume and merge a number of smaller ranges" scenario
> above, prio_tree gives O(log n) [ O(log n + m) ] behavior iso O(m log
> n) in your case.
Yea, one of the items I was looking at yesterday was to improve the 
range insert/remove usage, since I end up starting each lookup from the 
root node over and over.  I'm thinking of adding a iterate-next type 
call so that we don't re-start the lookup each iteration of the loop 
once we've found an item.

Thanks again for the great feedback!

-john


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/2] [RFC] Range tree implementation
  2012-03-21  4:15 [PATCH 0/2] [RFC] fadivse volatile & range tree (v5) John Stultz
@ 2012-03-21  4:15 ` John Stultz
  0 siblings, 0 replies; 15+ messages in thread
From: John Stultz @ 2012-03-21  4:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: John Stultz, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel,
	Dmitry Adamushko, Dave Chinner, Neil Brown, Andrea Righi,
	Aneesh Kumar K.V

After Andrew suggested something like his mumbletree idea
to better store a list of ranges, I worked on a few different
approaches, and this is what I've finally managed to get working.

I suspect range-tree isn't a totally accurate name, but I
couldn't quite make out the difference between range trees
and interval trees, so I just picked one to call it. Do
let me know if you have a better name.

The idea of storing ranges in a tree is nice, but has a number
of complications. When adding a range, its possible that a
large range will consume and merge a number of smaller ranges.
When removing a range, its possible you may end up splitting an
existing range, causing one range to become two. This makes it
very difficult to provide generic list_head like behavior, as
the parent structures would need to be duplicated and removed,
and that has lots of memory ownership issues.

So, this is a much simplified and more list_head like
implementation. You can add a node to a tree, or remove a node
to a tree, but the generic implementation doesn't do the
merging or splitting for you. But it does provide helpers to
find overlapping and adjacent ranges.

Andrew also really wanted this range-tree implementation to be
resuable so we don't duplicate the file locking logic. I'm not
totally convinced that the requirements between the volatile
ranges and file locking are really equivelent, but this reduced
impelementation may make it possible.

Do let me know what you think or if you have other ideas for
better ways to do the same.

Changelog:
v2:
* Reworked code to use an rbtree instead of splaying

v3:
* Added range_tree_next_in_range() to avoid having to start
  lookups from the root every time.
* Fixed some comments and return NULL instead of 0, as suggested
  by Aneesh Kumar K.V

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Android Kernel Team <kernel-team@android.com>
CC: Robert Love <rlove@google.com>
CC: Mel Gorman <mel@csn.ul.ie>
CC: Hugh Dickins <hughd@google.com>
CC: Dave Hansen <dave@linux.vnet.ibm.com>
CC: Rik van Riel <riel@redhat.com>
CC: Dmitry Adamushko <dmitry.adamushko@gmail.com>
CC: Dave Chinner <david@fromorbit.com>
CC: Neil Brown <neilb@suse.de>
CC: Andrea Righi <andrea@betterlinux.com>
CC: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/rangetree.h |   56 ++++++++++++++++++++
 lib/Makefile              |    2 +-
 lib/rangetree.c           |  124 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 181 insertions(+), 1 deletions(-)
 create mode 100644 include/linux/rangetree.h
 create mode 100644 lib/rangetree.c

diff --git a/include/linux/rangetree.h b/include/linux/rangetree.h
new file mode 100644
index 0000000..c61ce7c
--- /dev/null
+++ b/include/linux/rangetree.h
@@ -0,0 +1,56 @@
+#ifndef _LINUX_RANGETREE_H
+#define _LINUX_RANGETREE_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+struct range_tree_node {
+	struct rb_node rb;
+	u64 start;
+	u64 end;
+};
+
+struct range_tree_root {
+	struct rb_root head;
+};
+
+static inline void range_tree_init(struct range_tree_root *root)
+{
+	root->head = RB_ROOT;
+}
+
+static inline void range_tree_node_init(struct range_tree_node *node)
+{
+	rb_init_node(&node->rb);
+	node->start = 0;
+	node->end = 0;
+}
+
+static inline int range_tree_empty(struct range_tree_root *root)
+{
+	return RB_EMPTY_ROOT(&root->head);
+}
+
+static inline
+struct range_tree_node *range_tree_root_node(struct range_tree_root *root)
+{
+	struct range_tree_node *ret;
+	ret = container_of(root->head.rb_node, struct range_tree_node, rb);
+	return ret;
+}
+
+extern struct range_tree_node *range_tree_in_range(struct range_tree_root *root,
+							 u64 start, u64 end);
+extern struct range_tree_node *range_tree_in_range_adjacent(
+						struct range_tree_root *root,
+							 u64 start, u64 end);
+extern struct range_tree_node *range_tree_next_in_range(
+						struct range_tree_node *node,
+						u64 start, u64 end);
+extern void range_tree_add(struct range_tree_root *root,
+						struct range_tree_node *node);
+extern void range_tree_remove(struct range_tree_root *root,
+						struct range_tree_node *node);
+#endif
+
+
diff --git a/lib/Makefile b/lib/Makefile
index 18515f0..f43ef0d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -12,7 +12,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
 	 proportions.o prio_heap.o ratelimit.o show_mem.o \
-	 is_single_threaded.o plist.o decompress.o
+	 is_single_threaded.o plist.o decompress.o rangetree.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/rangetree.c b/lib/rangetree.c
new file mode 100644
index 0000000..a42b28e
--- /dev/null
+++ b/lib/rangetree.c
@@ -0,0 +1,124 @@
+#include <linux/rangetree.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+
+/**
+ * range_tree_in_range - Returns the first node that overlaps with the
+ *                       given range
+ * @root: range_tree root
+ * @start: range start
+ * @end: range end
+ *
+ */
+struct range_tree_node *range_tree_in_range(struct range_tree_root *root,
+						u64 start, u64 end)
+{
+	struct rb_node **p = &root->head.rb_node;
+	struct range_tree_node *candidate;
+
+	while (*p) {
+		candidate = rb_entry(*p, struct range_tree_node, rb);
+		if (end < candidate->start)
+			p = &(*p)->rb_left;
+		else if (start > candidate->end)
+			p = &(*p)->rb_right;
+		else
+			return candidate;
+	}
+
+	return NULL;
+}
+
+
+/**
+ * range_tree_in_range_adjacent - Returns the first node that overlaps or
+ *                                is adjacent with the given range.
+ * @root: range_tree root
+ * @start: range start
+ * @end: range end
+ *
+ */
+struct range_tree_node *range_tree_in_range_adjacent(
+					struct range_tree_root *root,
+					u64 start, u64 end)
+{
+	struct rb_node **p = &root->head.rb_node;
+	struct range_tree_node *candidate;
+
+	while (*p) {
+		candidate = rb_entry(*p, struct range_tree_node, rb);
+		if (end+1 < candidate->start)
+			p = &(*p)->rb_left;
+		else if (start > candidate->end + 1)
+			p = &(*p)->rb_right;
+		else
+			return candidate;
+	}
+	return NULL;
+}
+
+struct range_tree_node *range_tree_next_in_range(struct range_tree_node *node,
+							u64 start, u64 end)
+{
+	struct rb_node *next;
+	struct range_tree_node *candidate;
+	if (!node)
+		return NULL;
+	next = rb_next(&node->rb);
+	if (!next)
+		return NULL;
+
+	candidate = container_of(next, struct range_tree_node, rb);
+
+	if ((candidate->start > end) || (candidate->end < start))
+		return NULL;
+
+	return candidate;
+}
+
+/**
+ * range_tree_add - Add a node to a range tree
+ * @root: range tree to be added to
+ * @node: range_tree_node to be added
+ *
+ * Adds a node to the range tree.
+ */
+void range_tree_add(struct range_tree_root *root,
+					struct range_tree_node *node)
+{
+	struct rb_node **p = &root->head.rb_node;
+	struct rb_node *parent = NULL;
+	struct range_tree_node *ptr;
+
+	WARN_ON_ONCE(!RB_EMPTY_NODE(&node->rb));
+
+	while (*p) {
+		parent = *p;
+		ptr = rb_entry(parent, struct range_tree_node, rb);
+		if (node->start < ptr->start)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&node->rb, parent, p);
+	rb_insert_color(&node->rb, &root->head);
+
+}
+
+
+/**
+ * range_tree_remove: Removes a given node from the tree
+ * @root: root of tree
+ * @node: Node to be removed
+ *
+ * Removes a node and splays the tree
+ */
+void range_tree_remove(struct range_tree_root *root,
+						struct range_tree_node *node)
+{
+	WARN_ON_ONCE(RB_EMPTY_NODE(&node->rb));
+
+	rb_erase(&node->rb, &root->head);
+	RB_CLEAR_NODE(&node->rb);
+}
-- 
1.7.3.2.146.gca209


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 0/2] [RFC] Volatile Ranges (v6)
@ 2012-04-07  0:08 John Stultz
  2012-04-07  0:08 ` [PATCH 1/2] [RFC] Range tree implementation John Stultz
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: John Stultz @ 2012-04-07  0:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: John Stultz, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel,
	Dmitry Adamushko, Dave Chinner, Neil Brown, Andrea Righi,
	Aneesh Kumar K.V

Just wanted to send out another iteration of the volatile range code
for review and comment. 

This revision handles some improved coalescing logic, fixes some bugs
in the range tree management, and also utilizes a hash tabe to avoid
bloating the address_space structure with range_tree pointers.

Still looking for guidence on what a better interface for this might be
as well as thoughts on how to best drop the ranges under memory pressure
(vmtruncate_region is pretty close to what I want, but lockdep makes it
clear that its not really safe to call from a shrinker).

Another detail is that by hanging the volatile ranges off of the
address_space, the volatility for tmpfs files persists even when no one
has an open fd on the file. This could cause some surprises if application
A marked some pages volatile and died, then application B opened the file
and had pages dropped out underneith it while it was being used. I suspect
I need to clean up the volatility when all fds are dropped. But any extra
insight would be useful here.


Thanks for the continued advice and feedback!
-john

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Android Kernel Team <kernel-team@android.com>
CC: Robert Love <rlove@google.com>
CC: Mel Gorman <mel@csn.ul.ie>
CC: Hugh Dickins <hughd@google.com>
CC: Dave Hansen <dave@linux.vnet.ibm.com>
CC: Rik van Riel <riel@redhat.com>
CC: Dmitry Adamushko <dmitry.adamushko@gmail.com>
CC: Dave Chinner <david@fromorbit.com>
CC: Neil Brown <neilb@suse.de>
CC: Andrea Righi <andrea@betterlinux.com>
CC: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

John Stultz (2):
  [RFC] Range tree implementation
  [RFC] fadvise: Add _VOLATILE,_ISVOLATILE, and _NONVOLATILE flags

 fs/inode.c                |    2 +
 include/linux/fadvise.h   |    5 +
 include/linux/rangetree.h |   56 ++++++
 include/linux/volatile.h  |   14 ++
 lib/Makefile              |    2 +-
 lib/rangetree.c           |  128 +++++++++++++
 mm/Makefile               |    2 +-
 mm/fadvise.c              |   16 ++-
 mm/volatile.c             |  440 +++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 662 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/rangetree.h
 create mode 100644 include/linux/volatile.h
 create mode 100644 lib/rangetree.c
 create mode 100644 mm/volatile.c

-- 
1.7.3.2.146.gca209


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/2] [RFC] Range tree implementation
  2012-04-07  0:08 [PATCH 0/2] [RFC] Volatile Ranges (v6) John Stultz
@ 2012-04-07  0:08 ` John Stultz
  2012-04-07 17:36   ` Sasha Levin
  2012-04-07  0:08 ` [PATCH 2/2] [RFC] fadvise: Add _VOLATILE,_ISVOLATILE, and _NONVOLATILE flags John Stultz
  2012-04-07  8:14 ` [PATCH 0/2] [RFC] Volatile Ranges (v6) Dmitry Adamushko
  2 siblings, 1 reply; 15+ messages in thread
From: John Stultz @ 2012-04-07  0:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: John Stultz, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel,
	Dmitry Adamushko, Dave Chinner, Neil Brown, Andrea Righi,
	Aneesh Kumar K.V

After Andrew suggested something like his mumbletree idea
to better store a list of ranges, I worked on a few different
approaches, and this is what I've finally managed to get working.

I suspect range-tree isn't a totally accurate name, but I
couldn't quite make out the difference between range trees
and interval trees, so I just picked one to call it. Do
let me know if you have a better name.

The idea of storing ranges in a tree is nice, but has a number
of complications. When adding a range, its possible that a
large range will consume and merge a number of smaller ranges.
When removing a range, its possible you may end up splitting an
existing range, causing one range to become two. This makes it
very difficult to provide generic list_head like behavior, as
the parent structures would need to be duplicated and removed,
and that has lots of memory ownership issues.

So, this is a much simplified and more list_head like
implementation. You can add a node to a tree, or remove a node
to a tree, but the generic implementation doesn't do the
merging or splitting for you. But it does provide helpers to
find overlapping and adjacent ranges.

Andrew also really wanted this range-tree implementation to be
resuable so we don't duplicate the file locking logic. I'm not
totally convinced that the requirements between the volatile
ranges and file locking are really equivelent, but this reduced
impelementation may make it possible.

Do let me know what you think or if you have other ideas for
better ways to do the same.

Changelog:
v2:
* Reworked code to use an rbtree instead of splaying

v3:
* Added range_tree_next_in_range() to avoid having to start
  lookups from the root every time.
* Fixed some comments and return NULL instead of 0, as suggested
  by Aneesh Kumar K.V

v6:
* Fixed range_tree_in_range() so that it finds the earliest range,
  rather then the first. This allows the next_in_range() function
  to properly cover all the ranges in the tree.
* Minor clenaups to simplify some of the functions

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Android Kernel Team <kernel-team@android.com>
CC: Robert Love <rlove@google.com>
CC: Mel Gorman <mel@csn.ul.ie>
CC: Hugh Dickins <hughd@google.com>
CC: Dave Hansen <dave@linux.vnet.ibm.com>
CC: Rik van Riel <riel@redhat.com>
CC: Dmitry Adamushko <dmitry.adamushko@gmail.com>
CC: Dave Chinner <david@fromorbit.com>
CC: Neil Brown <neilb@suse.de>
CC: Andrea Righi <andrea@betterlinux.com>
CC: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/rangetree.h |   56 ++++++++++++++++++++
 lib/Makefile              |    2 +-
 lib/rangetree.c           |  128 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 185 insertions(+), 1 deletions(-)
 create mode 100644 include/linux/rangetree.h
 create mode 100644 lib/rangetree.c

diff --git a/include/linux/rangetree.h b/include/linux/rangetree.h
new file mode 100644
index 0000000..c61ce7c
--- /dev/null
+++ b/include/linux/rangetree.h
@@ -0,0 +1,56 @@
+#ifndef _LINUX_RANGETREE_H
+#define _LINUX_RANGETREE_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+struct range_tree_node {
+	struct rb_node rb;
+	u64 start;
+	u64 end;
+};
+
+struct range_tree_root {
+	struct rb_root head;
+};
+
+static inline void range_tree_init(struct range_tree_root *root)
+{
+	root->head = RB_ROOT;
+}
+
+static inline void range_tree_node_init(struct range_tree_node *node)
+{
+	rb_init_node(&node->rb);
+	node->start = 0;
+	node->end = 0;
+}
+
+static inline int range_tree_empty(struct range_tree_root *root)
+{
+	return RB_EMPTY_ROOT(&root->head);
+}
+
+static inline
+struct range_tree_node *range_tree_root_node(struct range_tree_root *root)
+{
+	struct range_tree_node *ret;
+	ret = container_of(root->head.rb_node, struct range_tree_node, rb);
+	return ret;
+}
+
+extern struct range_tree_node *range_tree_in_range(struct range_tree_root *root,
+							 u64 start, u64 end);
+extern struct range_tree_node *range_tree_in_range_adjacent(
+						struct range_tree_root *root,
+							 u64 start, u64 end);
+extern struct range_tree_node *range_tree_next_in_range(
+						struct range_tree_node *node,
+						u64 start, u64 end);
+extern void range_tree_add(struct range_tree_root *root,
+						struct range_tree_node *node);
+extern void range_tree_remove(struct range_tree_root *root,
+						struct range_tree_node *node);
+#endif
+
+
diff --git a/lib/Makefile b/lib/Makefile
index 18515f0..f43ef0d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -12,7 +12,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
 	 proportions.o prio_heap.o ratelimit.o show_mem.o \
-	 is_single_threaded.o plist.o decompress.o
+	 is_single_threaded.o plist.o decompress.o rangetree.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/rangetree.c b/lib/rangetree.c
new file mode 100644
index 0000000..08185bc
--- /dev/null
+++ b/lib/rangetree.c
@@ -0,0 +1,128 @@
+#include <linux/rangetree.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+
+
+/**
+ * range_tree_in_range - Returns the first node that overlaps with the
+ *                       given range
+ * @root: range_tree root
+ * @start: range start
+ * @end: range end
+ *
+ */
+struct range_tree_node *range_tree_in_range(struct range_tree_root *root,
+						u64 start, u64 end)
+{
+	struct rb_node *p = root->head.rb_node;
+	struct range_tree_node *candidate, *match = NULL;
+
+	while (p) {
+		candidate = rb_entry(p, struct range_tree_node, rb);
+		if (end < candidate->start)
+			p = p->rb_left;
+		else if (start > candidate->end)
+			p = p->rb_right;
+		else {
+			/* We found one, but try to find an earlier match */
+			match = candidate;
+			p = p->rb_left;
+		}
+	}
+
+	return match;
+}
+
+
+/**
+ * range_tree_in_range_adjacent - Returns the first node that overlaps or
+ *                                is adjacent with the given range.
+ * @root: range_tree root
+ * @start: range start
+ * @end: range end
+ *
+ */
+struct range_tree_node *range_tree_in_range_adjacent(
+					struct range_tree_root *root,
+					u64 start, u64 end)
+{
+	struct rb_node *p = root->head.rb_node;
+	struct range_tree_node *candidate;
+
+	while (p) {
+		candidate = rb_entry(p, struct range_tree_node, rb);
+		if (end+1 < candidate->start)
+			p = p->rb_left;
+		else if (start > candidate->end + 1)
+			p = p->rb_right;
+		else
+			return candidate;
+	}
+	return NULL;
+}
+
+struct range_tree_node *range_tree_next_in_range(struct range_tree_node *node,
+							u64 start, u64 end)
+{
+	struct rb_node *next;
+	struct range_tree_node *candidate;
+	if (!node)
+		return NULL;
+	next = rb_next(&node->rb);
+	if (!next)
+		return NULL;
+
+	candidate = container_of(next, struct range_tree_node, rb);
+
+	if ((candidate->start > end) || (candidate->end < start))
+		return NULL;
+
+	return candidate;
+}
+
+/**
+ * range_tree_add - Add a node to a range tree
+ * @root: range tree to be added to
+ * @node: range_tree_node to be added
+ *
+ * Adds a node to the range tree.
+ */
+void range_tree_add(struct range_tree_root *root,
+					struct range_tree_node *node)
+{
+	struct rb_node **p = &root->head.rb_node;
+	struct rb_node *parent = NULL;
+	struct range_tree_node *ptr;
+
+	WARN_ON_ONCE(!RB_EMPTY_NODE(&node->rb));
+
+	while (*p) {
+		parent = *p;
+		ptr = rb_entry(parent, struct range_tree_node, rb);
+		if (node->start < ptr->start)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&node->rb, parent, p);
+	rb_insert_color(&node->rb, &root->head);
+
+}
+
+
+/**
+ * range_tree_remove: Removes a given node from the tree
+ * @root: root of tree
+ * @node: Node to be removed
+ *
+ * Removes a node and splays the tree
+ */
+void range_tree_remove(struct range_tree_root *root,
+						struct range_tree_node *node)
+{
+	WARN_ON_ONCE(RB_EMPTY_NODE(&node->rb));
+
+	rb_erase(&node->rb, &root->head);
+	RB_CLEAR_NODE(&node->rb);
+}
-- 
1.7.3.2.146.gca209


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/2] [RFC] fadvise: Add _VOLATILE,_ISVOLATILE, and _NONVOLATILE flags
  2012-04-07  0:08 [PATCH 0/2] [RFC] Volatile Ranges (v6) John Stultz
  2012-04-07  0:08 ` [PATCH 1/2] [RFC] Range tree implementation John Stultz
@ 2012-04-07  0:08 ` John Stultz
  2012-04-07  8:14 ` [PATCH 0/2] [RFC] Volatile Ranges (v6) Dmitry Adamushko
  2 siblings, 0 replies; 15+ messages in thread
From: John Stultz @ 2012-04-07  0:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: John Stultz, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel,
	Dmitry Adamushko, Dave Chinner, Neil Brown, Andrea Righi,
	Aneesh Kumar K.V

This patch provides new fadvise flags that can be used to mark
file pages as volatile, which will allow it to be discarded if the
kernel wants to reclaim memory.

This is useful for userspace to allocate things like caches, and lets
the kernel destructively (but safely) reclaim them when there's memory
pressure.

It's different from FADV_DONTNEED since the pages are not immediately
discarded; they are only discarded under pressure.

This is very much influenced by the Android Ashmem interface by
Robert Love so credits to him and the Android developers.
In many cases the code & logic come directly from the ashmem patch.
The intent of this patch is to allow for ashmem-like behavior, but
embeds the idea a little deeper into the VM code, instead of isolating
it into a specific driver.

I'm very much a newbie at the VM code, so At this point, I just want
to try to get some input on the patch, so if you have another idea
for using something other then fadvise, or other thoughts on how the
volatile ranges are stored, I'd be really interested in hearing them.
So let me know if you have any comments for feedback!

Also many thanks to Dave Hansen who helped design and develop the
initial version of this patch, and has provided continued review and
mentoring for me in the VM code.

v2:
* After the valid critique that just dropping pages would poke holes
in volatile ranges, and instead we should zap an entire range if we
drop any of it, I changed the code to more closely mimic the ashmem
implementation, which zaps entire ranges via a shrinker using an lru
list that tracks which range has been marked volatile the longest.

v3:
* Reworked to use range tree implementation.

v4:
* Renamed functions to avoid confusion.
* More consistant PAGE_CACHE_SHIFT usage, suggested by Dmitry
  Adamushko
* Fixes exit without unlocking issue found by Dmitry Adamushko
* Migrate to rbtree based rangetree implementation
* Simplified locking to use global lock (we were grabbing global
  lru lock every time anyway).
* Avoid ENOMEM isses by allocating before we get into complicated
  code.
* Add some documentation to the volatile.c file from Neil Brown

v5:
* More fixes suggested by Dmitry Adamushko
* Improve range colescing so that we don't coalesce neighboring
  purged ranges.
* Utilize range_tree_next_in_range to avoid doing every lookup
  from the tree's root.

v6:
* Immediately zap range if we coalesce overlapping purged range.
* Use hash table to do mapping->rangetree lookup instead of
  bloating the address_space structure

Known issues:
* Lockdep doesn't like calling vmtruncate_range() from a shrinker.
  Any help here on how to address this would be appreciated.
  I've tried switching to invalidate_inode_pages2_range, but
  that always returns EBUSY in my testing, and I don't really
  want to launder dirty pages, instead I want to zap them.

* Volatile range persistence needs to be though through. Currently
  the volatility follows the inode in memory, which for tmpfs sticks
  around. This means application A could open a file, mark it volatile,
  close it. Then application B opens the file, and as its using it finds
  the pages earlier marked volatile disappearing under it. I think it
  probably makes more sense to drop all volatile ranges after all the
  fds to the file have been closed. This may also be something that
  changes if we switch up to a different interface. Suggestions here
  would be great.

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Android Kernel Team <kernel-team@android.com>
CC: Robert Love <rlove@google.com>
CC: Mel Gorman <mel@csn.ul.ie>
CC: Hugh Dickins <hughd@google.com>
CC: Dave Hansen <dave@linux.vnet.ibm.com>
CC: Rik van Riel <riel@redhat.com>
CC: Dmitry Adamushko <dmitry.adamushko@gmail.com>
CC: Dave Chinner <david@fromorbit.com>
CC: Neil Brown <neilb@suse.de>
CC: Andrea Righi <andrea@betterlinux.com>
CC: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 fs/inode.c               |    2 +
 include/linux/fadvise.h  |    5 +
 include/linux/volatile.h |   14 ++
 mm/Makefile              |    2 +-
 mm/fadvise.c             |   16 ++-
 mm/volatile.c            |  440 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 477 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/volatile.h
 create mode 100644 mm/volatile.c

diff --git a/fs/inode.c b/fs/inode.c
index 9f4f5fe..590d314 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -17,6 +17,7 @@
 #include <linux/prefetch.h>
 #include <linux/buffer_head.h> /* for inode_has_buffers */
 #include <linux/ratelimit.h>
+#include <linux/volatile.h>
 #include "internal.h"
 
 /*
@@ -244,6 +245,7 @@ void __destroy_inode(struct inode *inode)
 	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
 		posix_acl_release(inode->i_default_acl);
 #endif
+	mapping_clear_volatile_ranges(&inode->i_data);
 	this_cpu_dec(nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
diff --git a/include/linux/fadvise.h b/include/linux/fadvise.h
index e8e7471..443951c 100644
--- a/include/linux/fadvise.h
+++ b/include/linux/fadvise.h
@@ -18,4 +18,9 @@
 #define POSIX_FADV_NOREUSE	5 /* Data will be accessed once.  */
 #endif
 
+#define POSIX_FADV_VOLATILE	8  /* _can_ toss, but don't toss now */
+#define POSIX_FADV_NONVOLATILE	9  /* Remove VOLATILE flag */
+
+
+
 #endif	/* FADVISE_H_INCLUDED */
diff --git a/include/linux/volatile.h b/include/linux/volatile.h
new file mode 100644
index 0000000..5460d7b
--- /dev/null
+++ b/include/linux/volatile.h
@@ -0,0 +1,14 @@
+#ifndef _LINUX_VOLATILE_H
+#define _LINUX_VOLATILE_H
+
+#include <linux/fs.h>
+
+extern long mapping_range_volatile(struct address_space *mapping,
+				pgoff_t start_index, pgoff_t end_index);
+extern long mapping_range_nonvolatile(struct address_space *mapping,
+				pgoff_t start_index, pgoff_t end_index);
+extern long mapping_range_isvolatile(struct address_space *mapping,
+				pgoff_t start_index, pgoff_t end_index);
+extern void mapping_clear_volatile_ranges(struct address_space *mapping);
+
+#endif /* _LINUX_VOLATILE_H */
diff --git a/mm/Makefile b/mm/Makefile
index 50ec00e..7b6c7a8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -13,7 +13,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
 			   page_isolation.o mm_init.o mmu_context.o percpu.o \
-			   $(mmu-y)
+			   volatile.o $(mmu-y)
 obj-y += init-mm.o
 
 ifdef CONFIG_NO_BOOTMEM
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 469491e0..3e33845 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -17,6 +17,7 @@
 #include <linux/fadvise.h>
 #include <linux/writeback.h>
 #include <linux/syscalls.h>
+#include <linux/volatile.h>
 
 #include <asm/unistd.h>
 
@@ -106,7 +107,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 		nrpages = end_index - start_index + 1;
 		if (!nrpages)
 			nrpages = ~0UL;
-		
+
 		ret = force_page_cache_readahead(mapping, file,
 				start_index,
 				nrpages);
@@ -128,6 +129,19 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 			invalidate_mapping_pages(mapping, start_index,
 						end_index);
 		break;
+	case POSIX_FADV_VOLATILE:
+		/* First and last PARTIAL page! */
+		start_index = offset >> PAGE_CACHE_SHIFT;
+		end_index = endbyte >> PAGE_CACHE_SHIFT;
+		ret = mapping_range_volatile(mapping, start_index, end_index);
+		break;
+	case POSIX_FADV_NONVOLATILE:
+		/* First and last PARTIAL page! */
+		start_index = offset >> PAGE_CACHE_SHIFT;
+		end_index = endbyte >> PAGE_CACHE_SHIFT;
+		ret = mapping_range_nonvolatile(mapping, start_index,
+								end_index);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/mm/volatile.c b/mm/volatile.c
new file mode 100644
index 0000000..12450be
--- /dev/null
+++ b/mm/volatile.c
@@ -0,0 +1,440 @@
+/* mm/volatile.c
+ *
+ * Volatile page range managment.
+ *      Copyright 2011 Linaro
+ *
+ * Based on mm/ashmem.c
+ *      by Robert Love <rlove@google.com>
+ *      Copyright (C) 2008 Google, Inc.
+ *
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ *
+ * The goal behind volatile ranges is to allow applications to interact
+ * with the kernel's cache management infrastructure.  In particular an
+ * application can say "this memory contains data that might be useful in
+ * the future, but can be reconstructed if necessary, so if the kernel
+ * needs, it can zap and reclaim this memory without having to swap it out.
+ *
+ * The proposed mechanism - at a high level - is for user-space to be able
+ * to say "This memory is volatile" and then later "this memory is no longer
+ * volatile".  If the content of the memory is still available the second
+ * request succeeds.  If not, the memory is marked non-volatile and an
+ * error is returned to denote that the contents have been lost.
+ *
+ * Credits to Neil Brown for the above description.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/volatile.h>
+#include <linux/rangetree.h>
+#include <linux/hash.h>
+
+static DEFINE_MUTEX(volatile_mutex);
+
+struct volatile_range {
+	struct list_head	lru;
+	struct range_tree_node	range_node;
+	unsigned int		purged;
+	struct address_space	*mapping;
+};
+
+/* LRU list of volatile page ranges */
+static LIST_HEAD(volatile_lru_list);
+
+/* Count of pages on our LRU list */
+static u64 lru_count;
+
+
+/*
+ * To avoid bloating the address_space structure, we use
+ * a hash structure to map from address_space mappings to
+ * the range_tree root that stores volatile ranges
+ */
+static struct hlist_head *mapping_hash;
+static long mapping_hash_shift = 8;
+
+struct mapping_hash_entry {
+	struct range_tree_root root;
+	struct address_space *mapping;
+	struct hlist_node hnode;
+};
+
+static inline
+struct range_tree_root *mapping_allocate_root(struct address_space *mapping)
+{
+	struct mapping_hash_entry *entry;
+
+	/* Drop the volatile_mutex to avoid lockdep deadlock warnings */
+	mutex_unlock(&volatile_mutex);
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	mutex_lock(&volatile_mutex);
+
+	INIT_HLIST_NODE(&entry->hnode);
+	entry->mapping = mapping;
+	range_tree_init(&entry->root);
+
+	hlist_add_head_rcu(&entry->hnode,
+		&mapping_hash[hash_ptr(mapping, mapping_hash_shift)]);
+
+	return &entry->root;
+}
+
+static inline
+struct range_tree_root *mapping_to_root(struct address_space *mapping)
+{
+	struct hlist_node *elem;
+	struct mapping_hash_entry *entry;
+
+	hlist_for_each_entry_rcu(entry, elem,
+			&mapping_hash[hash_ptr(mapping, mapping_hash_shift)],
+				hnode)
+		if (entry->mapping == mapping)
+			return &entry->root;
+
+	return NULL;
+}
+
+static inline void mapping_free_root(struct range_tree_root *root)
+{
+	struct mapping_hash_entry *entry;
+
+	entry = container_of(root, struct mapping_hash_entry, root);
+
+	hlist_del_rcu(&entry->hnode);
+	kfree(entry);
+}
+
+
+/* Range tree helpers */
+static inline u64 range_size(struct volatile_range *range)
+{
+	return range->range_node.end - range->range_node.start + 1;
+}
+
+static inline void lru_add(struct volatile_range *range)
+{
+	list_add_tail(&range->lru, &volatile_lru_list);
+	lru_count += range_size(range);
+}
+
+static inline void lru_del(struct volatile_range *range)
+{
+	list_del(&range->lru);
+	lru_count -= range_size(range);
+}
+
+#define range_on_lru(range) (!(range)->purged)
+
+
+static inline void volatile_range_resize(struct volatile_range *range,
+				pgoff_t start_index, pgoff_t end_index)
+{
+	size_t pre = range_size(range);
+
+	range->range_node.start = start_index;
+	range->range_node.end = end_index;
+
+	if (range_on_lru(range))
+		lru_count -= pre - range_size(range);
+}
+
+static struct volatile_range *vrange_alloc(void)
+{
+	struct volatile_range *new;
+
+	new = kzalloc(sizeof(struct volatile_range), GFP_KERNEL);
+	if (!new)
+		return 0;
+	range_tree_node_init(&new->range_node);
+	return new;
+}
+
+static void vrange_del(struct range_tree_root *root,
+				struct volatile_range *vrange)
+{
+	if (range_on_lru(vrange))
+		lru_del(vrange);
+	range_tree_remove(root, &vrange->range_node);
+	kfree(vrange);
+}
+
+
+
+/*
+ * Mark a region as volatile, allowing dirty pages to be purged
+ * under memory pressure
+ */
+long mapping_range_volatile(struct address_space *mapping,
+				pgoff_t start_index, pgoff_t end_index)
+{
+	struct volatile_range *new;
+	struct range_tree_node *node;
+	struct volatile_range *vrange;
+	struct range_tree_root *root;
+	u64 start, end;
+	int purged = 0;
+	start = (u64)start_index;
+	end = (u64)end_index;
+
+	new = vrange_alloc();
+	if (!new)
+		return -ENOMEM;
+
+	mutex_lock(&volatile_mutex);
+
+
+	root = mapping_to_root(mapping);
+	if (!root)
+		root = mapping_allocate_root(mapping);
+
+	/* Find any existing ranges that overlap */
+	node = range_tree_in_range(root, start, end);
+	while (node) {
+		/* Already entirely marked volatile, so we're done */
+		if (node->start < start && node->end > end) {
+			/* don't need the allocated value */
+			kfree(new);
+			goto out;
+		}
+
+		/* Grab containing volatile range */
+		vrange = container_of(node, struct volatile_range, range_node);
+
+		/* resize range */
+		start = min_t(u64, start, node->start);
+		end = max_t(u64, end, node->end);
+		purged |= vrange->purged;
+
+		node = range_tree_next_in_range(&vrange->range_node,
+								start, end);
+		vrange_del(root, vrange);
+	}
+
+	/* Coalesce left-adjacent ranges */
+	node = range_tree_in_range(root, start-1, start);
+	if (node) {
+		vrange = container_of(node, struct volatile_range, range_node);
+		/* Only coalesce if both are either purged or unpurged */
+		if (vrange->purged == purged) {
+			/* resize range */
+			start = min_t(u64, start, node->start);
+			end = max_t(u64, end, node->end);
+			vrange_del(root, vrange);
+		}
+	}
+
+	/* Coalesce right-adjacent ranges */
+	node = range_tree_in_range(root, end, end+1);
+	if (node) {
+		vrange = container_of(node, struct volatile_range, range_node);
+		/* Only coalesce if both are either purged or unpurged */
+		if (vrange->purged == purged) {
+			/* resize range */
+			start = min_t(u64, start, node->start);
+			end = max_t(u64, end, node->end);
+			vrange_del(root, vrange);
+		}
+	}
+
+	new->mapping = mapping;
+	new->range_node.start = start;
+	new->range_node.end = end;
+	new->purged = purged;
+
+	if (purged) {
+		struct inode *inode;
+		loff_t pstart, pend;
+
+		inode = mapping->host;
+		pstart = start << PAGE_CACHE_SHIFT;
+		pend = ((end + 1) << PAGE_CACHE_SHIFT) - 1;
+		vmtruncate_range(inode, pstart, pend);
+	}
+	range_tree_add(root, &new->range_node);
+	if (range_on_lru(new))
+		lru_add(new);
+
+out:
+	mutex_unlock(&volatile_mutex);
+
+	return 0;
+}
+
+/*
+ * Mark a region as nonvolatile, returns 1 if any pages in the region
+ * were purged.
+ */
+long mapping_range_nonvolatile(struct address_space *mapping,
+				pgoff_t start_index, pgoff_t end_index)
+{
+	struct volatile_range *new;
+	struct range_tree_node *node;
+	struct range_tree_root *root;
+	int ret  = 0;
+	u64 start, end;
+	int used_new = 0;
+
+	start = (u64)start_index;
+	end = (u64)end_index;
+
+	/* create new node */
+	new = vrange_alloc();
+	if (!new)
+		return -ENOMEM;
+
+	mutex_lock(&volatile_mutex);
+	root = mapping_to_root(mapping);
+	if (!root)
+		root = mapping_allocate_root(mapping);
+
+	node = range_tree_in_range(root, start, end);
+	while (node) {
+		struct volatile_range *vrange;
+		vrange = container_of(node, struct volatile_range, range_node);
+
+		ret |= vrange->purged;
+
+		if (start <= node->start && end >= node->end) {
+			/* delete: volatile range is totally within range */
+			node = range_tree_next_in_range(&vrange->range_node,
+								start, end);
+			vrange_del(root, vrange);
+		} else if (node->start >= start) {
+			/* resize: volatile range right-overlaps range */
+			volatile_range_resize(vrange, end+1, node->end);
+			node = range_tree_next_in_range(&vrange->range_node,
+								start, end);
+
+		} else if (node->end <= end) {
+			/* resize: volatile range left-overlaps range */
+			volatile_range_resize(vrange, node->start, start-1);
+			node = range_tree_next_in_range(&vrange->range_node,
+								start, end);
+		} else {
+			/* split: range is totally within a volatile range */
+			used_new = 1; /* we only do this once */
+			new->mapping = mapping;
+			new->range_node.start = end + 1;
+			new->range_node.end = node->end;
+			new->purged = vrange->purged;
+			range_tree_add(root, &new->range_node);
+			if (range_on_lru(new))
+				lru_add(new);
+			volatile_range_resize(vrange, node->start, start-1);
+
+			break;
+		}
+	}
+	mutex_unlock(&volatile_mutex);
+
+	if (!used_new)
+		kfree(new);
+
+	return ret;
+}
+
+
+/*
+ * Cleans up any volatile ranges.
+ */
+void mapping_clear_volatile_ranges(struct address_space *mapping)
+{
+	struct volatile_range *tozap;
+	struct range_tree_root *root;
+
+	mutex_lock(&volatile_mutex);
+
+	root = mapping_to_root(mapping);
+	if (!root)
+		goto out;
+
+	while (!range_tree_empty(root)) {
+		struct range_tree_node *tmp;
+		tmp = range_tree_root_node(root);
+		tozap = container_of(tmp, struct volatile_range, range_node);
+		vrange_del(root, tozap);
+	}
+	mapping_free_root(root);
+out:
+	mutex_unlock(&volatile_mutex);
+}
+
+/*
+ * Purges volatile ranges when under memory pressure
+ */
+static int volatile_shrink(struct shrinker *ignored, struct shrink_control *sc)
+{
+	struct volatile_range *range, *next;
+	s64 nr_to_scan = sc->nr_to_scan;
+	const gfp_t gfp_mask = sc->gfp_mask;
+
+	if (nr_to_scan && !(gfp_mask & __GFP_FS))
+		return -1;
+	if (!nr_to_scan)
+		return lru_count;
+
+	mutex_lock(&volatile_mutex);
+	list_for_each_entry_safe(range, next, &volatile_lru_list, lru) {
+		struct inode *inode;
+		loff_t start, end;
+
+		inode = range->mapping->host;
+
+		start = range->range_node.start << PAGE_CACHE_SHIFT;
+		end = ((range->range_node.end + 1) << PAGE_CACHE_SHIFT) - 1;
+
+		/*
+		 * XXX - calling vmtruncate_range from a shrinker causes
+		 * lockdep warnings. Revisit this!
+		 */
+		if (!vmtruncate_range(inode, start, end)) {
+			lru_del(range);
+			range->purged = 1;
+			nr_to_scan -= range_size(range);
+		}
+
+		if (nr_to_scan <= 0)
+			break;
+	}
+	mutex_unlock(&volatile_mutex);
+
+	return lru_count;
+}
+
+static struct shrinker volatile_shrinker = {
+	.shrink = volatile_shrink,
+	.seeks = DEFAULT_SEEKS,
+};
+
+static int __init volatile_init(void)
+{
+	int i, size;
+
+	size = 1U << mapping_hash_shift;
+
+	mapping_hash = kzalloc(sizeof(mapping_hash)*size, GFP_KERNEL);
+
+	for (i = 0; i < size; i++)
+		INIT_HLIST_HEAD(&mapping_hash[i]);
+
+	register_shrinker(&volatile_shrinker);
+
+
+	return 0;
+}
+
+arch_initcall(volatile_init);
-- 
1.7.3.2.146.gca209


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/2] [RFC] Volatile Ranges (v6)
  2012-04-07  0:08 [PATCH 0/2] [RFC] Volatile Ranges (v6) John Stultz
  2012-04-07  0:08 ` [PATCH 1/2] [RFC] Range tree implementation John Stultz
  2012-04-07  0:08 ` [PATCH 2/2] [RFC] fadvise: Add _VOLATILE,_ISVOLATILE, and _NONVOLATILE flags John Stultz
@ 2012-04-07  8:14 ` Dmitry Adamushko
  2012-04-09 17:56   ` John Stultz
  2 siblings, 1 reply; 15+ messages in thread
From: Dmitry Adamushko @ 2012-04-07  8:14 UTC (permalink / raw)
  To: John Stultz
  Cc: linux-kernel, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel, Dave Chinner,
	Neil Brown, Andrea Righi, Aneesh Kumar K.V

On 7 April 2012 02:08, John Stultz <john.stultz@linaro.org> wrote:
>
> Another detail is that by hanging the volatile ranges off of the
> address_space, the volatility for tmpfs files persists even when no one
> has an open fd on the file. This could cause some surprises if application
> A marked some pages volatile and died, then application B opened the file
> and had pages dropped out underneith it while it was being used. I suspect
> I need to clean up the volatility when all fds are dropped.

And how do you handle the regions that have already been purged by
this moment? Unless B has some specific mechanism to verify the
consistency of the content, a sensible way would be to always mark off
the regions as non-volatile before accessing them and verify the
return code to see if there are holes.

More generally, what if B opens the file while A is still working with
it? Besides the use of normal synchronization mechanisms, B should not
make any assumption on the current state of the regions (unless there
is a high-level protocol between A and B to share this info). So an
explicit mark-off-as-non_volatile could be a simple generic mechanism.

--Dmitry

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] [RFC] Range tree implementation
  2012-04-07  0:08 ` [PATCH 1/2] [RFC] Range tree implementation John Stultz
@ 2012-04-07 17:36   ` Sasha Levin
  2012-04-09 18:04     ` John Stultz
  0 siblings, 1 reply; 15+ messages in thread
From: Sasha Levin @ 2012-04-07 17:36 UTC (permalink / raw)
  To: John Stultz
  Cc: linux-kernel, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel,
	Dmitry Adamushko, Dave Chinner, Neil Brown, Andrea Righi,
	Aneesh Kumar K.V, Pekka Enberg, Ingo Molnar

On Sat, Apr 7, 2012 at 2:08 AM, John Stultz <john.stultz@linaro.org> wrote:
> After Andrew suggested something like his mumbletree idea
> to better store a list of ranges, I worked on a few different
> approaches, and this is what I've finally managed to get working.
>
> I suspect range-tree isn't a totally accurate name, but I
> couldn't quite make out the difference between range trees
> and interval trees, so I just picked one to call it. Do
> let me know if you have a better name.
>
> The idea of storing ranges in a tree is nice, but has a number
> of complications. When adding a range, its possible that a
> large range will consume and merge a number of smaller ranges.
> When removing a range, its possible you may end up splitting an
> existing range, causing one range to become two. This makes it
> very difficult to provide generic list_head like behavior, as
> the parent structures would need to be duplicated and removed,
> and that has lots of memory ownership issues.
>
> So, this is a much simplified and more list_head like
> implementation. You can add a node to a tree, or remove a node
> to a tree, but the generic implementation doesn't do the
> merging or splitting for you. But it does provide helpers to
> find overlapping and adjacent ranges.
>
> Andrew also really wanted this range-tree implementation to be
> resuable so we don't duplicate the file locking logic. I'm not
> totally convinced that the requirements between the volatile
> ranges and file locking are really equivelent, but this reduced
> impelementation may make it possible.
>
> Do let me know what you think or if you have other ideas for
> better ways to do the same.

Hi John,

I have implemented an interval lookup tree using the augmented
features of the in-kernel rbtree for the KVM tools project. We
currently use it for several things as a framework code such as MMIO
memory mapping.

>From what I see in the patch, you haven't fully implemented the
interval structure (it needs to keep track of additional parameters
when building it, and the search is a bit different and is based on
those parameters).

I could send that code as a patch against the kernel tree if something
like that is actually required for the kernel at this point.

Thanks.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/2] [RFC] Volatile Ranges (v6)
  2012-04-07  8:14 ` [PATCH 0/2] [RFC] Volatile Ranges (v6) Dmitry Adamushko
@ 2012-04-09 17:56   ` John Stultz
  0 siblings, 0 replies; 15+ messages in thread
From: John Stultz @ 2012-04-09 17:56 UTC (permalink / raw)
  To: Dmitry Adamushko
  Cc: linux-kernel, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel, Dave Chinner,
	Neil Brown, Andrea Righi, Aneesh Kumar K.V

On 04/07/2012 01:14 AM, Dmitry Adamushko wrote:
> On 7 April 2012 02:08, John Stultz<john.stultz@linaro.org>  wrote:
>> Another detail is that by hanging the volatile ranges off of the
>> address_space, the volatility for tmpfs files persists even when no one
>> has an open fd on the file. This could cause some surprises if application
>> A marked some pages volatile and died, then application B opened the file
>> and had pages dropped out underneith it while it was being used. I suspect
>> I need to clean up the volatility when all fds are dropped.
> And how do you handle the regions that have already been purged by
> this moment? Unless B has some specific mechanism to verify the
> consistency of the content, a sensible way would be to always mark off
> the regions as non-volatile before accessing them and verify the
> return code to see if there are holes.
>
> More generally, what if B opens the file while A is still working with
> it? Besides the use of normal synchronization mechanisms, B should not
> make any assumption on the current state of the regions (unless there
> is a high-level protocol between A and B to share this info). So an
> explicit mark-off-as-non_volatile could be a simple generic mechanism.
>

So yes, marking as non-volatile before you use pages would be a way to 
avoid the issue.  But it still rubs me the wrong way.

I think the main issue I have with it is that it makes volatility the 
assumed state. So unless you mark it non-volatile to begin with, the 
file could be volatile somewhere.  I feel like volatility should be the 
special state, not the assumed one, so normal applications that don't 
think about volatility are less-likely to be surprised.

Now, when you have concurrent users of a file, you have to coordinate, 
and things can change under you. That's an expectation people already 
have.  But if volatile ranges persist, its sort of introducing a form of 
concurrency to non-concurrent access.  Where a killed application can 
reach from the grave and zap a page in file someone else is using.  I 
think this is too unexpected.

The case that bit me in particular was in testing this patch, I had an 
application (call it A) that had a bug and was marking a larger range 
volatile then it re-set to non-volatile.   Then when using the same file 
later with a different test application (call it B), I was seeing those 
further pages be zapped unexpectedly.  It took me a while to realize 
that it wasn't a problem with the B application, or the patch itself, 
but was a persistent range that was set much earlier by A.

So I suspect it would be better if the volatile ranges should be 
something that are cleared out when the last fd is closed.

thanks
-john





^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] [RFC] Range tree implementation
  2012-04-07 17:36   ` Sasha Levin
@ 2012-04-09 18:04     ` John Stultz
  2012-04-09 18:44       ` Sasha Levin
  0 siblings, 1 reply; 15+ messages in thread
From: John Stultz @ 2012-04-09 18:04 UTC (permalink / raw)
  To: Sasha Levin
  Cc: linux-kernel, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel,
	Dmitry Adamushko, Dave Chinner, Neil Brown, Andrea Righi,
	Aneesh Kumar K.V, Pekka Enberg, Ingo Molnar

On 04/07/2012 10:36 AM, Sasha Levin wrote:
> On Sat, Apr 7, 2012 at 2:08 AM, John Stultz<john.stultz@linaro.org>  wrote:
>> After Andrew suggested something like his mumbletree idea
>> to better store a list of ranges, I worked on a few different
>> approaches, and this is what I've finally managed to get working.
>>
>> I suspect range-tree isn't a totally accurate name, but I
>> couldn't quite make out the difference between range trees
>> and interval trees, so I just picked one to call it. Do
>> let me know if you have a better name.
>>
>> The idea of storing ranges in a tree is nice, but has a number
>> of complications. When adding a range, its possible that a
>> large range will consume and merge a number of smaller ranges.
>> When removing a range, its possible you may end up splitting an
>> existing range, causing one range to become two. This makes it
>> very difficult to provide generic list_head like behavior, as
>> the parent structures would need to be duplicated and removed,
>> and that has lots of memory ownership issues.
>>
>> So, this is a much simplified and more list_head like
>> implementation. You can add a node to a tree, or remove a node
>> to a tree, but the generic implementation doesn't do the
>> merging or splitting for you. But it does provide helpers to
>> find overlapping and adjacent ranges.
>>
>> Andrew also really wanted this range-tree implementation to be
>> resuable so we don't duplicate the file locking logic. I'm not
>> totally convinced that the requirements between the volatile
>> ranges and file locking are really equivelent, but this reduced
>> impelementation may make it possible.
>>
>> Do let me know what you think or if you have other ideas for
>> better ways to do the same.
> Hi John,
>
> I have implemented an interval lookup tree using the augmented
> features of the in-kernel rbtree for the KVM tools project. We
> currently use it for several things as a framework code such as MMIO
> memory mapping.
>
>  From what I see in the patch, you haven't fully implemented the
> interval structure (it needs to keep track of additional parameters
> when building it, and the search is a bit different and is based on
> those parameters).
Any more details on whats missing/different? Or the pros/cons of the 
different approaches?

> I could send that code as a patch against the kernel tree if something
> like that is actually required for the kernel at this point.
>
Sure.  I'm not married to this implementation (its just the only one so 
far that solves my needs - Dmitry already pointed out that the priotree 
might be close to sufficient, but I've not yet tried to adapt it), and 
whatever goes upstream needs to be generic enough to solve the related 
problems that a number of folks are all working on solving.  So seeing 
your approach might be good too.

thanks
-john



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] [RFC] Range tree implementation
  2012-04-09 18:04     ` John Stultz
@ 2012-04-09 18:44       ` Sasha Levin
  0 siblings, 0 replies; 15+ messages in thread
From: Sasha Levin @ 2012-04-09 18:44 UTC (permalink / raw)
  To: John Stultz
  Cc: linux-kernel, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel,
	Dmitry Adamushko, Dave Chinner, Neil Brown, Andrea Righi,
	Aneesh Kumar K.V, Pekka Enberg, Ingo Molnar

On Mon, Apr 9, 2012 at 8:04 PM, John Stultz <john.stultz@linaro.org> wrote:
> On 04/07/2012 10:36 AM, Sasha Levin wrote:
>>
>> On Sat, Apr 7, 2012 at 2:08 AM, John Stultz<john.stultz@linaro.org>
>>  wrote:
>>>
>>> After Andrew suggested something like his mumbletree idea
>>> to better store a list of ranges, I worked on a few different
>>> approaches, and this is what I've finally managed to get working.
>>>
>>> I suspect range-tree isn't a totally accurate name, but I
>>> couldn't quite make out the difference between range trees
>>> and interval trees, so I just picked one to call it. Do
>>> let me know if you have a better name.
>>>
>>> The idea of storing ranges in a tree is nice, but has a number
>>> of complications. When adding a range, its possible that a
>>> large range will consume and merge a number of smaller ranges.
>>> When removing a range, its possible you may end up splitting an
>>> existing range, causing one range to become two. This makes it
>>> very difficult to provide generic list_head like behavior, as
>>> the parent structures would need to be duplicated and removed,
>>> and that has lots of memory ownership issues.
>>>
>>> So, this is a much simplified and more list_head like
>>> implementation. You can add a node to a tree, or remove a node
>>> to a tree, but the generic implementation doesn't do the
>>> merging or splitting for you. But it does provide helpers to
>>> find overlapping and adjacent ranges.
>>>
>>> Andrew also really wanted this range-tree implementation to be
>>> resuable so we don't duplicate the file locking logic. I'm not
>>> totally convinced that the requirements between the volatile
>>> ranges and file locking are really equivelent, but this reduced
>>> impelementation may make it possible.
>>>
>>> Do let me know what you think or if you have other ideas for
>>> better ways to do the same.
>>
>> Hi John,
>>
>> I have implemented an interval lookup tree using the augmented
>> features of the in-kernel rbtree for the KVM tools project. We
>> currently use it for several things as a framework code such as MMIO
>> memory mapping.
>>
>>  From what I see in the patch, you haven't fully implemented the
>> interval structure (it needs to keep track of additional parameters
>> when building it, and the search is a bit different and is based on
>> those parameters).
>
> Any more details on whats missing/different? Or the pros/cons of the
> different approaches?

I took the implementation from 'Introduction to Algorithms', which
called for keeping the max high all the nodes in each sub-tree in the
root of that tree. I don't remember what stood behind that but I can
look it up when I get back to the book :)

>> I could send that code as a patch against the kernel tree if something
>> like that is actually required for the kernel at this point.
>>
> Sure.  I'm not married to this implementation (its just the only one so far
> that solves my needs - Dmitry already pointed out that the priotree might be
> close to sufficient, but I've not yet tried to adapt it), and whatever goes
> upstream needs to be generic enough to solve the related problems that a
> number of folks are all working on solving.  So seeing your approach might
> be good too.

The code is located here:
https://github.com/penberg/linux-kvm/blob/master/tools/kvm/util/rbtree-interval.c

Note that we didn't deal with intersecting ranges simply because there
was no call for that (we used it to map devices in a virtual memory
range, and those can't intersect). But it's not much of an issue
expanding the range intersection function to deal with that.

If the code above looks ok to you I'll format it as a patch and re-send it.

Thanks.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/2] [RFC] Range tree implementation
  2012-04-14  1:07 [PATCH 0/2][RFC] Volatile Ranges (v7) John Stultz
@ 2012-04-14  1:08 ` John Stultz
  0 siblings, 0 replies; 15+ messages in thread
From: John Stultz @ 2012-04-14  1:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: John Stultz, Andrew Morton, Android Kernel Team, Robert Love,
	Mel Gorman, Hugh Dickins, Dave Hansen, Rik van Riel,
	Dmitry Adamushko, Dave Chinner, Neil Brown, Andrea Righi,
	Aneesh Kumar K.V

After Andrew suggested something like his mumbletree idea
to better store a list of ranges, I worked on a few different
approaches, and this is what I've finally managed to get working.

I suspect range-tree isn't a totally accurate name, but I
couldn't quite make out the difference between range trees
and interval trees, so I just picked one to call it. Do
let me know if you have a better name.

The idea of storing ranges in a tree is nice, but has a number
of complications. When adding a range, its possible that a
large range will consume and merge a number of smaller ranges.
When removing a range, its possible you may end up splitting an
existing range, causing one range to become two. This makes it
very difficult to provide generic list_head like behavior, as
the parent structures would need to be duplicated and removed,
and that has lots of memory ownership issues.

So, this is a much simplified and more list_head like
implementation. You can add a node to a tree, or remove a node
to a tree, but the generic implementation doesn't do the
merging or splitting for you. But it does provide helpers to
find overlapping and adjacent ranges.

Andrew also really wanted this range-tree implementation to be
resuable so we don't duplicate the file locking logic. I'm not
totally convinced that the requirements between the volatile
ranges and file locking are really equivelent, but this reduced
impelementation may make it possible.

Do let me know what you think or if you have other ideas for
better ways to do the same.

Changelog:
v2:
* Reworked code to use an rbtree instead of splaying

v3:
* Added range_tree_next_in_range() to avoid having to start
  lookups from the root every time.
* Fixed some comments and return NULL instead of 0, as suggested
  by Aneesh Kumar K.V

v6:
* Fixed range_tree_in_range() so that it finds the earliest range,
  rather then the first. This allows the next_in_range() function
  to properly cover all the ranges in the tree.
* Minor clenaups to simplify some of the functions

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Android Kernel Team <kernel-team@android.com>
CC: Robert Love <rlove@google.com>
CC: Mel Gorman <mel@csn.ul.ie>
CC: Hugh Dickins <hughd@google.com>
CC: Dave Hansen <dave@linux.vnet.ibm.com>
CC: Rik van Riel <riel@redhat.com>
CC: Dmitry Adamushko <dmitry.adamushko@gmail.com>
CC: Dave Chinner <david@fromorbit.com>
CC: Neil Brown <neilb@suse.de>
CC: Andrea Righi <andrea@betterlinux.com>
CC: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/rangetree.h |   56 ++++++++++++++++++++
 lib/Makefile              |    2 +-
 lib/rangetree.c           |  128 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 185 insertions(+), 1 deletions(-)
 create mode 100644 include/linux/rangetree.h
 create mode 100644 lib/rangetree.c

diff --git a/include/linux/rangetree.h b/include/linux/rangetree.h
new file mode 100644
index 0000000..c61ce7c
--- /dev/null
+++ b/include/linux/rangetree.h
@@ -0,0 +1,56 @@
+#ifndef _LINUX_RANGETREE_H
+#define _LINUX_RANGETREE_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+struct range_tree_node {
+	struct rb_node rb;
+	u64 start;
+	u64 end;
+};
+
+struct range_tree_root {
+	struct rb_root head;
+};
+
+static inline void range_tree_init(struct range_tree_root *root)
+{
+	root->head = RB_ROOT;
+}
+
+static inline void range_tree_node_init(struct range_tree_node *node)
+{
+	rb_init_node(&node->rb);
+	node->start = 0;
+	node->end = 0;
+}
+
+static inline int range_tree_empty(struct range_tree_root *root)
+{
+	return RB_EMPTY_ROOT(&root->head);
+}
+
+static inline
+struct range_tree_node *range_tree_root_node(struct range_tree_root *root)
+{
+	struct range_tree_node *ret;
+	ret = container_of(root->head.rb_node, struct range_tree_node, rb);
+	return ret;
+}
+
+extern struct range_tree_node *range_tree_in_range(struct range_tree_root *root,
+							 u64 start, u64 end);
+extern struct range_tree_node *range_tree_in_range_adjacent(
+						struct range_tree_root *root,
+							 u64 start, u64 end);
+extern struct range_tree_node *range_tree_next_in_range(
+						struct range_tree_node *node,
+						u64 start, u64 end);
+extern void range_tree_add(struct range_tree_root *root,
+						struct range_tree_node *node);
+extern void range_tree_remove(struct range_tree_root *root,
+						struct range_tree_node *node);
+#endif
+
+
diff --git a/lib/Makefile b/lib/Makefile
index 18515f0..f43ef0d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -12,7 +12,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
 	 proportions.o prio_heap.o ratelimit.o show_mem.o \
-	 is_single_threaded.o plist.o decompress.o
+	 is_single_threaded.o plist.o decompress.o rangetree.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/rangetree.c b/lib/rangetree.c
new file mode 100644
index 0000000..08185bc
--- /dev/null
+++ b/lib/rangetree.c
@@ -0,0 +1,128 @@
+#include <linux/rangetree.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+
+
+/**
+ * range_tree_in_range - Returns the first node that overlaps with the
+ *                       given range
+ * @root: range_tree root
+ * @start: range start
+ * @end: range end
+ *
+ */
+struct range_tree_node *range_tree_in_range(struct range_tree_root *root,
+						u64 start, u64 end)
+{
+	struct rb_node *p = root->head.rb_node;
+	struct range_tree_node *candidate, *match = NULL;
+
+	while (p) {
+		candidate = rb_entry(p, struct range_tree_node, rb);
+		if (end < candidate->start)
+			p = p->rb_left;
+		else if (start > candidate->end)
+			p = p->rb_right;
+		else {
+			/* We found one, but try to find an earlier match */
+			match = candidate;
+			p = p->rb_left;
+		}
+	}
+
+	return match;
+}
+
+
+/**
+ * range_tree_in_range_adjacent - Returns the first node that overlaps or
+ *                                is adjacent with the given range.
+ * @root: range_tree root
+ * @start: range start
+ * @end: range end
+ *
+ */
+struct range_tree_node *range_tree_in_range_adjacent(
+					struct range_tree_root *root,
+					u64 start, u64 end)
+{
+	struct rb_node *p = root->head.rb_node;
+	struct range_tree_node *candidate;
+
+	while (p) {
+		candidate = rb_entry(p, struct range_tree_node, rb);
+		if (end+1 < candidate->start)
+			p = p->rb_left;
+		else if (start > candidate->end + 1)
+			p = p->rb_right;
+		else
+			return candidate;
+	}
+	return NULL;
+}
+
+struct range_tree_node *range_tree_next_in_range(struct range_tree_node *node,
+							u64 start, u64 end)
+{
+	struct rb_node *next;
+	struct range_tree_node *candidate;
+	if (!node)
+		return NULL;
+	next = rb_next(&node->rb);
+	if (!next)
+		return NULL;
+
+	candidate = container_of(next, struct range_tree_node, rb);
+
+	if ((candidate->start > end) || (candidate->end < start))
+		return NULL;
+
+	return candidate;
+}
+
+/**
+ * range_tree_add - Add a node to a range tree
+ * @root: range tree to be added to
+ * @node: range_tree_node to be added
+ *
+ * Adds a node to the range tree.
+ */
+void range_tree_add(struct range_tree_root *root,
+					struct range_tree_node *node)
+{
+	struct rb_node **p = &root->head.rb_node;
+	struct rb_node *parent = NULL;
+	struct range_tree_node *ptr;
+
+	WARN_ON_ONCE(!RB_EMPTY_NODE(&node->rb));
+
+	while (*p) {
+		parent = *p;
+		ptr = rb_entry(parent, struct range_tree_node, rb);
+		if (node->start < ptr->start)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&node->rb, parent, p);
+	rb_insert_color(&node->rb, &root->head);
+
+}
+
+
+/**
+ * range_tree_remove: Removes a given node from the tree
+ * @root: root of tree
+ * @node: Node to be removed
+ *
+ * Removes a node and splays the tree
+ */
+void range_tree_remove(struct range_tree_root *root,
+						struct range_tree_node *node)
+{
+	WARN_ON_ONCE(RB_EMPTY_NODE(&node->rb));
+
+	rb_erase(&node->rb, &root->head);
+	RB_CLEAR_NODE(&node->rb);
+}
-- 
1.7.3.2.146.gca209


^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2012-04-14  1:08 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-04-07  0:08 [PATCH 0/2] [RFC] Volatile Ranges (v6) John Stultz
2012-04-07  0:08 ` [PATCH 1/2] [RFC] Range tree implementation John Stultz
2012-04-07 17:36   ` Sasha Levin
2012-04-09 18:04     ` John Stultz
2012-04-09 18:44       ` Sasha Levin
2012-04-07  0:08 ` [PATCH 2/2] [RFC] fadvise: Add _VOLATILE,_ISVOLATILE, and _NONVOLATILE flags John Stultz
2012-04-07  8:14 ` [PATCH 0/2] [RFC] Volatile Ranges (v6) Dmitry Adamushko
2012-04-09 17:56   ` John Stultz
  -- strict thread matches above, loose matches on Subject: below --
2012-04-14  1:07 [PATCH 0/2][RFC] Volatile Ranges (v7) John Stultz
2012-04-14  1:08 ` [PATCH 1/2] [RFC] Range tree implementation John Stultz
2012-03-21  4:15 [PATCH 0/2] [RFC] fadivse volatile & range tree (v5) John Stultz
2012-03-21  4:15 ` [PATCH 1/2] [RFC] Range tree implementation John Stultz
2012-03-16 22:51 [PATCH 0/2] [RFC] Volatile ranges (v4) John Stultz
2012-03-16 22:51 ` [PATCH 1/2] [RFC] Range tree implementation John Stultz
2012-03-20 10:00   ` Dmitry Adamushko
2012-03-20 18:04     ` John Stultz
2012-03-20 16:44   ` Aneesh Kumar K.V
2012-02-10  0:16 John Stultz

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).