linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
To: "Theodore Ts'o" <tytso@mit.edu>,
	Andreas Dilger <adilger.kernel@dilger.ca>,
	Jan Kara <jack@suse.com>,
	Andrew Morton <akpm@linux-foundation.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>,
	Hugh Dickins <hughd@google.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Dave Hansen <dave.hansen@intel.com>,
	Vlastimil Babka <vbabka@suse.cz>,
	Matthew Wilcox <willy@infradead.org>,
	Ross Zwisler <ross.zwisler@linux.intel.com>,
	linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	linux-block@vger.kernel.org,
	Matthew Wilcox <willy@linux.intel.com>,
	"Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Subject: [PATCHv4 03/43] radix-tree: Add radix_tree_join
Date: Tue, 25 Oct 2016 03:13:02 +0300	[thread overview]
Message-ID: <20161025001342.76126-4-kirill.shutemov@linux.intel.com> (raw)
In-Reply-To: <20161025001342.76126-1-kirill.shutemov@linux.intel.com>

From: Matthew Wilcox <willy@linux.intel.com>

This new function allows for the replacement of many smaller entries in
the radix tree with one larger multiorder entry.  From the point of view
of an RCU walker, they may see a mixture of the smaller entries and the
large entry during the same walk, but they will never see NULL for an
index which was populated before the join.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 include/linux/radix-tree.h            |   2 +
 lib/radix-tree.c                      | 159 +++++++++++++++++++++++++++-------
 tools/testing/radix-tree/multiorder.c |  32 +++++++
 3 files changed, 163 insertions(+), 30 deletions(-)

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index af3581b8a451..1efd81f21241 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -319,6 +319,8 @@ static inline void radix_tree_preload_end(void)
 	preempt_enable();
 }
 
+int radix_tree_join(struct radix_tree_root *, unsigned long index,
+			unsigned new_order, void *);
 /**
  * struct radix_tree_iter - radix tree iterator state
  *
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 8e6d552c40dd..6a76252c93a6 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -314,18 +314,14 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
 {
 	struct radix_tree_node *node =
 			container_of(head, struct radix_tree_node, rcu_head);
-	int i;
 
 	/*
-	 * must only free zeroed nodes into the slab. radix_tree_shrink
-	 * can leave us with a non-NULL entry in the first slot, so clear
-	 * that here to make sure.
+	 * Must only free zeroed nodes into the slab.  We can be left with
+	 * non-NULL entries by radix_tree_free_nodes, so clear the entries
+	 * and tags here.
 	 */
-	for (i = 0; i < RADIX_TREE_MAX_TAGS; i++)
-		tag_clear(node, i, 0);
-
-	node->slots[0] = NULL;
-	node->count = 0;
+	memset(node->slots, 0, sizeof(node->slots));
+	memset(node->tags, 0, sizeof(node->tags));
 
 	kmem_cache_free(radix_tree_node_cachep, node);
 }
@@ -563,14 +559,14 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 	shift = radix_tree_load_root(root, &child, &maxindex);
 
 	/* Make sure the tree is high enough.  */
+	if (order > 0 && max == ((1UL << order) - 1))
+		max++;
 	if (max > maxindex) {
 		int error = radix_tree_extend(root, max, shift);
 		if (error < 0)
 			return error;
 		shift = error;
 		child = root->rnode;
-		if (order == shift)
-			shift += RADIX_TREE_MAP_SHIFT;
 	}
 
 	while (shift > order) {
@@ -582,6 +578,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 				return -ENOMEM;
 			child->shift = shift;
 			child->offset = offset;
+			child->count = 0;
 			child->parent = node;
 			rcu_assign_pointer(*slot, node_to_entry(child));
 			if (node)
@@ -595,31 +592,113 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 		slot = &node->slots[offset];
 	}
 
+	if (nodep)
+		*nodep = node;
+	if (slotp)
+		*slotp = slot;
+	return 0;
+}
+
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
-	/* Insert pointers to the canonical entry */
-	if (order > shift) {
-		unsigned i, n = 1 << (order - shift);
+/*
+ * Free any nodes below this node.  The tree is presumed to not need
+ * shrinking, and any user data in the tree is presumed to not need a
+ * destructor called on it.  If we need to add a destructor, we can
+ * add that functionality later.  Note that we may not clear tags or
+ * slots from the tree as an RCU walker may still have a pointer into
+ * this subtree.  We could replace the entries with RADIX_TREE_RETRY,
+ * but we'll still have to clear those in rcu_free.
+ */
+static void radix_tree_free_nodes(struct radix_tree_node *node)
+{
+	unsigned offset = 0;
+	struct radix_tree_node *child = entry_to_node(node);
+
+	for (;;) {
+		void *entry = child->slots[offset];
+		if (radix_tree_is_internal_node(entry) &&
+					!is_sibling_entry(child, entry)) {
+			child = entry_to_node(entry);
+			offset = 0;
+			continue;
+		}
+		offset++;
+		while (offset == RADIX_TREE_MAP_SIZE) {
+			struct radix_tree_node *old = child;
+			offset = child->offset + 1;
+			child = child->parent;
+			radix_tree_node_free(old);
+			if (old == entry_to_node(node))
+				return;
+		}
+	}
+}
+
+static inline int insert_entries(struct radix_tree_node *node, void **slot,
+				void *ptr, unsigned order, bool replace)
+{
+	struct radix_tree_node *child;
+	unsigned i, n, tag, offset, tags = 0;
+
+	if (node) {
+		n = 1 << (order - node->shift);
+		offset = get_slot_offset(node, slot);
+	} else {
+		n = 1;
+		offset = 0;
+	}
+
+	if (n > 1) {
 		offset = offset & ~(n - 1);
 		slot = &node->slots[offset];
-		child = node_to_entry(slot);
-		for (i = 0; i < n; i++) {
-			if (slot[i])
+	}
+	child = node_to_entry(slot);
+
+	for (i = 0; i < n; i++) {
+		if (slot[i]) {
+			if (replace) {
+				node->count--;
+				for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+					if (tag_get(node, tag, offset + i))
+						tags |= 1 << tag;
+			} else
 				return -EEXIST;
 		}
+	}
 
-		for (i = 1; i < n; i++) {
+	for (i = 0; i < n; i++) {
+		struct radix_tree_node *old = slot[i];
+		if (i) {
 			rcu_assign_pointer(slot[i], child);
-			node->count++;
+			for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+				if (tags & (1 << tag))
+					tag_clear(node, tag, offset + i);
+		} else {
+			rcu_assign_pointer(slot[i], ptr);
+			for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+				if (tags & (1 << tag))
+					tag_set(node, tag, offset);
 		}
+		if (radix_tree_is_internal_node(old) &&
+					!is_sibling_entry(node, old))
+			radix_tree_free_nodes(old);
 	}
-#endif
-
-	if (nodep)
-		*nodep = node;
-	if (slotp)
-		*slotp = slot;
-	return 0;
+	if (node)
+		node->count += n;
+	return n;
+}
+#else
+static inline int insert_entries(struct radix_tree_node *node, void **slot,
+				void *ptr, unsigned order, bool replace)
+{
+	if (*slot)
+		return -EEXIST;
+	rcu_assign_pointer(*slot, ptr);
+	if (node)
+		node->count++;
+	return 1;
 }
+#endif
 
 /**
  *	__radix_tree_insert    -    insert into a radix tree
@@ -642,13 +721,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
 	error = __radix_tree_create(root, index, order, &node, &slot);
 	if (error)
 		return error;
-	if (*slot != NULL)
-		return -EEXIST;
-	rcu_assign_pointer(*slot, item);
+
+	error = insert_entries(node, slot, item, order, false);
+	if (error < 0)
+		return error;
 
 	if (node) {
 		unsigned offset = get_slot_offset(node, slot);
-		node->count++;
 		BUG_ON(tag_get(node, 0, offset));
 		BUG_ON(tag_get(node, 1, offset));
 		BUG_ON(tag_get(node, 2, offset));
@@ -746,6 +825,26 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
 }
 EXPORT_SYMBOL(radix_tree_lookup);
 
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+int radix_tree_join(struct radix_tree_root *root, unsigned long index,
+			unsigned order, void *item)
+{
+	struct radix_tree_node *node;
+	void **slot;
+	int error;
+
+	BUG_ON(radix_tree_is_internal_node(item));
+
+	error = __radix_tree_create(root, index, order, &node, &slot);
+	if (!error)
+		error = insert_entries(node, slot, item, order, true);
+	if (error > 0)
+		error = 0;
+
+	return error;
+}
+#endif
+
 /**
  *	radix_tree_tag_set - set a tag on a radix tree node
  *	@root:		radix tree root
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
index 05d7bc488971..4c66acc6d0ea 100644
--- a/tools/testing/radix-tree/multiorder.c
+++ b/tools/testing/radix-tree/multiorder.c
@@ -325,6 +325,37 @@ void multiorder_tagged_iteration(void)
 	item_kill_tree(&tree);
 }
 
+static void __multiorder_join(unsigned long index,
+				unsigned order1, unsigned order2)
+{
+	unsigned long loc;
+	void *item, *item2 = item_create(index + 1);
+	RADIX_TREE(tree, GFP_KERNEL);
+
+	item_insert_order(&tree, index, order2);
+	item = radix_tree_lookup(&tree, index);
+	radix_tree_join(&tree, index + 1, order1, item2);
+	loc = radix_tree_locate_item(&tree, item);
+	if (loc == -1)
+		free(item);
+	item = radix_tree_lookup(&tree, index + 1);
+	assert(item == item2);
+	item_kill_tree(&tree);
+}
+
+static void multiorder_join(void)
+{
+	int i, j, idx;
+
+	for (idx = 0; idx < 1024; idx = idx * 2 + 3) {
+		for (i = 1; i < 15; i++) {
+			for (j = 0; j < i; j++) {
+				__multiorder_join(idx, i, j);
+			}
+		}
+	}
+}
+
 void multiorder_checks(void)
 {
 	int i;
@@ -342,4 +373,5 @@ void multiorder_checks(void)
 	multiorder_tag_tests();
 	multiorder_iteration();
 	multiorder_tagged_iteration();
+	multiorder_join();
 }
-- 
2.9.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2016-10-25  0:13 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-25  0:12 [PATCHv4 00/43] ext4: support of huge pages Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 01/43] tools: Add WARN_ON_ONCE Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 02/43] radix tree test suite: Allow GFP_ATOMIC allocations to fail Kirill A. Shutemov
2016-10-25  0:13 ` Kirill A. Shutemov [this message]
2016-10-25  0:13 ` [PATCHv4 04/43] radix-tree: Add radix_tree_split Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 05/43] radix-tree: Add radix_tree_split_preload() Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 06/43] mm, shmem: swich huge tmpfs to multi-order radix-tree entries Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 07/43] Revert "radix-tree: implement radix_tree_maybe_preload_order()" Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 08/43] page-flags: relax page flag policy for few flags Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 09/43] mm, rmap: account file thp pages Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 10/43] thp: try to free page's buffers before attempt split Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 11/43] thp: handle write-protection faults for file THP Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 12/43] truncate: make sure invalidate_mapping_pages() can discard huge pages Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 13/43] filemap: allocate huge page in page_cache_read(), if allowed Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 14/43] filemap: handle huge pages in do_generic_file_read() Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 15/43] filemap: allocate huge page in pagecache_get_page(), if allowed Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 16/43] filemap: handle huge pages in filemap_fdatawait_range() Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 17/43] HACK: readahead: alloc huge pages, if allowed Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 18/43] block: define BIO_MAX_PAGES to HPAGE_PMD_NR if huge page cache enabled Kirill A. Shutemov
2016-10-25  7:21   ` Christoph Hellwig
2016-10-25 12:54     ` Kirill A. Shutemov
2016-10-26  4:13       ` Andreas Dilger
2016-10-26  7:30         ` Ming Lei
2016-10-26  7:36           ` Christoph Hellwig
2016-10-26  7:36         ` Christoph Hellwig
2016-10-26  7:35       ` Christoph Hellwig
2016-10-25  0:13 ` [PATCHv4 19/43] brd: make it handle huge pages Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 20/43] mm: make write_cache_pages() work on " Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 21/43] thp: introduce hpage_size() and hpage_mask() Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 22/43] thp: do not threat slab pages as huge in hpage_{nr_pages,size,mask} Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 23/43] thp: make thp_get_unmapped_area() respect S_HUGE_MODE Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 24/43] fs: make block_read_full_page() be able to read huge page Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 25/43] fs: make block_write_{begin,end}() be able to handle huge pages Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 26/43] fs: make block_page_mkwrite() aware about " Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 27/43] truncate: make truncate_inode_pages_range() " Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 28/43] truncate: make invalidate_inode_pages2_range() " Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 29/43] mm, hugetlb: switch hugetlbfs to multi-order radix-tree entries Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 30/43] mm: account huge pages to dirty, writaback, reclaimable, etc Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 31/43] ext4: make ext4_mpage_readpages() hugepage-aware Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 32/43] ext4: make ext4_writepage() work on huge pages Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 33/43] ext4: handle huge pages in ext4_page_mkwrite() Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 34/43] ext4: handle huge pages in __ext4_block_zero_page_range() Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 35/43] ext4: make ext4_block_write_begin() aware about huge pages Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 36/43] ext4: handle huge pages in ext4_da_write_end() Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 37/43] ext4: make ext4_da_page_release_reservation() aware about huge pages Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 38/43] ext4: handle writeback with " Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 39/43] ext4: make EXT4_IOC_MOVE_EXT work " Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 40/43] ext4: fix SEEK_DATA/SEEK_HOLE for " Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 41/43] ext4: make fallocate() operations work with " Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 42/43] mm, fs, ext4: expand use of page_mapping() and page_to_pgoff() Kirill A. Shutemov
2016-10-25  0:13 ` [PATCHv4 43/43] ext4, vfs: add huge= mount option Kirill A. Shutemov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20161025001342.76126-4-kirill.shutemov@linux.intel.com \
    --to=kirill.shutemov@linux.intel.com \
    --cc=aarcange@redhat.com \
    --cc=adilger.kernel@dilger.ca \
    --cc=akpm@linux-foundation.org \
    --cc=dave.hansen@intel.com \
    --cc=hughd@google.com \
    --cc=jack@suse.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ross.zwisler@linux.intel.com \
    --cc=tytso@mit.edu \
    --cc=vbabka@suse.cz \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    --cc=willy@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).