All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andi Kleen <ak@linux.intel.com>, Andi Kleen <andi@firstfloor.org>,
	x86@kernel.org, linux-kernel@vger.kernel.org, oleg@redhat.com,
	paulmck@linux.vnet.ibm.com, rusty@rustcorp.com.au,
	mingo@kernel.org
Subject: Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
Date: Thu, 26 Feb 2015 17:43:56 +0100	[thread overview]
Message-ID: <20150226164356.GU21418@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <2127583772.183982.1424966563927.JavaMail.zimbra@efficios.com>

On Thu, Feb 26, 2015 at 04:02:43PM +0000, Mathieu Desnoyers wrote:
> Perhaps you could use mod_value() below, and introduce a
> "mod_size()" too. This would keep the init vs core selection
> out of the traversal code.

Indeed!

> Is it customary to define static variables in the
> middle of a C file ?

Dunno, it seemed like a good a place as any.

> The rest looks good, especially for use of the latch.
> I'd be tempted to turn "0, 1, 2, 3" into an enum though,
> so we can follow in the code what each of those array
> entry really means.

Agreed.

---
Subject: module: Optimize __module_address() using a latched RB-tree
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Feb 26 10:57:34 CET 2015

Currently __module_address() is using a linear search through all
modules in order to find the module corresponding to the provided
address. With a lot of modules this can take a lot of time.

One of the users of this is __kernel_text_address() which is employed
in many stack unwinders; which in turn are used by perf-callchain and
ftrace (possibly from NMI context).

So by optimizing __module_address() we optimize many stack unwinders
which are used by both perf and tracing in performance sensitive code.

Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/module.h |   19 +++-
 kernel/module.c        |  212 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 224 insertions(+), 7 deletions(-)

--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -17,6 +17,7 @@
 #include <linux/moduleparam.h>
 #include <linux/jump_label.h>
 #include <linux/export.h>
+#include <linux/rbtree.h>
 
 #include <linux/percpu.h>
 #include <asm/module.h>
@@ -210,6 +211,11 @@ enum module_state {
 	MODULE_STATE_UNFORMED,	/* Still setting it up. */
 };
 
+struct module_node {
+	struct module *mod;
+	struct rb_node node;
+};
+
 struct module {
 	enum module_state state;
 
@@ -269,8 +275,15 @@ struct module {
 	/* Startup function. */
 	int (*init)(void);
 
-	/* If this is non-NULL, vfree after init() returns */
-	void *module_init;
+	/*
+	 * If this is non-NULL, vfree after init() returns
+	 *
+	 * cacheline align here, such that:
+	 *   module_init, module_core, init_size, core_size and
+	 *   tree_node[0]
+	 * are on the same cacheline.
+	 */
+	void *module_init	____cacheline_aligned;
 
 	/* Here is the actual code + data, vfree'd on unload. */
 	void *module_core;
@@ -281,6 +294,8 @@ struct module {
 	/* The size of the executable code in each section.  */
 	unsigned int init_text_size, core_text_size;
 
+	struct module_node tree_node[4];
+
 	/* Size of RO sections of the module (text+rodata) */
 	unsigned int init_ro_size, core_ro_size;
 
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -102,6 +102,204 @@
 DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
+
+
+/*
+ * Use a latched RB-tree for __module_address()
+ *
+ * The latch concept is a multi-value concurrency data structure which allows
+ * unserialized access and guarantees at least one version is stable.
+ *
+ * It is employed here to optimize __module_address(), which needs to find the
+ * module (if any) associated with an address. Such questions are best answered
+ * using a binary search tree.
+ *
+ * Since modules use non-overlapping memory ranges we can use a regular RB-tree
+ * (as opposed to the interval tree). However, BSTs cannot be iterated while
+ * modified.
+ *
+ * To overcome this we use the latched RB-tree, it basically consists of two
+ * RB-trees which are modified in order, ensuring one is always consistent.
+ *
+ * Things are somewhat more complicated because there are two ranges per
+ * module, but other than that its pretty straight forward.
+ *
+ */
+
+enum {
+	latch0_core = 0,
+	latch1_core = 1,
+	latch0_init = 2,
+	latch1_init = 3,
+};
+
+enum {
+	latch_bit = 0x01,
+	init_bit  = 0x02,
+};
+
+struct latch_tree_root {
+	seqcount_t	seq;
+	struct rb_root	tree[2];
+};
+
+static unsigned long mod_value(struct module *mod, int idx)
+{
+	if (idx & init_bit)
+		return (unsigned long)mod->module_init;
+	else
+		return (unsigned long)mod->module_core;
+}
+
+static unsigned long mod_size(struct module *mod, int idx)
+{
+	if (idx & init_bit)
+		return mod->init_size;
+	else
+		return mod->core_size;
+}
+
+static struct module *mod_entry(struct rb_node *n)
+{
+	struct module_node *mn = container_of(n, struct module_node, node);
+	return mn->mod;
+}
+
+static int mod_node_idx(struct module *m, struct rb_node *n)
+{
+	struct module_node *mn = container_of(n, struct module_node, node);
+	int idx = mn - m->tree_node;
+
+	BUG_ON(mn->mod != m);
+	BUG_ON((unsigned)idx > 3);
+
+	return idx;
+}
+
+static void __tree_insert(struct latch_tree_root *mod_tree, struct module *mod, int idx)
+{
+	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
+	struct module_node *mn = &mod->tree_node[idx];
+	struct rb_node **link = &root->rb_node;
+	struct rb_node *parent = NULL;
+	unsigned long mod_val, m_val;
+	struct module *m;
+	int i;
+
+	mn->mod = mod;
+	mod_val = mod_value(mod, idx);
+
+	while (*link) {
+		parent = *link;
+		m = mod_entry(parent);
+		i = mod_node_idx(m, parent);
+		m_val = mod_value(m, i);
+
+		if (mod_val < m_val)
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node(&mn->node, parent, link);
+	rb_insert_color(&mn->node, root);
+}
+
+static void __tree_remove(struct latch_tree_root *mod_tree, struct module *mod, int idx)
+{
+	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
+	struct module_node *mn = &mod->tree_node[idx];
+
+	rb_erase(&mn->node, root);
+}
+
+/*
+ * struct module is arranged such that:
+ *
+ *   module_init, module_core, init_size, core_size,
+ *   init_text_size, core_text_size and tree_node[0]
+ *
+ * are on the same cacheline, therefore if the below iteration is
+ * on latch0 and all module init has completed, we'll only hit
+ * tree_node[0] and every intermediate level will hit only a single
+ * cacheline.
+ *
+ * Furthermore, by ensuring init_text_size and core_text_size are
+ * also in this same cacheline we've made sure is_module_text_address()
+ * will also not require additional lines.
+ */
+static struct module *__tree_find(struct rb_root *r, unsigned long addr)
+{
+	struct rb_node *n = r->rb_node;
+	unsigned long m_val, m_size;
+
+	while (n) {
+		struct module *m = mod_entry(n);
+		int idx = mod_node_idx(m, n);
+
+		m_val = mod_value(m, idx);
+		m_size = mod_size(m, idx);
+
+		if (addr < m_val)
+			n = n->rb_left;
+		else if (addr >= m_val + m_size)
+			n = n->rb_right;
+		else
+			return m;
+	}
+
+	return NULL;
+}
+
+static struct latch_tree_root mod_tree;
+
+static void mod_tree_insert(struct module *mod)
+{
+	raw_write_seqcount_latch(&mod_tree.seq);
+	__tree_insert(&mod_tree, mod, latch0_core);
+	if (mod->init_size)
+		__tree_insert(&mod_tree, mod, latch0_init);
+	raw_write_seqcount_latch(&mod_tree.seq);
+	__tree_insert(&mod_tree, mod, latch1_core);
+	if (mod->init_size)
+		__tree_insert(&mod_tree, mod, latch1_init);
+}
+
+static void mod_tree_remove_init(struct module *mod)
+{
+	raw_write_seqcount_latch(&mod_tree.seq);
+	if (mod->init_size)
+		__tree_remove(&mod_tree, mod, latch0_init);
+	raw_write_seqcount_latch(&mod_tree.seq);
+	if (mod->init_size)
+		__tree_remove(&mod_tree, mod, latch1_init);
+}
+
+static void mod_tree_remove(struct module *mod)
+{
+	raw_write_seqcount_latch(&mod_tree.seq);
+	__tree_remove(&mod_tree, mod, latch0_core);
+	if (mod->init_size)
+		__tree_remove(&mod_tree, mod, latch0_init);
+	raw_write_seqcount_latch(&mod_tree.seq);
+	__tree_remove(&mod_tree, mod, latch1_core);
+	if (mod->init_size)
+		__tree_remove(&mod_tree, mod, latch1_init);
+}
+
+static struct module *mod_tree_find(unsigned long addr)
+{
+	struct module *m;
+	unsigned int seq;
+
+	do {
+		seq = raw_read_seqcount(&mod_tree.seq);
+		m = __tree_find(&mod_tree.tree[seq & latch_bit], addr);
+	} while (read_seqcount_retry(&mod_tree.seq, seq));
+
+	return m;
+}
+
 #ifdef CONFIG_KGDB_KDB
 struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
 #endif /* CONFIG_KGDB_KDB */
@@ -1854,6 +2052,7 @@ static void free_module(struct module *m
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
+	mod_tree_remove(mod);
 	/* Remove this module from bug list, this uses list_del_rcu */
 	module_bug_cleanup(mod);
 	/* Wait for RCU synchronizing before releasing mod->list and buglist. */
@@ -3098,6 +3297,7 @@ static noinline int do_init_module(struc
 	mod->symtab = mod->core_symtab;
 	mod->strtab = mod->core_strtab;
 #endif
+	mod_tree_remove_init(mod);
 	unset_module_init_ro_nx(mod);
 	module_arch_freeing_init(mod);
 	mod->module_init = NULL;
@@ -3168,6 +3368,7 @@ static int add_unformed_module(struct mo
 		goto out;
 	}
 	list_add_rcu(&mod->list, &modules);
+	mod_tree_insert(mod);
 	err = 0;
 
 out:
@@ -3367,6 +3568,7 @@ static int load_module(struct load_info
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
+	mod_tree_remove(mod);
 	wake_up_all(&module_wq);
 	/* Wait for RCU synchronizing before releasing mod->list. */
 	synchronize_rcu();
@@ -3810,13 +4012,13 @@ struct module *__module_address(unsigned
 	if (addr < module_addr_min || addr > module_addr_max)
 		return NULL;
 
-	list_for_each_entry_rcu(mod, &modules, list) {
+	mod = mod_tree_find(addr);
+	if (mod) {
+		BUG_ON(!within_module(addr, mod));
 		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		if (within_module(addr, mod))
-			return mod;
+			mod = NULL;
 	}
-	return NULL;
+	return mod;
 }
 EXPORT_SYMBOL_GPL(__module_address);
 

  reply	other threads:[~2015-02-26 16:44 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-02-21  1:38 [PATCH 1/3] x86: Move msr accesses out of line Andi Kleen
2015-02-21  1:38 ` [PATCH 2/3] x86: Add trace point for MSR accesses Andi Kleen
2015-02-21  1:38 ` [PATCH 3/3] perf, x86: Remove old MSR perf tracing code Andi Kleen
2015-02-23 17:04 ` [PATCH 1/3] x86: Move msr accesses out of line Peter Zijlstra
2015-02-23 17:43   ` Andi Kleen
2015-02-25 12:27     ` Peter Zijlstra
2015-02-25 18:20       ` Andi Kleen
2015-02-25 18:34         ` Borislav Petkov
2015-02-26 11:43     ` [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree Peter Zijlstra
2015-02-26 12:00       ` Ingo Molnar
2015-02-26 14:12       ` Peter Zijlstra
2015-02-27 11:51         ` Rusty Russell
2015-02-26 16:02       ` Mathieu Desnoyers
2015-02-26 16:43         ` Peter Zijlstra [this message]
2015-02-26 16:55           ` Mathieu Desnoyers
2015-02-26 17:16             ` Peter Zijlstra
2015-02-26 17:22             ` Peter Zijlstra
2015-02-26 18:28           ` Paul E. McKenney
2015-02-26 19:06             ` Mathieu Desnoyers
2015-02-26 19:13             ` Peter Zijlstra
2015-02-26 19:41               ` Paul E. McKenney
2015-02-26 19:45                 ` Peter Zijlstra
2015-02-26 22:32                   ` Peter Zijlstra
2015-02-26 20:52                 ` Andi Kleen
2015-02-26 22:36                   ` Peter Zijlstra
2015-02-27 10:01                 ` Peter Zijlstra
2015-02-28 23:30                   ` Paul E. McKenney
2015-02-28 16:41               ` Peter Zijlstra
2015-02-28 16:56                 ` Peter Zijlstra
2015-02-28 23:32                   ` Paul E. McKenney
2015-03-02  9:24                     ` Peter Zijlstra
2015-03-02 16:58                       ` Paul E. McKenney
2015-02-27 12:02       ` Rusty Russell
2015-02-27 14:30         ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150226164356.GU21418@twins.programming.kicks-ass.net \
    --to=peterz@infradead.org \
    --cc=ak@linux.intel.com \
    --cc=andi@firstfloor.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mingo@kernel.org \
    --cc=oleg@redhat.com \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=rusty@rustcorp.com.au \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.