* [PATCH v2 1/5] module: Wait for RCU synchronizing before releasing a module
2014-10-23 19:26 [PATCH v2 0/5] module: Remove stop_machine from module unloading Masami Hiramatsu
@ 2014-10-23 19:26 ` Masami Hiramatsu
2014-10-23 19:26 ` [PATCH v2 2/5] module: Unlink module with RCU synchronizing instead of stop_machine Masami Hiramatsu
` (3 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Masami Hiramatsu @ 2014-10-23 19:26 UTC (permalink / raw)
To: Rusty Russell; +Cc: Lucas De Marchi, Linux Kernel Mailing List, Josh Poimboeuf
Wait for RCU synchronizing on failure path of module loading
before releasing struct module, because the memory of mod->list
can still be accessed by list walkers (e.g. kallsyms).
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
---
kernel/module.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/kernel/module.c b/kernel/module.c
index 88cec1d..331b03f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3326,6 +3326,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
wake_up_all(&module_wq);
+ /* Wait for RCU synchronizing before releasing mod->list. */
+ synchronize_rcu();
mutex_unlock(&module_mutex);
free_module:
module_deallocate(mod, info);
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v2 2/5] module: Unlink module with RCU synchronizing instead of stop_machine
2014-10-23 19:26 [PATCH v2 0/5] module: Remove stop_machine from module unloading Masami Hiramatsu
2014-10-23 19:26 ` [PATCH v2 1/5] module: Wait for RCU synchronizing before releasing a module Masami Hiramatsu
@ 2014-10-23 19:26 ` Masami Hiramatsu
2014-10-23 19:26 ` [PATCH v2 3/5] lib/bug: Use RCU list ops for module_bug_list Masami Hiramatsu
` (2 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Masami Hiramatsu @ 2014-10-23 19:26 UTC (permalink / raw)
To: Rusty Russell; +Cc: Lucas De Marchi, Linux Kernel Mailing List, Josh Poimboeuf
Unlink module from module list with RCU synchronizing instead
of using stop_machine(). Since module list is already protected
by rcu, we don't need stop_machine() anymore.
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
---
kernel/module.c | 18 +++++-------------
1 file changed, 5 insertions(+), 13 deletions(-)
diff --git a/kernel/module.c b/kernel/module.c
index 331b03f..bed608b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1697,18 +1697,6 @@ static void mod_sysfs_teardown(struct module *mod)
mod_sysfs_fini(mod);
}
-/*
- * unlink the module with the whole machine is stopped with interrupts off
- * - this defends against kallsyms not taking locks
- */
-static int __unlink_module(void *_mod)
-{
- struct module *mod = _mod;
- list_del(&mod->list);
- module_bug_cleanup(mod);
- return 0;
-}
-
#ifdef CONFIG_DEBUG_SET_MODULE_RONX
/*
* LKM RO/NX protection: protect module's text/ro-data
@@ -1860,7 +1848,11 @@ static void free_module(struct module *mod)
/* Now we can delete it from the lists */
mutex_lock(&module_mutex);
- stop_machine(__unlink_module, mod, NULL);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ /* Wait for RCU synchronizing before releasing mod->list. */
+ synchronize_rcu();
+ module_bug_cleanup(mod);
mutex_unlock(&module_mutex);
/* This may be NULL, but that's OK */
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v2 3/5] lib/bug: Use RCU list ops for module_bug_list
2014-10-23 19:26 [PATCH v2 0/5] module: Remove stop_machine from module unloading Masami Hiramatsu
2014-10-23 19:26 ` [PATCH v2 1/5] module: Wait for RCU synchronizing before releasing a module Masami Hiramatsu
2014-10-23 19:26 ` [PATCH v2 2/5] module: Unlink module with RCU synchronizing instead of stop_machine Masami Hiramatsu
@ 2014-10-23 19:26 ` Masami Hiramatsu
2014-10-23 19:27 ` [PATCH v2 4/5] module: Replace module_ref with atomic_t refcnt Masami Hiramatsu
2014-10-23 19:27 ` [PATCH v2 5/5] module: Remove stop_machine from module unloading Masami Hiramatsu
4 siblings, 0 replies; 7+ messages in thread
From: Masami Hiramatsu @ 2014-10-23 19:26 UTC (permalink / raw)
To: Rusty Russell; +Cc: Lucas De Marchi, Linux Kernel Mailing List, Josh Poimboeuf
Actually since module_bug_list should be used in BUG context,
we may not need this. But for someone who want to use this
from normal context, this makes module_bug_list an RCU list.
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
---
kernel/module.c | 5 +++--
lib/bug.c | 20 ++++++++++++++------
2 files changed, 17 insertions(+), 8 deletions(-)
diff --git a/kernel/module.c b/kernel/module.c
index bed608b..d596a30 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1850,9 +1850,10 @@ static void free_module(struct module *mod)
mutex_lock(&module_mutex);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
- /* Wait for RCU synchronizing before releasing mod->list. */
- synchronize_rcu();
+ /* Remove this module from bug list, this uses list_del_rcu */
module_bug_cleanup(mod);
+ /* Wait for RCU synchronizing before releasing mod->list and buglist. */
+ synchronize_rcu();
mutex_unlock(&module_mutex);
/* This may be NULL, but that's OK */
diff --git a/lib/bug.c b/lib/bug.c
index d1d7c78..0c3bd95 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -64,16 +64,22 @@ static LIST_HEAD(module_bug_list);
static const struct bug_entry *module_find_bug(unsigned long bugaddr)
{
struct module *mod;
+ const struct bug_entry *bug = NULL;
- list_for_each_entry(mod, &module_bug_list, bug_list) {
- const struct bug_entry *bug = mod->bug_table;
+ rcu_read_lock();
+ list_for_each_entry_rcu(mod, &module_bug_list, bug_list) {
unsigned i;
+ bug = mod->bug_table;
for (i = 0; i < mod->num_bugs; ++i, ++bug)
if (bugaddr == bug_addr(bug))
- return bug;
+ goto out;
}
- return NULL;
+ bug = NULL;
+out:
+ rcu_read_unlock();
+
+ return bug;
}
void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
@@ -99,13 +105,15 @@ void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
* Strictly speaking this should have a spinlock to protect against
* traversals, but since we only traverse on BUG()s, a spinlock
* could potentially lead to deadlock and thus be counter-productive.
+ * Thus, this uses RCU to safely manipulate the bug list, since BUG
+ * must run in non-interruptive state.
*/
- list_add(&mod->bug_list, &module_bug_list);
+ list_add_rcu(&mod->bug_list, &module_bug_list);
}
void module_bug_cleanup(struct module *mod)
{
- list_del(&mod->bug_list);
+ list_del_rcu(&mod->bug_list);
}
#else
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v2 4/5] module: Replace module_ref with atomic_t refcnt
2014-10-23 19:26 [PATCH v2 0/5] module: Remove stop_machine from module unloading Masami Hiramatsu
` (2 preceding siblings ...)
2014-10-23 19:26 ` [PATCH v2 3/5] lib/bug: Use RCU list ops for module_bug_list Masami Hiramatsu
@ 2014-10-23 19:27 ` Masami Hiramatsu
2014-10-23 19:27 ` [PATCH v2 5/5] module: Remove stop_machine from module unloading Masami Hiramatsu
4 siblings, 0 replies; 7+ messages in thread
From: Masami Hiramatsu @ 2014-10-23 19:27 UTC (permalink / raw)
To: Rusty Russell; +Cc: Lucas De Marchi, Linux Kernel Mailing List, Josh Poimboeuf
Replace module_ref per-cpu complex reference counter with
an atomic_t simple refcnt. This is for code simplification.
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
---
include/linux/module.h | 16 +---------------
include/trace/events/module.h | 2 +-
kernel/module.c | 39 +++++----------------------------------
3 files changed, 7 insertions(+), 50 deletions(-)
diff --git a/include/linux/module.h b/include/linux/module.h
index 71f282a..ebfb0e1 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -210,20 +210,6 @@ enum module_state {
MODULE_STATE_UNFORMED, /* Still setting it up. */
};
-/**
- * struct module_ref - per cpu module reference counts
- * @incs: number of module get on this cpu
- * @decs: number of module put on this cpu
- *
- * We force an alignment on 8 or 16 bytes, so that alloc_percpu()
- * put @incs/@decs in same cache line, with no extra memory cost,
- * since alloc_percpu() is fine grained.
- */
-struct module_ref {
- unsigned long incs;
- unsigned long decs;
-} __attribute((aligned(2 * sizeof(unsigned long))));
-
struct module {
enum module_state state;
@@ -367,7 +353,7 @@ struct module {
/* Destruction function. */
void (*exit)(void);
- struct module_ref __percpu *refptr;
+ atomic_t refcnt;
#endif
#ifdef CONFIG_CONSTRUCTORS
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index 7c5cbfe..81c4c18 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -80,7 +80,7 @@ DECLARE_EVENT_CLASS(module_refcnt,
TP_fast_assign(
__entry->ip = ip;
- __entry->refcnt = __this_cpu_read(mod->refptr->incs) - __this_cpu_read(mod->refptr->decs);
+ __entry->refcnt = atomic_read(&mod->refcnt);
__assign_str(name, mod->name);
),
diff --git a/kernel/module.c b/kernel/module.c
index d596a30..b1d485d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -631,15 +631,11 @@ EXPORT_TRACEPOINT_SYMBOL(module_get);
/* Init the unload section of the module. */
static int module_unload_init(struct module *mod)
{
- mod->refptr = alloc_percpu(struct module_ref);
- if (!mod->refptr)
- return -ENOMEM;
-
INIT_LIST_HEAD(&mod->source_list);
INIT_LIST_HEAD(&mod->target_list);
/* Hold reference count during initialization. */
- raw_cpu_write(mod->refptr->incs, 1);
+ atomic_set(&mod->refcnt, 1);
return 0;
}
@@ -721,8 +717,6 @@ static void module_unload_free(struct module *mod)
kfree(use);
}
mutex_unlock(&module_mutex);
-
- free_percpu(mod->refptr);
}
#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -772,28 +766,7 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
unsigned long module_refcount(struct module *mod)
{
- unsigned long incs = 0, decs = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- decs += per_cpu_ptr(mod->refptr, cpu)->decs;
- /*
- * ensure the incs are added up after the decs.
- * module_put ensures incs are visible before decs with smp_wmb.
- *
- * This 2-count scheme avoids the situation where the refcount
- * for CPU0 is read, then CPU0 increments the module refcount,
- * then CPU1 drops that refcount, then the refcount for CPU1 is
- * read. We would record a decrement but not its corresponding
- * increment so we would see a low count (disaster).
- *
- * Rare situation? But module_refcount can be preempted, and we
- * might be tallying up 4096+ CPUs. So it is not impossible.
- */
- smp_rmb();
- for_each_possible_cpu(cpu)
- incs += per_cpu_ptr(mod->refptr, cpu)->incs;
- return incs - decs;
+ return (unsigned long)atomic_read(&mod->refcnt);
}
EXPORT_SYMBOL(module_refcount);
@@ -935,7 +908,7 @@ void __module_get(struct module *module)
{
if (module) {
preempt_disable();
- __this_cpu_inc(module->refptr->incs);
+ atomic_inc(&module->refcnt);
trace_module_get(module, _RET_IP_);
preempt_enable();
}
@@ -950,7 +923,7 @@ bool try_module_get(struct module *module)
preempt_disable();
if (likely(module_is_live(module))) {
- __this_cpu_inc(module->refptr->incs);
+ atomic_inc(&module->refcnt);
trace_module_get(module, _RET_IP_);
} else
ret = false;
@@ -965,9 +938,7 @@ void module_put(struct module *module)
{
if (module) {
preempt_disable();
- smp_wmb(); /* see comment in module_refcount */
- __this_cpu_inc(module->refptr->decs);
-
+ atomic_dec(&module->refcnt);
trace_module_put(module, _RET_IP_);
preempt_enable();
}
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v2 5/5] module: Remove stop_machine from module unloading
2014-10-23 19:26 [PATCH v2 0/5] module: Remove stop_machine from module unloading Masami Hiramatsu
` (3 preceding siblings ...)
2014-10-23 19:27 ` [PATCH v2 4/5] module: Replace module_ref with atomic_t refcnt Masami Hiramatsu
@ 2014-10-23 19:27 ` Masami Hiramatsu
2014-10-28 1:20 ` Rusty Russell
4 siblings, 1 reply; 7+ messages in thread
From: Masami Hiramatsu @ 2014-10-23 19:27 UTC (permalink / raw)
To: Rusty Russell; +Cc: Lucas De Marchi, Linux Kernel Mailing List, Josh Poimboeuf
Remove stop_machine from module unloading by adding new reference
counting algorithm.
This atomic refcounter works like a semaphore, it can get (be
incremented) only when the counter is not 0. When loading a module,
kmodule subsystem sets the counter MODULE_REF_BASE (= 1). And when
unloading the module, it subtracts MODULE_REF_BASE from the counter.
If no one refers the module, the refcounter becomes 0 and we can
remove the module safely. If someone referes it, we try to recover
the counter by adding MODULE_REF_BASE unless the counter becomes 0,
because the referrer can put the module right before recovering.
If the recovering is failed, we can get the 0 refcount and it
never be incremented again, it can be removed safely too.
Note that __module_get() forcibly gets the module refcounter,
users should use try_module_get() instead of that.
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
---
kernel/module.c | 67 ++++++++++++++++++++++++++++++++-----------------------
1 file changed, 39 insertions(+), 28 deletions(-)
diff --git a/kernel/module.c b/kernel/module.c
index b1d485d..e772595 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -42,7 +42,6 @@
#include <linux/vermagic.h>
#include <linux/notifier.h>
#include <linux/sched.h>
-#include <linux/stop_machine.h>
#include <linux/device.h>
#include <linux/string.h>
#include <linux/mutex.h>
@@ -98,7 +97,7 @@
* 1) List of modules (also safely readable with preempt_disable),
* 2) module_use links,
* 3) module_addr_min/module_addr_max.
- * (delete uses stop_machine/add uses RCU list operations). */
+ * (delete and add uses RCU list operations). */
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
@@ -628,14 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
EXPORT_TRACEPOINT_SYMBOL(module_get);
+/* MODULE_REF_BASE is the base reference count by kmodule loader. */
+#define MODULE_REF_BASE 1
+
/* Init the unload section of the module. */
static int module_unload_init(struct module *mod)
{
+ /*
+ * Initialize reference counter to MODULE_REF_BASE.
+ * refcnt == 0 means module is going.
+ */
+ atomic_set(&mod->refcnt, MODULE_REF_BASE);
+
INIT_LIST_HEAD(&mod->source_list);
INIT_LIST_HEAD(&mod->target_list);
/* Hold reference count during initialization. */
- atomic_set(&mod->refcnt, 1);
+ atomic_inc(&mod->refcnt);
return 0;
}
@@ -734,39 +742,39 @@ static inline int try_force_unload(unsigned int flags)
}
#endif /* CONFIG_MODULE_FORCE_UNLOAD */
-struct stopref
+/* Try to release refcount of module, 0 means success. */
+static int try_release_module_ref(struct module *mod)
{
- struct module *mod;
- int flags;
- int *forced;
-};
+ int ret;
-/* Whole machine is stopped with interrupts off when this runs. */
-static int __try_stop_module(void *_sref)
-{
- struct stopref *sref = _sref;
+ /* Try to decrement refcnt which we set at loading */
+ ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
+ BUG_ON(ret < 0);
+ if (ret)
+ /* Someone can put this right now, recover with checking */
+ ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0);
+
+ return ret;
+}
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
/* If it's not unused, quit unless we're forcing. */
- if (module_refcount(sref->mod) != 0) {
- if (!(*sref->forced = try_force_unload(sref->flags)))
+ if (try_release_module_ref(mod) != 0) {
+ *forced = try_force_unload(flags);
+ if (!(*forced))
return -EWOULDBLOCK;
}
/* Mark it as dying. */
- sref->mod->state = MODULE_STATE_GOING;
- return 0;
-}
-
-static int try_stop_module(struct module *mod, int flags, int *forced)
-{
- struct stopref sref = { mod, flags, forced };
+ mod->state = MODULE_STATE_GOING;
- return stop_machine(__try_stop_module, &sref, NULL);
+ return 0;
}
unsigned long module_refcount(struct module *mod)
{
- return (unsigned long)atomic_read(&mod->refcnt);
+ return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE;
}
EXPORT_SYMBOL(module_refcount);
@@ -921,11 +929,11 @@ bool try_module_get(struct module *module)
if (module) {
preempt_disable();
-
- if (likely(module_is_live(module))) {
- atomic_inc(&module->refcnt);
+ /* Note: here, we can fail to get a reference */
+ if (likely(module_is_live(module) &&
+ atomic_inc_not_zero(&module->refcnt) != 0))
trace_module_get(module, _RET_IP_);
- } else
+ else
ret = false;
preempt_enable();
@@ -936,9 +944,12 @@ EXPORT_SYMBOL(try_module_get);
void module_put(struct module *module)
{
+ int ret;
+
if (module) {
preempt_disable();
- atomic_dec(&module->refcnt);
+ ret = atomic_dec_if_positive(&module->refcnt);
+ WARN_ON(ret < 0); /* Failed to put refcount */
trace_module_put(module, _RET_IP_);
preempt_enable();
}
^ permalink raw reply related [flat|nested] 7+ messages in thread