From: Ingo Molnar <mingo@elte.hu>
To: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: linux-kernel@vger.kernel.org, oleg@tv-sign.ru,
dipankar@in.ibm.com, suzannew@cs.pdx.edu,
Andrew Morton <akpm@osdl.org>
Subject: Re: [PATCH] Fixes for RCU handling of task_struct
Date: Mon, 31 Oct 2005 15:04:59 +0100 [thread overview]
Message-ID: <20051031140459.GA5664@elte.hu> (raw)
In-Reply-To: <20051031020535.GA46@us.ibm.com>
* Paul E. McKenney <paulmck@us.ibm.com> wrote:
> Hello!
>
> My earlier code that applies RCU to the task list (in PREEMPT_RT) was
> missing some rcu_dereference() and rcu_assign_pointer() calls. This
> patch fixes these problems.
>
> Signed-off-by: <paulmck@us.ibm.com>
thanks, applied - will show up in -rt2. Find below the rollup of the
sighand-RCU feature in the -rt tree. Andrew, Paul, could/should we try
this in -mm?
Ingo
----
RCU signal handling: send signals RCU-read-locked instead of
tasklist_lock read-locked. This is a scalability improvement on SMP and
a preemption-latency improvement under PREEMPT_RCU.
Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -34,6 +34,7 @@
#include <linux/percpu.h>
#include <linux/topology.h>
#include <linux/seccomp.h>
+#include <linux/rcupdate.h>
#include <linux/auxvec.h> /* For AT_VECTOR_SIZE */
@@ -408,8 +409,16 @@ struct sighand_struct {
atomic_t count;
struct k_sigaction action[_NSIG];
spinlock_t siglock;
+ struct rcu_head rcu;
};
+static inline void sighand_free(struct sighand_struct *sp)
+{
+ extern void sighand_free_cb(struct rcu_head *rhp);
+
+ call_rcu(&sp->rcu, sighand_free_cb);
+}
+
/*
* NOTE! "signal_struct" does not have it's own
* locking, because a shared signal_struct always
@@ -913,6 +922,7 @@ struct task_struct {
int cpuset_mems_generation;
#endif
atomic_t fs_excl; /* holding fs exclusive resources */
+ struct rcu_head rcu;
};
static inline pid_t process_group(struct task_struct *tsk)
@@ -936,8 +946,29 @@ static inline int pid_alive(struct task_
extern void free_task(struct task_struct *tsk);
extern void __put_task_struct(struct task_struct *tsk);
#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
-#define put_task_struct(tsk) \
-do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
+
+static inline int get_task_struct_rcu(struct task_struct *t)
+{
+ int oldusage;
+
+ do {
+ oldusage = atomic_read(&t->usage);
+ if (oldusage == 0) {
+ return 0;
+ }
+ } while (cmpxchg(&t->usage.counter,
+ oldusage, oldusage + 1) != oldusage);
+ return 1;
+}
+
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+ if (atomic_dec_and_test(&t->usage)) {
+ call_rcu(&t->rcu, __put_task_struct_cb);
+ }
+}
/*
* Per process flags
Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -176,6 +176,13 @@ static unsigned int task_timeslice(task_
#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
< (long long) (sd)->cache_hot_time)
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+ __put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
+
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+
/*
* These are the runqueue data structures:
*/
Index: linux/kernel/signal.c
===================================================================
--- linux.orig/kernel/signal.c
+++ linux/kernel/signal.c
@@ -330,13 +330,20 @@ void __exit_sighand(struct task_struct *
/* Ok, we're done with the signal handlers */
tsk->sighand = NULL;
if (atomic_dec_and_test(&sighand->count))
- kmem_cache_free(sighand_cachep, sighand);
+ sighand_free(sighand);
}
void exit_sighand(struct task_struct *tsk)
{
write_lock_irq(&tasklist_lock);
- __exit_sighand(tsk);
+ rcu_read_lock();
+ if (tsk->sighand != NULL) {
+ struct sighand_struct *sighand = tsk->sighand;
+ spin_lock(&sighand->siglock);
+ __exit_sighand(tsk);
+ spin_unlock(&sighand->siglock);
+ }
+ rcu_read_unlock();
write_unlock_irq(&tasklist_lock);
}
@@ -352,6 +359,7 @@ void __exit_signal(struct task_struct *t
BUG();
if (!atomic_read(&sig->count))
BUG();
+ rcu_read_lock();
spin_lock(&sighand->siglock);
posix_cpu_timers_exit(tsk);
if (atomic_dec_and_test(&sig->count)) {
@@ -359,6 +367,7 @@ void __exit_signal(struct task_struct *t
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
tsk->signal = NULL;
+ __exit_sighand(tsk);
spin_unlock(&sighand->siglock);
flush_sigqueue(&sig->shared_pending);
} else {
@@ -390,9 +399,11 @@ void __exit_signal(struct task_struct *t
sig->nvcsw += tsk->nvcsw;
sig->nivcsw += tsk->nivcsw;
sig->sched_time += tsk->sched_time;
+ __exit_sighand(tsk);
spin_unlock(&sighand->siglock);
sig = NULL; /* Marker for below. */
}
+ rcu_read_unlock();
clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
flush_sigqueue(&tsk->pending);
if (sig) {
@@ -1119,13 +1130,24 @@ void zap_other_threads(struct task_struc
int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
unsigned long flags;
+ struct sighand_struct *sp;
int ret;
+retry:
ret = check_kill_permission(sig, info, p);
- if (!ret && sig && p->sighand) {
- spin_lock_irqsave(&p->sighand->siglock, flags);
+ if (!ret && sig && (sp = p->sighand)) {
+ if (!get_task_struct_rcu(p)) {
+ return -ESRCH;
+ }
+ spin_lock_irqsave(&sp->siglock, flags);
+ if (p->sighand != sp) {
+ spin_unlock_irqrestore(&sp->siglock, flags);
+ put_task_struct(p);
+ goto retry;
+ }
ret = __group_send_sig_info(sig, info, p);
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ spin_unlock_irqrestore(&sp->siglock, flags);
+ put_task_struct(p);
}
return ret;
@@ -1172,12 +1194,12 @@ kill_proc_info(int sig, struct siginfo *
int error;
struct task_struct *p;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
p = find_task_by_pid(pid);
error = -ESRCH;
if (p)
error = group_send_sig_info(sig, info, p);
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return error;
}
@@ -1385,16 +1407,47 @@ send_sigqueue(int sig, struct sigqueue *
{
unsigned long flags;
int ret = 0;
+ struct sighand_struct *sh = p->sighand;
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
- read_lock(&tasklist_lock);
+
+ /*
+ * The rcu based delayed sighand destroy makes it possible to
+ * run this without tasklist lock held. The task struct itself
+ * cannot go away as create_timer did get_task_struct().
+ *
+ * We return -1, when the task is marked exiting, so
+ * posix_timer_event can redirect it to the group leader
+ *
+ */
+ rcu_read_lock();
if (unlikely(p->flags & PF_EXITING)) {
ret = -1;
goto out_err;
}
- spin_lock_irqsave(&p->sighand->siglock, flags);
+ spin_lock_irqsave(&sh->siglock, flags);
+
+ /*
+ * We do the check here again to handle the following scenario:
+ *
+ * CPU 0 CPU 1
+ * send_sigqueue
+ * check PF_EXITING
+ * interrupt exit code running
+ * __exit_signal
+ * lock sighand->siglock
+ * unlock sighand->siglock
+ * lock sh->siglock
+ * add(tsk->pending) flush_sigqueue(tsk->pending)
+ *
+ */
+
+ if (unlikely(p->flags & PF_EXITING)) {
+ ret = -1;
+ goto out;
+ }
if (unlikely(!list_empty(&q->list))) {
/*
@@ -1412,17 +1465,16 @@ send_sigqueue(int sig, struct sigqueue *
goto out;
}
- q->lock = &p->sighand->siglock;
+ q->lock = &sh->siglock;
list_add_tail(&q->list, &p->pending.list);
sigaddset(&p->pending.signal, sig);
if (!sigismember(&p->blocked, sig))
signal_wake_up(p, sig == SIGKILL);
out:
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ spin_unlock_irqrestore(&sh->siglock, flags);
out_err:
- read_unlock(&tasklist_lock);
-
+ rcu_read_unlock();
return ret;
}
@@ -1433,7 +1485,16 @@ send_group_sigqueue(int sig, struct sigq
int ret = 0;
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
- read_lock(&tasklist_lock);
+
+ while(!read_trylock(&tasklist_lock)) {
+ if (!p->sighand)
+ return -1;
+ cpu_relax();
+ }
+ if (unlikely(!p->sighand)) {
+ ret = -1;
+ goto out_err;
+ }
spin_lock_irqsave(&p->sighand->siglock, flags);
handle_stop_signal(sig, p);
@@ -1467,8 +1528,9 @@ send_group_sigqueue(int sig, struct sigq
__group_complete_signal(sig, p);
out:
spin_unlock_irqrestore(&p->sighand->siglock, flags);
+out_err:
read_unlock(&tasklist_lock);
- return(ret);
+ return ret;
}
/*
Index: linux/fs/exec.c
===================================================================
--- linux.orig/fs/exec.c
+++ linux/fs/exec.c
@@ -779,7 +779,7 @@ no_thread_group:
write_unlock_irq(&tasklist_lock);
if (atomic_dec_and_test(&oldsighand->count))
- kmem_cache_free(sighand_cachep, oldsighand);
+ sighand_free(oldsighand);
}
BUG_ON(!thread_group_leader(current));
Index: linux/kernel/exit.c
===================================================================
--- linux.orig/kernel/exit.c
+++ linux/kernel/exit.c
@@ -71,7 +71,6 @@ repeat:
__ptrace_unlink(p);
BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
__exit_signal(p);
- __exit_sighand(p);
/*
* Note that the fastpath in sys_times depends on __exit_signal having
* updated the counters before a task is removed from the tasklist of
Index: linux/kernel/fork.c
===================================================================
--- linux.orig/kernel/fork.c
+++ linux/kernel/fork.c
@@ -754,6 +754,14 @@ int unshare_files(void)
EXPORT_SYMBOL(unshare_files);
+void sighand_free_cb(struct rcu_head *rhp)
+{
+ struct sighand_struct *sp =
+ container_of(rhp, struct sighand_struct, rcu);
+
+ kmem_cache_free(sighand_cachep, sp);
+}
+
static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
{
struct sighand_struct *sig;
Index: linux/kernel/rcupdate.c
===================================================================
--- linux.orig/kernel/rcupdate.c
+++ linux/kernel/rcupdate.c
@@ -35,6 +35,7 @@
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
+#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <asm/atomic.h>
Index: linux/include/linux/list.h
===================================================================
--- linux.orig/include/linux/list.h
+++ linux/include/linux/list.h
@@ -208,6 +208,7 @@ static inline void list_replace_rcu(stru
smp_wmb();
new->next->prev = new;
new->prev->next = new;
+ old->prev = LIST_POISON2;
}
/**
@@ -578,6 +579,25 @@ static inline void hlist_del_init(struct
}
}
+/*
+ * hlist_replace_rcu - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ *
+ * The old entry will be replaced with the new entry atomically.
+ */
+static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *new){
+ struct hlist_node *next = old->next;
+
+ new->next = next;
+ new->pprev = old->pprev;
+ smp_wmb();
+ if (next)
+ new->next->pprev = &new->next;
+ *new->pprev = new;
+ old->pprev = LIST_POISON2;
+}
+
static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
{
struct hlist_node *first = h->first;
Index: linux/kernel/pid.c
===================================================================
--- linux.orig/kernel/pid.c
+++ linux/kernel/pid.c
@@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_
struct hlist_node *elem;
struct pid *pid;
- hlist_for_each_entry(pid, elem,
+ hlist_for_each_entry_rcu(pid, elem,
&pid_hash[type][pid_hashfn(nr)], pid_chain) {
if (pid->nr == nr)
return pid;
@@ -151,12 +151,12 @@ int fastcall attach_pid(task_t *task, en
task_pid = &task->pids[type];
pid = find_pid(type, nr);
if (pid == NULL) {
- hlist_add_head(&task_pid->pid_chain,
- &pid_hash[type][pid_hashfn(nr)]);
INIT_LIST_HEAD(&task_pid->pid_list);
+ hlist_add_head_rcu(&task_pid->pid_chain,
+ &pid_hash[type][pid_hashfn(nr)]);
} else {
INIT_HLIST_NODE(&task_pid->pid_chain);
- list_add_tail(&task_pid->pid_list, &pid->pid_list);
+ list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
}
task_pid->nr = nr;
@@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t
pid = &task->pids[type];
if (!hlist_unhashed(&pid->pid_chain)) {
- hlist_del(&pid->pid_chain);
- if (list_empty(&pid->pid_list))
+ if (list_empty(&pid->pid_list)) {
nr = pid->nr;
- else {
+ hlist_del_rcu(&pid->pid_chain);
+ } else {
pid_next = list_entry(pid->pid_list.next,
struct pid, pid_list);
/* insert next pid from pid_list to hash */
- hlist_add_head(&pid_next->pid_chain,
- &pid_hash[type][pid_hashfn(pid_next->nr)]);
+ hlist_replace_rcu(&pid->pid_chain,
+ &pid_next->pid_chain);
}
}
- list_del(&pid->pid_list);
+ list_del_rcu(&pid->pid_list);
pid->nr = 0;
return nr;
next prev parent reply other threads:[~2005-10-31 14:05 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-10-31 2:05 [PATCH] Fixes for RCU handling of task_struct Paul E. McKenney
2005-10-31 14:04 ` Ingo Molnar [this message]
2005-10-31 14:08 ` Ingo Molnar
2005-11-01 4:51 ` Andrew Morton
2005-11-03 19:09 ` Paul E. McKenney
2005-11-04 17:41 ` Oleg Nesterov
2005-11-04 20:08 ` Paul E. McKenney
2005-11-05 16:32 ` Oleg Nesterov
2005-11-05 23:20 ` Paul E. McKenney
2005-11-06 12:01 ` Oleg Nesterov
2005-11-06 22:59 ` Paul E. McKenney
2005-11-07 13:17 ` Oleg Nesterov
2005-11-07 18:28 ` Oleg Nesterov
2005-11-06 21:49 ` Andrew Morton
2005-11-06 22:43 ` Paul E. McKenney
2005-11-07 1:12 ` Nick Piggin
2005-11-07 4:58 ` Paul E. McKenney
2005-11-07 5:51 ` Nick Piggin
2005-11-07 18:10 ` Paul E. McKenney
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20051031140459.GA5664@elte.hu \
--to=mingo@elte.hu \
--cc=akpm@osdl.org \
--cc=dipankar@in.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=oleg@tv-sign.ru \
--cc=paulmck@us.ibm.com \
--cc=suzannew@cs.pdx.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.