* [PATCH RFC 01/13] exit: Clarify choice of new parent in forget_original_parent()
[not found] <20150525162722.5171.15901.stgit@pro>
@ 2015-05-25 17:44 ` Kirill Tkhai
2015-05-25 17:44 ` [PATCH RFC 02/13] rwlock_t: Implement double_write_{,un}lock() Kirill Tkhai
` (11 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-25 17:44 UTC (permalink / raw)
To: linux-kernel
Cc: Oleg Nesterov, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
Kirill Tkhai
Second parameter of find_new_reaper() and the similarity of its name
and find_child_reaper()'s confuse a reader.
Rename find_child_reaper() for better conformity of its name and its
function. Also delete the second parameter of find_new_reaper().
These both improve modularity.
Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
---
kernel/exit.c | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index 22fcc05..a29c35d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -450,7 +450,7 @@ static struct task_struct *find_alive_thread(struct task_struct *p)
return NULL;
}
-static struct task_struct *find_child_reaper(struct task_struct *father)
+static void check_pid_ns_reaper_exit(struct task_struct *father)
__releases(&tasklist_lock)
__acquires(&tasklist_lock)
{
@@ -458,12 +458,12 @@ static struct task_struct *find_child_reaper(struct task_struct *father)
struct task_struct *reaper = pid_ns->child_reaper;
if (likely(reaper != father))
- return reaper;
+ return;
reaper = find_alive_thread(father);
if (reaper) {
pid_ns->child_reaper = reaper;
- return reaper;
+ return;
}
write_unlock_irq(&tasklist_lock);
@@ -473,8 +473,6 @@ static struct task_struct *find_child_reaper(struct task_struct *father)
}
zap_pid_ns_processes(pid_ns);
write_lock_irq(&tasklist_lock);
-
- return father;
}
/*
@@ -484,15 +482,16 @@ static struct task_struct *find_child_reaper(struct task_struct *father)
* child_subreaper for its children (like a service manager)
* 3. give it to the init process (PID 1) in our pid namespace
*/
-static struct task_struct *find_new_reaper(struct task_struct *father,
- struct task_struct *child_reaper)
+static struct task_struct *find_new_reaper(struct task_struct *father)
{
- struct task_struct *thread, *reaper;
+ struct task_struct *thread, *reaper, *child_reaper;
thread = find_alive_thread(father);
if (thread)
return thread;
+ child_reaper = task_active_pid_ns(father)->child_reaper;
+
if (father->signal->has_child_subreaper) {
/*
* Find the first ->is_child_subreaper ancestor in our pid_ns.
@@ -557,11 +556,11 @@ static void forget_original_parent(struct task_struct *father,
exit_ptrace(father, dead);
/* Can drop and reacquire tasklist_lock */
- reaper = find_child_reaper(father);
+ check_pid_ns_reaper_exit(father);
if (list_empty(&father->children))
return;
- reaper = find_new_reaper(father, reaper);
+ reaper = find_new_reaper(father);
list_for_each_entry(p, &father->children, sibling) {
for_each_thread(p, t) {
t->real_parent = reaper;
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH RFC 02/13] rwlock_t: Implement double_write_{,un}lock()
[not found] <20150525162722.5171.15901.stgit@pro>
2015-05-25 17:44 ` [PATCH RFC 01/13] exit: Clarify choice of new parent in forget_original_parent() Kirill Tkhai
@ 2015-05-25 17:44 ` Kirill Tkhai
2015-05-25 17:44 ` [PATCH RFC 03/13] pid_ns: Implement rwlock_t pid_ns::cr_lock for locking child_reaper Kirill Tkhai
` (10 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-25 17:44 UTC (permalink / raw)
To: linux-kernel
Cc: Oleg Nesterov, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
Kirill Tkhai
New helpers for locking two locks at a time.
Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
---
include/linux/spinlock.h | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 3e18379..e1e3054 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -426,4 +426,23 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
#define atomic_dec_and_lock(atomic, lock) \
__cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
+static inline void double_write_lock(rwlock_t *lock1, rwlock_t *lock2)
+{
+ if (lock1 < lock2) {
+ write_lock(lock1);
+ write_lock(lock2);
+ } else if (lock1 > lock2) {
+ write_lock(lock2);
+ write_lock(lock1);
+ } else {
+ write_lock(lock1);
+ }
+}
+static inline void double_write_unlock(rwlock_t *lock1, rwlock_t *lock2)
+{
+ if (lock2 != lock1)
+ write_unlock(lock2);
+ write_unlock(lock1);
+}
+
#endif /* __LINUX_SPINLOCK_H */
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH RFC 03/13] pid_ns: Implement rwlock_t pid_ns::cr_lock for locking child_reaper
[not found] <20150525162722.5171.15901.stgit@pro>
2015-05-25 17:44 ` [PATCH RFC 01/13] exit: Clarify choice of new parent in forget_original_parent() Kirill Tkhai
2015-05-25 17:44 ` [PATCH RFC 02/13] rwlock_t: Implement double_write_{,un}lock() Kirill Tkhai
@ 2015-05-25 17:44 ` Kirill Tkhai
2015-05-25 17:44 ` [PATCH RFC 04/13] exit: Small refactoring mm_update_next_owner() Kirill Tkhai
` (9 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-25 17:44 UTC (permalink / raw)
To: linux-kernel
Cc: Oleg Nesterov, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
Kirill Tkhai
Protects child_reaper modifitations.
Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
---
include/linux/pid_namespace.h | 1 +
kernel/exit.c | 15 ++++++++++++---
kernel/fork.c | 1 +
kernel/pid.c | 10 +++++++++-
kernel/pid_namespace.c | 5 +++--
5 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 918b117..3e59d2a 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -28,6 +28,7 @@ struct pid_namespace {
int last_pid;
unsigned int nr_hashed;
struct task_struct *child_reaper;
+ rwlock_t cr_lock;
struct kmem_cache *pid_cachep;
unsigned int level;
struct pid_namespace *parent;
diff --git a/kernel/exit.c b/kernel/exit.c
index a29c35d..a1b2bf7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -455,16 +455,23 @@ static void check_pid_ns_reaper_exit(struct task_struct *father)
__acquires(&tasklist_lock)
{
struct pid_namespace *pid_ns = task_active_pid_ns(father);
- struct task_struct *reaper = pid_ns->child_reaper;
+ struct task_struct *reaper;
+
+ read_lock(&pid_ns->cr_lock);
+ reaper = pid_ns->child_reaper;
+ read_unlock(&pid_ns->cr_lock);
if (likely(reaper != father))
return;
+ write_lock(&pid_ns->cr_lock);
reaper = find_alive_thread(father);
- if (reaper) {
+ if (reaper)
pid_ns->child_reaper = reaper;
+ write_unlock(&pid_ns->cr_lock);
+
+ if (reaper)
return;
- }
write_unlock_irq(&tasklist_lock);
if (unlikely(pid_ns == &init_pid_ns)) {
@@ -560,6 +567,7 @@ static void forget_original_parent(struct task_struct *father,
if (list_empty(&father->children))
return;
+ read_lock(&task_active_pid_ns(father)->cr_lock);
reaper = find_new_reaper(father);
list_for_each_entry(p, &father->children, sibling) {
for_each_thread(p, t) {
@@ -579,6 +587,7 @@ static void forget_original_parent(struct task_struct *father,
reparent_leader(father, p, dead);
}
list_splice_tail_init(&father->children, &reaper->children);
+ read_unlock(&task_active_pid_ns(father)->cr_lock);
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 0bb88b5..66e31eb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1563,6 +1563,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
init_task_pid(p, PIDTYPE_SID, task_session(current));
if (is_child_reaper(pid)) {
+ /* Lockless, as we're the only process in ns */
ns_of_pid(pid)->child_reaper = p;
p->signal->flags |= SIGNAL_UNKILLABLE;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 4fd07d5..39a8b0a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -78,6 +78,7 @@ struct pid_namespace init_pid_ns = {
.nr_hashed = PIDNS_HASH_ADDING,
.level = 0,
.child_reaper = &init_task,
+ .cr_lock = __RW_LOCK_UNLOCKED(&init_pid_ns.cr_lock),
.user_ns = &init_user_ns,
.ns.inum = PROC_PID_INIT_INO,
#ifdef CONFIG_PID_NS
@@ -259,6 +260,7 @@ static void delayed_put_pid(struct rcu_head *rhp)
void free_pid(struct pid *pid)
{
/* We can be called with write_lock_irq(&tasklist_lock) held */
+ struct task_struct *child_reaper = NULL;
int i;
unsigned long flags;
@@ -274,7 +276,8 @@ void free_pid(struct pid *pid)
* is the reaper wake up the reaper. The reaper
* may be sleeping in zap_pid_ns_processes().
*/
- wake_up_process(ns->child_reaper);
+ child_reaper = ns->child_reaper;
+ get_task_struct(child_reaper);
break;
case PIDNS_HASH_ADDING:
/* Handle a fork failure of the first process */
@@ -288,6 +291,11 @@ void free_pid(struct pid *pid)
}
spin_unlock_irqrestore(&pidmap_lock, flags);
+ if (child_reaper) {
+ wake_up_process(child_reaper);
+ put_task_struct(child_reaper);
+ }
+
for (i = 0; i <= pid->level; i++)
free_pidmap(pid->numbers + i);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a65ba13..bbaa072 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -115,6 +115,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->nr_hashed = PIDNS_HASH_ADDING;
+ rwlock_init(&ns->cr_lock);
INIT_WORK(&ns->proc_work, proc_cleanup_work);
set_bit(0, ns->pidmap[0].page);
@@ -324,9 +325,9 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
return -EINVAL;
}
- read_lock(&tasklist_lock);
+ read_lock(&pid_ns->cr_lock);
force_sig(SIGKILL, pid_ns->child_reaper);
- read_unlock(&tasklist_lock);
+ read_unlock(&pid_ns->cr_lock);
do_exit(0);
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH RFC 04/13] exit: Small refactoring mm_update_next_owner()
[not found] <20150525162722.5171.15901.stgit@pro>
` (2 preceding siblings ...)
2015-05-25 17:44 ` [PATCH RFC 03/13] pid_ns: Implement rwlock_t pid_ns::cr_lock for locking child_reaper Kirill Tkhai
@ 2015-05-25 17:44 ` Kirill Tkhai
2015-05-25 17:45 ` [PATCH RFC 05/13] fs: Refactoring in get_children_pid() Kirill Tkhai
` (8 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-25 17:44 UTC (permalink / raw)
To: linux-kernel
Cc: Oleg Nesterov, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
Kirill Tkhai
This will be used in next patches. No functionality change.
Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
---
kernel/exit.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index a1b2bf7..5e82787 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -322,16 +322,20 @@ void mm_update_next_owner(struct mm_struct *mm)
* Search in the children
*/
list_for_each_entry(c, &p->children, sibling) {
- if (c->mm == mm)
+ if (c->mm == mm) {
+ get_task_struct(c);
goto assign_new_owner;
+ }
}
/*
* Search in the siblings
*/
list_for_each_entry(c, &p->real_parent->children, sibling) {
- if (c->mm == mm)
+ if (c->mm == mm) {
+ get_task_struct(c);
goto assign_new_owner;
+ }
}
/*
@@ -341,8 +345,10 @@ void mm_update_next_owner(struct mm_struct *mm)
if (g->flags & PF_KTHREAD)
continue;
for_each_thread(g, c) {
- if (c->mm == mm)
+ if (c->mm == mm) {
+ get_task_struct(c);
goto assign_new_owner;
+ }
if (c->mm)
break;
}
@@ -358,7 +364,6 @@ void mm_update_next_owner(struct mm_struct *mm)
assign_new_owner:
BUG_ON(c == p);
- get_task_struct(c);
/*
* The task_lock protects c->mm from changing.
* We always want mm->owner->mm == mm
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH RFC 05/13] fs: Refactoring in get_children_pid()
[not found] <20150525162722.5171.15901.stgit@pro>
` (3 preceding siblings ...)
2015-05-25 17:44 ` [PATCH RFC 04/13] exit: Small refactoring mm_update_next_owner() Kirill Tkhai
@ 2015-05-25 17:45 ` Kirill Tkhai
2015-05-25 17:45 ` [PATCH RFC 06/13] core: Add rwlock_t task_list::kin_lock Kirill Tkhai
` (7 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-25 17:45 UTC (permalink / raw)
To: linux-kernel
Cc: Oleg Nesterov, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
Kirill Tkhai
This will be used in next patches. No functionality change.
Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
---
fs/proc/array.c | 28 ++++++++++++++++------------
1 file changed, 16 insertions(+), 12 deletions(-)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fd02a9e..e2f21c0 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -576,9 +576,7 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
struct task_struct *start, *task;
struct pid *pid = NULL;
- read_lock(&tasklist_lock);
-
- start = pid_task(proc_pid(inode), PIDTYPE_PID);
+ start = get_pid_task(proc_pid(inode), PIDTYPE_PID);
if (!start)
goto out;
@@ -586,18 +584,21 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
* Lets try to continue searching first, this gives
* us significant speedup on children-rich processes.
*/
- if (pid_prev) {
- task = pid_task(pid_prev, PIDTYPE_PID);
- if (task && task->real_parent == start &&
- !(list_empty(&task->sibling))) {
- if (list_is_last(&task->sibling, &start->children))
- goto out;
+ task = get_pid_task(pid_prev, PIDTYPE_PID);
+ if (!task)
+ goto put_start;
+
+ read_lock(&tasklist_lock);
+ if (task->real_parent == start && !(list_empty(&task->sibling))) {
+ put_task_struct(task);
+ if (!list_is_last(&task->sibling, &start->children)) {
task = list_first_entry(&task->sibling,
- struct task_struct, sibling);
+ struct task_struct, sibling);
pid = get_pid(task_pid(task));
- goto out;
}
+ goto unlock;
}
+ put_task_struct(task);
/*
* Slow search case.
@@ -621,8 +622,11 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
}
}
-out:
+unlock:
read_unlock(&tasklist_lock);
+put_start:
+ put_task_struct(start);
+out:
return pid;
}
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH RFC 06/13] core: Add rwlock_t task_list::kin_lock
[not found] <20150525162722.5171.15901.stgit@pro>
` (4 preceding siblings ...)
2015-05-25 17:45 ` [PATCH RFC 05/13] fs: Refactoring in get_children_pid() Kirill Tkhai
@ 2015-05-25 17:45 ` Kirill Tkhai
2015-05-25 17:45 ` [PATCH RFC 07/13] kin_lock: Implement helpers for kin_lock locking Kirill Tkhai
` (6 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-25 17:45 UTC (permalink / raw)
To: linux-kernel
Cc: Oleg Nesterov, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
Kirill Tkhai
Add a new primitive for synchronization between relatives.
It will be used in next/further patches.
Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
---
include/linux/init_task.h | 1 +
include/linux/sched.h | 1 +
kernel/fork.c | 1 +
3 files changed, 3 insertions(+)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index bb9b075..919243a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -218,6 +218,7 @@ extern struct task_group root_task_group;
.parent = &tsk, \
.children = LIST_HEAD_INIT(tsk.children), \
.sibling = LIST_HEAD_INIT(tsk.sibling), \
+ .kin_lock = __RW_LOCK_UNLOCKED(tsk.kin_lock), \
.group_leader = &tsk, \
RCU_POINTER_INITIALIZER(real_cred, &init_cred), \
RCU_POINTER_INITIALIZER(cred, &init_cred), \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index af0eeba..58dc09f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1477,6 +1477,7 @@ struct task_struct {
struct list_head children; /* list of my children */
struct list_head sibling; /* linkage in my parent's children list */
struct task_struct *group_leader; /* threadgroup leader */
+ rwlock_t kin_lock;
/*
* ptraced is the list of tasks this task is using ptrace on.
diff --git a/kernel/fork.c b/kernel/fork.c
index 66e31eb..ee389ba 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1330,6 +1330,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
+ rwlock_init(&p->kin_lock);
rcu_copy_process(p);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH RFC 07/13] kin_lock: Implement helpers for kin_lock locking.
[not found] <20150525162722.5171.15901.stgit@pro>
` (5 preceding siblings ...)
2015-05-25 17:45 ` [PATCH RFC 06/13] core: Add rwlock_t task_list::kin_lock Kirill Tkhai
@ 2015-05-25 17:45 ` Kirill Tkhai
2015-05-25 17:45 ` [PATCH RFC 08/13] core: Use kin_lock synchronizations between parent and child and for thread group Kirill Tkhai
` (5 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-25 17:45 UTC (permalink / raw)
To: linux-kernel
Cc: Oleg Nesterov, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
Kirill Tkhai
Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
---
include/linux/sched.h | 302 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 302 insertions(+)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 58dc09f..6a810f3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1835,6 +1835,308 @@ static inline bool should_numa_migrate_memory(struct task_struct *p,
}
#endif
+static inline void write_real_parent_lock(struct task_struct *p)
+{
+ struct task_struct *real_parent;
+ rcu_read_lock();
+again:
+ real_parent = p->real_parent;
+ write_lock(&real_parent->kin_lock);
+ if (real_parent != p->real_parent) {
+ write_unlock(&real_parent->kin_lock);
+ goto again;
+ }
+ rcu_read_unlock();
+}
+
+static inline void write_real_parent_lock_irq(struct task_struct *p)
+{
+ local_irq_disable();
+ write_real_parent_lock(p);
+}
+
+static inline void write_real_parent_unlock(struct task_struct *p)
+{
+ write_unlock(&p->real_parent->kin_lock);
+}
+
+static inline void write_real_parent_unlock_irq(struct task_struct *p)
+{
+ write_unlock_irq(&p->real_parent->kin_lock);
+}
+
+static inline void read_parent_lock(struct task_struct *p)
+{
+ struct task_struct *parent;
+ rcu_read_lock();
+again:
+ parent = p->parent;
+ read_lock(&parent->kin_lock);
+ if (parent != p->parent) {
+ read_unlock(&parent->kin_lock);
+ goto again;
+ }
+ rcu_read_unlock();
+
+}
+
+static inline void read_parent_unlock(struct task_struct *p)
+{
+ read_unlock(&p->parent->kin_lock);
+}
+
+static inline void read_real_parent_lock(struct task_struct *p)
+{
+ struct task_struct *real_parent;
+ rcu_read_lock();
+again:
+ real_parent = p->real_parent;
+ read_lock(&real_parent->kin_lock);
+ if (real_parent != p->real_parent) {
+ read_unlock(&real_parent->kin_lock);
+ goto again;
+ }
+ rcu_read_unlock();
+
+}
+
+static inline void read_real_parent_unlock(struct task_struct *p)
+{
+ read_unlock(&p->real_parent->kin_lock);
+}
+
+static inline void write_parent_lock(struct task_struct *p)
+{
+ struct task_struct *parent;
+ rcu_read_lock();
+again:
+ parent = p->parent;
+ write_lock(&parent->kin_lock);
+ if (parent != p->parent) {
+ write_unlock(&parent->kin_lock);
+ goto again;
+ }
+ rcu_read_unlock();
+}
+
+static inline void write_parent_lock_irq(struct task_struct *p)
+{
+ local_irq_disable();
+ write_parent_lock(p);
+}
+
+static inline void write_parent_unlock(struct task_struct *p)
+{
+ write_unlock(&p->parent->kin_lock);
+}
+
+static inline void write_parent_unlock_irq(struct task_struct *p)
+{
+ write_unlock_irq(&p->parent->kin_lock);
+}
+
+static inline void write_parent_and_given_lock(struct task_struct *child,
+ struct task_struct *given)
+{
+ struct task_struct *parent;
+ rcu_read_lock();
+again:
+ parent = child->parent;
+ if (parent <= given) {
+ write_lock(&parent->kin_lock);
+ if (parent != child->parent) {
+ write_unlock(&parent->kin_lock);
+ goto again;
+ }
+ if (parent != given)
+ write_lock(&given->kin_lock);
+ } else {
+ write_lock(&given->kin_lock);
+ write_lock(&parent->kin_lock);
+ if (parent != child->parent) {
+ write_unlock(&given->kin_lock);
+ write_unlock(&parent->kin_lock);
+ goto again;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static inline void write_parent_and_given_lock_irq(struct task_struct *child,
+ struct task_struct *given)
+{
+ local_irq_disable();
+ write_parent_and_given_lock(child, given);
+}
+
+static inline void read_parent_and_given_lock(struct task_struct *child,
+ struct task_struct *given)
+{
+ struct task_struct *parent;
+ rcu_read_lock();
+again:
+ parent = child->parent;
+ if (parent <= given) {
+ read_lock(&parent->kin_lock);
+ if (parent != child->parent) {
+ read_unlock(&parent->kin_lock);
+ goto again;
+ }
+ if (parent != given)
+ read_lock(&given->kin_lock);
+ } else {
+ read_lock(&given->kin_lock);
+ read_lock(&parent->kin_lock);
+ if (parent != child->parent) {
+ read_unlock(&given->kin_lock);
+ read_unlock(&parent->kin_lock);
+ goto again;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static inline void read_real_parent_and_given_lock(struct task_struct *child,
+ struct task_struct *given)
+{
+ struct task_struct *real_parent;
+ rcu_read_lock();
+again:
+ real_parent = child->real_parent;
+ if (real_parent <= given) {
+ read_lock(&real_parent->kin_lock);
+ if (real_parent != child->real_parent) {
+ read_unlock(&real_parent->kin_lock);
+ goto again;
+ }
+ if (real_parent != given)
+ read_lock(&given->kin_lock);
+ } else {
+ read_lock(&given->kin_lock);
+ read_lock(&real_parent->kin_lock);
+ if (real_parent != child->real_parent) {
+ read_unlock(&given->kin_lock);
+ read_unlock(&real_parent->kin_lock);
+ goto again;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static inline void read_real_parent_and_given_unlock(struct task_struct *child,
+ struct task_struct *given)
+{
+ struct task_struct *real_parent = child->real_parent;
+
+ if (real_parent != given)
+ read_unlock(&real_parent->kin_lock);
+ read_unlock(&given->kin_lock);
+}
+static inline void write_parent_and_real_parent_lock(struct task_struct *child)
+{
+ struct task_struct *parent, *real_parent;
+ rcu_read_lock();
+again:
+ parent = child->parent;
+ real_parent = child->real_parent;
+ if (parent <= real_parent) {
+ write_lock(&parent->kin_lock);
+ if (parent != child->parent) {
+ write_unlock(&parent->kin_lock);
+ goto again;
+ }
+ if (parent != real_parent) {
+ write_lock(&real_parent->kin_lock);
+ if (real_parent != child->real_parent) {
+ write_unlock(&parent->kin_lock);
+ write_unlock(&real_parent->kin_lock);
+ goto again;
+ }
+ }
+ } else {
+ write_lock(&real_parent->kin_lock);
+ write_lock(&parent->kin_lock);
+ if (real_parent != child->real_parent ||
+ parent != child->parent) {
+ write_unlock(&real_parent->kin_lock);
+ write_unlock(&parent->kin_lock);
+ goto again;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static inline void write_parent_and_real_parent_lock_irq(struct task_struct *child)
+{
+ local_irq_disable();
+ write_parent_and_real_parent_lock(child);
+}
+
+static inline void write_parent_and_real_parent_unlock(struct task_struct *child)
+{
+ struct task_struct *parent, *real_parent;
+
+ parent = child->parent;
+ real_parent = child->real_parent;
+
+ write_unlock(&real_parent->kin_lock);
+ if (parent != real_parent)
+ write_unlock(&parent->kin_lock);
+}
+
+static inline void write_parent_and_real_parent_unlock_irq(struct task_struct *child)
+{
+ write_parent_and_real_parent_unlock(child);
+ local_irq_enable();
+}
+
+static inline void read_parent_and_real_parent_lock(struct task_struct *child)
+{
+ struct task_struct *parent, *real_parent;
+ rcu_read_lock();
+again:
+ parent = child->parent;
+ real_parent = child->real_parent;
+ if (parent <= real_parent) {
+ read_lock(&parent->kin_lock);
+ if (parent != child->parent) {
+ read_unlock(&parent->kin_lock);
+ goto again;
+ }
+ if (parent != real_parent) {
+ read_lock(&real_parent->kin_lock);
+ if (real_parent != child->real_parent) {
+ read_unlock(&parent->kin_lock);
+ read_unlock(&real_parent->kin_lock);
+ goto again;
+ }
+ }
+ } else {
+ read_lock(&real_parent->kin_lock);
+ read_lock(&parent->kin_lock);
+ if (real_parent != child->real_parent ||
+ parent != child->parent) {
+ read_unlock(&real_parent->kin_lock);
+ read_unlock(&parent->kin_lock);
+ goto again;
+ }
+ }
+ rcu_read_unlock();
+}
+
+static inline void read_parent_and_real_parent_unlock(struct task_struct *child)
+{
+ struct task_struct *parent, *real_parent;
+
+ parent = child->parent;
+ real_parent = child->real_parent;
+
+ read_unlock(&real_parent->kin_lock);
+ if (parent != real_parent)
+ read_unlock(&parent->kin_lock);
+}
+
+
static inline struct pid *task_pid(struct task_struct *task)
{
return task->pids[PIDTYPE_PID].pid;
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH RFC 08/13] core: Use kin_lock synchronizations between parent and child and for thread group
[not found] <20150525162722.5171.15901.stgit@pro>
` (6 preceding siblings ...)
2015-05-25 17:45 ` [PATCH RFC 07/13] kin_lock: Implement helpers for kin_lock locking Kirill Tkhai
@ 2015-05-25 17:45 ` Kirill Tkhai
2015-05-25 17:45 ` [PATCH RFC 09/13] exit: Use for_each_thread() in do_wait() Kirill Tkhai
` (4 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-25 17:45 UTC (permalink / raw)
To: linux-kernel
Cc: Oleg Nesterov, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
Kirill Tkhai
real_parent->kin_lock protects:
child's threads, place in sibling list etc.
parent->kin_lock protects:
ptrace operations
Note: sighand is still stable under tasklist_lock.
Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
---
fs/exec.c | 16 ++++++++---
fs/proc/array.c | 4 +--
kernel/exit.c | 68 +++++++++++++++++++++++++++++++++++++---------
kernel/fork.c | 4 +++
kernel/ptrace.c | 45 ++++++++++++++++++++++++++----
kernel/signal.c | 9 ++++++
kernel/sys.c | 8 ++++-
mm/oom_kill.c | 9 ++++--
security/keys/keyctl.c | 4 +--
security/selinux/hooks.c | 4 +--
10 files changed, 136 insertions(+), 35 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 1977c2a..4de7770 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -932,14 +932,16 @@ static int de_thread(struct task_struct *tsk)
for (;;) {
threadgroup_change_begin(tsk);
write_lock_irq(&tasklist_lock);
+ write_real_parent_lock(tsk);
/*
- * Do this under tasklist_lock to ensure that
- * exit_notify() can't miss ->group_exit_task
+ * Do this under real_parent's kin_lock to ensure
+ * that exit_notify() can't miss ->group_exit_task.
*/
sig->notify_count = -1;
if (likely(leader->exit_state))
break;
__set_current_state(TASK_KILLABLE);
+ write_real_parent_unlock(tsk);
write_unlock_irq(&tasklist_lock);
threadgroup_change_end(tsk);
schedule();
@@ -995,9 +997,13 @@ static int de_thread(struct task_struct *tsk)
* We are going to release_task()->ptrace_unlink() silently,
* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
* the tracer wont't block again waiting for this thread.
+ *
+ * Also, we not need hold leader->parent->kin_lock, because
+ * it can't change till we release real_parent->kin_lock.
*/
if (unlikely(leader->ptrace))
__wake_up_parent(leader, leader->parent);
+ write_real_parent_unlock(tsk);
write_unlock_irq(&tasklist_lock);
threadgroup_change_end(tsk);
@@ -1029,9 +1035,11 @@ static int de_thread(struct task_struct *tsk)
sizeof(newsighand->action));
write_lock_irq(&tasklist_lock);
+ write_parent_and_real_parent_lock(tsk);
spin_lock(&oldsighand->siglock);
rcu_assign_pointer(tsk->sighand, newsighand);
spin_unlock(&oldsighand->siglock);
+ write_parent_and_real_parent_unlock(tsk);
write_unlock_irq(&tasklist_lock);
__cleanup_sighand(oldsighand);
@@ -1042,10 +1050,10 @@ static int de_thread(struct task_struct *tsk)
killed:
/* protects against exit_notify() and __exit_signal() */
- read_lock(&tasklist_lock);
+ read_real_parent_lock(tsk);
sig->group_exit_task = NULL;
sig->notify_count = 0;
- read_unlock(&tasklist_lock);
+ read_real_parent_unlock(tsk);
return -EAGAIN;
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index e2f21c0..820611cc 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -588,7 +588,7 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
if (!task)
goto put_start;
- read_lock(&tasklist_lock);
+ read_lock(&start->kin_lock);
if (task->real_parent == start && !(list_empty(&task->sibling))) {
put_task_struct(task);
if (!list_is_last(&task->sibling, &start->children)) {
@@ -623,7 +623,7 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
}
unlock:
- read_unlock(&tasklist_lock);
+ read_unlock(&start->kin_lock);
put_start:
put_task_struct(start);
out:
diff --git a/kernel/exit.c b/kernel/exit.c
index 5e82787..a268093 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -180,6 +180,7 @@ void release_task(struct task_struct *p)
proc_flush_task(p);
write_lock_irq(&tasklist_lock);
+ write_parent_and_real_parent_lock(p);
ptrace_release_task(p);
__exit_signal(p);
@@ -202,6 +203,7 @@ void release_task(struct task_struct *p)
leader->exit_state = EXIT_DEAD;
}
+ write_parent_and_real_parent_unlock(p);
write_unlock_irq(&tasklist_lock);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);
@@ -317,43 +319,50 @@ void mm_update_next_owner(struct mm_struct *mm)
return;
}
- read_lock(&tasklist_lock);
/*
* Search in the children
*/
+ read_lock(&p->kin_lock);
list_for_each_entry(c, &p->children, sibling) {
if (c->mm == mm) {
get_task_struct(c);
+ read_unlock(&p->kin_lock);
goto assign_new_owner;
}
}
+ read_unlock(&p->kin_lock);
/*
* Search in the siblings
*/
+ read_real_parent_lock(p);
list_for_each_entry(c, &p->real_parent->children, sibling) {
if (c->mm == mm) {
get_task_struct(c);
+ read_real_parent_unlock(p);
goto assign_new_owner;
}
}
+ read_real_parent_unlock(p);
/*
* Search through everything else, we should not get here often.
*/
+ rcu_read_lock();
for_each_process(g) {
if (g->flags & PF_KTHREAD)
continue;
for_each_thread(g, c) {
if (c->mm == mm) {
get_task_struct(c);
+ rcu_read_unlock();
goto assign_new_owner;
}
if (c->mm)
break;
}
}
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
/*
* We found no owner yet mm_users > 1: this implies that we are
* most likely racing with swapoff (try_to_unuse()) or /proc or
@@ -369,11 +378,6 @@ void mm_update_next_owner(struct mm_struct *mm)
* We always want mm->owner->mm == mm
*/
task_lock(c);
- /*
- * Delay read_unlock() till we have the task_lock()
- * to ensure that c does not slip away underneath us
- */
- read_unlock(&tasklist_lock);
if (c->mm != mm) {
task_unlock(c);
put_task_struct(c);
@@ -470,9 +474,13 @@ static void check_pid_ns_reaper_exit(struct task_struct *father)
return;
write_lock(&pid_ns->cr_lock);
+ read_real_parent_lock(father);
+
reaper = find_alive_thread(father);
if (reaper)
pid_ns->child_reaper = reaper;
+
+ read_real_parent_unlock(father);
write_unlock(&pid_ns->cr_lock);
if (reaper)
@@ -494,13 +502,22 @@ static void check_pid_ns_reaper_exit(struct task_struct *father)
* child_subreaper for its children (like a service manager)
* 3. give it to the init process (PID 1) in our pid namespace
*/
-static struct task_struct *find_new_reaper(struct task_struct *father)
+static struct task_struct *find_double_lock_new_reaper(struct task_struct *father)
{
struct task_struct *thread, *reaper, *child_reaper;
+ rcu_read_lock();
+again:
thread = find_alive_thread(father);
- if (thread)
+ if (thread) {
+ double_write_lock(&father->kin_lock, &thread->kin_lock);
+ if (thread->flags & PF_EXITING) {
+ double_write_unlock(&father->kin_lock, &thread->kin_lock);
+ goto again;
+ }
+ rcu_read_unlock();
return thread;
+ }
child_reaper = task_active_pid_ns(father)->child_reaper;
@@ -518,12 +535,26 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
break;
if (!reaper->signal->is_child_subreaper)
continue;
+find_again:
thread = find_alive_thread(reaper);
- if (thread)
- return thread;
+ if (thread) {
+ double_write_lock(&father->kin_lock,
+ &thread->kin_lock);
+ if (!(thread->flags & PF_EXITING)) {
+ rcu_read_unlock();
+ return thread;
+ }
+ double_write_unlock(&father->kin_lock,
+ &thread->kin_lock);
+ goto find_again;
+ }
}
}
+ rcu_read_unlock();
+
+ double_write_lock(&child_reaper->kin_lock, &father->kin_lock);
+
return child_reaper;
}
@@ -569,11 +600,18 @@ static void forget_original_parent(struct task_struct *father,
/* Can drop and reacquire tasklist_lock */
check_pid_ns_reaper_exit(father);
- if (list_empty(&father->children))
+
+ /* read_lock() guarantees concurrent thread sees our PF_EXITING */
+ read_lock(&father->kin_lock);
+ if (list_empty(&father->children)) {
+ read_unlock(&father->kin_lock);
return;
+ }
+ read_unlock(&father->kin_lock);
read_lock(&task_active_pid_ns(father)->cr_lock);
- reaper = find_new_reaper(father);
+ reaper = find_double_lock_new_reaper(father);
+
list_for_each_entry(p, &father->children, sibling) {
for_each_thread(p, t) {
t->real_parent = reaper;
@@ -592,6 +630,8 @@ static void forget_original_parent(struct task_struct *father,
reparent_leader(father, p, dead);
}
list_splice_tail_init(&father->children, &reaper->children);
+
+ double_write_unlock(&reaper->kin_lock, &father->kin_lock);
read_unlock(&task_active_pid_ns(father)->cr_lock);
}
@@ -608,6 +648,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
write_lock_irq(&tasklist_lock);
forget_original_parent(tsk, &dead);
+ write_parent_and_real_parent_lock(tsk);
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -631,6 +672,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
+ write_parent_and_real_parent_unlock(tsk);
write_unlock_irq(&tasklist_lock);
list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
diff --git a/kernel/fork.c b/kernel/fork.c
index ee389ba..63e225b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1524,9 +1524,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
+ write_real_parent_lock(current);
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
} else {
+ write_lock(¤t->kin_lock);
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
}
@@ -1550,6 +1552,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
recalc_sigpending();
if (signal_pending(current)) {
spin_unlock(¤t->sighand->siglock);
+ write_real_parent_unlock(p);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
goto bad_fork_free_pid;
@@ -1591,6 +1594,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
total_forks++;
spin_unlock(¤t->sighand->siglock);
+ write_real_parent_unlock(p);
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c8e0e05..817288d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,6 +181,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
* be changed by us so it's not changing right after this.
*/
read_lock(&tasklist_lock);
+ read_parent_lock(child);
if (child->ptrace && child->parent == current) {
WARN_ON(child->state == __TASK_TRACED);
/*
@@ -190,6 +191,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
if (ignore_state || ptrace_freeze_traced(child))
ret = 0;
}
+ read_parent_unlock(child);
read_unlock(&tasklist_lock);
if (!ret && !ignore_state) {
@@ -275,6 +277,7 @@ static int ptrace_attach(struct task_struct *task, long request,
unsigned long flags)
{
bool seize = (request == PTRACE_SEIZE);
+ struct task_struct *old_parent;
int retval;
retval = -EIO;
@@ -312,11 +315,13 @@ static int ptrace_attach(struct task_struct *task, long request,
goto unlock_creds;
write_lock_irq(&tasklist_lock);
+ write_parent_and_given_lock(task, current);
+ old_parent = task->parent;
retval = -EPERM;
- if (unlikely(task->exit_state))
- goto unlock_tasklist;
- if (task->ptrace)
- goto unlock_tasklist;
+ if (unlikely(task->exit_state) || task->ptrace) {
+ write_unlock(&old_parent->kin_lock);
+ goto unlock_current;
+ }
if (seize)
flags |= PT_SEIZED;
@@ -327,6 +332,7 @@ static int ptrace_attach(struct task_struct *task, long request,
task->ptrace = flags;
__ptrace_link(task, current);
+ write_unlock(&old_parent->kin_lock);
/* SEIZE doesn't trap tracee on attach */
if (!seize)
@@ -358,7 +364,8 @@ static int ptrace_attach(struct task_struct *task, long request,
spin_unlock(&task->sighand->siglock);
retval = 0;
-unlock_tasklist:
+unlock_current:
+ write_unlock(¤t->kin_lock);
write_unlock_irq(&tasklist_lock);
unlock_creds:
mutex_unlock(&task->signal->cred_guard_mutex);
@@ -383,6 +390,7 @@ static int ptrace_traceme(void)
int ret = -EPERM;
write_lock_irq(&tasklist_lock);
+ write_parent_lock(current);
/* Are we already being traced? */
if (!current->ptrace) {
ret = security_ptrace_traceme(current->parent);
@@ -392,10 +400,12 @@ static int ptrace_traceme(void)
* pretend ->real_parent untraces us right after return.
*/
if (!ret && !(current->real_parent->flags & PF_EXITING)) {
+ BUG_ON(current->parent != current->real_parent);
current->ptrace = PT_PTRACED;
__ptrace_link(current, current->real_parent);
}
}
+ write_parent_unlock(current);
write_unlock_irq(&tasklist_lock);
return ret;
@@ -456,6 +466,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
static int ptrace_detach(struct task_struct *child, unsigned int data)
{
+ struct task_struct *old_parent;
if (!valid_signal(data))
return -EIO;
@@ -464,6 +475,8 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
write_lock_irq(&tasklist_lock);
+ write_parent_and_real_parent_lock(child);
+ old_parent = child->parent;
/*
* We rely on ptrace_freeze_traced(). It can't be killed and
* untraced by another thread, it can't be a zombie.
@@ -475,6 +488,9 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
*/
child->exit_code = data;
__ptrace_detach(current, child);
+ if (old_parent != child->real_parent)
+ write_unlock(&old_parent->kin_lock);
+ write_real_parent_unlock(child);
write_unlock_irq(&tasklist_lock);
proc_ptrace_connector(child, PTRACE_DETACH);
@@ -488,15 +504,30 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
*/
void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
{
- struct task_struct *p, *n;
+ struct task_struct *p;
+
+ rcu_read_lock();
+ for (;;) {
+ p = list_first_or_null_rcu(&tracer->ptraced, struct task_struct,
+ ptrace_entry);
+ if (!p)
+ break;
+
+ write_parent_and_real_parent_lock(p);
+ if (p->parent != tracer) {
+ write_parent_and_real_parent_unlock(p);
+ continue;
+ }
- list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
if (unlikely(p->ptrace & PT_EXITKILL))
send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
if (__ptrace_detach(tracer, p))
list_add(&p->ptrace_entry, dead);
+
+ write_parent_and_real_parent_unlock(p);
}
+ rcu_read_unlock();
}
int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
diff --git a/kernel/signal.c b/kernel/signal.c
index f19833b..8288019 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1882,6 +1882,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
spin_unlock_irq(¤t->sighand->siglock);
read_lock(&tasklist_lock);
+ read_parent_and_real_parent_lock(current);
if (may_ptrace_stop()) {
/*
* Notify parents of the stop.
@@ -1904,6 +1905,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
* XXX: implement read_unlock_no_resched().
*/
preempt_disable();
+ read_parent_and_real_parent_unlock(current);
read_unlock(&tasklist_lock);
preempt_enable_no_resched();
freezable_schedule();
@@ -1925,6 +1927,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
__set_current_state(TASK_RUNNING);
if (clear_code)
current->exit_code = 0;
+ read_parent_and_real_parent_unlock(current);
read_unlock(&tasklist_lock);
}
@@ -2079,7 +2082,9 @@ static bool do_signal_stop(int signr)
*/
if (notify) {
read_lock(&tasklist_lock);
+ read_parent_and_real_parent_lock(current);
do_notify_parent_cldstop(current, false, notify);
+ read_parent_and_real_parent_unlock(current);
read_unlock(&tasklist_lock);
}
@@ -2225,11 +2230,13 @@ int get_signal(struct ksignal *ksig)
* a duplicate.
*/
read_lock(&tasklist_lock);
+ read_parent_and_real_parent_lock(current->group_leader);
do_notify_parent_cldstop(current, false, why);
if (ptrace_reparented(current->group_leader))
do_notify_parent_cldstop(current->group_leader,
true, why);
+ read_parent_and_real_parent_unlock(current->group_leader);
read_unlock(&tasklist_lock);
goto relock;
@@ -2476,7 +2483,9 @@ void exit_signals(struct task_struct *tsk)
*/
if (unlikely(group_stop)) {
read_lock(&tasklist_lock);
+ read_parent_and_real_parent_lock(tsk);
do_notify_parent_cldstop(tsk, false, group_stop);
+ read_parent_and_real_parent_unlock(tsk);
read_unlock(&tasklist_lock);
}
}
diff --git a/kernel/sys.c b/kernel/sys.c
index a4e372b..783aec4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -939,7 +939,9 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
err = -ESRCH;
p = find_task_by_vpid(pid);
if (!p)
- goto out;
+ goto out2;
+
+ write_real_parent_lock(p);
err = -EINVAL;
if (!thread_group_leader(p))
@@ -981,7 +983,9 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
err = 0;
out:
- /* All paths lead to here, thus we are safe. -DaveM */
+ /* All (almost) paths lead to here, thus we are safe. -DaveM */
+ write_real_parent_unlock(p);
+out2:
write_unlock_irq(&tasklist_lock);
rcu_read_unlock();
return err;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2b665da..aa89beb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -538,13 +538,14 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* parent. This attempts to lose the minimal amount of work done while
* still freeing memory.
*/
- read_lock(&tasklist_lock);
+ rcu_read_lock();
for_each_thread(p, t) {
+ read_lock(&t->kin_lock);
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
if (child->mm == p->mm)
- continue;
+ goto unlock;
/*
* oom_badness() returns 0 if the thread is unkillable
*/
@@ -557,8 +558,10 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
get_task_struct(victim);
}
}
+unlock:
+ read_unlock(&t->kin_lock);
}
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
p = find_lock_task_mm(victim);
if (!p) {
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 0b9ec78..64eea7b 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1491,7 +1491,7 @@ long keyctl_session_to_parent(void)
me = current;
rcu_read_lock();
- write_lock_irq(&tasklist_lock);
+ write_real_parent_lock_irq(me);
ret = -EPERM;
oldwork = NULL;
@@ -1540,7 +1540,7 @@ long keyctl_session_to_parent(void)
if (!ret)
newwork = NULL;
unlock:
- write_unlock_irq(&tasklist_lock);
+ write_real_parent_unlock_irq(me);
rcu_read_unlock();
if (oldwork)
put_cred(container_of(oldwork, struct cred, rcu));
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7dade28..0feda4b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2461,9 +2461,9 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
/* Wake up the parent if it is waiting so that it can recheck
* wait permission to the new task SID. */
- read_lock(&tasklist_lock);
+ read_real_parent_lock(current);
__wake_up_parent(current, current->real_parent);
- read_unlock(&tasklist_lock);
+ read_real_parent_unlock(current);
}
/* superblock security operations */
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH RFC 09/13] exit: Use for_each_thread() in do_wait()
[not found] <20150525162722.5171.15901.stgit@pro>
` (7 preceding siblings ...)
2015-05-25 17:45 ` [PATCH RFC 08/13] core: Use kin_lock synchronizations between parent and child and for thread group Kirill Tkhai
@ 2015-05-25 17:45 ` Kirill Tkhai
2015-05-26 19:46 ` Oleg Nesterov
2015-05-25 17:45 ` [PATCH RFC 10/13] exit: Add struct wait_opts's member held_lock and use it for tasklist_lock Kirill Tkhai
` (3 subsequent siblings)
12 siblings, 1 reply; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-25 17:45 UTC (permalink / raw)
To: linux-kernel
Cc: Oleg Nesterov, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
Kirill Tkhai
Refactoring, no functionality change.
Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
---
kernel/exit.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index a268093..c84f3e4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1538,8 +1538,7 @@ static long do_wait(struct wait_opts *wo)
set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
- tsk = current;
- do {
+ for_each_thread(current, tsk) {
retval = do_wait_thread(wo, tsk);
if (retval)
goto end;
@@ -1550,7 +1549,7 @@ static long do_wait(struct wait_opts *wo)
if (wo->wo_flags & __WNOTHREAD)
break;
- } while_each_thread(current, tsk);
+ }
read_unlock(&tasklist_lock);
notask:
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCH RFC 09/13] exit: Use for_each_thread() in do_wait()
2015-05-25 17:45 ` [PATCH RFC 09/13] exit: Use for_each_thread() in do_wait() Kirill Tkhai
@ 2015-05-26 19:46 ` Oleg Nesterov
2015-05-27 9:33 ` Kirill Tkhai
0 siblings, 1 reply; 16+ messages in thread
From: Oleg Nesterov @ 2015-05-26 19:46 UTC (permalink / raw)
To: Kirill Tkhai
Cc: linux-kernel, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
Kirill Tkhai
On 05/25, Kirill Tkhai wrote:
>
> Refactoring, no functionality change.
Hmm. unless I missed something this change is wrong.
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -1538,8 +1538,7 @@ static long do_wait(struct wait_opts *wo)
>
> set_current_state(TASK_INTERRUPTIBLE);
> read_lock(&tasklist_lock);
> - tsk = current;
> - do {
> + for_each_thread(current, tsk) {
> retval = do_wait_thread(wo, tsk);
> if (retval)
> goto end;
> @@ -1550,7 +1549,7 @@ static long do_wait(struct wait_opts *wo)
>
> if (wo->wo_flags & __WNOTHREAD)
> break;
> - } while_each_thread(current, tsk);
> + }
Please note the __WNOTHREAD check. This is the rare case when we
actually want while_each_thread() (although it should die anyway).
for_each_thread() always starts from ->group_leader, but we need
to start from "current" first.
Oleg.
^ permalink raw reply [flat|nested] 16+ messages in thread* Re: [PATCH RFC 09/13] exit: Use for_each_thread() in do_wait()
2015-05-26 19:46 ` Oleg Nesterov
@ 2015-05-27 9:33 ` Kirill Tkhai
2015-05-27 9:42 ` Kirill Tkhai
0 siblings, 1 reply; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-27 9:33 UTC (permalink / raw)
To: Oleg Nesterov, Kirill Tkhai
Cc: linux-kernel@vger.kernel.org, Andrew Morton, Ingo Molnar,
Peter Zijlstra, Michal Hocko, Rik van Riel, Ionut Alexa,
Peter Hurley
26.05.2015, 22:47, "Oleg Nesterov" <oleg@redhat.com>:
> On 05/25, Kirill Tkhai wrote:
>> Refactoring, no functionality change.
>
> Hmm. unless I missed something this change is wrong.
>
>> --- a/kernel/exit.c
>> +++ b/kernel/exit.c
>> @@ -1538,8 +1538,7 @@ static long do_wait(struct wait_opts *wo)
>>
>> set_current_state(TASK_INTERRUPTIBLE);
>> read_lock(&tasklist_lock);
>> - tsk = current;
>> - do {
>> + for_each_thread(current, tsk) {
>> retval = do_wait_thread(wo, tsk);
>> if (retval)
>> goto end;
>> @@ -1550,7 +1549,7 @@ static long do_wait(struct wait_opts *wo)
>>
>> if (wo->wo_flags & __WNOTHREAD)
>> break;
>> - } while_each_thread(current, tsk);
>> + }
>
> Please note the __WNOTHREAD check. This is the rare case when we
> actually want while_each_thread() (although it should die anyway).
>
> for_each_thread() always starts from ->group_leader, but we need
> to start from "current" first.
Sure, this must be like below. Thanks!
I won't resend the whole series with only this one patch changed to
do not bomb mail boxes. Waiting for the review.
diff --git a/kernel/exit.c b/kernel/exit.c
index a268093..e4963d3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1538,8 +1538,10 @@ static long do_wait(struct wait_opts *wo)
set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
- tsk = current;
- do {
+ for_each_thread(current, tsk) {
+ if (wo->wo_flags & __WNOTHREAD)
+ tsk = current;
+
retval = do_wait_thread(wo, tsk);
if (retval)
goto end;
@@ -1550,7 +1552,7 @@ static long do_wait(struct wait_opts *wo)
if (wo->wo_flags & __WNOTHREAD)
break;
- } while_each_thread(current, tsk);
+ }
read_unlock(&tasklist_lock);
notask:
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCH RFC 09/13] exit: Use for_each_thread() in do_wait()
2015-05-27 9:33 ` Kirill Tkhai
@ 2015-05-27 9:42 ` Kirill Tkhai
0 siblings, 0 replies; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-27 9:42 UTC (permalink / raw)
To: Oleg Nesterov, Kirill Tkhai
Cc: linux-kernel@vger.kernel.org, Andrew Morton, Ingo Molnar,
Peter Zijlstra, Michal Hocko, Rik van Riel, Ionut Alexa,
Peter Hurley
27.05.2015, 12:34, "Kirill Tkhai" <tkhai@yandex.ru>:
> 26.05.2015, 22:47, "Oleg Nesterov" <oleg@redhat.com>:
>> On 05/25, Kirill Tkhai wrote:
>>> Refactoring, no functionality change.
>> Hmm. unless I missed something this change is wrong.
>>> --- a/kernel/exit.c
>>> +++ b/kernel/exit.c
>>> @@ -1538,8 +1538,7 @@ static long do_wait(struct wait_opts *wo)
>>>
>>> set_current_state(TASK_INTERRUPTIBLE);
>>> read_lock(&tasklist_lock);
>>> - tsk = current;
>>> - do {
>>> + for_each_thread(current, tsk) {
>>> retval = do_wait_thread(wo, tsk);
>>> if (retval)
>>> goto end;
>>> @@ -1550,7 +1549,7 @@ static long do_wait(struct wait_opts *wo)
>>>
>>> if (wo->wo_flags & __WNOTHREAD)
>>> break;
>>> - } while_each_thread(current, tsk);
>>> + }
>> Please note the __WNOTHREAD check. This is the rare case when we
>> actually want while_each_thread() (although it should die anyway).
>>
>> for_each_thread() always starts from ->group_leader, but we need
>> to start from "current" first.
>
> Sure, this must be like below. Thanks!
> I won't resend the whole series with only this one patch changed to
> do not bomb mail boxes. Waiting for the review.
>
> diff --git a/kernel/exit.c b/kernel/exit.c
> index a268093..e4963d3 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -1538,8 +1538,10 @@ static long do_wait(struct wait_opts *wo)
>
> set_current_state(TASK_INTERRUPTIBLE);
> read_lock(&tasklist_lock);
> - tsk = current;
> - do {
> + for_each_thread(current, tsk) {
> + if (wo->wo_flags & __WNOTHREAD)
> + tsk = current;
> +
> retval = do_wait_thread(wo, tsk);
> if (retval)
> goto end;
> @@ -1550,7 +1552,7 @@ static long do_wait(struct wait_opts *wo)
>
> if (wo->wo_flags & __WNOTHREAD)
> break;
> - } while_each_thread(current, tsk);
> + }
> read_unlock(&tasklist_lock);
>
> notask:
Hm. Once again. Is the problem in __WNOTHREAD only?
Should we firstly reap our own children in common case?
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH RFC 10/13] exit: Add struct wait_opts's member held_lock and use it for tasklist_lock
[not found] <20150525162722.5171.15901.stgit@pro>
` (8 preceding siblings ...)
2015-05-25 17:45 ` [PATCH RFC 09/13] exit: Use for_each_thread() in do_wait() Kirill Tkhai
@ 2015-05-25 17:45 ` Kirill Tkhai
2015-05-25 17:46 ` [PATCH RFC 11/13] exit: Syncronize on kin_lock while do_notify_parent() Kirill Tkhai
` (2 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-25 17:45 UTC (permalink / raw)
To: linux-kernel
Cc: Oleg Nesterov, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
Kirill Tkhai
These will be used in next patches.
Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
---
kernel/exit.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index c84f3e4..bb9d165 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -953,6 +953,7 @@ struct wait_opts {
wait_queue_t child_wait;
int notask_error;
+ rwlock_t *held_lock;
};
static inline
@@ -1034,7 +1035,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
int why;
get_task_struct(p);
- read_unlock(&tasklist_lock);
+ read_unlock(wo->held_lock);
sched_annotate_sleep();
if ((exit_code & 0x7f) == 0) {
@@ -1056,7 +1057,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
/*
* We own this thread, nobody else can reap it.
*/
- read_unlock(&tasklist_lock);
+ read_unlock(wo->held_lock);
sched_annotate_sleep();
/*
@@ -1246,7 +1247,7 @@ static int wait_task_stopped(struct wait_opts *wo,
get_task_struct(p);
pid = task_pid_vnr(p);
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
- read_unlock(&tasklist_lock);
+ read_unlock(wo->held_lock);
sched_annotate_sleep();
if (unlikely(wo->wo_flags & WNOWAIT))
@@ -1309,7 +1310,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
pid = task_pid_vnr(p);
get_task_struct(p);
- read_unlock(&tasklist_lock);
+ read_unlock(wo->held_lock);
sched_annotate_sleep();
if (!wo->wo_info) {
@@ -1538,6 +1539,7 @@ static long do_wait(struct wait_opts *wo)
set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
+ wo->held_lock = &tasklist_lock;
for_each_thread(current, tsk) {
retval = do_wait_thread(wo, tsk);
if (retval)
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH RFC 11/13] exit: Syncronize on kin_lock while do_notify_parent()
[not found] <20150525162722.5171.15901.stgit@pro>
` (9 preceding siblings ...)
2015-05-25 17:45 ` [PATCH RFC 10/13] exit: Add struct wait_opts's member held_lock and use it for tasklist_lock Kirill Tkhai
@ 2015-05-25 17:46 ` Kirill Tkhai
2015-05-25 17:46 ` [PATCH RFC 12/13] exit: Delete write dependence on tasklist_lock in exit_notify() Kirill Tkhai
2015-05-25 17:46 ` [PATCH RFC 13/13] core: Nest tasklist_lock into task_struct::kin_lock Kirill Tkhai
12 siblings, 0 replies; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-25 17:46 UTC (permalink / raw)
To: linux-kernel
Cc: Oleg Nesterov, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
Kirill Tkhai
Note: do_wait() (working on tsk) may catch its child, which is being traced
by a thread from tsk's thread group. For proper synchronization, we must
hold both parent and real_parent locks for calling do_notify_parent().
Also delete tasklist_lock dependence in ptrace_attach() etc, because everything
is already synchronized on kin_lock (Due to previous patches).
Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
---
kernel/exit.c | 42 ++++++++++++++++++++++++++++--------------
kernel/ptrace.c | 20 ++++++--------------
kernel/signal.c | 11 ++---------
3 files changed, 36 insertions(+), 37 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index bb9d165..aeded00 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -183,6 +183,7 @@ void release_task(struct task_struct *p)
write_parent_and_real_parent_lock(p);
ptrace_release_task(p);
__exit_signal(p);
+ write_unlock(&tasklist_lock);
/*
* If we are the last non-leader member of the thread
@@ -203,8 +204,7 @@ void release_task(struct task_struct *p)
leader->exit_state = EXIT_DEAD;
}
- write_parent_and_real_parent_unlock(p);
- write_unlock_irq(&tasklist_lock);
+ write_parent_and_real_parent_unlock_irq(p);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);
@@ -1016,7 +1016,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
/*
* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
- * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * read_lock(wo->held_lock) on entry. If we return zero, we still hold
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
@@ -1036,6 +1036,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
get_task_struct(p);
read_unlock(wo->held_lock);
+ rcu_read_unlock();
sched_annotate_sleep();
if ((exit_code & 0x7f) == 0) {
@@ -1058,6 +1059,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* We own this thread, nobody else can reap it.
*/
read_unlock(wo->held_lock);
+ rcu_read_unlock();
sched_annotate_sleep();
/*
@@ -1152,16 +1154,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
retval = pid;
if (state == EXIT_TRACE) {
- write_lock_irq(&tasklist_lock);
+ struct task_struct *old_parent;
+ write_parent_and_real_parent_lock_irq(p);
+ old_parent = p->parent;
/* We dropped tasklist, ptracer could die and untrace */
ptrace_unlink(p);
+ if (p->parent != old_parent)
+ write_unlock(&old_parent->kin_lock);
+
/* If parent wants a zombie, don't release it now */
state = EXIT_ZOMBIE;
if (do_notify_parent(p, p->exit_signal))
state = EXIT_DEAD;
p->exit_state = state;
- write_unlock_irq(&tasklist_lock);
+ write_parent_unlock_irq(p);
}
if (state == EXIT_DEAD)
release_task(p);
@@ -1191,13 +1198,13 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
* Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
*
* CONTEXT:
- * read_lock(&tasklist_lock), which is released if return value is
+ * read_lock(wo->held_lock), which is released if return value is
* non-zero. Also, grabs and releases @p->sighand->siglock.
*
* RETURNS:
* 0 if wait condition didn't exist and search for other wait conditions
* should continue. Non-zero return, -errno on failure and @p's pid on
- * success, implies that tasklist_lock is released and wait condition
+ * success, implies that wo->held_lock is released and wait condition
* search should terminate.
*/
static int wait_task_stopped(struct wait_opts *wo,
@@ -1241,13 +1248,14 @@ static int wait_task_stopped(struct wait_opts *wo,
* Now we are pretty sure this task is interesting.
* Make sure it doesn't get reaped out from under us while we
* give up the lock and then examine it below. We don't want to
- * keep holding onto the tasklist_lock while we call getrusage and
+ * keep holding onto wo->held_lock while we call getrusage and
* possibly take page faults for user memory.
*/
get_task_struct(p);
pid = task_pid_vnr(p);
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(wo->held_lock);
+ rcu_read_unlock();
sched_annotate_sleep();
if (unlikely(wo->wo_flags & WNOWAIT))
@@ -1281,7 +1289,7 @@ static int wait_task_stopped(struct wait_opts *wo,
/*
* Handle do_wait work for one task in a live, non-stopped state.
- * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
+ * read_lock(wo->held_lock) on entry. If we return zero, we still hold
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
@@ -1311,6 +1319,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
pid = task_pid_vnr(p);
get_task_struct(p);
read_unlock(wo->held_lock);
+ rcu_read_unlock();
sched_annotate_sleep();
if (!wo->wo_info) {
@@ -1334,7 +1343,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
* Consider @p for a wait by @parent.
*
* -ECHILD should be in ->notask_error before the first call.
- * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns nonzero for a final return, when we have unlocked wo->held_lock.
* Returns zero if the search for a child should continue;
* then ->notask_error is 0 if @p is an eligible child,
* or another error from security_task_wait(), or still -ECHILD.
@@ -1392,6 +1401,9 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* If a ptracer wants to distinguish these two events for its
* own children it should create a separate process which takes
* the role of real parent.
+ *
+ * Since we use call do_notify_parent() under both parent's and
+ * real_parent's kin_locks, we are protected from it.
*/
if (!ptrace_reparented(p))
ptrace = 1;
@@ -1460,7 +1472,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* Do the work of do_wait() for one thread in the group, @tsk.
*
* -ECHILD should be in ->notask_error before the first call.
- * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns nonzero for a final return, when we have unlocked wo->held_lock.
* Returns zero if the search for a child should continue; then
* ->notask_error is 0 if there were any eligible children,
* or another error from security_task_wait(), or still -ECHILD.
@@ -1538,9 +1550,10 @@ static long do_wait(struct wait_opts *wo)
goto notask;
set_current_state(TASK_INTERRUPTIBLE);
- read_lock(&tasklist_lock);
- wo->held_lock = &tasklist_lock;
+ rcu_read_lock();
for_each_thread(current, tsk) {
+ read_lock(&tsk->kin_lock);
+ wo->held_lock = &tsk->kin_lock;
retval = do_wait_thread(wo, tsk);
if (retval)
goto end;
@@ -1548,11 +1561,12 @@ static long do_wait(struct wait_opts *wo)
retval = ptrace_do_wait(wo, tsk);
if (retval)
goto end;
+ read_unlock(&tsk->kin_lock);
if (wo->wo_flags & __WNOTHREAD)
break;
}
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
notask:
retval = wo->notask_error;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 817288d..6785f66 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -180,7 +180,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
* we are sure that this is our traced child and that can only
* be changed by us so it's not changing right after this.
*/
- read_lock(&tasklist_lock);
read_parent_lock(child);
if (child->ptrace && child->parent == current) {
WARN_ON(child->state == __TASK_TRACED);
@@ -192,7 +191,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
ret = 0;
}
read_parent_unlock(child);
- read_unlock(&tasklist_lock);
if (!ret && !ignore_state) {
if (!wait_task_inactive(child, __TASK_TRACED)) {
@@ -314,8 +312,7 @@ static int ptrace_attach(struct task_struct *task, long request,
if (retval)
goto unlock_creds;
- write_lock_irq(&tasklist_lock);
- write_parent_and_given_lock(task, current);
+ write_parent_and_given_lock_irq(task, current);
old_parent = task->parent;
retval = -EPERM;
if (unlikely(task->exit_state) || task->ptrace) {
@@ -365,8 +362,7 @@ static int ptrace_attach(struct task_struct *task, long request,
retval = 0;
unlock_current:
- write_unlock(¤t->kin_lock);
- write_unlock_irq(&tasklist_lock);
+ write_unlock_irq(¤t->kin_lock);
unlock_creds:
mutex_unlock(&task->signal->cred_guard_mutex);
out:
@@ -389,8 +385,7 @@ static int ptrace_traceme(void)
{
int ret = -EPERM;
- write_lock_irq(&tasklist_lock);
- write_parent_lock(current);
+ write_parent_lock_irq(current);
/* Are we already being traced? */
if (!current->ptrace) {
ret = security_ptrace_traceme(current->parent);
@@ -405,8 +400,7 @@ static int ptrace_traceme(void)
__ptrace_link(current, current->real_parent);
}
}
- write_parent_unlock(current);
- write_unlock_irq(&tasklist_lock);
+ write_parent_unlock_irq(current);
return ret;
}
@@ -474,8 +468,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
ptrace_disable(child);
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
- write_lock_irq(&tasklist_lock);
- write_parent_and_real_parent_lock(child);
+ write_parent_and_real_parent_lock_irq(child);
old_parent = child->parent;
/*
* We rely on ptrace_freeze_traced(). It can't be killed and
@@ -490,8 +483,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
__ptrace_detach(current, child);
if (old_parent != child->real_parent)
write_unlock(&old_parent->kin_lock);
- write_real_parent_unlock(child);
- write_unlock_irq(&tasklist_lock);
+ write_real_parent_unlock_irq(child);
proc_ptrace_connector(child, PTRACE_DETACH);
diff --git a/kernel/signal.c b/kernel/signal.c
index 8288019..2e7b622 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1732,6 +1732,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
struct sighand_struct *sighand;
cputime_t utime, stime;
+ BUG_ON(!same_thread_group(tsk, current));
+
if (for_ptracer) {
parent = tsk->parent;
} else {
@@ -1881,7 +1883,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
task_clear_jobctl_trapping(current);
spin_unlock_irq(¤t->sighand->siglock);
- read_lock(&tasklist_lock);
read_parent_and_real_parent_lock(current);
if (may_ptrace_stop()) {
/*
@@ -1906,7 +1907,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
*/
preempt_disable();
read_parent_and_real_parent_unlock(current);
- read_unlock(&tasklist_lock);
preempt_enable_no_resched();
freezable_schedule();
} else {
@@ -1928,7 +1928,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
if (clear_code)
current->exit_code = 0;
read_parent_and_real_parent_unlock(current);
- read_unlock(&tasklist_lock);
}
/*
@@ -2081,11 +2080,9 @@ static bool do_signal_stop(int signr)
* TASK_TRACED.
*/
if (notify) {
- read_lock(&tasklist_lock);
read_parent_and_real_parent_lock(current);
do_notify_parent_cldstop(current, false, notify);
read_parent_and_real_parent_unlock(current);
- read_unlock(&tasklist_lock);
}
/* Now we don't run again until woken by SIGCONT or SIGKILL */
@@ -2229,7 +2226,6 @@ int get_signal(struct ksignal *ksig)
* the ptracer of the group leader too unless it's gonna be
* a duplicate.
*/
- read_lock(&tasklist_lock);
read_parent_and_real_parent_lock(current->group_leader);
do_notify_parent_cldstop(current, false, why);
@@ -2237,7 +2233,6 @@ int get_signal(struct ksignal *ksig)
do_notify_parent_cldstop(current->group_leader,
true, why);
read_parent_and_real_parent_unlock(current->group_leader);
- read_unlock(&tasklist_lock);
goto relock;
}
@@ -2482,11 +2477,9 @@ void exit_signals(struct task_struct *tsk)
* should always go to the real parent of the group leader.
*/
if (unlikely(group_stop)) {
- read_lock(&tasklist_lock);
read_parent_and_real_parent_lock(tsk);
do_notify_parent_cldstop(tsk, false, group_stop);
read_parent_and_real_parent_unlock(tsk);
- read_unlock(&tasklist_lock);
}
}
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH RFC 12/13] exit: Delete write dependence on tasklist_lock in exit_notify()
[not found] <20150525162722.5171.15901.stgit@pro>
` (10 preceding siblings ...)
2015-05-25 17:46 ` [PATCH RFC 11/13] exit: Syncronize on kin_lock while do_notify_parent() Kirill Tkhai
@ 2015-05-25 17:46 ` Kirill Tkhai
2015-05-25 17:46 ` [PATCH RFC 13/13] core: Nest tasklist_lock into task_struct::kin_lock Kirill Tkhai
12 siblings, 0 replies; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-25 17:46 UTC (permalink / raw)
To: linux-kernel
Cc: Oleg Nesterov, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
Kirill Tkhai
Relatives and parent notifications have already protected by appropriate
kin_lock, so we may do not to take tasklist_lock for writing.
Hovewer, we still need it read locked to protect PGRP and PSID.
Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
---
kernel/exit.c | 19 +++++++------------
kernel/ptrace.c | 6 +++---
2 files changed, 10 insertions(+), 15 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index aeded00..99d7aa3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -460,8 +460,6 @@ static struct task_struct *find_alive_thread(struct task_struct *p)
}
static void check_pid_ns_reaper_exit(struct task_struct *father)
- __releases(&tasklist_lock)
- __acquires(&tasklist_lock)
{
struct pid_namespace *pid_ns = task_active_pid_ns(father);
struct task_struct *reaper;
@@ -473,7 +471,7 @@ static void check_pid_ns_reaper_exit(struct task_struct *father)
if (likely(reaper != father))
return;
- write_lock(&pid_ns->cr_lock);
+ write_lock_irq(&pid_ns->cr_lock);
read_real_parent_lock(father);
reaper = find_alive_thread(father);
@@ -481,18 +479,16 @@ static void check_pid_ns_reaper_exit(struct task_struct *father)
pid_ns->child_reaper = reaper;
read_real_parent_unlock(father);
- write_unlock(&pid_ns->cr_lock);
+ write_unlock_irq(&pid_ns->cr_lock);
if (reaper)
return;
- write_unlock_irq(&tasklist_lock);
if (unlikely(pid_ns == &init_pid_ns)) {
panic("Attempted to kill init! exitcode=0x%08x\n",
father->signal->group_exit_code ?: father->exit_code);
}
zap_pid_ns_processes(pid_ns);
- write_lock_irq(&tasklist_lock);
}
/*
@@ -598,7 +594,6 @@ static void forget_original_parent(struct task_struct *father,
if (unlikely(!list_empty(&father->ptraced)))
exit_ptrace(father, dead);
- /* Can drop and reacquire tasklist_lock */
check_pid_ns_reaper_exit(father);
/* read_lock() guarantees concurrent thread sees our PF_EXITING */
@@ -609,7 +604,7 @@ static void forget_original_parent(struct task_struct *father,
}
read_unlock(&father->kin_lock);
- read_lock(&task_active_pid_ns(father)->cr_lock);
+ read_lock_irq(&task_active_pid_ns(father)->cr_lock);
reaper = find_double_lock_new_reaper(father);
list_for_each_entry(p, &father->children, sibling) {
@@ -632,7 +627,7 @@ static void forget_original_parent(struct task_struct *father,
list_splice_tail_init(&father->children, &reaper->children);
double_write_unlock(&reaper->kin_lock, &father->kin_lock);
- read_unlock(&task_active_pid_ns(father)->cr_lock);
+ read_unlock_irq(&task_active_pid_ns(father)->cr_lock);
}
/*
@@ -645,12 +640,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
struct task_struct *p, *n;
LIST_HEAD(dead);
- write_lock_irq(&tasklist_lock);
forget_original_parent(tsk, &dead);
+ read_lock_irq(&tasklist_lock);
write_parent_and_real_parent_lock(tsk);
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
+ read_unlock(&tasklist_lock);
if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
@@ -672,8 +668,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
- write_parent_and_real_parent_unlock(tsk);
- write_unlock_irq(&tasklist_lock);
+ write_parent_and_real_parent_unlock_irq(tsk);
list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6785f66..d6e2d0b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -505,9 +505,9 @@ void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
if (!p)
break;
- write_parent_and_real_parent_lock(p);
+ write_parent_and_real_parent_lock_irq(p);
if (p->parent != tracer) {
- write_parent_and_real_parent_unlock(p);
+ write_parent_and_real_parent_unlock_irq(p);
continue;
}
@@ -517,7 +517,7 @@ void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
if (__ptrace_detach(tracer, p))
list_add(&p->ptrace_entry, dead);
- write_parent_and_real_parent_unlock(p);
+ write_parent_and_real_parent_unlock_irq(p);
}
rcu_read_unlock();
}
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH RFC 13/13] core: Nest tasklist_lock into task_struct::kin_lock
[not found] <20150525162722.5171.15901.stgit@pro>
` (11 preceding siblings ...)
2015-05-25 17:46 ` [PATCH RFC 12/13] exit: Delete write dependence on tasklist_lock in exit_notify() Kirill Tkhai
@ 2015-05-25 17:46 ` Kirill Tkhai
12 siblings, 0 replies; 16+ messages in thread
From: Kirill Tkhai @ 2015-05-25 17:46 UTC (permalink / raw)
To: linux-kernel
Cc: Oleg Nesterov, Andrew Morton, Ingo Molnar, Peter Zijlstra,
Michal Hocko, Rik van Riel, Ionut Alexa, Peter Hurley,
Kirill Tkhai
tasklist_lock is global lock, and it should be locked as small time
as possible. So that, lets nest it under kin_lock to minimize this
time.
Signed-off-by: Kirill Tkhai <ktkhai@odin.com>
---
fs/exec.c | 20 ++++++++++----------
kernel/exit.c | 13 +++++++------
kernel/fork.c | 15 +++++++--------
kernel/sys.c | 17 ++++++++---------
4 files changed, 32 insertions(+), 33 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 4de7770..0b44752 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -931,8 +931,7 @@ static int de_thread(struct task_struct *tsk)
for (;;) {
threadgroup_change_begin(tsk);
- write_lock_irq(&tasklist_lock);
- write_real_parent_lock(tsk);
+ write_real_parent_lock_irq(tsk);
/*
* Do this under real_parent's kin_lock to ensure
* that exit_notify() can't miss ->group_exit_task.
@@ -941,8 +940,7 @@ static int de_thread(struct task_struct *tsk)
if (likely(leader->exit_state))
break;
__set_current_state(TASK_KILLABLE);
- write_real_parent_unlock(tsk);
- write_unlock_irq(&tasklist_lock);
+ write_real_parent_unlock_irq(tsk);
threadgroup_change_end(tsk);
schedule();
if (unlikely(__fatal_signal_pending(tsk)))
@@ -964,6 +962,8 @@ static int de_thread(struct task_struct *tsk)
BUG_ON(!same_thread_group(leader, tsk));
BUG_ON(has_group_leader_pid(tsk));
+
+ write_lock(&tasklist_lock);
/*
* An exec() starts a new thread group with the
* TGID of the previous thread group. Rehash the
@@ -982,6 +982,7 @@ static int de_thread(struct task_struct *tsk)
transfer_pid(leader, tsk, PIDTYPE_SID);
list_replace_rcu(&leader->tasks, &tsk->tasks);
+ write_unlock(&tasklist_lock);
list_replace_init(&leader->sibling, &tsk->sibling);
tsk->group_leader = tsk;
@@ -1003,8 +1004,7 @@ static int de_thread(struct task_struct *tsk)
*/
if (unlikely(leader->ptrace))
__wake_up_parent(leader, leader->parent);
- write_real_parent_unlock(tsk);
- write_unlock_irq(&tasklist_lock);
+ write_real_parent_unlock_irq(tsk);
threadgroup_change_end(tsk);
release_task(leader);
@@ -1034,13 +1034,13 @@ static int de_thread(struct task_struct *tsk)
memcpy(newsighand->action, oldsighand->action,
sizeof(newsighand->action));
- write_lock_irq(&tasklist_lock);
- write_parent_and_real_parent_lock(tsk);
+ write_parent_and_real_parent_lock_irq(tsk);
+ write_lock(&tasklist_lock);
spin_lock(&oldsighand->siglock);
rcu_assign_pointer(tsk->sighand, newsighand);
spin_unlock(&oldsighand->siglock);
- write_parent_and_real_parent_unlock(tsk);
- write_unlock_irq(&tasklist_lock);
+ write_unlock(&tasklist_lock);
+ write_parent_and_real_parent_unlock_irq(tsk);
__cleanup_sighand(oldsighand);
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 99d7aa3..44641aa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -179,9 +179,9 @@ void release_task(struct task_struct *p)
proc_flush_task(p);
- write_lock_irq(&tasklist_lock);
- write_parent_and_real_parent_lock(p);
+ write_parent_and_real_parent_lock_irq(p);
ptrace_release_task(p);
+ write_lock(&tasklist_lock);
__exit_signal(p);
write_unlock(&tasklist_lock);
@@ -642,11 +642,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
forget_original_parent(tsk, &dead);
- read_lock_irq(&tasklist_lock);
- write_parent_and_real_parent_lock(tsk);
- if (group_dead)
+ write_parent_and_real_parent_lock_irq(tsk);
+ if (group_dead) {
+ read_lock(&tasklist_lock);
kill_orphaned_pgrp(tsk->group_leader, NULL);
- read_unlock(&tasklist_lock);
+ read_unlock(&tasklist_lock);
+ }
if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
diff --git a/kernel/fork.c b/kernel/fork.c
index 63e225b..b7cb176 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1520,19 +1520,18 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
- write_lock_irq(&tasklist_lock);
-
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
- write_real_parent_lock(current);
+ write_real_parent_lock_irq(current);
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
} else {
- write_lock(¤t->kin_lock);
+ write_lock_irq(¤t->kin_lock);
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
}
+ write_lock(&tasklist_lock);
spin_lock(¤t->sighand->siglock);
/*
@@ -1552,8 +1551,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
recalc_sigpending();
if (signal_pending(current)) {
spin_unlock(¤t->sighand->siglock);
- write_real_parent_unlock(p);
- write_unlock_irq(&tasklist_lock);
+ write_unlock(&tasklist_lock);
+ write_real_parent_unlock_irq(p);
retval = -ERESTARTNOINTR;
goto bad_fork_free_pid;
}
@@ -1594,9 +1593,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
total_forks++;
spin_unlock(¤t->sighand->siglock);
- write_real_parent_unlock(p);
+ write_unlock(&tasklist_lock);
+ write_real_parent_unlock_irq(p);
syscall_tracepoint_update(p);
- write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 783aec4..013be3e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -931,17 +931,16 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
return -EINVAL;
rcu_read_lock();
- /* From this point forward we keep holding onto the tasklist lock
- * so that our parent does not change from under us. -DaveM
- */
- write_lock_irq(&tasklist_lock);
-
err = -ESRCH;
p = find_task_by_vpid(pid);
if (!p)
goto out2;
- write_real_parent_lock(p);
+ write_real_parent_lock_irq(p);
+ if (p->flags & PF_EXITPIDONE)
+ goto out3;
+
+ write_lock(&tasklist_lock);
err = -EINVAL;
if (!thread_group_leader(p))
@@ -983,10 +982,10 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
err = 0;
out:
- /* All (almost) paths lead to here, thus we are safe. -DaveM */
- write_real_parent_unlock(p);
+ write_unlock(&tasklist_lock);
+out3:
+ write_real_parent_unlock_irq(p);
out2:
- write_unlock_irq(&tasklist_lock);
rcu_read_unlock();
return err;
}
^ permalink raw reply related [flat|nested] 16+ messages in thread