[PATCH] cgroup: serialize css kill and release paths

BPF List
 help / color / mirror / Atom feed

* [PATCH] cgroup: serialize css kill and release paths
@ 2022-06-03 17:34 Tadeusz Struk
  2022-06-03 18:06 ` Tadeusz Struk
  2022-06-03 18:13 ` [PATCH v2] " Tadeusz Struk
  0 siblings, 2 replies; 6+ messages in thread
From: Tadeusz Struk @ 2022-06-03 17:34 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Tadeusz Struk, Michal Koutny, Zefan Li, Johannes Weiner,
	Christian Brauner, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, cgroups, netdev, bpf, stable,
	linux-kernel, syzbot+e42ae441c3b10acf9e9d

Syzbot found a corrupted list bug scenario that can be triggered from
cgroup_subtree_control_write(cgrp). The reproduces writes to
cgroup.subtree_control file, which invokes:
cgroup_apply_control_enable()->css_create()->css_populate_dir(), which
then fails with a fault injected -ENOMEM.
In such scenario the css_killed_work_fn will be en-queued via
cgroup_apply_control_disable(cgrp)->kill_css(css), and bail out to
cgroup_kn_unlock(). Then cgroup_kn_unlock() will call:
cgroup_put(cgrp)->css_put(&cgrp->self), which will try to enqueue
css_release_work_fn for the same css instance, causing a list_add
corruption bug, as can be seen in the syzkaller report [1].

Fix this by synchronizing the css ref_kill and css_release jobs.
css_release() function will check if the css_killed_work_fn() has been
scheduled for the css and only en-queue the css_release_work_fn()
if css_killed_work_fn wasn't already en-queued. Otherwise css_release() will
set the CSS_REL_LATER flag for that css. This will cause the css_release_work_fn()
work to be executed after css_killed_work_fn() is finished.

Two scc flags have been introduced to implement this serialization mechanizm:

 * CSS_KILL_ENQED, which will be set when css_killed_work_fn() is en-queued, and
 * CSS_REL_LATER, which, if set, will cause the css_release_work_fn() to be
   scheduled after the css_killed_work_fn is finished.

There is also a new lock, which will protect the integrity of the css flags.

[1] https://syzkaller.appspot.com/bug?id=e26e54d6eac9d9fb50b221ec3e4627b327465dbd

Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Koutny <mkoutny@suse.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: <cgroups@vger.kernel.org>
Cc: <netdev@vger.kernel.org>
Cc: <bpf@vger.kernel.org>
Cc: <stable@vger.kernel.org>
Cc: <linux-kernel@vger.kernel.org>

Reported-and-tested-by: syzbot+e42ae441c3b10acf9e9d@syzkaller.appspotmail.com
Fixes: 8f36aaec9c92 ("cgroup: Use rcu_work instead of explicit rcu and work item")
Signed-off-by: Tadeusz Struk <tadeusz.struk@linaro.org>
---
 include/linux/cgroup-defs.h |  4 ++++
 kernel/cgroup/cgroup.c      | 35 ++++++++++++++++++++++++++++++++---
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1bfcfb1af352..8dc8b4edb242 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -53,6 +53,8 @@ enum {
 	CSS_RELEASED	= (1 << 2), /* refcnt reached zero, released */
 	CSS_VISIBLE	= (1 << 3), /* css is visible to userland */
 	CSS_DYING	= (1 << 4), /* css is dying */
+	CSS_KILL_ENQED	= (1 << 5), /* kill work enqueued for the css */
+	CSS_REL_LATER	= (1 << 6), /* release needs to be done after kill */
 };
 
 /* bits in struct cgroup flags field */
@@ -162,6 +164,8 @@ struct cgroup_subsys_state {
 	 */
 	int id;
 
+	/* lock to protect flags */
+	spinlock_t lock;
 	unsigned int flags;
 
 	/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1779ccddb734..a0ceead4b390 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5210,8 +5210,23 @@ static void css_release(struct percpu_ref *ref)
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 
-	INIT_WORK(&css->destroy_work, css_release_work_fn);
-	queue_work(cgroup_destroy_wq, &css->destroy_work);
+	spin_lock_bh(&css->lock);
+
+	/*
+	 * Check if the css_killed_work_fn work has been scheduled for this
+	 * css and enqueue css_release_work_fn only if it wasn't.
+	 * Otherwise set the CSS_REL_LATER flag, which will cause
+	 * release to be enqueued after css_killed_work_fn is finished.
+	 * This is to prevent list corruption by en-queuing two instance
+	 * of the same work struct on the same WQ, namely cgroup_destroy_wq.
+	 */
+	if (!(css->flags & CSS_KILL_ENQED)) {
+		INIT_WORK(&css->destroy_work, css_release_work_fn);
+		queue_work(cgroup_destroy_wq, &css->destroy_work);
+	} else {
+		css->flags |= CSS_REL_LATER;
+	}
+	spin_unlock_bh(&css->lock);
 }
 
 static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -5230,6 +5245,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 	INIT_LIST_HEAD(&css->rstat_css_node);
 	css->serial_nr = css_serial_nr_next++;
 	atomic_set(&css->online_cnt, 0);
+	spin_lock_init(&css->lock);
 
 	if (cgroup_parent(cgrp)) {
 		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
@@ -5545,10 +5561,12 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
  */
 static void css_killed_work_fn(struct work_struct *work)
 {
-	struct cgroup_subsys_state *css =
+	struct cgroup_subsys_state *css_killed, *css =
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 
 	mutex_lock(&cgroup_mutex);
+	css_killed = css;
+	css_killed->flags &= ~CSS_KILL_ENQED;
 
 	do {
 		offline_css(css);
@@ -5557,6 +5575,14 @@ static void css_killed_work_fn(struct work_struct *work)
 		css = css->parent;
 	} while (css && atomic_dec_and_test(&css->online_cnt));
 
+	spin_lock_bh(&css->lock);
+	if (css_killed->flags & CSS_REL_LATER) {
+		/* If css_release work was delayed for the css enqueue it now. */
+		INIT_WORK(&css_killed->destroy_work, css_release_work_fn);
+		queue_work(cgroup_destroy_wq, &css_killed->destroy_work);
+		css_killed->flags &= ~CSS_REL_LATER;
+	}
+	spin_unlock_bh(&css->lock);
 	mutex_unlock(&cgroup_mutex);
 }
 
@@ -5566,10 +5592,13 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 
+	spin_lock_bh(&css->lock);
 	if (atomic_dec_and_test(&css->online_cnt)) {
+		css->flags |= CSS_KILL_ENQED;
 		INIT_WORK(&css->destroy_work, css_killed_work_fn);
 		queue_work(cgroup_destroy_wq, &css->destroy_work);
 	}
+	spin_unlock_bh(&css->lock);
 }
 
 /**
-- 
2.36.1

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] cgroup: serialize css kill and release paths
  2022-06-03 17:34 [PATCH] cgroup: serialize css kill and release paths Tadeusz Struk
@ 2022-06-03 18:06 ` Tadeusz Struk
  2022-06-03 18:13 ` [PATCH v2] " Tadeusz Struk
  1 sibling, 0 replies; 6+ messages in thread
From: Tadeusz Struk @ 2022-06-03 18:06 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Koutny, Zefan Li, Johannes Weiner, Christian Brauner,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, cgroups, netdev, bpf, stable, linux-kernel,
	syzbot+e42ae441c3b10acf9e9d

On 6/3/22 10:34, Tadeusz Struk wrote:
> Syzbot found a corrupted list bug scenario that can be triggered from
> cgroup_subtree_control_write(cgrp). The reproduces writes to
> cgroup.subtree_control file, which invokes:
> cgroup_apply_control_enable()->css_create()->css_populate_dir(), which
> then fails with a fault injected -ENOMEM.
> In such scenario the css_killed_work_fn will be en-queued via
> cgroup_apply_control_disable(cgrp)->kill_css(css), and bail out to
> cgroup_kn_unlock(). Then cgroup_kn_unlock() will call:
> cgroup_put(cgrp)->css_put(&cgrp->self), which will try to enqueue
> css_release_work_fn for the same css instance, causing a list_add
> corruption bug, as can be seen in the syzkaller report [1].
> 
> Fix this by synchronizing the css ref_kill and css_release jobs.
> css_release() function will check if the css_killed_work_fn() has been
> scheduled for the css and only en-queue the css_release_work_fn()
> if css_killed_work_fn wasn't already en-queued. Otherwise css_release() will
> set the CSS_REL_LATER flag for that css. This will cause the css_release_work_fn()
> work to be executed after css_killed_work_fn() is finished.
> 
> Two scc flags have been introduced to implement this serialization mechanizm:
> 
>   * CSS_KILL_ENQED, which will be set when css_killed_work_fn() is en-queued, and
>   * CSS_REL_LATER, which, if set, will cause the css_release_work_fn() to be
>     scheduled after the css_killed_work_fn is finished.
> 
> There is also a new lock, which will protect the integrity of the css flags.
> 
> [1]https://syzkaller.appspot.com/bug?id=e26e54d6eac9d9fb50b221ec3e4627b327465dbd
> 
> Cc: Tejun Heo<tj@kernel.org>
> Cc: Michal Koutny<mkoutny@suse.com>
> Cc: Zefan Li<lizefan.x@bytedance.com>
> Cc: Johannes Weiner<hannes@cmpxchg.org>
> Cc: Christian Brauner<brauner@kernel.org>
> Cc: Alexei Starovoitov<ast@kernel.org>
> Cc: Daniel Borkmann<daniel@iogearbox.net>
> Cc: Andrii Nakryiko<andrii@kernel.org>
> Cc: Martin KaFai Lau<kafai@fb.com>
> Cc: Song Liu<songliubraving@fb.com>
> Cc: Yonghong Song<yhs@fb.com>
> Cc: John Fastabend<john.fastabend@gmail.com>
> Cc: KP Singh<kpsingh@kernel.org>
> Cc:<cgroups@vger.kernel.org>
> Cc:<netdev@vger.kernel.org>
> Cc:<bpf@vger.kernel.org>
> Cc:<stable@vger.kernel.org>
> Cc:<linux-kernel@vger.kernel.org>
> 
> Reported-and-tested-by:syzbot+e42ae441c3b10acf9e9d@syzkaller.appspotmail.com
> Fixes: 8f36aaec9c92 ("cgroup: Use rcu_work instead of explicit rcu and work item")
> Signed-off-by: Tadeusz Struk<tadeusz.struk@linaro.org>

I just spotted an issue with this. I'm holding invalid lock in css_killed_work_fn().
I will follow up with a v2 of the patch soon.

-- 
Thanks,
Tadeusz

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v2] cgroup: serialize css kill and release paths
  2022-06-03 17:34 [PATCH] cgroup: serialize css kill and release paths Tadeusz Struk
  2022-06-03 18:06 ` Tadeusz Struk
@ 2022-06-03 18:13 ` Tadeusz Struk
  2022-06-03 18:17   ` Tadeusz Struk
  2022-06-06 12:39   ` Michal Koutný
  1 sibling, 2 replies; 6+ messages in thread
From: Tadeusz Struk @ 2022-06-03 18:13 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Tadeusz Struk, Michal Koutny, Zefan Li, Johannes Weiner,
	Christian Brauner, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, cgroups, netdev, bpf, stable,
	linux-kernel, syzbot+e42ae441c3b10acf9e9d

Syzbot found a corrupted list bug scenario that can be triggered from
cgroup_subtree_control_write(cgrp). The reproduces writes to
cgroup.subtree_control file, which invokes:
cgroup_apply_control_enable()->css_create()->css_populate_dir(), which
then fails with a fault injected -ENOMEM.
In such scenario the css_killed_work_fn will be en-queued via
cgroup_apply_control_disable(cgrp)->kill_css(css), and bail out to
cgroup_kn_unlock(). Then cgroup_kn_unlock() will call:
cgroup_put(cgrp)->css_put(&cgrp->self), which will try to enqueue
css_release_work_fn for the same css instance, causing a list_add
corruption bug, as can be seen in the syzkaller report [1].

Fix this by synchronizing the css ref_kill and css_release jobs.
css_release() function will check if the css_killed_work_fn() has been
scheduled for the css and only en-queue the css_release_work_fn()
if css_killed_work_fn wasn't already en-queued. Otherwise css_release() will
set the CSS_REL_LATER flag for that css. This will cause the
css_release_work_fn() work to be executed after css_killed_work_fn() is finished.

Two scc flags have been introduced to implement this serialization mechanizm:

 * CSS_KILL_ENQED, which will be set when css_killed_work_fn() is en-queued, and
 * CSS_REL_LATER, which, if set, will cause the css_release_work_fn() to be
   scheduled after the css_killed_work_fn is finished.

There is also a new lock, which will protect the integrity of the css flags.

[1] https://syzkaller.appspot.com/bug?id=e26e54d6eac9d9fb50b221ec3e4627b327465dbd

Cc: Tejun Heo <tj@kernel.org>
Cc: Michal Koutny<mkoutny@suse.com>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: <cgroups@vger.kernel.org>
Cc: <netdev@vger.kernel.org>
Cc: <bpf@vger.kernel.org>
Cc: <stable@vger.kernel.org>
Cc: <linux-kernel@vger.kernel.org>

Reported-and-tested-by: syzbot+e42ae441c3b10acf9e9d@syzkaller.appspotmail.com
Fixes: 8f36aaec9c92 ("cgroup: Use rcu_work instead of explicit rcu and work item")
Signed-off-by: Tadeusz Struk <tadeusz.struk@linaro.org>
---
v2: Use correct lock in css_killed_work_fn()
---
 include/linux/cgroup-defs.h |  4 ++++
 kernel/cgroup/cgroup.c      | 35 ++++++++++++++++++++++++++++++++---
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1bfcfb1af352..8dc8b4edb242 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -53,6 +53,8 @@ enum {
 	CSS_RELEASED	= (1 << 2), /* refcnt reached zero, released */
 	CSS_VISIBLE	= (1 << 3), /* css is visible to userland */
 	CSS_DYING	= (1 << 4), /* css is dying */
+	CSS_KILL_ENQED	= (1 << 5), /* kill work enqueued for the css */
+	CSS_REL_LATER	= (1 << 6), /* release needs to be done after kill */
 };
 
 /* bits in struct cgroup flags field */
@@ -162,6 +164,8 @@ struct cgroup_subsys_state {
 	 */
 	int id;
 
+	/* lock to protect flags */
+	spinlock_t lock;
 	unsigned int flags;
 
 	/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1779ccddb734..b1bbd438d426 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5210,8 +5210,23 @@ static void css_release(struct percpu_ref *ref)
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 
-	INIT_WORK(&css->destroy_work, css_release_work_fn);
-	queue_work(cgroup_destroy_wq, &css->destroy_work);
+	spin_lock_bh(&css->lock);
+
+	/*
+	 * Check if the css_killed_work_fn work has been scheduled for this
+	 * css and enqueue css_release_work_fn only if it wasn't.
+	 * Otherwise set the CSS_REL_LATER flag, which will cause
+	 * release to be enqueued after css_killed_work_fn is finished.
+	 * This is to prevent list corruption by en-queuing two instance
+	 * of the same work struct on the same WQ, namely cgroup_destroy_wq.
+	 */
+	if (!(css->flags & CSS_KILL_ENQED)) {
+		INIT_WORK(&css->destroy_work, css_release_work_fn);
+		queue_work(cgroup_destroy_wq, &css->destroy_work);
+	} else {
+		css->flags |= CSS_REL_LATER;
+	}
+	spin_unlock_bh(&css->lock);
 }
 
 static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -5230,6 +5245,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 	INIT_LIST_HEAD(&css->rstat_css_node);
 	css->serial_nr = css_serial_nr_next++;
 	atomic_set(&css->online_cnt, 0);
+	spin_lock_init(&css->lock);
 
 	if (cgroup_parent(cgrp)) {
 		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
@@ -5545,10 +5561,12 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
  */
 static void css_killed_work_fn(struct work_struct *work)
 {
-	struct cgroup_subsys_state *css =
+	struct cgroup_subsys_state *css_killed, *css =
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 
 	mutex_lock(&cgroup_mutex);
+	css_killed = css;
+	css_killed->flags &= ~CSS_KILL_ENQED;
 
 	do {
 		offline_css(css);
@@ -5557,6 +5575,14 @@ static void css_killed_work_fn(struct work_struct *work)
 		css = css->parent;
 	} while (css && atomic_dec_and_test(&css->online_cnt));
 
+	spin_lock_bh(&css_killed->lock);
+	if (css_killed->flags & CSS_REL_LATER) {
+		/* If css_release work was delayed for the css enqueue it now. */
+		INIT_WORK(&css_killed->destroy_work, css_release_work_fn);
+		queue_work(cgroup_destroy_wq, &css_killed->destroy_work);
+		css_killed->flags &= ~CSS_REL_LATER;
+	}
+	spin_unlock_bh(&css_killed->lock);
 	mutex_unlock(&cgroup_mutex);
 }
 
@@ -5566,10 +5592,13 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 
+	spin_lock_bh(&css->lock);
 	if (atomic_dec_and_test(&css->online_cnt)) {
+		css->flags |= CSS_KILL_ENQED;
 		INIT_WORK(&css->destroy_work, css_killed_work_fn);
 		queue_work(cgroup_destroy_wq, &css->destroy_work);
 	}
+	spin_unlock_bh(&css->lock);
 }
 
 /**
-- 
2.36.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] cgroup: serialize css kill and release paths
  2022-06-03 18:13 ` [PATCH v2] " Tadeusz Struk
@ 2022-06-03 18:17   ` Tadeusz Struk
  2022-06-06 12:39   ` Michal Koutný
  1 sibling, 0 replies; 6+ messages in thread
From: Tadeusz Struk @ 2022-06-03 18:17 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michal Koutny, Zefan Li, Johannes Weiner, Christian Brauner,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, cgroups, netdev, bpf, stable, linux-kernel,
	syzbot+e42ae441c3b10acf9e9d

On 6/3/22 11:13, Tadeusz Struk wrote:
> Syzbot found a corrupted list bug scenario that can be triggered from
> cgroup_subtree_control_write(cgrp). The reproduces writes to
> cgroup.subtree_control file, which invokes:
> cgroup_apply_control_enable()->css_create()->css_populate_dir(), which
> then fails with a fault injected -ENOMEM.
> In such scenario the css_killed_work_fn will be en-queued via
> cgroup_apply_control_disable(cgrp)->kill_css(css), and bail out to
> cgroup_kn_unlock(). Then cgroup_kn_unlock() will call:
> cgroup_put(cgrp)->css_put(&cgrp->self), which will try to enqueue
> css_release_work_fn for the same css instance, causing a list_add
> corruption bug, as can be seen in the syzkaller report [1].
> 
> Fix this by synchronizing the css ref_kill and css_release jobs.
> css_release() function will check if the css_killed_work_fn() has been
> scheduled for the css and only en-queue the css_release_work_fn()
> if css_killed_work_fn wasn't already en-queued. Otherwise css_release() will
> set the CSS_REL_LATER flag for that css. This will cause the
> css_release_work_fn() work to be executed after css_killed_work_fn() is finished.
> 
> Two scc flags have been introduced to implement this serialization mechanizm:
> 
>   * CSS_KILL_ENQED, which will be set when css_killed_work_fn() is en-queued, and
>   * CSS_REL_LATER, which, if set, will cause the css_release_work_fn() to be
>     scheduled after the css_killed_work_fn is finished.
> 
> There is also a new lock, which will protect the integrity of the css flags.
> 
> [1] https://syzkaller.appspot.com/bug?id=e26e54d6eac9d9fb50b221ec3e4627b327465dbd

This also fixes a similar, cgroup related list corrupt issue:
https://syzkaller.appspot.com/bug?id=3c7ff113ccb695e839b859da3fc481c36eb1cfd5

-- 
Thanks,
Tadeusz

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] cgroup: serialize css kill and release paths
  2022-06-03 18:13 ` [PATCH v2] " Tadeusz Struk
  2022-06-03 18:17   ` Tadeusz Struk
@ 2022-06-06 12:39   ` Michal Koutný
  2022-06-07 18:47     ` Tadeusz Struk
  1 sibling, 1 reply; 6+ messages in thread
From: Michal Koutný @ 2022-06-06 12:39 UTC (permalink / raw)
  To: Tadeusz Struk
  Cc: Tejun Heo, Zefan Li, Johannes Weiner, Christian Brauner,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, cgroups, netdev, bpf, stable, linux-kernel,
	syzbot+e42ae441c3b10acf9e9d

Hello.

On Fri, Jun 03, 2022 at 11:13:21AM -0700, Tadeusz Struk <tadeusz.struk@linaro.org> wrote:
> In such scenario the css_killed_work_fn will be en-queued via
> cgroup_apply_control_disable(cgrp)->kill_css(css), and bail out to
> cgroup_kn_unlock(). Then cgroup_kn_unlock() will call:
> cgroup_put(cgrp)->css_put(&cgrp->self), which will try to enqueue
> css_release_work_fn for the same css instance, causing a list_add
> corruption bug, as can be seen in the syzkaller report [1].

This hypothesis doesn't add up to me (I am sorry).

The kill_css(css) would be a css associated with a subsys (css.ss !=
NULL) whereas css_put(&cgrp->self) is a different css just for the
cgroup (css.ss == NULL).

Michal

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] cgroup: serialize css kill and release paths
  2022-06-06 12:39   ` Michal Koutný
@ 2022-06-07 18:47     ` Tadeusz Struk
  0 siblings, 0 replies; 6+ messages in thread
From: Tadeusz Struk @ 2022-06-07 18:47 UTC (permalink / raw)
  To: Michal Koutný
  Cc: Tejun Heo, Zefan Li, Johannes Weiner, Christian Brauner,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, cgroups, netdev, bpf, stable, linux-kernel,
	syzbot+e42ae441c3b10acf9e9d

On 6/6/22 05:39, Michal Koutný wrote:
> On Fri, Jun 03, 2022 at 11:13:21AM -0700, Tadeusz Struk<tadeusz.struk@linaro.org>  wrote:
>> In such scenario the css_killed_work_fn will be en-queued via
>> cgroup_apply_control_disable(cgrp)->kill_css(css), and bail out to
>> cgroup_kn_unlock(). Then cgroup_kn_unlock() will call:
>> cgroup_put(cgrp)->css_put(&cgrp->self), which will try to enqueue
>> css_release_work_fn for the same css instance, causing a list_add
>> corruption bug, as can be seen in the syzkaller report [1].
> This hypothesis doesn't add up to me (I am sorry).
> 
> The kill_css(css) would be a css associated with a subsys (css.ss !=
> NULL) whereas css_put(&cgrp->self) is a different css just for the
> cgroup (css.ss == NULL).

Yes, you are right. I couldn't figure it out where the extra css_put()
is called from, and the only place that fitted into my theory was from
the cgroup_kn_unlock() in cgroup_apply_control_disable().
After some more debugging I can see that, as you said, the cgrp->self
is a different css. The offending _put() is actually called by the
percpu_ref_kill_and_confirm(), as it not only calls the passed confirm_kill
percpu_ref_func_t, but also it puts the refcnt iself.
Because the cgroup_apply_control_disable() will loop for_each_live_descendant,
and call css_kill() on all css'es, and css_killed_work_fn() will also loop
and call css_put() on all parents, the css_release() will be called on the
first parent prematurely, causing the BUG(). What I think should be done
to balance put/get is to call css_get() for all the parents in kill_css():

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c1e1a5c34e77..3ca61325bc4e 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5527,6 +5527,8 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
   */
  static void kill_css(struct cgroup_subsys_state *css)
  {
+       struct cgroup_subsys_state *_css = css;
+
         lockdep_assert_held(&cgroup_mutex);
  
         if (css->flags & CSS_DYING)
@@ -5541,10 +5543,13 @@ static void kill_css(struct cgroup_subsys_state *css)
         css_clear_dir(css);
  
         /*
-        * Killing would put the base ref, but we need to keep it alive
-        * until after ->css_offline().
+        * Killing would put the base ref, but we need to keep it alive,
+        * and all its parents, until after ->css_offline().
          */
-       css_get(css);
+       do {
+               css_get(_css);
+               _css = _css->parent;
+       } while (_css && atomic_read(&_css->online_cnt));
  
         /*
          * cgroup core guarantees that, by the time ->css_offline() is

This will be then "reverted" in css_killed_work_fn()
Please let me know if it makes sense to you.
I'm still testing it, but syzbot is very slow today.

-- 
Thanks,
Tadeusz

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-06-07 21:02 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-06-03 17:34 [PATCH] cgroup: serialize css kill and release paths Tadeusz Struk
2022-06-03 18:06 ` Tadeusz Struk
2022-06-03 18:13 ` [PATCH v2] " Tadeusz Struk
2022-06-03 18:17   ` Tadeusz Struk
2022-06-06 12:39   ` Michal Koutný
2022-06-07 18:47     ` Tadeusz Struk

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox