public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Greg KH <gregkh@suse.de>
To: linux-kernel@vger.kernel.org, stable@kernel.org
Cc: Justin Forbes <jmforbes@linuxtx.org>,
	Zwane Mwaikambo <zwane@arm.linux.org.uk>,
	"Theodore Ts'o" <tytso@mit.edu>,
	Randy Dunlap <rdunlap@xenotime.net>,
	Dave Jones <davej@redhat.com>,
	Chuck Wolber <chuckw@quantumlinux.com>,
	Chris Wedgwood <reviews@ml.cw.f00f.org>,
	Michael Krufky <mkrufky@linuxtv.org>,
	torvalds@osdl.org, akpm@osdl.org, alan@lxorguk.ukuu.org.uk,
	Patrick McHardy <kaber@trash.net>,
	"David S. Miller" <davem@davemloft.net>,
	Greg Kroah-Hartman <gregkh@suse.de>
Subject: [patch 01/67] NET_SCHED: Fix fallout from dev->qdisc RCU change
Date: Wed, 11 Oct 2006 14:03:30 -0700	[thread overview]
Message-ID: <20061011210330.GB16627@kroah.com> (raw)
In-Reply-To: <20061011210310.GA16627@kroah.com>

[-- Attachment #1: net_sched-fix-fallout-from-dev-qdisc-rcu-change.patch --]
[-- Type: text/plain, Size: 8212 bytes --]


-stable review patch.  If anyone has any objections, please let us know.

------------------
From: Patrick McHardy <kaber@trash.net>

The move of qdisc destruction to a rcu callback broke locking in the
entire qdisc layer by invalidating previously valid assumptions about
the context in which changes to the qdisc tree occur.

The two assumptions were:

- since changes only happen in process context, read_lock doesn't need
  bottem half protection. Now invalid since destruction of inner qdiscs,
  classifiers, actions and estimators happens in the RCU callback unless
  they're manually deleted, resulting in dead-locks when read_lock in
  process context is interrupted by write_lock_bh in bottem half context.

- since changes only happen under the RTNL, no additional locking is
  necessary for data not used during packet processing (f.e. u32_list).
  Again, since destruction now happens in the RCU callback, this assumption
  is not valid anymore, causing races while using this data, which can
  result in corruption or use-after-free.

Instead of "fixing" this by disabling bottem halfs everywhere and adding
new locks/refcounting, this patch makes these assumptions valid again by
moving destruction back to process context. Since only the dev->qdisc
pointer is protected by RCU, but ->enqueue and the qdisc tree are still
protected by dev->qdisc_lock, destruction of the tree can be performed
immediately and only the final free needs to happen in the rcu callback
to make sure dev_queue_xmit doesn't access already freed memory.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

---
 net/core/dev.c          |   14 +++++-----
 net/sched/cls_api.c     |    4 +-
 net/sched/sch_api.c     |   16 +++++------
 net/sched/sch_generic.c |   66 +++++++++++++++---------------------------------
 4 files changed, 39 insertions(+), 61 deletions(-)

--- linux-2.6.18.orig/net/core/dev.c
+++ linux-2.6.18/net/core/dev.c
@@ -1478,14 +1478,16 @@ gso:
 	if (q->enqueue) {
 		/* Grab device queue */
 		spin_lock(&dev->queue_lock);
+		q = dev->qdisc;
+		if (q->enqueue) {
+			rc = q->enqueue(skb, q);
+			qdisc_run(dev);
+			spin_unlock(&dev->queue_lock);
 
-		rc = q->enqueue(skb, q);
-
-		qdisc_run(dev);
-
+			rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
+			goto out;
+		}
 		spin_unlock(&dev->queue_lock);
-		rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
-		goto out;
 	}
 
 	/* The device has no queue. Common case for software devices:
--- linux-2.6.18.orig/net/sched/cls_api.c
+++ linux-2.6.18/net/sched/cls_api.c
@@ -401,7 +401,7 @@ static int tc_dump_tfilter(struct sk_buf
 	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
 		return skb->len;
 
-	read_lock_bh(&qdisc_tree_lock);
+	read_lock(&qdisc_tree_lock);
 	if (!tcm->tcm_parent)
 		q = dev->qdisc_sleeping;
 	else
@@ -458,7 +458,7 @@ errout:
 	if (cl)
 		cops->put(q, cl);
 out:
-	read_unlock_bh(&qdisc_tree_lock);
+	read_unlock(&qdisc_tree_lock);
 	dev_put(dev);
 	return skb->len;
 }
--- linux-2.6.18.orig/net/sched/sch_api.c
+++ linux-2.6.18/net/sched/sch_api.c
@@ -195,14 +195,14 @@ struct Qdisc *qdisc_lookup(struct net_de
 {
 	struct Qdisc *q;
 
-	read_lock_bh(&qdisc_tree_lock);
+	read_lock(&qdisc_tree_lock);
 	list_for_each_entry(q, &dev->qdisc_list, list) {
 		if (q->handle == handle) {
-			read_unlock_bh(&qdisc_tree_lock);
+			read_unlock(&qdisc_tree_lock);
 			return q;
 		}
 	}
-	read_unlock_bh(&qdisc_tree_lock);
+	read_unlock(&qdisc_tree_lock);
 	return NULL;
 }
 
@@ -837,7 +837,7 @@ static int tc_dump_qdisc(struct sk_buff 
 			continue;
 		if (idx > s_idx)
 			s_q_idx = 0;
-		read_lock_bh(&qdisc_tree_lock);
+		read_lock(&qdisc_tree_lock);
 		q_idx = 0;
 		list_for_each_entry(q, &dev->qdisc_list, list) {
 			if (q_idx < s_q_idx) {
@@ -846,12 +846,12 @@ static int tc_dump_qdisc(struct sk_buff 
 			}
 			if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
 					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
-				read_unlock_bh(&qdisc_tree_lock);
+				read_unlock(&qdisc_tree_lock);
 				goto done;
 			}
 			q_idx++;
 		}
-		read_unlock_bh(&qdisc_tree_lock);
+		read_unlock(&qdisc_tree_lock);
 	}
 
 done:
@@ -1074,7 +1074,7 @@ static int tc_dump_tclass(struct sk_buff
 	s_t = cb->args[0];
 	t = 0;
 
-	read_lock_bh(&qdisc_tree_lock);
+	read_lock(&qdisc_tree_lock);
 	list_for_each_entry(q, &dev->qdisc_list, list) {
 		if (t < s_t || !q->ops->cl_ops ||
 		    (tcm->tcm_parent &&
@@ -1096,7 +1096,7 @@ static int tc_dump_tclass(struct sk_buff
 			break;
 		t++;
 	}
-	read_unlock_bh(&qdisc_tree_lock);
+	read_unlock(&qdisc_tree_lock);
 
 	cb->args[0] = t;
 
--- linux-2.6.18.orig/net/sched/sch_generic.c
+++ linux-2.6.18/net/sched/sch_generic.c
@@ -45,11 +45,10 @@
    The idea is the following:
    - enqueue, dequeue are serialized via top level device
      spinlock dev->queue_lock.
-   - tree walking is protected by read_lock_bh(qdisc_tree_lock)
+   - tree walking is protected by read_lock(qdisc_tree_lock)
      and this lock is used only in process context.
-   - updates to tree are made under rtnl semaphore or
-     from softirq context (__qdisc_destroy rcu-callback)
-     hence this lock needs local bh disabling.
+   - updates to tree are made only under rtnl semaphore,
+     hence this lock may be made without local bh disabling.
 
    qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
  */
@@ -57,14 +56,14 @@ DEFINE_RWLOCK(qdisc_tree_lock);
 
 void qdisc_lock_tree(struct net_device *dev)
 {
-	write_lock_bh(&qdisc_tree_lock);
+	write_lock(&qdisc_tree_lock);
 	spin_lock_bh(&dev->queue_lock);
 }
 
 void qdisc_unlock_tree(struct net_device *dev)
 {
 	spin_unlock_bh(&dev->queue_lock);
-	write_unlock_bh(&qdisc_tree_lock);
+	write_unlock(&qdisc_tree_lock);
 }
 
 /* 
@@ -483,20 +482,6 @@ void qdisc_reset(struct Qdisc *qdisc)
 static void __qdisc_destroy(struct rcu_head *head)
 {
 	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
-	struct Qdisc_ops  *ops = qdisc->ops;
-
-#ifdef CONFIG_NET_ESTIMATOR
-	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
-#endif
-	write_lock(&qdisc_tree_lock);
-	if (ops->reset)
-		ops->reset(qdisc);
-	if (ops->destroy)
-		ops->destroy(qdisc);
-	write_unlock(&qdisc_tree_lock);
-	module_put(ops->owner);
-
-	dev_put(qdisc->dev);
 	kfree((char *) qdisc - qdisc->padded);
 }
 
@@ -504,32 +489,23 @@ static void __qdisc_destroy(struct rcu_h
 
 void qdisc_destroy(struct Qdisc *qdisc)
 {
-	struct list_head cql = LIST_HEAD_INIT(cql);
-	struct Qdisc *cq, *q, *n;
+	struct Qdisc_ops  *ops = qdisc->ops;
 
 	if (qdisc->flags & TCQ_F_BUILTIN ||
-		!atomic_dec_and_test(&qdisc->refcnt))
+	    !atomic_dec_and_test(&qdisc->refcnt))
 		return;
 
-	if (!list_empty(&qdisc->list)) {
-		if (qdisc->ops->cl_ops == NULL)
-			list_del(&qdisc->list);
-		else
-			list_move(&qdisc->list, &cql);
-	}
-
-	/* unlink inner qdiscs from dev->qdisc_list immediately */
-	list_for_each_entry(cq, &cql, list)
-		list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
-			if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
-				if (q->ops->cl_ops == NULL)
-					list_del_init(&q->list);
-				else
-					list_move_tail(&q->list, &cql);
-			}
-	list_for_each_entry_safe(cq, n, &cql, list)
-		list_del_init(&cq->list);
+	list_del(&qdisc->list);
+#ifdef CONFIG_NET_ESTIMATOR
+	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
+#endif
+	if (ops->reset)
+		ops->reset(qdisc);
+	if (ops->destroy)
+		ops->destroy(qdisc);
 
+	module_put(ops->owner);
+	dev_put(qdisc->dev);
 	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
 }
 
@@ -549,15 +525,15 @@ void dev_activate(struct net_device *dev
 				printk(KERN_INFO "%s: activation failed\n", dev->name);
 				return;
 			}
-			write_lock_bh(&qdisc_tree_lock);
+			write_lock(&qdisc_tree_lock);
 			list_add_tail(&qdisc->list, &dev->qdisc_list);
-			write_unlock_bh(&qdisc_tree_lock);
+			write_unlock(&qdisc_tree_lock);
 		} else {
 			qdisc =  &noqueue_qdisc;
 		}
-		write_lock_bh(&qdisc_tree_lock);
+		write_lock(&qdisc_tree_lock);
 		dev->qdisc_sleeping = qdisc;
-		write_unlock_bh(&qdisc_tree_lock);
+		write_unlock(&qdisc_tree_lock);
 	}
 
 	if (!netif_carrier_ok(dev))

--

  reply	other threads:[~2006-10-11 21:04 UTC|newest]

Thread overview: 90+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20061011204756.642936754@quad.kroah.org>
2006-10-11 21:03 ` [patch 00/67] 2.6.18-stable review Greg KH
2006-10-11 21:03   ` Greg KH [this message]
2006-10-11 21:03   ` [patch 02/67] uml: allow using again x86/x86_64 crypto code Greg KH
2006-10-11 21:03   ` [patch 03/67] uml: use DEFCONFIG_LIST to avoid reading hosts config Greg KH
2006-10-11 21:03   ` [patch 04/67] UML: Fix UML build failure Greg KH
2006-10-11 21:03   ` [patch 05/67] Video: Fix msp343xG handling regression Greg KH
2006-10-11 21:03   ` [patch 06/67] Video: cx24123: fix PLL divisor setup Greg KH
2006-10-11 21:15     ` Michael Krufky
2006-10-11 21:29       ` Greg KH
2006-10-11 21:36         ` Michael Krufky
2006-10-11 23:01           ` [stable] " Greg KH
2006-10-11 23:58             ` Michael Krufky
2006-10-13 18:48               ` Greg KH
2006-10-11 21:03   ` [patch 07/67] Video: pvrusb2: Solve mutex deadlock Greg KH
2006-10-11 21:04   ` [patch 09/67] Video: pvrusb2: Suppress compiler warning Greg KH
2006-10-11 21:04   ` [patch 10/67] Video: pvrusb2: Limit hor res for 24xxx devices Greg KH
2006-10-11 21:04   ` [patch 11/67] zd1211rw: ZD1211B ASIC/FWT, not jointly decoder Greg KH
2006-10-12 13:41     ` John W. Linville
2006-10-11 21:04   ` [patch 12/67] S390: user readable uninitialised kernel memory (CVE-2006-5174) Greg KH
2006-10-11 21:04   ` [patch 13/67] IB/mthca: Fix lid used for sending traps Greg KH
2006-10-11 21:04   ` [patch 14/67] USB: Allow compile in g_ether, fix typo Greg KH
2006-10-11 21:04   ` [patch 15/67] ALSA: Fix initiailization of user-space controls Greg KH
2006-10-11 21:04   ` [patch 16/67] jbd: fix commit of ordered data buffers Greg KH
2006-10-12 11:55     ` Jan Kara
2006-10-12 17:16       ` Greg KH
2006-10-11 21:04   ` [patch 17/67] Fix longstanding load balancing bug in the scheduler Greg KH
2006-10-12  7:30     ` Arjan van de Ven
2006-10-11 21:04   ` [patch 18/67] zone_reclaim: dynamic slab reclaim Greg KH
2006-10-12  7:31     ` Arjan van de Ven
2006-10-12 10:04       ` Christoph Lameter
2006-10-11 21:04   ` [patch 19/67] mv643xx_eth: fix obvious typo, which caused build breakage Greg KH
2006-10-11 21:05   ` [patch 20/67] netdrvr: lp486e: fix typo Greg KH
2006-10-11 21:05   ` [patch 21/67] sky2: tx pause bug fix Greg KH
2006-10-11 21:05   ` [patch 22/67] sky2 network driver device ids Greg KH
2006-10-11 21:05   ` [patch 23/67] One line per header in Kbuild files to reduce conflicts Greg KH
2006-10-11 21:05   ` [patch 24/67] Fix ARM make headers_check Greg KH
2006-10-11 21:05   ` [patch 25/67] Fix make headers_check on sh Greg KH
2006-10-11 21:05   ` [patch 26/67] Fix make headers_check on sh64 Greg KH
2006-10-11 21:05   ` [patch 27/67] Fix make headers_check on m32r Greg KH
2006-10-11 21:05   ` [patch 28/67] Fix exported headers for SPARC, SPARC64 Greg KH
2006-10-11 21:05   ` [patch 29/67] Fix m68knommu exported headers Greg KH
2006-10-11 21:05   ` [patch 30/67] Fix H8300 " Greg KH
2006-10-11 21:06   ` [patch 31/67] Remove ARM26 header export Greg KH
2006-10-11 21:06   ` [patch 32/67] Remove UML " Greg KH
2006-10-11 21:06   ` [patch 33/67] Dont advertise (or allow) headers_{install,check} where inappropriate Greg KH
2006-10-11 21:06   ` [patch 34/67] Fix v850 exported headers Greg KH
2006-10-11 21:06   ` [patch 35/67] Clean up exported headers on CRIS Greg KH
2006-10-11 21:06   ` [patch 36/67] Remove offsetof() from user-visible <linux/stddef.h> Greg KH
2006-10-11 21:06   ` [patch 37/67] powerpc: fix building gdb against asm/ptrace.h Greg KH
2006-10-11 21:06   ` [patch 38/67] sysfs: remove duplicated dput in sysfs_update_file Greg KH
2006-10-11 21:06   ` [patch 39/67] powerpc: Fix ohare IDE irq workaround on old powermacs Greg KH
2006-10-11 21:07   ` [patch 40/67] i386 bootioremap / kexec fix Greg KH
2006-10-11 21:07   ` [patch 41/67] rtc: lockdep fix/workaround Greg KH
2006-10-11 21:07   ` [patch 42/67] do not free non slab allocated per_cpu_pageset Greg KH
2006-10-11 21:07   ` [patch 43/67] backlight: fix oops in __mutex_lock_slowpath during head /sys/class/graphics/fb0/bits_per_pixel /sys/class/graphics/fb0/blank /sys/class/graphics/fb0/console /sys/class/graphics/fb0/cursor /sys/class/graphics/fb0/dev /sys/class/graphics/fb0/device /sys/class/graphics/fb0/mode /sys/class/graphics/fb0/modes /sys/class/graphics/fb0/name /sys/class/graphics/fb0/pan /sys/class/graphics/fb0/rotate /sys/class/graphics/fb0/state /sys/class/graphics/fb0/stride /sys/class/graphics/fb0/subsystem /sys/class/graphics/fb0/uevent /sys/class/graphics/fb0/virtual_size Greg KH
2006-10-11 21:07   ` [patch 44/67] cpu to node relationship fixup: acpi_map_cpu2node Greg KH
2006-10-11 21:07   ` [patch 45/67] cpu to node relationship fixup: map cpu to node Greg KH
2006-10-11 21:07   ` [patch 46/67] i386: fix flat mode numa on a real numa system Greg KH
2006-10-11 21:07   ` [patch 47/67] load_module: no BUG if module_subsys uninitialized Greg KH
2006-10-11 21:07   ` [patch 48/67] Fix VIDIOC_ENUMSTD bug Greg KH
2006-10-11 21:46     ` Jonathan Corbet
2006-10-11 21:49       ` Michael Krufky
2006-10-11 22:10         ` Mauro Carvalho Chehab
2006-10-11 23:04           ` [stable] " Greg KH
2006-10-11 21:07   ` [patch 49/67] SPARC64: Fix serious bug in sched_clock() on sparc64 Greg KH
2006-10-11 21:07   ` [patch 50/67] CPUFREQ: Fix some more CPU hotplug locking Greg KH
2006-10-11 21:08   ` [patch 51/67] IPV6: bh_lock_sock_nested on tcp_v6_rcv Greg KH
2006-10-11 21:08   ` [patch 52/67] SPARC64: Fix sparc64 ramdisk handling Greg KH
2006-10-11 21:08   ` [patch 53/67] sata_mv: fix oops Greg KH
2006-10-11 21:08   ` [patch 54/67] PKT_SCHED: cls_basic: Use unsigned int when generating handle Greg KH
2006-10-11 21:08   ` [patch 55/67] IPV6: Disable SG for GSO unless we have checksum Greg KH
2006-10-11 21:08   ` [patch 56/67] MD: Fix problem where hot-added drives are not resynced Greg KH
2006-10-11 21:08   ` [patch 57/67] TCP: Fix and simplify microsecond rtt sampling Greg KH
2006-10-11 21:08   ` [patch 58/67] mm: bug in set_page_dirty_buffers Greg KH
2006-10-11 21:08   ` [patch 59/67] fbdev: correct buffer size limit in fbmem_read_proc() Greg KH
2006-10-11 21:08   ` [patch 60/67] rtc driver rtc-pcf8563 century bit inversed Greg KH
2006-10-11 21:08   ` [patch 61/67] invalidate_inode_pages2(): ignore page refcounts Greg KH
2006-10-11 21:09   ` [patch 62/67] scx200_hrt: fix precedence bug manifesting as 27x clock in 1 MHz mode Greg KH
2006-10-11 21:09   ` [patch 63/67] ide-generic: jmicron fix Greg KH
2006-10-11 21:09   ` [patch 64/67] x86-64: Calgary IOMMU: Fix off by one when calculating register space location Greg KH
2006-10-11 21:09   ` [patch 66/67] NETFILTER: NAT: fix NOTRACK checksum handling Greg KH
2006-10-11 21:09   ` [patch 67/67] block layer: elv_iosched_show should get elv_list_lock Greg KH
2006-10-11 21:36   ` [patch 00/67] 2.6.18-stable review Dave Jones
2006-10-11 21:59     ` Greg KH
2006-10-11 22:17       ` Dave Jones
2006-10-11 22:19       ` Dave Jones
2006-10-11 22:59         ` [stable] " Greg KH
2006-10-12  0:42   ` Theodore Tso
2006-10-12 16:35     ` [stable] " Greg KH
2006-10-12 16:51       ` Dave Jones

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20061011210330.GB16627@kroah.com \
    --to=gregkh@suse.de \
    --cc=akpm@osdl.org \
    --cc=alan@lxorguk.ukuu.org.uk \
    --cc=chuckw@quantumlinux.com \
    --cc=davej@redhat.com \
    --cc=davem@davemloft.net \
    --cc=jmforbes@linuxtx.org \
    --cc=kaber@trash.net \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mkrufky@linuxtv.org \
    --cc=rdunlap@xenotime.net \
    --cc=reviews@ml.cw.f00f.org \
    --cc=stable@kernel.org \
    --cc=torvalds@osdl.org \
    --cc=tytso@mit.edu \
    --cc=zwane@arm.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox