Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 1/2] net/sb1250: register mdio bus in probe
From: Sebastian Andrzej Siewior @ 2010-04-25 21:02 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: netdev, Sebastian Andrzej Siewior

"ifconfig eth0 up && ifconfig eth0 down" triggers:
| kobject (a8000000cfa5a480): tried to init an initialized object, something is seriously wrong.
| Call Trace:
| [<ffffffff8010aabc>] dump_stack+0x8/0x34
| [<ffffffff80293128>] kobject_init+0xe8/0xf0
| [<ffffffff802d922c>] device_initialize+0x2c/0x98
| [<ffffffff802d9cfc>] device_register+0x14/0x28
| [<ffffffff80312cd4>] mdiobus_register+0xdc/0x1e0
| [<ffffffff80314cf0>] sbmac_open+0x58/0x220
| [<ffffffff803519bc>] __dev_open+0x11c/0x180
| [<ffffffff8034d578>] __dev_change_flags+0x120/0x180
| [<ffffffff80351848>] dev_change_flags+0x20/0x78
| [<ffffffff803a753c>] devinet_ioctl+0x7cc/0x820
| [<ffffffff80339ac8>] sock_do_ioctl+0x38/0x90
| [<ffffffff8033a258>] compat_sock_ioctl_trans+0x408/0x1030
| [<ffffffff8033af30>] compat_sock_ioctl+0xb0/0xd0
| [<ffffffff80208b08>] compat_sys_ioctl+0xa0/0x18b8
| [<ffffffff80102f94>] handle_sys+0x114/0x130
|
| sb1250-mac-mdio: probed

mdiobus_register() calls device_register() which initializes the kobj of
the device. mdiobus_unregister() calls only device_del() so we have one
reference left. That one is leaving with mdiobus_free() which is only
called on remove.
Since I don't see any reason why mdiobus_register()/mdiobus_unregister()
should happen in ->open()/->close() I move them to probe & exit.

Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
---
 drivers/net/sb1250-mac.c |   41 +++++++++++++++++++----------------------
 1 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/drivers/net/sb1250-mac.c b/drivers/net/sb1250-mac.c
index 9944e5d..d162862 100644
--- a/drivers/net/sb1250-mac.c
+++ b/drivers/net/sb1250-mac.c
@@ -2353,17 +2353,15 @@ static int sbmac_init(struct platform_device *pldev, long long base)
 
 	sc->mii_bus = mdiobus_alloc();
 	if (sc->mii_bus == NULL) {
-		sbmac_uninitctx(sc);
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto uninit_ctx;
 	}
 
 	err = register_netdev(dev);
 	if (err) {
 		printk(KERN_ERR "%s.%d: unable to register netdev\n",
 		       sbmac_string, idx);
-		mdiobus_free(sc->mii_bus);
-		sbmac_uninitctx(sc);
-		return err;
+		goto free_mdio;
 	}
 
 	pr_info("%s.%d: registered as %s\n", sbmac_string, idx, dev->name);
@@ -2389,9 +2387,23 @@ static int sbmac_init(struct platform_device *pldev, long long base)
 		sc->mii_bus->irq[i] = SBMAC_PHY_INT;
 
 	sc->mii_bus->parent = &pldev->dev;
+	/*
+	 * Probe PHY address
+	 */
+	err = mdiobus_register(sc->mii_bus);
+	if (err) {
+		printk(KERN_ERR "%s: unable to register MDIO bus\n",
+		       dev->name);
+		goto free_mdio;
+	}
 	dev_set_drvdata(&pldev->dev, sc->mii_bus);
-
 	return 0;
+
+free_mdio:
+	mdiobus_free(sc->mii_bus);
+uninit_ctx:
+	sbmac_uninitctx(sc);
+	return err;
 }
 
 
@@ -2417,16 +2429,6 @@ static int sbmac_open(struct net_device *dev)
 		goto out_err;
 	}
 
-	/*
-	 * Probe PHY address
-	 */
-	err = mdiobus_register(sc->mii_bus);
-	if (err) {
-		printk(KERN_ERR "%s: unable to register MDIO bus\n",
-		       dev->name);
-		goto out_unirq;
-	}
-
 	sc->sbm_speed = sbmac_speed_none;
 	sc->sbm_duplex = sbmac_duplex_none;
 	sc->sbm_fc = sbmac_fc_none;
@@ -2457,11 +2459,7 @@ static int sbmac_open(struct net_device *dev)
 	return 0;
 
 out_unregister:
-	mdiobus_unregister(sc->mii_bus);
-
-out_unirq:
 	free_irq(dev->irq, dev);
-
 out_err:
 	return err;
 }
@@ -2651,8 +2649,6 @@ static int sbmac_close(struct net_device *dev)
 	phy_disconnect(sc->phy_dev);
 	sc->phy_dev = NULL;
 
-	mdiobus_unregister(sc->mii_bus);
-
 	free_irq(dev->irq, dev);
 
 	sbdma_emptyring(&(sc->sbm_txdma));
@@ -2760,6 +2756,7 @@ static int __exit sbmac_remove(struct platform_device *pldev)
 
 	unregister_netdev(dev);
 	sbmac_uninitctx(sc);
+	mdiobus_unregister(sc->mii_bus);
 	mdiobus_free(sc->mii_bus);
 	iounmap(sc->sbm_base);
 	free_netdev(dev);
-- 
1.6.6.1


^ permalink raw reply related

* [PATCH 2/2] net/sb1250: remove CONFIG_SIBYTE_STANDALONE
From: Sebastian Andrzej Siewior @ 2010-04-25 21:02 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: netdev, Sebastian Andrzej Siewior
In-Reply-To: <1272229348-16140-1-git-send-email-sebastian@breakpoint.cc>

CONFIG_SIBYTE_STANDALONE is gone since v2.6.31-rc1 ("MIPS: Sibyte:
Remove standalone kernel support")
This is a missing piece.

Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
---
 drivers/net/sb1250-mac.c |  144 ----------------------------------------------
 1 files changed, 0 insertions(+), 144 deletions(-)

diff --git a/drivers/net/sb1250-mac.c b/drivers/net/sb1250-mac.c
index d162862..fc503a1 100644
--- a/drivers/net/sb1250-mac.c
+++ b/drivers/net/sb1250-mac.c
@@ -48,23 +48,6 @@
 #include <asm/io.h>
 #include <asm/processor.h>	/* Processor type for cache alignment. */
 
-/* This is only here until the firmware is ready.  In that case,
-   the firmware leaves the ethernet address in the register for us. */
-#ifdef CONFIG_SIBYTE_STANDALONE
-#define SBMAC_ETH0_HWADDR "40:00:00:00:01:00"
-#define SBMAC_ETH1_HWADDR "40:00:00:00:01:01"
-#define SBMAC_ETH2_HWADDR "40:00:00:00:01:02"
-#define SBMAC_ETH3_HWADDR "40:00:00:00:01:03"
-#endif
-
-
-/* These identify the driver base version and may not be removed. */
-#if 0
-static char version1[] __initdata =
-"sb1250-mac.c:1.00 1/11/2001 Written by Mitch Lichtenberg\n";
-#endif
-
-
 /* Operational parameters that usually are not changed. */
 
 #define CONFIG_SBMAC_COALESCE
@@ -2182,85 +2165,6 @@ static void sbmac_setmulti(struct sbmac_softc *sc)
 	}
 }
 
-#if defined(SBMAC_ETH0_HWADDR) || defined(SBMAC_ETH1_HWADDR) || defined(SBMAC_ETH2_HWADDR) || defined(SBMAC_ETH3_HWADDR)
-/**********************************************************************
- *  SBMAC_PARSE_XDIGIT(str)
- *
- *  Parse a hex digit, returning its value
- *
- *  Input parameters:
- *  	   str - character
- *
- *  Return value:
- *  	   hex value, or -1 if invalid
- ********************************************************************* */
-
-static int sbmac_parse_xdigit(char str)
-{
-	int digit;
-
-	if ((str >= '0') && (str <= '9'))
-		digit = str - '0';
-	else if ((str >= 'a') && (str <= 'f'))
-		digit = str - 'a' + 10;
-	else if ((str >= 'A') && (str <= 'F'))
-		digit = str - 'A' + 10;
-	else
-		return -1;
-
-	return digit;
-}
-
-/**********************************************************************
- *  SBMAC_PARSE_HWADDR(str,hwaddr)
- *
- *  Convert a string in the form xx:xx:xx:xx:xx:xx into a 6-byte
- *  Ethernet address.
- *
- *  Input parameters:
- *  	   str - string
- *  	   hwaddr - pointer to hardware address
- *
- *  Return value:
- *  	   0 if ok, else -1
- ********************************************************************* */
-
-static int sbmac_parse_hwaddr(char *str, unsigned char *hwaddr)
-{
-	int digit1,digit2;
-	int idx = 6;
-
-	while (*str && (idx > 0)) {
-		digit1 = sbmac_parse_xdigit(*str);
-		if (digit1 < 0)
-			return -1;
-		str++;
-		if (!*str)
-			return -1;
-
-		if ((*str == ':') || (*str == '-')) {
-			digit2 = digit1;
-			digit1 = 0;
-		}
-		else {
-			digit2 = sbmac_parse_xdigit(*str);
-			if (digit2 < 0)
-				return -1;
-			str++;
-		}
-
-		*hwaddr++ = (digit1 << 4) | digit2;
-		idx--;
-
-		if (*str == '-')
-			str++;
-		if (*str == ':')
-			str++;
-	}
-	return 0;
-}
-#endif
-
 static int sb1250_change_mtu(struct net_device *_dev, int new_mtu)
 {
 	if (new_mtu >  ENET_PACKET_SIZE)
@@ -2768,36 +2672,6 @@ static int __exit sbmac_remove(struct platform_device *pldev)
 static struct platform_device **sbmac_pldev;
 static int sbmac_max_units;
 
-#if defined(SBMAC_ETH0_HWADDR) || defined(SBMAC_ETH1_HWADDR) || defined(SBMAC_ETH2_HWADDR) || defined(SBMAC_ETH3_HWADDR)
-static void __init sbmac_setup_hwaddr(int idx, char *addr)
-{
-	void __iomem *sbm_base;
-	unsigned long start, end;
-	uint8_t eaddr[6];
-	uint64_t val;
-
-	if (idx >= sbmac_max_units)
-		return;
-
-	start = A_MAC_CHANNEL_BASE(idx);
-	end = A_MAC_CHANNEL_BASE(idx + 1) - 1;
-
-	sbm_base = ioremap_nocache(start, end - start + 1);
-	if (!sbm_base) {
-		printk(KERN_ERR "%s: unable to map device registers\n",
-		       sbmac_string);
-		return;
-	}
-
-	sbmac_parse_hwaddr(addr, eaddr);
-	val = sbmac_addr2reg(eaddr);
-	__raw_writeq(val, sbm_base + R_MAC_ETHERNET_ADDR);
-	val = __raw_readq(sbm_base + R_MAC_ETHERNET_ADDR);
-
-	iounmap(sbm_base);
-}
-#endif
-
 static int __init sbmac_platform_probe_one(int idx)
 {
 	struct platform_device *pldev;
@@ -2874,24 +2748,6 @@ static void __init sbmac_platform_probe(void)
 		return;				/* none */
 	}
 
-	/*
-	 * For bringup when not using the firmware, we can pre-fill
-	 * the MAC addresses using the environment variables
-	 * specified in this file (or maybe from the config file?)
-	 */
-#ifdef SBMAC_ETH0_HWADDR
-	sbmac_setup_hwaddr(0, SBMAC_ETH0_HWADDR);
-#endif
-#ifdef SBMAC_ETH1_HWADDR
-	sbmac_setup_hwaddr(1, SBMAC_ETH1_HWADDR);
-#endif
-#ifdef SBMAC_ETH2_HWADDR
-	sbmac_setup_hwaddr(2, SBMAC_ETH2_HWADDR);
-#endif
-#ifdef SBMAC_ETH3_HWADDR
-	sbmac_setup_hwaddr(3, SBMAC_ETH3_HWADDR);
-#endif
-
 	sbmac_pldev = kcalloc(sbmac_max_units, sizeof(*sbmac_pldev),
 			      GFP_KERNEL);
 	if (!sbmac_pldev) {
-- 
1.6.6.1


^ permalink raw reply related

* Re: [PATCH] RCU: don't turn off lockdep when find suspicious rcu_dereference_check() usage
From: Miles Lane @ 2010-04-25 20:20 UTC (permalink / raw)
  To: paulmck
  Cc: Vivek Goyal, Eric Paris, Lai Jiangshan, Ingo Molnar,
	Peter Zijlstra, LKML, nauman, eric.dumazet, netdev, Jens Axboe,
	Gui Jianfeng, Li Zefan, Johannes Berg
In-Reply-To: <w2sa44ae5cd1004250849q73704370vc6af935c70db9b73@mail.gmail.com>

> I am down to seeing three suspicious rcu_dereference_check traces when
> I apply this patch and all the previous patches to 2.6.34-rc5-git6.
>
> 1. The "__sched_setscheduler+0x19d/0x300" trace.
> 2. The two "is_swiotlb_buffer+0x2e/0x3b" traces (waiting to see
> Johannes' patch show up in a Linux snapshot)
>
> Did I miss a patch for the setscheduler issue?

Hmm.  I am still seeing these two messages as well.

[   83.363146] [ INFO: suspicious rcu_dereference_check() usage. ]
[   83.363148] ---------------------------------------------------
[   83.363151] include/net/inet_timewait_sock.h:227 invoked
rcu_dereference_check() without protection!
[   83.363154]
[   83.363155] other info that might help us debug this:
[   83.363156]
[   83.363158]
[   83.363159] rcu_scheduler_active = 1, debug_locks = 1
[   83.363162] 2 locks held by gwibber-service/5076:
[   83.363164]  #0:  (&p->lock){+.+.+.}, at: [<ffffffff8110534a>]
seq_read+0x37/0x381
[   83.363176]  #1:  (&(&hashinfo->ehash_locks[i])->rlock){+.-...},
at: [<ffffffff813ddcd5>] established_get_next+0xc4/0x132
[   83.363186]
[   83.363187] stack backtrace:
[   83.363191] Pid: 5076, comm: gwibber-service Not tainted 2.6.34-rc5-git6 #27
[   83.363194] Call Trace:
[   83.363202]  [<ffffffff81068086>] lockdep_rcu_dereference+0x9d/0xa5
[   83.363207]  [<ffffffff813dc998>] twsk_net+0x4f/0x57
[   83.363212]  [<ffffffff813ddc65>] established_get_next+0x54/0x132
[   83.363216]  [<ffffffff813dde47>] tcp_seq_next+0x5d/0x6a
[   83.363221]  [<ffffffff81105599>] seq_read+0x286/0x381
[   83.363226]  [<ffffffff81105313>] ? seq_read+0x0/0x381
[   83.363231]  [<ffffffff8113503c>] proc_reg_read+0x8d/0xac
[   83.363236]  [<ffffffff810ebf14>] vfs_read+0xa6/0x103
[   83.363241]  [<ffffffff810ec027>] sys_read+0x45/0x69
[   83.363246]  [<ffffffff81002b6b>] system_call_fastpath+0x16/0x1b

[   84.660302] [ INFO: suspicious rcu_dereference_check() usage. ]
[   84.660304] ---------------------------------------------------
[   84.660308] include/net/inet_timewait_sock.h:227 invoked
rcu_dereference_check() without protection!
[   84.660311]
[   84.660312] other info that might help us debug this:
[   84.660313]
[   84.660315]
[   84.660316] rcu_scheduler_active = 1, debug_locks = 1
[   84.660319] no locks held by gwibber-service/5081.
[   84.660321]
[   84.660322] stack backtrace:
[   84.660325] Pid: 5081, comm: gwibber-service Not tainted 2.6.34-rc5-git6 #27
[   84.660328] Call Trace:
[   84.660339]  [<ffffffff81068086>] lockdep_rcu_dereference+0x9d/0xa5
[   84.660345]  [<ffffffff813cad6f>] twsk_net+0x4f/0x57
[   84.660350]  [<ffffffff813cb18f>] __inet_twsk_hashdance+0x50/0x158
[   84.660355]  [<ffffffff813e0bb9>] tcp_time_wait+0x1c1/0x24b
[   84.660360]  [<ffffffff813d3d97>] tcp_fin+0x83/0x162
[   84.660364]  [<ffffffff813d4727>] tcp_data_queue+0x1ff/0xa1e
[   84.660370]  [<ffffffff810496aa>] ? mod_timer+0x1e/0x20
[   84.660375]  [<ffffffff813d8363>] tcp_rcv_state_process+0x89d/0x8f2
[   84.660381]  [<ffffffff813943bb>] ? release_sock+0x30/0x10b
[   84.660386]  [<ffffffff813de772>] tcp_v4_do_rcv+0x2de/0x33f
[   84.660391]  [<ffffffff8139440d>] release_sock+0x82/0x10b
[   84.660395]  [<ffffffff813ce875>] tcp_close+0x1b5/0x37e
[   84.660401]  [<ffffffff813ecdb7>] inet_release+0x50/0x57
[   84.660405]  [<ffffffff81391ae4>] sock_release+0x1a/0x66
[   84.660410]  [<ffffffff81391b52>] sock_close+0x22/0x26
[   84.660415]  [<ffffffff810ece07>] __fput+0x120/0x1cd
[   84.660420]  [<ffffffff810ecec9>] fput+0x15/0x17
[   84.660424]  [<ffffffff810e9d41>] filp_close+0x63/0x6d
[   84.660428]  [<ffffffff810e9e22>] sys_close+0xd7/0x111
[   84.660434]  [<ffffffff81002b6b>] system_call_fastpath+0x16/0x1b

^ permalink raw reply

* [PATCH net-2.6] bridge br_multicast: Ensure to initialize BR_INPUT_SKB_CB(skb)->mrouters_only.
From: YOSHIFUJI Hideaki @ 2010-04-25 18:59 UTC (permalink / raw)
  To: davem; +Cc: yoshfuji, netdev

Even with commit 32dec5dd0233ebffa9cae25ce7ba6daeb7df4467 ("bridge
br_multicast: Don't refer to BR_INPUT_SKB_CB(skb)->mrouters_only
without IGMP snooping."), BR_INPUT_SKB_CB(skb)->mrouters_only is
not appropriately initialized if IGMP snooping support is
compiled and disabled, so we can see garbage.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/bridge/br_multicast.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 386c153..eaa0e1b 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -957,9 +957,6 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 	unsigned offset;
 	int err;

-	BR_INPUT_SKB_CB(skb)->igmp = 0;
-	BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
-
 	/* We treat OOM as packet loss for now. */
 	if (!pskb_may_pull(skb, sizeof(*iph)))
 		return -EINVAL;
@@ -1049,6 +1046,9 @@ err_out:
 int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
 		     struct sk_buff *skb)
 {
+	BR_INPUT_SKB_CB(skb)->igmp = 0;
+	BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
+
 	if (br->multicast_disabled)
 		return 0;

-- 
1.5.6.5

^ permalink raw reply related

* Re: [PATCH net-next-2.6] netns: call ops_free right after ops_exit
From: Jiri Pirko @ 2010-04-25 18:44 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: David Miller, netdev
In-Reply-To: <m1k4rvvb5x.fsf@fess.ebiederm.org>

Sun, Apr 25, 2010 at 04:50:34PM CEST, ebiederm@xmission.com wrote:
>David Miller <davem@davemloft.net> writes:
>
>> From: Jiri Pirko <jpirko@redhat.com>
>> Date: Sun, 25 Apr 2010 11:26:01 +0200
>>
>>> There's no need to iterate this twice. We can free net generic
>>> variables right after exit is called.
>>>
>>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>>
>> Are you sure there are no problems with doing this?
>>
>> What if there are inter-net variable reference dependencies
>> or something like that?
>>
>> I really suspect it is being done this way on purpose, but
>> in the end I defer to experts like Eric B. :-)
>
>I am pretty certain there is a problem.  My memory is fuzzy this
>morning but I believe we can have rcu references between various
>pieces of the networking stack for a single network namespace.  So we
>need to cause all of the network namespace to exit before it is safe
>to free those pieces.

Hmm, that doesn't make much sense to me. Since the allocated memory in question
is used locally, after exit() is called, the memory chunk should not be used by
anyone and if it is, I think it's a bug.

Earlier, when the memory wasn't allocated automatically (by filling .size)
memory was individually freed in exit(). From what I understood from your reply,
you are telling this was buggy?

Jirka

>
>Eric
>
>

^ permalink raw reply

* [PATCH net-next-2.6] bridge br_multicast: Ensure to initialize BR_INPUT_SKB_CB(skb)->mrouters_only.
From: YOSHIFUJI Hideaki @ 2010-04-25 18:06 UTC (permalink / raw)
  To: davem; +Cc: yoshfuji, netdev

Even with commit 32dec5dd0233ebffa9cae25ce7ba6daeb7df4467 ("bridge
br_multicast: Don't refer to BR_INPUT_SKB_CB(skb)->mrouters_only
without IGMP snooping."), BR_INPUT_SKB_CB(skb)->mrouters_only is
not appropriately initialized if IGMP/MLD snooping support is
compiled and disabled, so we can see garbage.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
---
 net/bridge/br_multicast.c |    9 +++------
 1 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 38d1fbd..e481dbd 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1340,9 +1340,6 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 	unsigned offset;
 	int err;

-	BR_INPUT_SKB_CB(skb)->igmp = 0;
-	BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
-
 	/* We treat OOM as packet loss for now. */
 	if (!pskb_may_pull(skb, sizeof(*iph)))
 		return -EINVAL;
@@ -1440,9 +1437,6 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 	unsigned offset;
 	int err;

-	BR_INPUT_SKB_CB(skb)->igmp = 0;
-	BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
-
 	if (!pskb_may_pull(skb, sizeof(*ip6h)))
 		return -EINVAL;

@@ -1550,6 +1544,9 @@ out:
 int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
 		     struct sk_buff *skb)
 {
+	BR_INPUT_SKB_CB(skb)->igmp = 0;
+	BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
+
 	if (br->multicast_disabled)
 		return 0;

-- 
1.5.6.5

^ permalink raw reply related

* Re: [2.6.34-rc5-git6] EIP: is at inet6_csk_bind_conflict + 06xe/0xb7 [ipv6]
From: Sedat Dilek @ 2010-04-25 17:25 UTC (permalink / raw)
  To: piotr; +Cc: Eric Dumazet, LKML, netdev, David Miller, Jiri Olsa, Jongman Heo
In-Reply-To: <4BD4757E.8020503@example.com>

On Sun, Apr 25, 2010 at 7:01 PM, Piotr Hosowicz <piotr@hosowicz.com> wrote:
> On 25.04.2010 18:55, Sedat Dilek wrote:
>>
[...]
>> On Sun, Apr 25, 2010 at 6:39 PM, Eric Dumazet<eric.dumazet@gmail.com>
>>  wrote:
>>>
>>> Le dimanche 25 avril 2010 à 18:36 +0200, Piotr Hosowicz a écrit :
>>>>
>>>> On 25.04.2010 18:31, Sedat Dilek wrote:
>>>>>
>>>>> [ CCing netdev ML ]
>>>>>
>>>>> Confirmed: The revert-patch [1] fixes the problem here.
>>>>
>>>> I confirm, I've built a git6 kernel and it works fine.
>>>>
>>>>> See also "Bug 15847 -  crash in inet6_csk_bind_conflict" [2].
>>>>>
>>>>> Feel free to add a... Tested-by: Sedat Dilek<sedat.dilek@gmail.com>
>>>>
>>>> I added created and tested phrase in my archive. ;-) Thank you a lot. I
>>>> hope there will be no this error in git7.
>>>>
>>>
>>> Did you test the proposed fix ?
>
> Eric, but where to get the proposed fix as a patch? I looked at kernel's
> bugzilla and there is no such thing. As for now I applied Sedat's reverse
> patch and now I am booted fine in git6 kernel.
>
> Regards,
>
> Piotr Hosowicz
>

For easy catching proposed patched to LKML look at <patchwork.kernel.org> [1].
There you find other mailing-lists and further patches.
Cut-N-Paste excerpts of Eric's email should also work :-).

- Sedat -

[1] https://patchwork.kernel.org/project/LKML/list/
[2] https://patchwork.kernel.org/patch/94961/

^ permalink raw reply

* Re: PROBLEM: Linux kernel 2.6.31 IPv4 TCP fails to open huge amount of outgoing connections (unable to bind ... )
From: Michael S. Tsirkin @ 2010-04-25 16:35 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Evgeniy Polyakov, Ben Greear, David Miller, Gaspar Chilingarov,
	netdev
In-Reply-To: <1272212460.2069.9.camel@edumazet-laptop>

On Sun, Apr 25, 2010 at 06:21:00PM +0200, Eric Dumazet wrote:
> Le dimanche 25 avril 2010 à 19:56 +0400, Evgeniy Polyakov a écrit :
> > On Sun, Apr 25, 2010 at 05:26:42PM +0300, Michael S. Tsirkin (mst@redhat.com) wrote:
> > 
> > > > diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> > > > index 0c5e3c3..fb6959c 100644
> > > > --- a/net/ipv6/inet6_connection_sock.c
> > > > +++ b/net/ipv6/inet6_connection_sock.c
> > > > @@ -42,11 +42,16 @@ int inet6_csk_bind_conflict(const struct sock *sk,
> > > >  		if (sk != sk2 &&
> > > >  		    (!sk->sk_bound_dev_if ||
> > > >  		     !sk2->sk_bound_dev_if ||
> > > > -		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
> > > > -		    (!sk->sk_reuse || !sk2->sk_reuse ||
> > > > -		     sk2->sk_state == TCP_LISTEN) &&
> > > > -		     ipv6_rcv_saddr_equal(sk, sk2))
> > > > -			break;
> > > > +		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> > > > +			if ((!sk->sk_reuse || !sk2->sk_reuse ||
> > > > +			     sk2->sk_state == TCP_LISTEN) &&
> > > > +			     ipv6_rcv_saddr_equal(sk, sk2))
> > > > +				break;
> > > > +			else if (sk->sk_reuse && sk2->sk_reuse &&
> > > > +				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
> > 
> > I suppose above line is guilty when inet6_rcv_saddr() returns NULL?
> > 
> 
> Sorry, I cant test this at this moment (I am travelling)
> 
> Evgeniy, David could you double check ?
> 
> Michael, could you test this patch ?
> 
> Thanks !
> 
> [PATCH] ipv6: Fix inet6_csk_bind_conflict()
> 
> Commit fda48a0d7a84 (tcp: bind() fix when many ports are bound)
> introduced a bug on IPV6 part.
> We should not call ipv6_addr_any(inet6_rcv_saddr(sk2)) but
> ipv6_addr_any(inet6_rcv_saddr(sk)) because sk2 can be IPV4, while sk is
> IPV6.
> 
> Reported-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
> diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> index b4b7d40..3a4d92b 100644
> --- a/net/ipv6/inet6_connection_sock.c
> +++ b/net/ipv6/inet6_connection_sock.c
> @@ -48,7 +48,7 @@ int inet6_csk_bind_conflict(const struct sock *sk,
>  			     ipv6_rcv_saddr_equal(sk, sk2))
>  				break;
>  			else if (sk->sk_reuse && sk2->sk_reuse &&
> -				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
> +				!ipv6_addr_any(inet6_rcv_saddr(sk)) &&
>  				ipv6_rcv_saddr_equal(sk, sk2))
>  				break;
>  		}
> 

works for me
Tested-by: Michael S. Tsirkin <mst@redhat.com>


^ permalink raw reply

* Re: [2.6.34-rc5-git6] EIP: is at inet6_csk_bind_conflict + 06xe/0xb7 [ipv6]
From: Eric Dumazet @ 2010-04-25 17:08 UTC (permalink / raw)
  To: piotr
  Cc: sedat.dilek, Sedat Dilek, LKML, netdev, David Miller, Jiri Olsa,
	Jongman Heo
In-Reply-To: <4BD4757E.8020503@example.com>

Le dimanche 25 avril 2010 à 19:01 +0200, Piotr Hosowicz a écrit :
> On 25.04.2010 18:55, Sedat Dilek wrote:
> > Rebuild ipv6 kernel-modules by:
> >
> > $ make M=net/ipv6
> >
> > ...and copied net/ipv6/*.ko and net/ipv6/netfilter/*.ko files manually
> > to the right place.
> >
> > Applied your patch (seen on netdev ML) already and booted into new kernel.
> > Works, thanks.
> >
> > Tested-by: Sedat Dilek<sedat.dilek@gmail.com>
> >
> > - Sedat -
> >
> > On Sun, Apr 25, 2010 at 6:39 PM, Eric Dumazet<eric.dumazet@gmail.com>  wrote:
> >> Le dimanche 25 avril 2010 à 18:36 +0200, Piotr Hosowicz a écrit :
> >>> On 25.04.2010 18:31, Sedat Dilek wrote:
> >>>> [ CCing netdev ML ]
> >>>>
> >>>> Confirmed: The revert-patch [1] fixes the problem here.
> >>>
> >>> I confirm, I've built a git6 kernel and it works fine.
> >>>
> >>>> See also "Bug 15847 -  crash in inet6_csk_bind_conflict" [2].
> >>>>
> >>>> Feel free to add a... Tested-by: Sedat Dilek<sedat.dilek@gmail.com>
> >>>
> >>> I added created and tested phrase in my archive. ;-) Thank you a lot. I
> >>> hope there will be no this error in git7.
> >>>
> >>
> >> Did you test the proposed fix ?
> 
> Eric, but where to get the proposed fix as a patch? I looked at kernel's 
> bugzilla and there is no such thing. As for now I applied Sedat's 
> reverse patch and now I am booted fine in git6 kernel.
> 

I wont spend my Sunday time at filling bugzilla entries.

I sent a patch on netdev.

I finaly could test it (I have a very slow machine while traveling, and
compiling the thing took a *lot* of time)

Now back to family affairs ;)




^ permalink raw reply

* Re: [2.6.34-rc5-git6] EIP: is at inet6_csk_bind_conflict + 06xe/0xb7 [ipv6]
From: Piotr Hosowicz @ 2010-04-25 17:01 UTC (permalink / raw)
  To: sedat.dilek
  Cc: Sedat Dilek, Eric Dumazet, LKML, netdev, David Miller, Jiri Olsa,
	Jongman Heo
In-Reply-To: <j2p2d0a357f1004250955i59969a2cp9fd9eb9f3ef22d4b@mail.gmail.com>

On 25.04.2010 18:55, Sedat Dilek wrote:
> Rebuild ipv6 kernel-modules by:
>
> $ make M=net/ipv6
>
> ...and copied net/ipv6/*.ko and net/ipv6/netfilter/*.ko files manually
> to the right place.
>
> Applied your patch (seen on netdev ML) already and booted into new kernel.
> Works, thanks.
>
> Tested-by: Sedat Dilek<sedat.dilek@gmail.com>
>
> - Sedat -
>
> On Sun, Apr 25, 2010 at 6:39 PM, Eric Dumazet<eric.dumazet@gmail.com>  wrote:
>> Le dimanche 25 avril 2010 à 18:36 +0200, Piotr Hosowicz a écrit :
>>> On 25.04.2010 18:31, Sedat Dilek wrote:
>>>> [ CCing netdev ML ]
>>>>
>>>> Confirmed: The revert-patch [1] fixes the problem here.
>>>
>>> I confirm, I've built a git6 kernel and it works fine.
>>>
>>>> See also "Bug 15847 -  crash in inet6_csk_bind_conflict" [2].
>>>>
>>>> Feel free to add a... Tested-by: Sedat Dilek<sedat.dilek@gmail.com>
>>>
>>> I added created and tested phrase in my archive. ;-) Thank you a lot. I
>>> hope there will be no this error in git7.
>>>
>>
>> Did you test the proposed fix ?

Eric, but where to get the proposed fix as a patch? I looked at kernel's 
bugzilla and there is no such thing. As for now I applied Sedat's 
reverse patch and now I am booted fine in git6 kernel.

Regards,

Piotr Hosowicz

>> [PATCH] ipv6: Fix inet6_csk_bind_conflict()
>>
>> Commit fda48a0d7a84 (tcp: bind() fix when many ports are bound)
>> introduced a bug on IPV6 part.
>> We should not call ipv6_addr_any(inet6_rcv_saddr(sk2)) but
>> ipv6_addr_any(inet6_rcv_saddr(sk)) because sk2 can be IPV4, while sk is
>> IPV6.
>>
>> Reported-by: Michael S. Tsirkin<mst@redhat.com>
>> Signed-off-by: Eric Dumazet<eric.dumazet@gmail.com>
>> ---
>> diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
>> index b4b7d40..3a4d92b 100644
>> --- a/net/ipv6/inet6_connection_sock.c
>> +++ b/net/ipv6/inet6_connection_sock.c
>> @@ -48,7 +48,7 @@ int inet6_csk_bind_conflict(const struct sock *sk,
>>                              ipv6_rcv_saddr_equal(sk, sk2))
>>                                 break;
>>                         else if (sk->sk_reuse&&  sk2->sk_reuse&&
>> -                               !ipv6_addr_any(inet6_rcv_saddr(sk2))&&
>> +                               !ipv6_addr_any(inet6_rcv_saddr(sk))&&
>>                                 ipv6_rcv_saddr_equal(sk, sk2))
>>                                 break;
>>                 }
>>
>>
>>


-- 
"Bardzo fajna firma, tylko czasami nie płaci. Do jej głównych
zalet należą: ekspres ciśnieniowy (ale tylko na jednym z pię-
ter) oraz fajne widoki z salki konferencyjnej. No i jakieś
pół roku temu odeszło 90% pracowników działu IT, ale może nie
mieli racji ;)"
NP: Chickenfoot - Soap On A Rope
NB: 2.6.34-rc5-git6

^ permalink raw reply

* Re: [2.6.34-rc5-git6] EIP: is at inet6_csk_bind_conflict + 06xe/0xb7 [ipv6]
From: Sedat Dilek @ 2010-04-25 16:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: piotr, LKML, netdev, David Miller, Jiri Olsa, Jongman Heo
In-Reply-To: <1272213597.2069.32.camel@edumazet-laptop>

[-- Attachment #1: Type: text/plain, Size: 2241 bytes --]

Rebuild ipv6 kernel-modules by:

$ make M=net/ipv6

...and copied net/ipv6/*.ko and net/ipv6/netfilter/*.ko files manually
to the right place.

Applied your patch (seen on netdev ML) already and booted into new kernel.
Works, thanks.

Tested-by: Sedat Dilek <sedat.dilek@gmail.com>

- Sedat -

On Sun, Apr 25, 2010 at 6:39 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le dimanche 25 avril 2010 à 18:36 +0200, Piotr Hosowicz a écrit :
>> On 25.04.2010 18:31, Sedat Dilek wrote:
>> > [ CCing netdev ML ]
>> >
>> > Confirmed: The revert-patch [1] fixes the problem here.
>>
>> I confirm, I've built a git6 kernel and it works fine.
>>
>> > See also "Bug 15847 -  crash in inet6_csk_bind_conflict" [2].
>> >
>> > Feel free to add a... Tested-by: Sedat Dilek<sedat.dilek@gmail.com>
>>
>> I added created and tested phrase in my archive. ;-) Thank you a lot. I
>> hope there will be no this error in git7.
>>
>
> Did you test the proposed fix ?
>
>
> [PATCH] ipv6: Fix inet6_csk_bind_conflict()
>
> Commit fda48a0d7a84 (tcp: bind() fix when many ports are bound)
> introduced a bug on IPV6 part.
> We should not call ipv6_addr_any(inet6_rcv_saddr(sk2)) but
> ipv6_addr_any(inet6_rcv_saddr(sk)) because sk2 can be IPV4, while sk is
> IPV6.
>
> Reported-by: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
> diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> index b4b7d40..3a4d92b 100644
> --- a/net/ipv6/inet6_connection_sock.c
> +++ b/net/ipv6/inet6_connection_sock.c
> @@ -48,7 +48,7 @@ int inet6_csk_bind_conflict(const struct sock *sk,
>                             ipv6_rcv_saddr_equal(sk, sk2))
>                                break;
>                        else if (sk->sk_reuse && sk2->sk_reuse &&
> -                               !ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
> +                               !ipv6_addr_any(inet6_rcv_saddr(sk)) &&
>                                ipv6_rcv_saddr_equal(sk, sk2))
>                                break;
>                }
>
>
>

[-- Attachment #2: ipv6-Fix-inet6_csk_bind_conflict.patch --]
[-- Type: text/x-diff, Size: 882 bytes --]

[PATCH] ipv6: Fix inet6_csk_bind_conflict()

Commit fda48a0d7a84 (tcp: bind() fix when many ports are bound)
introduced a bug on IPV6 part.
We should not call ipv6_addr_any(inet6_rcv_saddr(sk2)) but
ipv6_addr_any(inet6_rcv_saddr(sk)) because sk2 can be IPV4, while sk is
IPV6.

Reported-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index b4b7d40..3a4d92b 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -48,7 +48,7 @@ int inet6_csk_bind_conflict(const struct sock *sk,
 			     ipv6_rcv_saddr_equal(sk, sk2))
 				break;
 			else if (sk->sk_reuse && sk2->sk_reuse &&
-				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
+				!ipv6_addr_any(inet6_rcv_saddr(sk)) &&
 				ipv6_rcv_saddr_equal(sk, sk2))
 				break;
 		}

^ permalink raw reply related

* Re: [2.6.34-rc5-git6] EIP: is at inet6_csk_bind_conflict + 06xe/0xb7 [ipv6]
From: Eric Dumazet @ 2010-04-25 16:39 UTC (permalink / raw)
  To: piotr
  Cc: sedat.dilek, Sedat Dilek, LKML, netdev, David Miller, Jiri Olsa,
	Jongman Heo
In-Reply-To: <4BD46F9C.5060500@example.com>

Le dimanche 25 avril 2010 à 18:36 +0200, Piotr Hosowicz a écrit :
> On 25.04.2010 18:31, Sedat Dilek wrote:
> > [ CCing netdev ML ]
> >
> > Confirmed: The revert-patch [1] fixes the problem here.
> 
> I confirm, I've built a git6 kernel and it works fine.
> 
> > See also "Bug 15847 -  crash in inet6_csk_bind_conflict" [2].
> >
> > Feel free to add a... Tested-by: Sedat Dilek<sedat.dilek@gmail.com>
> 
> I added created and tested phrase in my archive. ;-) Thank you a lot. I 
> hope there will be no this error in git7.
> 

Did you test the proposed fix ?


[PATCH] ipv6: Fix inet6_csk_bind_conflict()

Commit fda48a0d7a84 (tcp: bind() fix when many ports are bound)
introduced a bug on IPV6 part.
We should not call ipv6_addr_any(inet6_rcv_saddr(sk2)) but
ipv6_addr_any(inet6_rcv_saddr(sk)) because sk2 can be IPV4, while sk is
IPV6.

Reported-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index b4b7d40..3a4d92b 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -48,7 +48,7 @@ int inet6_csk_bind_conflict(const struct sock *sk,
 			     ipv6_rcv_saddr_equal(sk, sk2))
 				break;
 			else if (sk->sk_reuse && sk2->sk_reuse &&
-				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
+				!ipv6_addr_any(inet6_rcv_saddr(sk)) &&
 				ipv6_rcv_saddr_equal(sk, sk2))
 				break;
 		}

^ permalink raw reply related

* Re: [2.6.34-rc5-git6] EIP: is at inet6_csk_bind_conflict + 06xe/0xb7 [ipv6]
From: Piotr Hosowicz @ 2010-04-25 16:36 UTC (permalink / raw)
  To: sedat.dilek
  Cc: Sedat Dilek, LKML, netdev, David Miller, Jiri Olsa, Eric Dumazet,
	Jongman Heo
In-Reply-To: <g2z2d0a357f1004250931pf5880c60l32fd0643e0f14bde@mail.gmail.com>

On 25.04.2010 18:31, Sedat Dilek wrote:
> [ CCing netdev ML ]
>
> Confirmed: The revert-patch [1] fixes the problem here.

I confirm, I've built a git6 kernel and it works fine.

> See also "Bug 15847 -  crash in inet6_csk_bind_conflict" [2].
>
> Feel free to add a... Tested-by: Sedat Dilek<sedat.dilek@gmail.com>

I added created and tested phrase in my archive. ;-) Thank you a lot. I 
hope there will be no this error in git7.

Regards,

Piotr Hosowicz

> - Sedat -
>
> [1] https://patchwork.kernel.org/patch/94959/
> [2] https://bugzilla.kernel.org/show_bug.cgi?id=15847
>
> 2010/4/25 Piotr Hosowicz<piotr@hosowicz.com>:
>> On 25.04.2010 17:11, Sedat Dilek wrote:
>>>
>>> [ Please CC - I am not subscribed to LKML ]
>>>
>>> [QUOTE]
>>>
>>> On 25.04.2010 16:26, Jongman Heo wrote:
>>>
>>>> I also hit this bug today.
>>>
>>> I also hit similar bug, maybe it is the same.
>>>
>>>> Doing git bisect, first bad commit was
>>>>
>>>>    commit fda48a0d7a8412cedacda46a9c0bf8ef9cd13559
>>>>    tcp: bind() fix when many ports are bound
>>>>
>>>> Reverting above commit fixes the problem.
>>>
>>> How to do it? Would you please publish a reverting patch?
>>>
>>> Regards,
>>>
>>> Piotr Hosowicz
>>>
>>> [/QUOTE]
>>>
>>> Hi,
>>>
>>> did forget to mention 2.6.34-rc5-git5 was OK.
>>>
>>> Revering this commit did not help:
>>>
>>> commit f4f914b58019f0e50d521bbbadfaee260d766f95
>>> net: ipv6 bind to device issue
>>>
>>> After looking into net-2.6 GIT repository, "tcp: bind() fix when many
>>> ports are bound" could cause indeed the problems here, too.
>>> Building now....
>>>
>>> Regards,
>>> - Sedat -
>>>
>>> P.S.: Attached 0001-Revert-tcp-bind-fix-when-many-ports-are-bound.patch
>>
>> Thanks a lot. Applied and building now.
>>
>> Regards,
>>
>> Piotr Hosowicz
>>
>> --
>> Z cyklu "Uroki demokracji", czyli pytania i odpowiedzi w teledurniejach:
>> - W którym kraju znajduje się Mount Everest?
>> - Hm, to nie Szkocja, prawda?
>> NP: Mark Knopfler - Cleaning My Gun
>> NB: 2.6.34-rc5-git5
>>
>


-- 
Grupa marzeń w eliminacjach MŚ :
Zimbabwe, Alaska, Grenlandia, Antarktyda i Zair.
NP: Chickenfoot - Oh Yeah
NB: 2.6.34-rc5-git6

^ permalink raw reply

* Re: [2.6.34-rc5-git6] EIP: is at inet6_csk_bind_conflict + 06xe/0xb7 [ipv6]
From: Sedat Dilek @ 2010-04-25 16:31 UTC (permalink / raw)
  To: piotr; +Cc: LKML, netdev, David Miller, Jiri Olsa, Eric Dumazet, Jongman Heo
In-Reply-To: <4BD45E68.4080900@example.com>

[ CCing netdev ML ]

Confirmed: The revert-patch [1] fixes the problem here.

See also "Bug 15847 -  crash in inet6_csk_bind_conflict" [2].

Feel free to add a... Tested-by: Sedat Dilek <sedat.dilek@gmail.com>

- Sedat -

[1] https://patchwork.kernel.org/patch/94959/
[2] https://bugzilla.kernel.org/show_bug.cgi?id=15847

2010/4/25 Piotr Hosowicz <piotr@hosowicz.com>:
> On 25.04.2010 17:11, Sedat Dilek wrote:
>>
>> [ Please CC - I am not subscribed to LKML ]
>>
>> [QUOTE]
>>
>> On 25.04.2010 16:26, Jongman Heo wrote:
>>
>>> I also hit this bug today.
>>
>> I also hit similar bug, maybe it is the same.
>>
>>> Doing git bisect, first bad commit was
>>>
>>>   commit fda48a0d7a8412cedacda46a9c0bf8ef9cd13559
>>>   tcp: bind() fix when many ports are bound
>>>
>>> Reverting above commit fixes the problem.
>>
>> How to do it? Would you please publish a reverting patch?
>>
>> Regards,
>>
>> Piotr Hosowicz
>>
>> [/QUOTE]
>>
>> Hi,
>>
>> did forget to mention 2.6.34-rc5-git5 was OK.
>>
>> Revering this commit did not help:
>>
>> commit f4f914b58019f0e50d521bbbadfaee260d766f95
>> net: ipv6 bind to device issue
>>
>> After looking into net-2.6 GIT repository, "tcp: bind() fix when many
>> ports are bound" could cause indeed the problems here, too.
>> Building now....
>>
>> Regards,
>> - Sedat -
>>
>> P.S.: Attached 0001-Revert-tcp-bind-fix-when-many-ports-are-bound.patch
>
> Thanks a lot. Applied and building now.
>
> Regards,
>
> Piotr Hosowicz
>
> --
> Z cyklu "Uroki demokracji", czyli pytania i odpowiedzi w teledurniejach:
> - W którym kraju znajduje się Mount Everest?
> - Hm, to nie Szkocja, prawda?
> NP: Mark Knopfler - Cleaning My Gun
> NB: 2.6.34-rc5-git5
>

^ permalink raw reply

* Re: PROBLEM: Linux kernel 2.6.31 IPv4 TCP fails to open huge amount of outgoing connections (unable to bind ... )
From: Eric Dumazet @ 2010-04-25 16:21 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Michael S. Tsirkin, Ben Greear, David Miller, Gaspar Chilingarov,
	netdev
In-Reply-To: <20100425155600.GA13319@ioremap.net>

Le dimanche 25 avril 2010 à 19:56 +0400, Evgeniy Polyakov a écrit :
> On Sun, Apr 25, 2010 at 05:26:42PM +0300, Michael S. Tsirkin (mst@redhat.com) wrote:
> 
> > > diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> > > index 0c5e3c3..fb6959c 100644
> > > --- a/net/ipv6/inet6_connection_sock.c
> > > +++ b/net/ipv6/inet6_connection_sock.c
> > > @@ -42,11 +42,16 @@ int inet6_csk_bind_conflict(const struct sock *sk,
> > >  		if (sk != sk2 &&
> > >  		    (!sk->sk_bound_dev_if ||
> > >  		     !sk2->sk_bound_dev_if ||
> > > -		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
> > > -		    (!sk->sk_reuse || !sk2->sk_reuse ||
> > > -		     sk2->sk_state == TCP_LISTEN) &&
> > > -		     ipv6_rcv_saddr_equal(sk, sk2))
> > > -			break;
> > > +		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> > > +			if ((!sk->sk_reuse || !sk2->sk_reuse ||
> > > +			     sk2->sk_state == TCP_LISTEN) &&
> > > +			     ipv6_rcv_saddr_equal(sk, sk2))
> > > +				break;
> > > +			else if (sk->sk_reuse && sk2->sk_reuse &&
> > > +				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
> 
> I suppose above line is guilty when inet6_rcv_saddr() returns NULL?
> 

Sorry, I cant test this at this moment (I am travelling)

Evgeniy, David could you double check ?

Michael, could you test this patch ?

Thanks !

[PATCH] ipv6: Fix inet6_csk_bind_conflict()

Commit fda48a0d7a84 (tcp: bind() fix when many ports are bound)
introduced a bug on IPV6 part.
We should not call ipv6_addr_any(inet6_rcv_saddr(sk2)) but
ipv6_addr_any(inet6_rcv_saddr(sk)) because sk2 can be IPV4, while sk is
IPV6.

Reported-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index b4b7d40..3a4d92b 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -48,7 +48,7 @@ int inet6_csk_bind_conflict(const struct sock *sk,
 			     ipv6_rcv_saddr_equal(sk, sk2))
 				break;
 			else if (sk->sk_reuse && sk2->sk_reuse &&
-				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
+				!ipv6_addr_any(inet6_rcv_saddr(sk)) &&
 				ipv6_rcv_saddr_equal(sk, sk2))
 				break;
 		}



^ permalink raw reply related

* Re: PROBLEM: Linux kernel 2.6.31 IPv4 TCP fails to open huge amount of outgoing connections (unable to bind ... )
From: Eric Dumazet @ 2010-04-25 16:13 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Michael S. Tsirkin, Ben Greear, David Miller, Gaspar Chilingarov,
	netdev
In-Reply-To: <20100425155600.GA13319@ioremap.net>

Le dimanche 25 avril 2010 à 19:56 +0400, Evgeniy Polyakov a écrit :
> On Sun, Apr 25, 2010 at 05:26:42PM +0300, Michael S. Tsirkin (mst@redhat.com) wrote:
> 
> > > diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> > > index 0c5e3c3..fb6959c 100644
> > > --- a/net/ipv6/inet6_connection_sock.c
> > > +++ b/net/ipv6/inet6_connection_sock.c
> > > @@ -42,11 +42,16 @@ int inet6_csk_bind_conflict(const struct sock *sk,
> > >  		if (sk != sk2 &&
> > >  		    (!sk->sk_bound_dev_if ||
> > >  		     !sk2->sk_bound_dev_if ||
> > > -		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
> > > -		    (!sk->sk_reuse || !sk2->sk_reuse ||
> > > -		     sk2->sk_state == TCP_LISTEN) &&
> > > -		     ipv6_rcv_saddr_equal(sk, sk2))
> > > -			break;
> > > +		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> > > +			if ((!sk->sk_reuse || !sk2->sk_reuse ||
> > > +			     sk2->sk_state == TCP_LISTEN) &&
> > > +			     ipv6_rcv_saddr_equal(sk, sk2))
> > > +				break;
> > > +			else if (sk->sk_reuse && sk2->sk_reuse &&
> > > +				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
> 
> I suppose above line is guilty when inet6_rcv_saddr() returns NULL?
> 

Oh its a typo

we should test ipv6_addr_any(inet6_rcv_saddr(sk))

instead of ipv6_addr_any(inet6_rcv_saddr(sk2))

(sk is AF_INET6, while sk2 could be AF_INET)

I'll submit a patch promptly



^ permalink raw reply

* Re: PROBLEM: Linux kernel 2.6.31 IPv4 TCP fails to open huge amount of outgoing connections (unable to bind ... )
From: Evgeniy Polyakov @ 2010-04-25 15:56 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Eric Dumazet, Ben Greear, David Miller, Gaspar Chilingarov,
	netdev
In-Reply-To: <20100425142642.GA11411@redhat.com>

On Sun, Apr 25, 2010 at 05:26:42PM +0300, Michael S. Tsirkin (mst@redhat.com) wrote:

> > diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> > index 0c5e3c3..fb6959c 100644
> > --- a/net/ipv6/inet6_connection_sock.c
> > +++ b/net/ipv6/inet6_connection_sock.c
> > @@ -42,11 +42,16 @@ int inet6_csk_bind_conflict(const struct sock *sk,
> >  		if (sk != sk2 &&
> >  		    (!sk->sk_bound_dev_if ||
> >  		     !sk2->sk_bound_dev_if ||
> > -		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
> > -		    (!sk->sk_reuse || !sk2->sk_reuse ||
> > -		     sk2->sk_state == TCP_LISTEN) &&
> > -		     ipv6_rcv_saddr_equal(sk, sk2))
> > -			break;
> > +		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> > +			if ((!sk->sk_reuse || !sk2->sk_reuse ||
> > +			     sk2->sk_state == TCP_LISTEN) &&
> > +			     ipv6_rcv_saddr_equal(sk, sk2))
> > +				break;
> > +			else if (sk->sk_reuse && sk2->sk_reuse &&
> > +				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&

I suppose above line is guilty when inet6_rcv_saddr() returns NULL?

-- 
	Evgeniy Polyakov

^ permalink raw reply

* Re: [PATCH] RCU: don't turn off lockdep when find suspicious rcu_dereference_check() usage
From: Miles Lane @ 2010-04-25 15:49 UTC (permalink / raw)
  To: paulmck
  Cc: Vivek Goyal, Eric Paris, Lai Jiangshan, Ingo Molnar,
	Peter Zijlstra, LKML, nauman, eric.dumazet, netdev, Jens Axboe,
	Gui Jianfeng, Li Zefan, Johannes Berg
In-Reply-To: <20100425023455.GM2440@linux.vnet.ibm.com>

On Sat, Apr 24, 2010 at 10:34 PM, Paul E. McKenney
<paulmck@linux.vnet.ibm.com> wrote:
> On Fri, Apr 23, 2010 at 06:59:12PM -0400, Miles Lane wrote:
>> On Fri, Apr 23, 2010 at 3:42 PM, Paul E. McKenney
>> <paulmck@linux.vnet.ibm.com> wrote:
>> > On Fri, Apr 23, 2010 at 08:50:59AM -0400, Miles Lane wrote:
>> >> Hi Paul,
>> >> There has been a bit of back and forth, and I am not sure what patches
>> >> I should test now.
>> >> Could you send me a bundle of whatever needs testing now?
>> >
>> > Hello, Miles,
>> >
>> > I am posting my set as replies to this message.  There are a couple
>> > of KVM fixes that are going up via Avi's tree, and a number of networking
>> > fixes that are going up via Dave Miller's tree -- a number of these
>> > are against quickly changing code, so it didn't make sense for me to
>> > keep them separately.
>> >
>> > I believe that the two splats below are addressed by this patch set
>> > carried in the networking tree:
>> >
>> >        https://patchwork.kernel.org/patch/90754/
>>
>> With your twelve patches and the one linked to above applied to
>> 2.6.34-rc5-git3, here are the warnings I see:
>>
>> [    0.173969] [ INFO: suspicious rcu_dereference_check() usage. ]
>> [    0.174097] ---------------------------------------------------
>> [    0.174226] include/linux/cgroup.h:534 invoked
>> rcu_dereference_check() without protection!
>> [    0.174429]
>> [    0.174430] other info that might help us debug this:
>> [    0.174431]
>> [    0.174792]
>> [    0.174793] rcu_scheduler_active = 1, debug_locks = 1
>> [    0.175037] no locks held by watchdog/0/5.
>> [    0.175162]
>> [    0.175163] stack backtrace:
>> [    0.175405] Pid: 5, comm: watchdog/0 Not tainted 2.6.34-rc5-git3 #22
>> [    0.175534] Call Trace:
>> [    0.175666]  [<ffffffff81067fbe>] lockdep_rcu_dereference+0x9d/0xa5
>> [    0.175799]  [<ffffffff8102d678>] task_subsys_state+0x59/0x70
>> [    0.175931]  [<ffffffff810328fa>] __sched_setscheduler+0x19d/0x300
>> [    0.176064]  [<ffffffff8102b477>] ? need_resched+0x1e/0x28
>> [    0.176196]  [<ffffffff813cd401>] ? schedule+0x5c3/0x66e
>> [    0.176327]  [<ffffffff81091943>] ? watchdog+0x0/0x8c
>> [    0.176457]  [<ffffffff81032a78>] sched_setscheduler+0xe/0x10
>> [    0.176587]  [<ffffffff8109196d>] watchdog+0x2a/0x8c
>> [    0.176677]  [<ffffffff81091943>] ? watchdog+0x0/0x8c
>> [    0.176808]  [<ffffffff81057152>] kthread+0x89/0x91
>> [    0.176939]  [<ffffffff8106891e>] ? trace_hardirqs_on_caller+0x114/0x13f
>> [    0.177073]  [<ffffffff81003994>] kernel_thread_helper+0x4/0x10
>> [    0.177204]  [<ffffffff813cfc40>] ? restore_args+0x0/0x30
>> [    0.177334]  [<ffffffff810570c9>] ? kthread+0x0/0x91
>> [    0.177463]  [<ffffffff81003990>] ? kernel_thread_helper+0x0/0x10
>
> According to Documentation/cgroups/cgroups.txt, we must hold cgroup_mutex,
> the task's task_alloc lock, or be in an RCU read-side critical section.
> We are in neither of these.
>
> I would argue that sched_setscheduler() should take care of
> synchronization, but am not sure which of these three are appropriate
> for sched_setscheduler() to acquire.  Peter, thoughts?
>
>> [    3.173419] [ INFO: suspicious rcu_dereference_check() usage. ]
>> [    3.173419] ---------------------------------------------------
>> [    3.173419] kernel/cgroup.c:4438 invoked rcu_dereference_check()
>> without protection!
>> [    3.173419]
>> [    3.173419] other info that might help us debug this:
>> [    3.173419]
>> [    3.173419]
>> [    3.173419] rcu_scheduler_active = 1, debug_locks = 1
>> [    3.173419] 2 locks held by async/0/668:
>> [    3.173419]  #0:  (&shost->scan_mutex){+.+.+.}, at:
>> [<ffffffff812df020>] __scsi_add_device+0x83/0xe4
>> [    3.173419]  #1:  (&(&blkcg->lock)->rlock){......}, at:
>> [<ffffffff811f2df9>] blkiocg_add_blkio_group+0x29/0x7f
>> [    3.173419]
>> [    3.173419] stack backtrace:
>> [    3.173419] Pid: 668, comm: async/0 Not tainted 2.6.34-rc5-git3 #22
>> [    3.173419] Call Trace:
>> [    3.173419]  [<ffffffff81067fbe>] lockdep_rcu_dereference+0x9d/0xa5
>> [    3.173419]  [<ffffffff8107f9ad>] css_id+0x3f/0x51
>> [    3.173419]  [<ffffffff811f2e08>] blkiocg_add_blkio_group+0x38/0x7f
>> [    3.173419]  [<ffffffff811f4dd0>] cfq_init_queue+0xdf/0x2dc
>> [    3.173419]  [<ffffffff811e33b1>] elevator_init+0xba/0xf5
>> [    3.173419]  [<ffffffff812dbfaa>] ? scsi_request_fn+0x0/0x451
>> [    3.173419]  [<ffffffff811e68d7>] blk_init_queue_node+0x12f/0x135
>> [    3.173419]  [<ffffffff811e68e9>] blk_init_queue+0xc/0xe
>> [    3.173419]  [<ffffffff812dc41c>] __scsi_alloc_queue+0x21/0x111
>> [    3.173419]  [<ffffffff812dc524>] scsi_alloc_queue+0x18/0x64
>> [    3.173419]  [<ffffffff812de520>] scsi_alloc_sdev+0x19e/0x256
>> [    3.173419]  [<ffffffff812de6be>] scsi_probe_and_add_lun+0xe6/0x9c5
>> [    3.173419]  [<ffffffff8106891e>] ? trace_hardirqs_on_caller+0x114/0x13f
>> [    3.173419]  [<ffffffff813ce056>] ? __mutex_lock_common+0x3e4/0x43a
>> [    3.173419]  [<ffffffff812df020>] ? __scsi_add_device+0x83/0xe4
>> [    3.173419]  [<ffffffff812d09dc>] ? transport_setup_classdev+0x0/0x17
>> [    3.173419]  [<ffffffff812df020>] ? __scsi_add_device+0x83/0xe4
>> [    3.173419]  [<ffffffff812df055>] __scsi_add_device+0xb8/0xe4
>> [    3.173419]  [<ffffffff812ea945>] ata_scsi_scan_host+0x74/0x16e
>> [    3.173419]  [<ffffffff81057699>] ? autoremove_wake_function+0x0/0x34
>> [    3.173419]  [<ffffffff812e8de4>] async_port_probe+0xab/0xb7
>> [    3.173419]  [<ffffffff8105e1b1>] ? async_thread+0x0/0x1f4
>> [    3.173419]  [<ffffffff8105e2b6>] async_thread+0x105/0x1f4
>> [    3.173419]  [<ffffffff81033d8e>] ? default_wake_function+0x0/0xf
>> [    3.173419]  [<ffffffff8105e1b1>] ? async_thread+0x0/0x1f4
>> [    3.173419]  [<ffffffff81057152>] kthread+0x89/0x91
>> [    3.173419]  [<ffffffff8106891e>] ? trace_hardirqs_on_caller+0x114/0x13f
>> [    3.173419]  [<ffffffff81003994>] kernel_thread_helper+0x4/0x10
>> [    3.173419]  [<ffffffff813cfc40>] ? restore_args+0x0/0x30
>> [    3.173419]  [<ffffffff810570c9>] ? kthread+0x0/0x91
>> [    3.173419]  [<ffffffff81003990>] ? kernel_thread_helper+0x0/0x10
>
> Please see below for a patch for this based on my earlier conversation
> with Vivek Goyal.  (Vivek, if you are already pushing a fix elsewhere,
> please let me know, and I will drop my patch in favor of yours.)
>
>> [   32.905446] [ INFO: suspicious rcu_dereference_check() usage. ]
>> [   32.905449] ---------------------------------------------------
>> [   32.905453] net/core/dev.c:1993 invoked rcu_dereference_check()
>> without protection!
>> [   32.905456]
>> [   32.905457] other info that might help us debug this:
>> [   32.905458]
>> [   32.905461]
>> [   32.905462] rcu_scheduler_active = 1, debug_locks = 1
>> [   32.905466] 2 locks held by canberra-gtk-pl/4182:
>> [   32.905469]  #0:  (sk_lock-AF_INET){+.+.+.}, at:
>> [<ffffffff81394f7d>] inet_stream_connect+0x3a/0x24d
>> [   32.905483]  #1:  (rcu_read_lock_bh){.+....}, at:
>> [<ffffffff8134a789>] dev_queue_xmit+0x14e/0x4b8
>> [   32.905495]
>> [   32.905496] stack backtrace:
>> [   32.905500] Pid: 4182, comm: canberra-gtk-pl Not tainted 2.6.34-rc5-git3 #22
>> [   32.905504] Call Trace:
>> [   32.905512]  [<ffffffff81067fbe>] lockdep_rcu_dereference+0x9d/0xa5
>> [   32.905518]  [<ffffffff8134a894>] dev_queue_xmit+0x259/0x4b8
>> [   32.905524]  [<ffffffff8134a789>] ? dev_queue_xmit+0x14e/0x4b8
>> [   32.905531]  [<ffffffff81041c66>] ? _local_bh_enable_ip+0xcd/0xda
>> [   32.905538]  [<ffffffff813536da>] neigh_resolve_output+0x234/0x285
>> [   32.905544]  [<ffffffff8136f69f>] ip_finish_output2+0x257/0x28c
>> [   32.905549]  [<ffffffff8136f73c>] ip_finish_output+0x68/0x6a
>> [   32.905554]  [<ffffffff81370433>] T.866+0x52/0x59
>> [   32.905559]  [<ffffffff8137067e>] ip_output+0xaa/0xb4
>> [   32.905565]  [<ffffffff8136eb38>] ip_local_out+0x20/0x24
>> [   32.905571]  [<ffffffff8136f184>] ip_queue_xmit+0x309/0x368
>> [   32.905578]  [<ffffffff810e4226>] ? __kmalloc_track_caller+0x111/0x155
>> [   32.905585]  [<ffffffff8138316f>] ? tcp_connect+0x223/0x3d3
>> [   32.905591]  [<ffffffff813818f1>] tcp_transmit_skb+0x707/0x745
>> [   32.905597]  [<ffffffff813832c2>] tcp_connect+0x376/0x3d3
>> [   32.905604]  [<ffffffff81268a43>] ? secure_tcp_sequence_number+0x55/0x6f
>> [   32.905610]  [<ffffffff81387270>] tcp_v4_connect+0x3df/0x455
>> [   32.905617]  [<ffffffff8133cb59>] ? lock_sock_nested+0xf3/0x102
>> [   32.905623]  [<ffffffff81394fe7>] inet_stream_connect+0xa4/0x24d
>> [   32.905629]  [<ffffffff8133b398>] sys_connect+0x90/0xd0
>> [   32.905636]  [<ffffffff81002b9c>] ? sysret_check+0x27/0x62
>> [   32.905642]  [<ffffffff8106891e>] ? trace_hardirqs_on_caller+0x114/0x13f
>> [   32.905649]  [<ffffffff813cec80>] ? trace_hardirqs_on_thunk+0x3a/0x3f
>> [   32.905655]  [<ffffffff81002b6b>] system_call_fastpath+0x16/0x1b
>
> A fix for the above is already in Dave Miller's tree.
>
>> [   51.912282] [ INFO: suspicious rcu_dereference_check() usage. ]
>> [   51.912285] ---------------------------------------------------
>> [   51.912289] net/mac80211/sta_info.c:886 invoked
>> rcu_dereference_check() without protection!
>> [   51.912293]
>> [   51.912293] other info that might help us debug this:
>> [   51.912295]
>> [   51.912298]
>> [   51.912298] rcu_scheduler_active = 1, debug_locks = 1
>> [   51.912302] no locks held by wpa_supplicant/3951.
>> [   51.912305]
>> [   51.912306] stack backtrace:
>> [   51.912310] Pid: 3951, comm: wpa_supplicant Not tainted 2.6.34-rc5-git3 #22
>> [   51.912314] Call Trace:
>> [   51.912317]  <IRQ>  [<ffffffff81067fbe>] lockdep_rcu_dereference+0x9d/0xa5
>> [   51.912345]  [<ffffffffa014f9ae>]
>> ieee80211_find_sta_by_hw+0x46/0x10f [mac80211]
>> [   51.912358]  [<ffffffffa014fa8e>] ieee80211_find_sta+0x17/0x19 [mac80211]
>> [   51.912373]  [<ffffffffa01e50f2>] iwl_tx_queue_reclaim+0xdb/0x1b1 [iwlcore]
>> [   51.912380]  [<ffffffff8106842b>] ? mark_lock+0x2d/0x235
>> [   51.912391]  [<ffffffffa0252f1c>] iwl5000_rx_reply_tx+0x4a9/0x556 [iwlagn]
>> [   51.912399]  [<ffffffff8120a353>] ? is_swiotlb_buffer+0x2e/0x3b
>> [   51.912407]  [<ffffffffa024bbf4>] iwl_rx_handle+0x163/0x2b5 [iwlagn]
>> [   51.912414]  [<ffffffff81068904>] ? trace_hardirqs_on_caller+0xfa/0x13f
>> [   51.912422]  [<ffffffffa024c3ac>] iwl_irq_tasklet+0x2bb/0x3c0 [iwlagn]
>> [   51.912429]  [<ffffffff810411f3>] tasklet_action+0xa7/0x10f
>> [   51.912435]  [<ffffffff81042205>] __do_softirq+0x144/0x252
>> [   51.912442]  [<ffffffff81003a8c>] call_softirq+0x1c/0x34
>> [   51.912447]  [<ffffffff810050e4>] do_softirq+0x38/0x80
>> [   51.912452]  [<ffffffff81041cd2>] irq_exit+0x45/0x94
>> [   51.912457]  [<ffffffff81004829>] do_IRQ+0xad/0xc4
>> [   51.912463]  [<ffffffff810cbbd3>] ? might_fault+0x63/0xb3
>> [   51.912470]  [<ffffffff813cfb93>] ret_from_intr+0x0/0xf
>> [   51.912474]  <EOI>  [<ffffffff810cbbd3>] ? might_fault+0x63/0xb3
>> [   51.912484]  [<ffffffff8106a75d>] ? lock_release+0x208/0x215
>> [   51.912490]  [<ffffffff810cbc1c>] might_fault+0xac/0xb3
>> [   51.912495]  [<ffffffff810cbbd3>] ? might_fault+0x63/0xb3
>> [   51.912501]  [<ffffffff812025e3>] __clear_user+0x15/0x59
>> [   51.912508]  [<ffffffff8100b2bc>] save_i387_xstate+0x9c/0x1bc
>> [   51.912515]  [<ffffffff81002276>] do_signal+0x240/0x686
>> [   51.912521]  [<ffffffff81002b9c>] ? sysret_check+0x27/0x62
>> [   51.912527]  [<ffffffff8106891e>] ? trace_hardirqs_on_caller+0x114/0x13f
>> [   51.912533]  [<ffffffff813cec80>] ? trace_hardirqs_on_thunk+0x3a/0x3f
>> [   51.912539]  [<ffffffff810026e3>] do_notify_resume+0x27/0x5f
>> [   51.912545]  [<ffffffff813cec80>] ? trace_hardirqs_on_thunk+0x3a/0x3f
>> [   51.912551]  [<ffffffff81002e86>] int_signal+0x12/0x17
>
> This is a repeat from last time that confused me at the time.  I could
> do a hacky "fix" by putting an RCU read-side critical section around
> the for_each_sta_info() in ieee80211_find_sta_by_hw(), but I do not
> understand this code well enough to feel comfortable doing so.
>
> Johannes, any enlightenment?
>
>> [   51.929529] [ INFO: suspicious rcu_dereference_check() usage. ]
>> [   51.929532] ---------------------------------------------------
>> [   51.929536] net/mac80211/sta_info.c:886 invoked
>> rcu_dereference_check() without protection!
>> [   51.929540]
>> [   51.929541] other info that might help us debug this:
>> [   51.929542]
>> [   51.929545]
>> [   51.929546] rcu_scheduler_active = 1, debug_locks = 1
>> [   51.929550] 1 lock held by Xorg/4013:
>> [   51.929553]  #0:  (clock-AF_UNIX){++.+..}, at: [<ffffffff8133cebd>]
>> sock_def_readable+0x19/0x62
>> [   51.929567]
>> [   51.929568] stack backtrace:
>> [   51.929573] Pid: 4013, comm: Xorg Not tainted 2.6.34-rc5-git3 #22
>> [   51.929576] Call Trace:
>> [   51.929579]  <IRQ>  [<ffffffff81067fbe>] lockdep_rcu_dereference+0x9d/0xa5
>> [   51.929603]  [<ffffffffa014f9fe>]
>> ieee80211_find_sta_by_hw+0x96/0x10f [mac80211]
>> [   51.929615]  [<ffffffffa014fa8e>] ieee80211_find_sta+0x17/0x19 [mac80211]
>> [   51.929631]  [<ffffffffa01e50f2>] iwl_tx_queue_reclaim+0xdb/0x1b1 [iwlcore]
>> [   51.929642]  [<ffffffffa0252f1c>] iwl5000_rx_reply_tx+0x4a9/0x556 [iwlagn]
>> [   51.929649]  [<ffffffff81068685>] ? mark_held_locks+0x52/0x70
>> [   51.929656]  [<ffffffff813cf46c>] ? _raw_spin_unlock_irqrestore+0x3a/0x69
>> [   51.929662]  [<ffffffff8120a353>] ? is_swiotlb_buffer+0x2e/0x3b
>> [   51.929671]  [<ffffffffa024bbf4>] iwl_rx_handle+0x163/0x2b5 [iwlagn]
>> [   51.929680]  [<ffffffffa024c3ac>] iwl_irq_tasklet+0x2bb/0x3c0 [iwlagn]
>> [   51.929687]  [<ffffffff810411f3>] tasklet_action+0xa7/0x10f
>> [   51.929693]  [<ffffffff81042205>] __do_softirq+0x144/0x252
>> [   51.929700]  [<ffffffff81003a8c>] call_softirq+0x1c/0x34
>> [   51.929705]  [<ffffffff810050e4>] do_softirq+0x38/0x80
>> [   51.929711]  [<ffffffff81041cd2>] irq_exit+0x45/0x94
>> [   51.929717]  [<ffffffff81019b10>] smp_apic_timer_interrupt+0x87/0x95
>> [   51.929724]  [<ffffffff81003553>] apic_timer_interrupt+0x13/0x20
>> [   51.929727]  <EOI>  [<ffffffff813cf46e>] ?
>> _raw_spin_unlock_irqrestore+0x3c/0x69
>> [   51.929739]  [<ffffffff8102d3fb>] __wake_up_sync_key+0x49/0x52
>> [   51.929745]  [<ffffffff8133cee7>] sock_def_readable+0x43/0x62
>> [   51.929751]  [<ffffffff813b1c61>] unix_stream_sendmsg+0x243/0x2e2
>> [   51.929758]  [<ffffffff8133b912>] ? sock_aio_write+0x0/0xcf
>> [   51.929764]  [<ffffffff81339342>] __sock_sendmsg+0x59/0x64
>> [   51.929770]  [<ffffffff8133b9cd>] sock_aio_write+0xbb/0xcf
>> [   51.929777]  [<ffffffff810e9909>] do_sync_readv_writev+0xbc/0xfb
>> [   51.929785]  [<ffffffff811c1792>] ? selinux_file_permission+0xa2/0xaf
>> [   51.929790]  [<ffffffff810e9690>] ? copy_from_user+0x2a/0x2c
>> [   51.929797]  [<ffffffff811baff1>] ? security_file_permission+0x11/0x13
>> [   51.929804]  [<ffffffff810ea6a6>] do_readv_writev+0xa2/0x122
>> [   51.929810]  [<ffffffff810ead93>] ? fcheck_files+0x8f/0xc9
>> [   51.929816]  [<ffffffff810ea764>] vfs_writev+0x3e/0x49
>> [   51.929821]  [<ffffffff810ea84a>] sys_writev+0x45/0x8e
>> [   51.929828]  [<ffffffff81002b6b>] system_call_fastpath+0x16/0x1b
>
> Ditto.
>
>                                                Thanx, Paul
>
> ------------------------------------------------------------------------
>
> commit 0868dd631def762ba00c2f0f397a53c5cdf24ae2
> Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Date:   Sat Apr 24 19:23:30 2010 -0700
>
>    block-cgroup: fix RCU-lockdep splat in blkiocg_add_blkio_group()
>
>    It is necessary to be in an RCU read-side critical section when invoking
>    css_id(), so this patch adds one to blkiocg_add_blkio_group().  This is
>    actually a false positive, because this is called at initialization time,
>    and hence always refers to the root cgroup, which cannot go away.
>
>    Located-by: Miles Lane <miles.lane@gmail.com>
>    Suggested-by: Vivek Goyal <vgoyal@redhat.com>
>    Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 5fe03de..55c8c73 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -71,7 +71,9 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
>
>        spin_lock_irqsave(&blkcg->lock, flags);
>        rcu_assign_pointer(blkg->key, key);
> +       rcu_read_lock();
>        blkg->blkcg_id = css_id(&blkcg->css);
> +       rcu_read_unlock();
>        hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
>        spin_unlock_irqrestore(&blkcg->lock, flags);
>  #ifdef CONFIG_DEBUG_BLK_CGROUP
>

I am down to seeing three suspicious rcu_dereference_check traces when
I apply this patch and all the previous patches to 2.6.34-rc5-git6.

1. The "__sched_setscheduler+0x19d/0x300" trace.
2. The two "is_swiotlb_buffer+0x2e/0x3b" traces (waiting to see
Johannes' patch show up in a Linux snapshot)

Did I miss a patch for the setscheduler issue?

Thanks!
        Miles

^ permalink raw reply

* Re: 2.6.34-rc5+: oops in IPv6
From: Tetsuo Handa @ 2010-04-25 14:58 UTC (permalink / raw)
  To: manuel.lauss, netdev; +Cc: linux-kernel
In-Reply-To: <z2if861ec6f1004250745u94892bdbw9b17db4be57b131b@mail.gmail.com>

Manuel Lauss wrote:
> 2.6.34-rc5-00204-gddc9b34  dies when sshd (openssh 5.5) is started
> Last pull I made on April 23 was fine.

This seems a regression introduced while handling
"PROBLEM: Linux kernel 2.6.31 IPv4 TCP fails to open hugeamount of outgoing connections (unable to bind ... )"
problem. It is in https://bugzilla.kernel.org/show_bug.cgi?id=15847 .

Regards.

^ permalink raw reply

* Re: [PATCH net-next-2.6] netns: call ops_free right after ops_exit
From: Eric W. Biederman @ 2010-04-25 14:50 UTC (permalink / raw)
  To: David Miller; +Cc: jpirko, netdev
In-Reply-To: <20100425.025902.94572342.davem@davemloft.net>

David Miller <davem@davemloft.net> writes:

> From: Jiri Pirko <jpirko@redhat.com>
> Date: Sun, 25 Apr 2010 11:26:01 +0200
>
>> There's no need to iterate this twice. We can free net generic
>> variables right after exit is called.
>>
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>
> Are you sure there are no problems with doing this?
>
> What if there are inter-net variable reference dependencies
> or something like that?
>
> I really suspect it is being done this way on purpose, but
> in the end I defer to experts like Eric B. :-)

I am pretty certain there is a problem.  My memory is fuzzy this
morning but I believe we can have rcu references between various
pieces of the networking stack for a single network namespace.  So we
need to cause all of the network namespace to exit before it is safe
to free those pieces.

Eric

^ permalink raw reply

* 2.6.34-rc5+: oops in IPv6
From: Manuel Lauss @ 2010-04-25 14:45 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

2.6.34-rc5-00204-gddc9b34  dies when sshd (openssh 5.5) is started
Last pull I made on April 23 was fine.

(transcribed from a photo):

BUG: unable to handle kernel NULL pointer dereference at 00000004
IP: [<b1535b72>] inet6_csk_bind_conflict+0x6e/0xb0

EIP: 0060:[<b1535b72>] EFLAGS: 00010293 CPU: 0
EAX: 0000 EBX: ed49c8c0 ECX: 00000000 EDX: 00000000
ESI: ed49c8dc EDI: ee223040 EBP: ef940058 ESP: ed7e9e84
 DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068

Stack:
 ee223040 eebf12e0 b1927198 b14dee45 00000016 ffffffff 00000016 00000005
<0> e8dd766c c8dd75d0 ed7e9ee0 ee223040 ed7e9f04 ee22346c b1517846
<0> ed7e9ef4 00000000 00000016 0000001c 00000000 b166ae1c ef422800 affdb8ab
Call Trace:
b144dee45  inet_csk_get_port+0x1a5/0x27c
b1517846 inet6_bind+0x1b5/0x293
b14aec6c sys_bind+0x63
b1524fdc ipv6_setsockopt+0x38/0x88
b14e0cd7 tcp_setsockopt+0x1b/0x36
b14afa68 sock_common_setsockopt+0x12
b14ae653 sys_setsockopt+0x5e
sys_socketcall
...

GDB says:

0xb1535b72 is in inet6_csk_bind_conflict
(/usr/src/linux-2.6.git/include/net/ipv6.h:376).
371     void ip6_frag_init(struct inet_frag_queue *q, void *a);
372     int ip6_frag_match(struct inet_frag_queue *q, void *a);
373
374     static inline int ipv6_addr_any(const struct in6_addr *a)
375     {
376             return ((a->s6_addr32[0] | a->s6_addr32[1] |
377                      a->s6_addr32[2] | a->s6_addr32[3] ) == 0);
378     }
379
380     static inline int ipv6_addr_loopback(const struct in6_addr *a)


Thanks,
      Manuel Lauss

^ permalink raw reply

* Re: PROBLEM: Linux kernel 2.6.31 IPv4 TCP fails to open huge amount of outgoing connections (unable to bind ... )
From: Michael S. Tsirkin @ 2010-04-25 14:26 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Evgeniy Polyakov, Ben Greear, David Miller, Gaspar Chilingarov,
	netdev
In-Reply-To: <1271877975.7895.3171.camel@edumazet-laptop>

On Wed, Apr 21, 2010 at 09:26:15PM +0200, Eric Dumazet wrote:
> Le mercredi 21 avril 2010 à 22:58 +0400, Evgeniy Polyakov a écrit :
> 
> > Damn it, I tried multiple times :)
> > You are right of course!
> > 
> 
> Here is a formal patch then :)
> 
> [PATCH] tcp: bind() fix when many ports are bound
> 
> Port autoselection done by kernel only works when number of bound
> sockets is under a threshold (typically 30000).
> 
> When this threshold is over, we must check if there is a conflict before
> exiting first loop in inet_csk_get_port()
> 
> Change inet_csk_bind_conflict() to forbid two reuse-enabled sockets to
> bind on same (address,port) tuple (with a non ANY address)
> 
> Same change for inet6_csk_bind_conflict()
> 
> Reported-by: Gaspar Chilingarov <gasparch@gmail.com>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  net/ipv4/inet_connection_sock.c  |   16 +++++++++++-----
>  net/ipv6/inet6_connection_sock.c |   15 ++++++++++-----
>  2 files changed, 21 insertions(+), 10 deletions(-)
> 
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index e0a3e35..78cbc39 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -70,13 +70,17 @@ int inet_csk_bind_conflict(const struct sock *sk,
>  		    (!sk->sk_bound_dev_if ||
>  		     !sk2->sk_bound_dev_if ||
>  		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> +			const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
> +
>  			if (!reuse || !sk2->sk_reuse ||
>  			    sk2->sk_state == TCP_LISTEN) {
> -				const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
>  				if (!sk2_rcv_saddr || !sk_rcv_saddr ||
>  				    sk2_rcv_saddr == sk_rcv_saddr)
>  					break;
> -			}
> +			} else if (reuse && sk2->sk_reuse &&
> +				   sk2_rcv_saddr &&
> +				   sk2_rcv_saddr == sk_rcv_saddr)
> +				break;
>  		}
>  	}
>  	return node != NULL;
> @@ -120,9 +124,11 @@ again:
>  						smallest_size = tb->num_owners;
>  						smallest_rover = rover;
>  						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
> -							spin_unlock(&head->lock);
> -							snum = smallest_rover;
> -							goto have_snum;
> +							if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
> +								spin_unlock(&head->lock);
> +								snum = smallest_rover;
> +								goto have_snum;
> +							}
>  						}
>  					}
>  					goto next;
> diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> index 0c5e3c3..fb6959c 100644
> --- a/net/ipv6/inet6_connection_sock.c
> +++ b/net/ipv6/inet6_connection_sock.c
> @@ -42,11 +42,16 @@ int inet6_csk_bind_conflict(const struct sock *sk,
>  		if (sk != sk2 &&
>  		    (!sk->sk_bound_dev_if ||
>  		     !sk2->sk_bound_dev_if ||
> -		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
> -		    (!sk->sk_reuse || !sk2->sk_reuse ||
> -		     sk2->sk_state == TCP_LISTEN) &&
> -		     ipv6_rcv_saddr_equal(sk, sk2))
> -			break;
> +		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> +			if ((!sk->sk_reuse || !sk2->sk_reuse ||
> +			     sk2->sk_state == TCP_LISTEN) &&
> +			     ipv6_rcv_saddr_equal(sk, sk2))
> +				break;
> +			else if (sk->sk_reuse && sk2->sk_reuse &&
> +				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
> +				ipv6_rcv_saddr_equal(sk, sk2))
> +				break;
> +		}
>  	}
>  
>  	return node != NULL;
> 

With this applied, my box crashes on boot:
rhel6 beta userspace, v2.6.34-rc5-204-gddc9b34 kernel.
2.6.34-rc5 kernel boots fine.
the crash seems to be around net/ipv6/inet6_connection_sock.c:50
after reverting fda48a0d7a8412cedacda46a9c0bf8ef9cd13559,
the crash goes away.

I created https://bugzilla.kernel.org/show_bug.cgi?id=15847
to track this.

Oops below:

BUG: unable to handle kernel NULL pointer dereference at
0000000000000004
IP: [<ffffffffa02b99aa>] inet6_csk_bind_conflict+0x6a/0x110 [ipv6]
PGD 0 
Oops: 0000 [#1] SMP 
last sysfs file:
/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/net/eth0/ifindex
CPU 9 
Modules linked in: ip6t_REJECT nf_conntrack_ipv6 ip6table_filter
ip6_tables ipv6 dm_mirror dm_region_hash dm_log igb i2c_i801 sg iTCO_wdt
iTCO_vendor_support shpchp ioatdma dca pcspkr sr_mod cdrom ext4 mbcache
jbd2 sd_mod ata_generic crc_t10dif pata_acpi ahci pata_jmicron radeon
ttm drm_kms_helper drm i2c_algo_bit i2c_core dm_mod [last unloaded:
scsi_wait_scan]

Pid: 1640, comm: master Not tainted 2.6.34-rc5-mst #1 X8DTN/X8DTN
RIP: 0010:[<ffffffffa02b99aa>]  [<ffffffffa02b99aa>]
inet6_csk_bind_conflict+0x6a/0x110 [ipv6]
RSP: 0018:ffff8803357a7d98  EFLAGS: 00010293
RAX: 0000000000000000 RBX: ffff880335709440 RCX: 0000000000000000
RDX: 0000000000020011 RSI: ffff880335709440 RDI: ffff880334c61e78
RBP: ffff8803357a7db8 R08: 0000000000000019 R09: 0000000000000019
R10: 00000000000000d4 R11: 0000000000000400 R12: ffff880335709468
R13: ffff880334c61800 R14: ffff880335489500 R15: ffffffff8225d700
FS:  00007feacd26f7c0(0000) GS:ffff8801c5700000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000004 CR3: 00000003341ef000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process master (pid: 1640, threadinfo ffff8803357a6000, task
ffff880334225540)
Stack:
 0000000000000000 ffffffff8225b500 ffffc9001251ced0 ffff880334c61800
<0> ffff8803357a7e48 ffffffff81418fa8 ffff880300000019 ffffffff8149ceb6
<0> 0000000536306140 0000000000000246 ffff8803357a7e08 0000000000000246
Call Trace:
 [<ffffffff81418fa8>] inet_csk_get_port+0x238/0x450
 [<ffffffff8149ceb6>] ? _raw_spin_lock_bh+0x16/0x40
 [<ffffffff8149ce15>] ? _raw_read_unlock_bh+0x15/0x20
 [<ffffffffa0290226>] ? ipv6_chk_addr+0xe6/0x100 [ipv6]

-- 
MST

^ permalink raw reply

* Re: [RFC][PATCH v4 00/18] Provide a zero-copy method on KVM virtio-net.
From: Michael S. Tsirkin @ 2010-04-25 12:14 UTC (permalink / raw)
  To: xiaohui.xin; +Cc: netdev, kvm, linux-kernel, mingo, davem, jdike
In-Reply-To: <1272187206-18534-19-git-send-email-xiaohui.xin@intel.com>

On Sun, Apr 25, 2010 at 05:20:06PM +0800, xiaohui.xin@intel.com wrote:
> We provide an zero-copy method which driver side may get external
> buffers to DMA. Here external means driver don't use kernel space
> to allocate skb buffers. Currently the external buffer can be from
> guest virtio-net driver.
> 
> The idea is simple, just to pin the guest VM user space and then
> let host NIC driver has the chance to directly DMA to it. 
> The patches are based on vhost-net backend driver. We add a device
> which provides proto_ops as sendmsg/recvmsg to vhost-net to
> send/recv directly to/from the NIC driver. KVM guest who use the
> vhost-net backend may bind any ethX interface in the host side to
> get copyless data transfer thru guest virtio-net frontend.
> 
> patch 01-12:  	net core changes.
> patch 13-17:  	new device as interface to mantpulate external buffers.
> patch 18: 	for vhost-net.
> 
> The guest virtio-net driver submits multiple requests thru vhost-net
> backend driver to the kernel. And the requests are queued and then
> completed after corresponding actions in h/w are done.
> 
> For read, user space buffers are dispensed to NIC driver for rx when
> a page constructor API is invoked. Means NICs can allocate user buffers
> from a page constructor. We add a hook in netif_receive_skb() function
> to intercept the incoming packets, and notify the zero-copy device.
> 
> For write, the zero-copy deivce may allocates a new host skb and puts
> payload on the skb_shinfo(skb)->frags, and copied the header to skb->data.
> The request remains pending until the skb is transmitted by h/w.
> 
> Here, we have ever considered 2 ways to utilize the page constructor
> API to dispense the user buffers.
> 
> One:	Modify __alloc_skb() function a bit, it can only allocate a 
> 	structure of sk_buff, and the data pointer is pointing to a 
> 	user buffer which is coming from a page constructor API.
> 	Then the shinfo of the skb is also from guest.
> 	When packet is received from hardware, the skb->data is filled
> 	directly by h/w. What we have done is in this way.
> 
> 	Pros:	We can avoid any copy here.
> 	Cons:	Guest virtio-net driver needs to allocate skb as almost
> 		the same method with the host NIC drivers, say the size
> 		of netdev_alloc_skb() and the same reserved space in the
> 		head of skb. Many NIC drivers are the same with guest and
> 		ok for this. But some lastest NIC drivers reserves special
> 		room in skb head. To deal with it, we suggest to provide
> 		a method in guest virtio-net driver to ask for parameter
> 		we interest from the NIC driver when we know which device 
> 		we have bind to do zero-copy. Then we ask guest to do so.
> 		Is that reasonable?

Do you still do this?

> Two:	Modify driver to get user buffer allocated from a page constructor
> 	API(to substitute alloc_page()), the user buffer are used as payload
> 	buffers and filled by h/w directly when packet is received. Driver
> 	should associate the pages with skb (skb_shinfo(skb)->frags). For 
> 	the head buffer side, let host allocates skb, and h/w fills it. 
> 	After that, the data filled in host skb header will be copied into
> 	guest header buffer which is submitted together with the payload buffer.
> 
> 	Pros:	We could less care the way how guest or host allocates their
> 		buffers.
> 	Cons:	We still need a bit copy here for the skb header.
> 
> We are not sure which way is the better here. This is the first thing we want
> to get comments from the community. We wish the modification to the network
> part will be generic which not used by vhost-net backend only, but a user
> application may use it as well when the zero-copy device may provides async
> read/write operations later.

I commented on this in the past. Do you still want comments?

> Please give comments especially for the network part modifications.
> 
> 
> We provide multiple submits and asynchronous notifiicaton to 
> vhost-net too.
> 
> Our goal is to improve the bandwidth and reduce the CPU usage.
> Exact performance data will be provided later. But for simple
> test with netperf, we found bindwidth up and CPU % up too,
> but the bindwidth up ratio is much more than CPU % up ratio.
> 
> What we have not done yet:
> 	packet split support
> 	To support GRO
> 	Performance tuning
> 
> what we have done in v1:
> 	polish the RCU usage
> 	deal with write logging in asynchroush mode in vhost
> 	add notifier block for mp device
> 	rename page_ctor to mp_port in netdevice.h to make it looks generic
> 	add mp_dev_change_flags() for mp device to change NIC state
> 	add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
> 	a small fix for missing dev_put when fail
> 	using dynamic minor instead of static minor number
> 	a __KERNEL__ protect to mp_get_sock()
> 
> what we have done in v2:
> 	
> 	remove most of the RCU usage, since the ctor pointer is only
> 	changed by BIND/UNBIND ioctl, and during that time, NIC will be
> 	stopped to get good cleanup(all outstanding requests are finished),
> 	so the ctor pointer cannot be raced into wrong situation.
> 
> 	Remove the struct vhost_notifier with struct kiocb.
> 	Let vhost-net backend to alloc/free the kiocb and transfer them
> 	via sendmsg/recvmsg.
> 
> 	use get_user_pages_fast() and set_page_dirty_lock() when read.
> 
> 	Add some comments for netdev_mp_port_prep() and handle_mpassthru().
> 
> what we have done in v3:
> 	the async write logging is rewritten 
> 	a drafted synchronous write function for qemu live migration
> 	a limit for locked pages from get_user_pages_fast() to prevent Dos
> 	by using RLIMIT_MEMLOCK
> 	
> 
> what we have done in v4:
> 	add iocb completion callback from vhost-net to queue iocb in mp device
> 	replace vq->receiver by mp_sock_data_ready()
> 	remove stuff in mp device which access structures from vhost-net
> 	modify skb_reserve() to ignore host NIC driver reserved space
> 	rebase to the latest vhost tree
> 	split large patches into small pieces, especially for net core part.
> 	
> 		
> performance:
> 	using netperf with GSO/TSO disabled, 10G NIC, 
> 	disabled packet split mode, with raw socket case compared to vhost.
> 
> 	bindwidth will be from 1.1Gbps to 1.7Gbps
> 	CPU % from 120%-140% to 140%-160%

That's nice. The thing to do is probably to enable GSO/TSO
and see what we get this way. Also, mergeable buffer support
was recently posted and I hope to merge it for 2.6.35.
You might want to take a look.

-- 
MST

^ permalink raw reply

* Re: [PATCH] can: Add driver for esd CAN-USB/2 device
From: Wolfgang Grandegger @ 2010-04-25 10:53 UTC (permalink / raw)
  To: Matthias Fuchs; +Cc: netdev, socketcan-core
In-Reply-To: <201004231015.16751.matthias.fuchs@esd.eu>

Hi Matthias,

Matthias Fuchs wrote:
> This patch adds a driver for esd's USB high speed
> CAN interface. The driver supports devices with
> multiple CAN interfaces.
> 
> Signed-off-by: Matthias Fuchs <matthias.fuchs@esd.eu>

Could you please add support for the recently added feature:

  commit 52c793f24054f5dc30d228e37e0e19cc8313f086
  Author: Wolfgang Grandegger <wg@grandegger.com>
  Date:   Mon Feb 22 22:21:17 2010 +0000

    can: netlink support for bus-error reporting and counters
    
    This patch makes the bus-error reporting configurable and allows to
    retrieve the CAN TX and RX bus error counters via netlink interface.
    I have added support for the SJA1000. The TX and RX bus error counters
    are also copied to the data fields 6..7 of error messages when state
    changes are reported.

Should not be a big deal. Also, please make a CC to the USB Linux
mailing list. Some minor comments below:

> ---
>  drivers/net/can/usb/Kconfig    |    6 +
>  drivers/net/can/usb/Makefile   |    1 +
>  drivers/net/can/usb/esd_usb2.c | 1107 ++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 1114 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/net/can/usb/esd_usb2.c
> 
...
> diff --git a/drivers/net/can/usb/esd_usb2.c b/drivers/net/can/usb/esd_usb2.c
> new file mode 100644
> index 0000000..c714ce9
> --- /dev/null
> +++ b/drivers/net/can/usb/esd_usb2.c
...
> +struct id_filter_msg {
> +	u8 len;
> +	u8 cmd;
> +	u8 net;
> +	u8 option;
> +	__le32 mask[65];

ESD_MAX_ID_SEGMENT + 1 ?

...
> +static netdev_tx_t esd_usb2_start_xmit(struct sk_buff *skb,
> +				      struct net_device *netdev)
> +{
> +	struct esd_usb2_net_priv *priv = netdev_priv(netdev);
> +	struct esd_usb2 *dev = priv->usb2;
> +	struct esd_tx_urb_context *context = NULL;
> +	struct net_device_stats *stats = &netdev->stats;
> +	struct can_frame *cf = (struct can_frame *)skb->data;
> +	struct esd_usb2_msg *msg;
> +	struct urb *urb;
> +	u8 *buf;
> +	int i, err;
> +	int ret = NETDEV_TX_OK;
> +	size_t size = sizeof(struct esd_usb2_msg);
> +
> +	if (can_dropped_invalid_skb(netdev, skb))
> +		return NETDEV_TX_OK;
> +
> +	/* create a URB, and a buffer for it, and copy the data to the URB */
> +	urb = usb_alloc_urb(0, GFP_ATOMIC);
> +	if (!urb) {
> +		dev_err(netdev->dev.parent, "No memory left for URBs\n");
> +		stats->tx_dropped++;
> +		dev_kfree_skb(skb);
> +		goto nourbmem;
> +	}
> +
> +	buf = usb_buffer_alloc(dev->udev, size, GFP_ATOMIC, &urb->transfer_dma);
> +	if (!buf) {
> +		dev_err(netdev->dev.parent, "No memory left for USB buffer\n");
> +		stats->tx_dropped++;
> +		dev_kfree_skb(skb);
> +		goto nobufmem;
> +	}
> +
> +	msg = (struct esd_usb2_msg *)buf;
> +
> +	msg->msg.hdr.len = 3; /* minimal length */
> +	msg->msg.hdr.cmd = CMD_CAN_TX;
> +	msg->msg.tx.net = priv->index;
> +	msg->msg.tx.dlc = cf->can_dlc;
> +	msg->msg.tx.id = cpu_to_le32(cf->can_id & CAN_ERR_MASK);
> +
> +	if (cf->can_id & CAN_RTR_FLAG)
> +		msg->msg.tx.dlc |= ESD_RTR;
> +
> +	if (cf->can_id & CAN_EFF_FLAG)
> +		msg->msg.tx.id |= cpu_to_le32(ESD_EXTID);
> +
> +	for (i = 0; i < cf->can_dlc; i++)
> +		msg->msg.tx.data[i] = cf->data[i];
> +
> +	msg->msg.hdr.len += (cf->can_dlc + 3) >> 2;
> +
> +	for (i = 0; i < MAX_TX_URBS; i++) {
> +		if (priv->tx_contexts[i].echo_index == MAX_TX_URBS) {
> +			context = &priv->tx_contexts[i];
> +			break;
> +		}
> +	}
> +
> +	/*
> +	 * This may never happen.
> +	 */
> +	if (!context) {
> +		dev_warn(netdev->dev.parent, "couldn't find free context\n");
> +		ret = NETDEV_TX_BUSY;
> +		goto releasebuf;
> +	}
> +
> +	context->priv = priv;
> +	context->echo_index = i;
> +	context->dlc = cf->can_dlc;
> +
> +	/* hnd must not be 0 */
> +	msg->msg.tx.hnd = 0x80000000 | i; /* returned in TX done message */

ESD_USB2_UBR ?

Wolfgang.

^ permalink raw reply

* Re: [RFC][PATCH v4 00/18] Provide a zero-copy method on KVM virtio-net.
From: Michael S. Tsirkin @ 2010-04-25 10:46 UTC (permalink / raw)
  To: David Miller; +Cc: xiaohui.xin, netdev, kvm, linux-kernel, mingo, jdike
In-Reply-To: <20100425.025529.123989625.davem@davemloft.net>

On Sun, Apr 25, 2010 at 02:55:29AM -0700, David Miller wrote:
> From: xiaohui.xin@intel.com
> Date: Sun, 25 Apr 2010 17:20:06 +0800
> 
> > The idea is simple, just to pin the guest VM user space and then let
> > host NIC driver has the chance to directly DMA to it.
> 
> Isn't it much easier to map the RX ring of the network device into the
> guest's address space, have DMA map calls translate guest addresses to
> physical/DMA addresses as well as do all of this crazy page pinning
> stuff, and provide the translations and protections via the IOMMU?

This means we need guest know how the specific network device works.
So we won't be able to, for example, move guest between different hosts.
There are other problems: many physical systems do not have an iommu,
some guest OS-es do not support DMA map calls, doing VM exit
on each DMA map call might turn out to be very slow. And so on.

> What's being proposed here looks a bit over-engineered.

This is an attempt to reduce overhead for virtio (paravirtualization).
'Don't use PV' is kind of an alternative, but I do not
think it's a simpler one.

-- 
MST

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox