Netdev List
 help / color / mirror / Atom feed
* Re: [net-2.6 PATCH] nete zero kobject in rx_queue_release
From: David Miller @ 2010-11-14 23:15 UTC (permalink / raw)
  To: therbert; +Cc: john.r.fastabend, netdev, eric.dumazet
In-Reply-To: <AANLkTi=1Xzne6HZN-w_3RTeUmqtjGpHPfW211SKvW17w@mail.gmail.com>

From: Tom Herbert <therbert@google.com>
Date: Sun, 14 Nov 2010 14:40:00 -0800

>> So we'll need something like:
>>
>>        if (atomic_dec_and_test(&first->count))
>>                kfree(first);
>>        else
>>                /* clear everything except queue->first */
>>
> 
> The patches to get rid of the separate refcnt should obviate this
> complexity.  Could just clear the queue in kobject release.

True but we'll still need a patch for older kernels.

^ permalink raw reply

* Re: [net-2.6 PATCH] nete zero kobject in rx_queue_release
From: Tom Herbert @ 2010-11-14 22:40 UTC (permalink / raw)
  To: David Miller; +Cc: john.r.fastabend, netdev, eric.dumazet
In-Reply-To: <20101112.130824.68146775.davem@davemloft.net>

> So we'll need something like:
>
>        if (atomic_dec_and_test(&first->count))
>                kfree(first);
>        else
>                /* clear everything except queue->first */
>

The patches to get rid of the separate refcnt should obviate this
complexity.  Could just clear the queue in kobject release.

^ permalink raw reply

* [PATCH] Reduce number of pointer dereferences in IPv6 netfilter LOG module function dump_packet()
From: Jesper Juhl @ 2010-11-14 21:47 UTC (permalink / raw)
  To: Netfilter Core Team
  Cc: Jan Rekorajski, David S. Miller, netfilter-devel, netfilter,
	netdev, linux-kernel


By adding two pointer variables to 
net/ipv6/netfilter/ip6t_LOG.c::dump_packet() we can save 19 bytes of .text 
and many pointer dereferences.

before:
   text    data     bss     dec     hex filename
   6258     600    3088    9946    26da net/ipv6/netfilter/ip6t_LOG.o

after:
   text    data     bss     dec     hex filename
   6239     600    3088    9927    26c7 net/ipv6/netfilter/ip6t_LOG.o


Please Cc me on replies.


Signed-off-by: Jesper Juhl <jj@chaosbits.net>
---
 ip6t_LOG.c |   16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 09c8889..51d10a5 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -46,6 +46,8 @@ static void dump_packet(struct sbuff *m,
 	unsigned int ptr;
 	unsigned int hdrlen = 0;
 	unsigned int logflags;
+	struct sock *sk;
+	struct socket *sk_socket;
 
 	if (info->type == NF_LOG_TYPE_LOG)
 		logflags = info->u.log.logflags;
@@ -358,13 +360,15 @@ static void dump_packet(struct sbuff *m,
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
-	if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
-		read_lock_bh(&skb->sk->sk_callback_lock);
-		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
+	sk = skb->sk;
+	sk_socket = sk->sk_socket;
+	if ((logflags & IP6T_LOG_UID) && recurse && sk) {
+		read_lock_bh(&sk->sk_callback_lock);
+		if (sk_socket && sk_socket->file)
 			sb_add(m, "UID=%u GID=%u ",
-				skb->sk->sk_socket->file->f_cred->fsuid,
-				skb->sk->sk_socket->file->f_cred->fsgid);
-		read_unlock_bh(&skb->sk->sk_callback_lock);
+				sk_socket->file->f_cred->fsuid,
+				sk_socket->file->f_cred->fsgid);
+		read_unlock_bh(&sk->sk_callback_lock);
 	}
 
 	/* Max length: 16 "MARK=0xFFFFFFFF " */



-- 
Jesper Juhl <jj@chaosbits.net>            http://www.chaosbits.net/
Don't top-post http://www.catb.org/~esr/jargon/html/T/top-post.html
Plain text mails only, please.


^ permalink raw reply related

* [PATCH] Reduce number of pointer dereferences in IPv4 netfilter LOG module function dump_packet()
From: Jesper Juhl @ 2010-11-14 21:35 UTC (permalink / raw)
  To: Netfilter Core Team
  Cc: netdev, linux-kernel, netfilter, netfilter-devel, David S. Miller,
	Rusty Russell


By adding two pointer variables to 
net/ipv4/netfilter/ipt_LOG.c::dump_packet() we can save 16 bytes of .text 
and 9 pointer dereferences.

before this patch we did 20 pointer dereferences and had this object file 
size:
   text    data     bss     dec     hex filename
   6233     600    3080    9913    26b9 net/ipv4/netfilter/ipt_LOG.o

after this patch we do just 11 pointer dereferences and have this object 
file size:
   text    data     bss     dec     hex filename
   6217     600    3080    9897    26a9 net/ipv4/netfilter/ipt_LOG.o


Please Cc me on replies.


Signed-off-by: Jesper Juhl <jj@chaosbits.net>
---
 ipt_LOG.c |   16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 72ffc8f..02a92de 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -39,6 +39,8 @@ static void dump_packet(struct sbuff *m,
 	struct iphdr _iph;
 	const struct iphdr *ih;
 	unsigned int logflags;
+	struct sock *sk;
+	struct socket *sk_socket;
 
 	if (info->type == NF_LOG_TYPE_LOG)
 		logflags = info->u.log.logflags;
@@ -335,13 +337,15 @@ static void dump_packet(struct sbuff *m,
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
-	if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
-		read_lock_bh(&skb->sk->sk_callback_lock);
-		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
+	sk = skb->sk;
+	sk_socket = sk->sk_socket;
+	if ((logflags & IPT_LOG_UID) && !iphoff && sk) {
+		read_lock_bh(&sk->sk_callback_lock);
+		if (sk_socket && sk_socket->file)
 			sb_add(m, "UID=%u GID=%u ",
-				skb->sk->sk_socket->file->f_cred->fsuid,
-				skb->sk->sk_socket->file->f_cred->fsgid);
-		read_unlock_bh(&skb->sk->sk_callback_lock);
+				sk_socket->file->f_cred->fsuid,
+				sk_socket->file->f_cred->fsgid);
+		read_unlock_bh(&sk->sk_callback_lock);
 	}
 
 	/* Max length: 16 "MARK=0xFFFFFFFF " */



-- 
Jesper Juhl <jj@chaosbits.net>            http://www.chaosbits.net/
Don't top-post http://www.catb.org/~esr/jargon/html/T/top-post.html
Plain text mails only, please.


^ permalink raw reply related

* [PATCH 2/5] bridge: add proper RCU annotation to should_route_hook
From: Stephen Hemminger @ 2010-11-14 21:12 UTC (permalink / raw)
  To: David Miller, Eric Dumazet; +Cc: netdev, bridge
In-Reply-To: <20101114211201.678755903@vyatta.com>

[-- Attachment #1: bridge-hook-typedef.patch --]
[-- Type: text/plain, Size: 3022 bytes --]

From: Eric Dumazet <eric.dumazet@gmail.com>

Add br_should_route_hook_t typedef, this is the only way we can
get a clean RCU implementation for function pointer.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 include/linux/if_bridge.h             |    4 +++-
 net/bridge/br.c                       |    4 ----
 net/bridge/br_input.c                 |   10 +++++++---
 net/bridge/netfilter/ebtable_broute.c |    3 ++-
 4 files changed, 12 insertions(+), 9 deletions(-)

--- a/net/bridge/br.c	2010-11-14 11:18:54.048692005 -0800
+++ b/net/bridge/br.c	2010-11-14 11:19:47.347027185 -0800
@@ -22,8 +22,6 @@
 
 #include "br_private.h"
 
-int (*br_should_route_hook)(struct sk_buff *skb);
-
 static const struct stp_proto br_stp_proto = {
 	.rcv	= br_stp_rcv,
 };
@@ -102,8 +100,6 @@ static void __exit br_deinit(void)
 	br_fdb_fini();
 }
 
-EXPORT_SYMBOL(br_should_route_hook);
-
 module_init(br_init)
 module_exit(br_deinit)
 MODULE_LICENSE("GPL");
--- a/net/bridge/br_input.c	2010-11-14 11:18:54.048692005 -0800
+++ b/net/bridge/br_input.c	2010-11-14 11:41:40.558700481 -0800
@@ -21,6 +21,10 @@
 /* Bridge group multicast address 802.1d (pg 51). */
 const u8 br_group_address[ETH_ALEN] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
 
+/* Hook for brouter */
+br_should_route_hook_t __rcu *br_should_route_hook __read_mostly;
+EXPORT_SYMBOL(br_should_route_hook);
+
 static int br_pass_frame_up(struct sk_buff *skb)
 {
 	struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
@@ -139,7 +143,7 @@ struct sk_buff *br_handle_frame(struct s
 {
 	struct net_bridge_port *p;
 	const unsigned char *dest = eth_hdr(skb)->h_dest;
-	int (*rhook)(struct sk_buff *skb);
+	br_should_route_hook_t *rhook;
 
 	if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
 		return skb;
@@ -173,8 +177,8 @@ forward:
 	switch (p->state) {
 	case BR_STATE_FORWARDING:
 		rhook = rcu_dereference(br_should_route_hook);
-		if (rhook != NULL) {
-			if (rhook(skb))
+		if (rhook) {
+			if ((*rhook)(skb))
 				return skb;
 			dest = eth_hdr(skb)->h_dest;
 		}
--- a/include/linux/if_bridge.h	2010-11-14 11:18:54.048692005 -0800
+++ b/include/linux/if_bridge.h	2010-11-14 11:19:47.351028008 -0800
@@ -102,7 +102,9 @@ struct __fdb_entry {
 #include <linux/netdevice.h>
 
 extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
-extern int (*br_should_route_hook)(struct sk_buff *skb);
+
+typedef int (*br_should_route_hook_t)(struct sk_buff *skb);
+extern br_should_route_hook_t __rcu *br_should_route_hook;
 
 #endif
 
--- a/net/bridge/netfilter/ebtable_broute.c	2010-11-14 11:20:39.745149494 -0800
+++ b/net/bridge/netfilter/ebtable_broute.c	2010-11-14 11:21:01.020917528 -0800
@@ -87,7 +87,8 @@ static int __init ebtable_broute_init(vo
 	if (ret < 0)
 		return ret;
 	/* see br_input.c */
-	rcu_assign_pointer(br_should_route_hook, ebt_broute);
+	rcu_assign_pointer(br_should_route_hook,
+			   (br_should_route_hook_t *)ebt_broute);
 	return 0;
 }
 



^ permalink raw reply

* [PATCH 4/5] bridge: fix RCU races with bridge port
From: Stephen Hemminger @ 2010-11-14 21:12 UTC (permalink / raw)
  To: David Miller, Eric Dumazet; +Cc: netdev, bridge
In-Reply-To: <20101114211201.678755903@vyatta.com>

[-- Attachment #1: br-port-rcu-race.patch --]
[-- Type: text/plain, Size: 6647 bytes --]

The macro br_port_exists() is not enough protection when only
RCU is being used. There is a tiny race where other CPU has cleared port
handler hook, but is bridge port flag might still be set.


Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 net/bridge/br_fdb.c             |   15 +++++++++------
 net/bridge/br_if.c              |    5 +----
 net/bridge/br_netfilter.c       |   13 +++++++------
 net/bridge/br_netlink.c         |   10 ++++++----
 net/bridge/br_notify.c          |    2 +-
 net/bridge/br_private.h         |   16 +++++++++++++---
 net/bridge/br_stp_bpdu.c        |    8 ++++----
 net/bridge/netfilter/ebtables.c |   11 +++++------
 8 files changed, 46 insertions(+), 34 deletions(-)

--- a/net/bridge/br_netfilter.c	2010-11-14 12:36:22.970441653 -0800
+++ b/net/bridge/br_netfilter.c	2010-11-14 12:44:55.666987357 -0800
@@ -131,17 +131,18 @@ void br_netfilter_rtable_init(struct net
 
 static inline struct rtable *bridge_parent_rtable(const struct net_device *dev)
 {
-	if (!br_port_exists(dev))
-		return NULL;
-	return &br_port_get_rcu(dev)->br->fake_rtable;
+	struct net_bridge_port *port;
+
+	port = br_port_get_rcu(dev);
+	return port ? &port->br->fake_rtable : NULL;
 }
 
 static inline struct net_device *bridge_parent(const struct net_device *dev)
 {
-	if (!br_port_exists(dev))
-		return NULL;
+	struct net_bridge_port *port;
 
-	return br_port_get_rcu(dev)->br->dev;
+	port = br_port_get_rcu(dev);
+	return port ? port->br->dev : NULL;
 }
 
 static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
--- a/net/bridge/br_stp_bpdu.c	2010-11-14 12:36:22.982443121 -0800
+++ b/net/bridge/br_stp_bpdu.c	2010-11-14 12:44:55.666987357 -0800
@@ -141,10 +141,6 @@ void br_stp_rcv(const struct stp_proto *
 	struct net_bridge *br;
 	const unsigned char *buf;
 
-	if (!br_port_exists(dev))
-		goto err;
-	p = br_port_get_rcu(dev);
-
 	if (!pskb_may_pull(skb, 4))
 		goto err;
 
@@ -153,6 +149,10 @@ void br_stp_rcv(const struct stp_proto *
 	if (buf[0] != 0 || buf[1] != 0 || buf[2] != 0)
 		goto err;
 
+	p = br_port_get_rcu(dev);
+	if (!p)
+		goto err;
+
 	br = p->br;
 	spin_lock(&br->lock);
 
--- a/net/bridge/netfilter/ebtables.c	2010-11-14 12:36:22.998445081 -0800
+++ b/net/bridge/netfilter/ebtables.c	2010-11-14 12:45:49.393153060 -0800
@@ -128,6 +128,7 @@ ebt_basic_match(const struct ebt_entry *
                 const struct net_device *in, const struct net_device *out)
 {
 	const struct ethhdr *h = eth_hdr(skb);
+	const struct net_bridge_port *p;
 	__be16 ethproto;
 	int verdict, i;
 
@@ -148,13 +149,11 @@ ebt_basic_match(const struct ebt_entry *
 	if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT))
 		return 1;
 	/* rcu_read_lock()ed by nf_hook_slow */
-	if (in && br_port_exists(in) &&
-	    FWINV2(ebt_dev_check(e->logical_in, br_port_get_rcu(in)->br->dev),
-		   EBT_ILOGICALIN))
+	if (in && (p = br_port_get_rcu(in)) != NULL &&
+	    FWINV2(ebt_dev_check(e->logical_in, p->br->dev), EBT_ILOGICALIN))
 		return 1;
-	if (out && br_port_exists(out) &&
-	    FWINV2(ebt_dev_check(e->logical_out, br_port_get_rcu(out)->br->dev),
-		   EBT_ILOGICALOUT))
+	if (out && (p = br_port_get_rcu(out)) != NULL &&
+	    FWINV2(ebt_dev_check(e->logical_out, p->br->dev), EBT_ILOGICALOUT))
 		return 1;
 
 	if (e->bitmask & EBT_SOURCEMAC) {
--- a/net/bridge/br_fdb.c	2010-11-14 12:36:23.022448019 -0800
+++ b/net/bridge/br_fdb.c	2010-11-14 12:44:55.670987817 -0800
@@ -238,15 +238,18 @@ struct net_bridge_fdb_entry *__br_fdb_ge
 int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
 {
 	struct net_bridge_fdb_entry *fdb;
+	struct net_bridge_port *port;
 	int ret;
 
-	if (!br_port_exists(dev))
-		return 0;
-
 	rcu_read_lock();
-	fdb = __br_fdb_get(br_port_get_rcu(dev)->br, addr);
-	ret = fdb && fdb->dst->dev != dev &&
-		fdb->dst->state == BR_STATE_FORWARDING;
+	port = br_port_get_rcu(dev);
+	if (!port)
+		ret = 0;
+	else {
+		fdb = __br_fdb_get(port->br, addr);
+		ret = fdb && fdb->dst->dev != dev &&
+			fdb->dst->state == BR_STATE_FORWARDING;
+	}
 	rcu_read_unlock();
 
 	return ret;
--- a/net/bridge/br_notify.c	2010-11-14 12:36:23.034449489 -0800
+++ b/net/bridge/br_notify.c	2010-11-14 12:44:55.670987817 -0800
@@ -32,7 +32,7 @@ struct notifier_block br_device_notifier
 static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
 {
 	struct net_device *dev = ptr;
-	struct net_bridge_port *p = br_port_get(dev);
+	struct net_bridge_port *p;
 	struct net_bridge *br;
 	int err;
 
--- a/net/bridge/br_private.h	2010-11-14 12:44:07.257410977 -0800
+++ b/net/bridge/br_private.h	2010-11-14 12:44:55.670987817 -0800
@@ -151,11 +151,21 @@ struct net_bridge_port
 #endif
 };
 
-#define br_port_get_rcu(dev) \
-	((struct net_bridge_port *) rcu_dereference(dev->rx_handler_data))
-#define br_port_get(dev) ((struct net_bridge_port *) dev->rx_handler_data)
 #define br_port_exists(dev) (dev->priv_flags & IFF_BRIDGE_PORT)
 
+static inline struct net_bridge_port *br_port_get_rcu(const struct net_device *dev)
+{
+	return br_port_exists(dev) ?
+		rcu_dereference(dev->rx_handler_data) : NULL;
+}
+
+static inline struct net_bridge_port *br_port_get(struct net_device *dev)
+{
+	return br_port_exists(dev) ? dev->rx_handler_data : NULL;
+}
+
+#define br_port_get(dev) ((struct net_bridge_port *) dev->rx_handler_data)
+
 struct br_cpu_netstats {
 	u64			rx_packets;
 	u64			rx_bytes;
--- a/net/bridge/br_if.c	2010-11-14 12:36:23.046450958 -0800
+++ b/net/bridge/br_if.c	2010-11-14 12:44:55.670987817 -0800
@@ -475,11 +475,8 @@ int br_del_if(struct net_bridge *br, str
 {
 	struct net_bridge_port *p;
 
-	if (!br_port_exists(dev))
-		return -EINVAL;
-
 	p = br_port_get(dev);
-	if (p->br != br)
+	if (!p || p->br != br)
 		return -EINVAL;
 
 	del_nbp(p);
--- a/net/bridge/br_netlink.c	2010-11-14 12:36:23.062452917 -0800
+++ b/net/bridge/br_netlink.c	2010-11-14 12:44:55.670987817 -0800
@@ -119,11 +119,13 @@ static int br_dump_ifinfo(struct sk_buff
 
 	idx = 0;
 	for_each_netdev(net, dev) {
+		struct net_bridge_port *port = br_port_get(dev);
+
 		/* not a bridge port */
-		if (!br_port_exists(dev) || idx < cb->args[0])
+		if (!port || idx < cb->args[0])
 			goto skip;
 
-		if (br_fill_ifinfo(skb, br_port_get(dev),
+		if (br_fill_ifinfo(skb, port,
 				   NETLINK_CB(cb->skb).pid,
 				   cb->nlh->nlmsg_seq, RTM_NEWLINK,
 				   NLM_F_MULTI) < 0)
@@ -169,9 +171,9 @@ static int br_rtm_setlink(struct sk_buff
 	if (!dev)
 		return -ENODEV;
 
-	if (!br_port_exists(dev))
-		return -EINVAL;
 	p = br_port_get(dev);
+	if (!p)
+		return -EINVAL;
 
 	/* if kernel STP is running, don't allow changes */
 	if (p->br->stp_enabled == BR_KERNEL_STP)



^ permalink raw reply

* [PATCH 1/5] bridge: add RCU annotation to bridge multicast table
From: Stephen Hemminger @ 2010-11-14 21:12 UTC (permalink / raw)
  To: David Miller, Eric Dumazet; +Cc: netdev, bridge
In-Reply-To: <20101114211201.678755903@vyatta.com>

[-- Attachment #1: bridge-mlock-rcu.patch --]
[-- Type: text/plain, Size: 10144 bytes --]

From: Eric Dumazet <eric.dumazet@gmail.com>

Add modern __rcu annotatations to bridge multicast table.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


---
 net/bridge/br_forward.c   |    4 +-
 net/bridge/br_multicast.c |   78 ++++++++++++++++++++++++++++++----------------
 net/bridge/br_private.h   |    6 +--
 3 files changed, 56 insertions(+), 32 deletions(-)

--- a/net/bridge/br_multicast.c	2010-11-14 12:36:30.383348571 -0800
+++ b/net/bridge/br_multicast.c	2010-11-14 12:36:37.084167303 -0800
@@ -33,6 +33,9 @@
 
 #include "br_private.h"
 
+#define mlock_dereference(X, br) \
+	rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
+
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 static inline int ipv6_is_local_multicast(const struct in6_addr *addr)
 {
@@ -135,7 +138,7 @@ static struct net_bridge_mdb_entry *br_m
 struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
 					struct sk_buff *skb)
 {
-	struct net_bridge_mdb_htable *mdb = br->mdb;
+	struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb);
 	struct br_ip ip;
 
 	if (br->multicast_disabled)
@@ -235,7 +238,8 @@ static void br_multicast_group_expired(u
 	if (mp->ports)
 		goto out;
 
-	mdb = br->mdb;
+	mdb = mlock_dereference(br->mdb, br);
+
 	hlist_del_rcu(&mp->hlist[mdb->ver]);
 	mdb->size--;
 
@@ -249,16 +253,20 @@ out:
 static void br_multicast_del_pg(struct net_bridge *br,
 				struct net_bridge_port_group *pg)
 {
-	struct net_bridge_mdb_htable *mdb = br->mdb;
+	struct net_bridge_mdb_htable *mdb;
 	struct net_bridge_mdb_entry *mp;
 	struct net_bridge_port_group *p;
-	struct net_bridge_port_group **pp;
+	struct net_bridge_port_group __rcu **pp;
+
+	mdb = mlock_dereference(br->mdb, br);
 
 	mp = br_mdb_ip_get(mdb, &pg->addr);
 	if (WARN_ON(!mp))
 		return;
 
-	for (pp = &mp->ports; (p = *pp); pp = &p->next) {
+	for (pp = &mp->ports;
+	     (p = mlock_dereference(*pp, br)) != NULL;
+	     pp = &p->next) {
 		if (p != pg)
 			continue;
 
@@ -294,10 +302,10 @@ out:
 	spin_unlock(&br->multicast_lock);
 }
 
-static int br_mdb_rehash(struct net_bridge_mdb_htable **mdbp, int max,
+static int br_mdb_rehash(struct net_bridge_mdb_htable __rcu **mdbp, int max,
 			 int elasticity)
 {
-	struct net_bridge_mdb_htable *old = *mdbp;
+	struct net_bridge_mdb_htable *old = rcu_dereference_protected(*mdbp, 1);
 	struct net_bridge_mdb_htable *mdb;
 	int err;
 
@@ -569,7 +577,7 @@ static struct net_bridge_mdb_entry *br_m
 	struct net_bridge *br, struct net_bridge_port *port,
 	struct br_ip *group, int hash)
 {
-	struct net_bridge_mdb_htable *mdb = br->mdb;
+	struct net_bridge_mdb_htable *mdb;
 	struct net_bridge_mdb_entry *mp;
 	struct hlist_node *p;
 	unsigned count = 0;
@@ -577,6 +585,7 @@ static struct net_bridge_mdb_entry *br_m
 	int elasticity;
 	int err;
 
+	mdb = rcu_dereference_protected(br->mdb, 1);
 	hlist_for_each_entry(mp, p, &mdb->mhash[hash], hlist[mdb->ver]) {
 		count++;
 		if (unlikely(br_ip_equal(group, &mp->addr)))
@@ -642,10 +651,11 @@ static struct net_bridge_mdb_entry *br_m
 	struct net_bridge *br, struct net_bridge_port *port,
 	struct br_ip *group)
 {
-	struct net_bridge_mdb_htable *mdb = br->mdb;
+	struct net_bridge_mdb_htable *mdb;
 	struct net_bridge_mdb_entry *mp;
 	int hash;
 
+	mdb = rcu_dereference_protected(br->mdb, 1);
 	if (!mdb) {
 		if (br_mdb_rehash(&br->mdb, BR_HASH_SIZE, 0))
 			return NULL;
@@ -660,7 +670,7 @@ static struct net_bridge_mdb_entry *br_m
 
 	case -EAGAIN:
 rehash:
-		mdb = br->mdb;
+		mdb = rcu_dereference_protected(br->mdb, 1);
 		hash = br_ip_hash(mdb, group);
 		break;
 
@@ -692,7 +702,7 @@ static int br_multicast_add_group(struct
 {
 	struct net_bridge_mdb_entry *mp;
 	struct net_bridge_port_group *p;
-	struct net_bridge_port_group **pp;
+	struct net_bridge_port_group __rcu **pp;
 	unsigned long now = jiffies;
 	int err;
 
@@ -712,7 +722,9 @@ static int br_multicast_add_group(struct
 		goto out;
 	}
 
-	for (pp = &mp->ports; (p = *pp); pp = &p->next) {
+	for (pp = &mp->ports;
+	     (p = mlock_dereference(*pp, br)) != NULL;
+	     pp = &p->next) {
 		if (p->port == port)
 			goto found;
 		if ((unsigned long)p->port < (unsigned long)port)
@@ -1106,7 +1118,7 @@ static int br_ip4_multicast_query(struct
 	struct net_bridge_mdb_entry *mp;
 	struct igmpv3_query *ih3;
 	struct net_bridge_port_group *p;
-	struct net_bridge_port_group **pp;
+	struct net_bridge_port_group __rcu **pp;
 	unsigned long max_delay;
 	unsigned long now = jiffies;
 	__be32 group;
@@ -1145,7 +1157,7 @@ static int br_ip4_multicast_query(struct
 	if (!group)
 		goto out;
 
-	mp = br_mdb_ip4_get(br->mdb, group);
+	mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group);
 	if (!mp)
 		goto out;
 
@@ -1157,7 +1169,9 @@ static int br_ip4_multicast_query(struct
 	     try_to_del_timer_sync(&mp->timer) >= 0))
 		mod_timer(&mp->timer, now + max_delay);
 
-	for (pp = &mp->ports; (p = *pp); pp = &p->next) {
+	for (pp = &mp->ports;
+	     (p = mlock_dereference(*pp, br)) != NULL;
+	     pp = &p->next) {
 		if (timer_pending(&p->timer) ?
 		    time_after(p->timer.expires, now + max_delay) :
 		    try_to_del_timer_sync(&p->timer) >= 0)
@@ -1178,7 +1192,8 @@ static int br_ip6_multicast_query(struct
 	struct mld_msg *mld = (struct mld_msg *) icmp6_hdr(skb);
 	struct net_bridge_mdb_entry *mp;
 	struct mld2_query *mld2q;
-	struct net_bridge_port_group *p, **pp;
+	struct net_bridge_port_group *p;
+	struct net_bridge_port_group __rcu **pp;
 	unsigned long max_delay;
 	unsigned long now = jiffies;
 	struct in6_addr *group = NULL;
@@ -1214,7 +1229,7 @@ static int br_ip6_multicast_query(struct
 	if (!group)
 		goto out;
 
-	mp = br_mdb_ip6_get(br->mdb, group);
+	mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group);
 	if (!mp)
 		goto out;
 
@@ -1225,7 +1240,9 @@ static int br_ip6_multicast_query(struct
 	     try_to_del_timer_sync(&mp->timer) >= 0))
 		mod_timer(&mp->timer, now + max_delay);
 
-	for (pp = &mp->ports; (p = *pp); pp = &p->next) {
+	for (pp = &mp->ports;
+	     (p = mlock_dereference(*pp, br)) != NULL;
+	     pp = &p->next) {
 		if (timer_pending(&p->timer) ?
 		    time_after(p->timer.expires, now + max_delay) :
 		    try_to_del_timer_sync(&p->timer) >= 0)
@@ -1254,7 +1271,7 @@ static void br_multicast_leave_group(str
 	    timer_pending(&br->multicast_querier_timer))
 		goto out;
 
-	mdb = br->mdb;
+	mdb = mlock_dereference(br->mdb, br);
 	mp = br_mdb_ip_get(mdb, group);
 	if (!mp)
 		goto out;
@@ -1277,7 +1294,9 @@ static void br_multicast_leave_group(str
 		goto out;
 	}
 
-	for (p = mp->ports; p; p = p->next) {
+	for (p = mlock_dereference(mp->ports, br);
+	     p != NULL;
+	     p = mlock_dereference(p->next, br)) {
 		if (p->port != port)
 			continue;
 
@@ -1625,7 +1644,7 @@ void br_multicast_stop(struct net_bridge
 	del_timer_sync(&br->multicast_query_timer);
 
 	spin_lock_bh(&br->multicast_lock);
-	mdb = br->mdb;
+	mdb = mlock_dereference(br->mdb, br);
 	if (!mdb)
 		goto out;
 
@@ -1729,6 +1748,7 @@ int br_multicast_toggle(struct net_bridg
 {
 	struct net_bridge_port *port;
 	int err = 0;
+	struct net_bridge_mdb_htable *mdb;
 
 	spin_lock(&br->multicast_lock);
 	if (br->multicast_disabled == !val)
@@ -1741,15 +1761,16 @@ int br_multicast_toggle(struct net_bridg
 	if (!netif_running(br->dev))
 		goto unlock;
 
-	if (br->mdb) {
-		if (br->mdb->old) {
+	mdb = mlock_dereference(br->mdb, br);
+	if (mdb) {
+		if (mdb->old) {
 			err = -EEXIST;
 rollback:
 			br->multicast_disabled = !!val;
 			goto unlock;
 		}
 
-		err = br_mdb_rehash(&br->mdb, br->mdb->max,
+		err = br_mdb_rehash(&br->mdb, mdb->max,
 				    br->hash_elasticity);
 		if (err)
 			goto rollback;
@@ -1774,6 +1795,7 @@ int br_multicast_set_hash_max(struct net
 {
 	int err = -ENOENT;
 	u32 old;
+	struct net_bridge_mdb_htable *mdb;
 
 	spin_lock(&br->multicast_lock);
 	if (!netif_running(br->dev))
@@ -1782,7 +1804,9 @@ int br_multicast_set_hash_max(struct net
 	err = -EINVAL;
 	if (!is_power_of_2(val))
 		goto unlock;
-	if (br->mdb && val < br->mdb->size)
+
+	mdb = mlock_dereference(br->mdb, br);
+	if (mdb && val < mdb->size)
 		goto unlock;
 
 	err = 0;
@@ -1790,8 +1814,8 @@ int br_multicast_set_hash_max(struct net
 	old = br->hash_max;
 	br->hash_max = val;
 
-	if (br->mdb) {
-		if (br->mdb->old) {
+	if (mdb) {
+		if (mdb->old) {
 			err = -EEXIST;
 rollback:
 			br->hash_max = old;
--- a/net/bridge/br_private.h	2010-11-14 12:36:30.399350527 -0800
+++ b/net/bridge/br_private.h	2010-11-14 12:44:07.257410977 -0800
@@ -72,7 +72,7 @@ struct net_bridge_fdb_entry
 
 struct net_bridge_port_group {
 	struct net_bridge_port		*port;
-	struct net_bridge_port_group	*next;
+	struct net_bridge_port_group __rcu *next;
 	struct hlist_node		mglist;
 	struct rcu_head			rcu;
 	struct timer_list		timer;
@@ -86,7 +86,7 @@ struct net_bridge_mdb_entry
 	struct hlist_node		hlist[2];
 	struct hlist_node		mglist;
 	struct net_bridge		*br;
-	struct net_bridge_port_group	*ports;
+	struct net_bridge_port_group __rcu *ports;
 	struct rcu_head			rcu;
 	struct timer_list		timer;
 	struct timer_list		query_timer;
@@ -227,7 +227,7 @@ struct net_bridge
 	unsigned long			multicast_startup_query_interval;
 
 	spinlock_t			multicast_lock;
-	struct net_bridge_mdb_htable	*mdb;
+	struct net_bridge_mdb_htable __rcu *mdb;
 	struct hlist_head		router_list;
 	struct hlist_head		mglist;
 
--- a/net/bridge/br_forward.c	2010-11-14 12:36:47.833478598 -0800
+++ b/net/bridge/br_forward.c	2010-11-14 12:42:22.001208297 -0800
@@ -223,7 +223,7 @@ static void br_multicast_flood(struct ne
 	struct net_bridge_port_group *p;
 	struct hlist_node *rp;
 
-	rp = rcu_dereference(br->router_list.first);
+	rp = rcu_dereference(hlist_first_rcu(&br->router_list));
 	p = mdst ? rcu_dereference(mdst->ports) : NULL;
 	while (p || rp) {
 		struct net_bridge_port *port, *lport, *rport;
@@ -242,7 +242,7 @@ static void br_multicast_flood(struct ne
 		if ((unsigned long)lport >= (unsigned long)port)
 			p = rcu_dereference(p->next);
 		if ((unsigned long)rport >= (unsigned long)port)
-			rp = rcu_dereference(rp->next);
+			rp = rcu_dereference(hlist_next_rcu(rp->next));
 	}
 
 	if (!prev)



^ permalink raw reply

* [PATCH 0/5] bridge: RCU annotation and cleanup
From: Stephen Hemminger @ 2010-11-14 21:12 UTC (permalink / raw)
  To: David Miller, Eric Dumazet; +Cc: netdev, bridge

This is a split up of what Eric did with a couple of small changes and additions.


^ permalink raw reply

* [PATCH 5/5] bridge: add RCU annotations to bridge port lookup
From: Stephen Hemminger @ 2010-11-14 21:12 UTC (permalink / raw)
  To: David Miller, Eric Dumazet; +Cc: netdev, bridge
In-Reply-To: <20101114211201.678755903@vyatta.com>

[-- Attachment #1: bridgeX.patch --]
[-- Type: text/plain, Size: 2502 bytes --]

From: Eric Dumazet <eric.dumazet@gmail.com>

Add modern __rcu annotations to bridge code, to reduce sparse errors,
and self document code.
(CONFIG_SPARSE_RCU_POINTER=y)

br_port_get() split int br_port_get_rtnl()  and br_port_get_rcu()
to make clear RTNL is held.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 net/bridge/br_if.c      |    2 +-
 net/bridge/br_netlink.c |    4 ++--
 net/bridge/br_notify.c  |    4 ++--
 net/bridge/br_private.h |    9 +++++----
 4 files changed, 10 insertions(+), 9 deletions(-)

--- a/net/bridge/br_netlink.c	2010-11-14 12:24:35.000000000 -0800
+++ b/net/bridge/br_netlink.c	2010-11-14 12:26:47.859892139 -0800
@@ -119,7 +119,7 @@ static int br_dump_ifinfo(struct sk_buff
 
 	idx = 0;
 	for_each_netdev(net, dev) {
-		struct net_bridge_port *port = br_port_get(dev);
+		struct net_bridge_port *port = br_port_get_rtnl(dev);
 
 		/* not a bridge port */
 		if (!port || idx < cb->args[0])
@@ -171,7 +171,7 @@ static int br_rtm_setlink(struct sk_buff
 	if (!dev)
 		return -ENODEV;
 
-	p = br_port_get(dev);
+	p = br_port_get_rtnl(dev);
 	if (!p)
 		return -EINVAL;
 
--- a/net/bridge/br_private.h	2010-11-14 12:25:44.600853008 -0800
+++ b/net/bridge/br_private.h	2010-11-14 12:33:23.880986098 -0800
@@ -159,9 +159,10 @@ static inline struct net_bridge_port *br
 		rcu_dereference(dev->rx_handler_data) : NULL;
 }
 
-static inline struct net_bridge_port *br_port_get(struct net_device *dev)
+static inline struct net_bridge_port *br_port_get_rtnl(struct net_device *dev)
 {
-	return br_port_exists(dev) ? dev->rx_handler_data : NULL;
+	return br_port_exists(dev) ?
+		rtnl_dereference(dev->rx_handler_data) : NULL;
 }
 
 #define br_port_get(dev) ((struct net_bridge_port *) dev->rx_handler_data)
--- a/net/bridge/br_if.c	2010-11-14 12:23:34.000000000 -0800
+++ b/net/bridge/br_if.c	2010-11-14 12:27:24.267934764 -0800
@@ -475,7 +475,7 @@ int br_del_if(struct net_bridge *br, str
 {
 	struct net_bridge_port *p;
 
-	p = br_port_get(dev);
+	p = br_port_get_rtnl(dev);
 	if (!p || p->br != br)
 		return -EINVAL;
 
--- a/net/bridge/br_notify.c	2010-11-14 12:30:46.250267273 -0800
+++ b/net/bridge/br_notify.c	2010-11-14 12:31:11.749076964 -0800
@@ -37,10 +37,10 @@ static int br_device_event(struct notifi
 	int err;
 
 	/* not a port of a bridge */
-	if (!br_port_exists(dev))
+	p = br_port_get_rtnl(dev);
+	if (!p)
 		return NOTIFY_DONE;
 
-	p = br_port_get(dev);
 	br = p->br;
 
 	switch (event) {



^ permalink raw reply

* [PATCH 3/5] netdev: add rcu annotations to receive handler hook
From: Stephen Hemminger @ 2010-11-14 21:12 UTC (permalink / raw)
  To: David Miller, Eric Dumazet; +Cc: netdev, bridge
In-Reply-To: <20101114211201.678755903@vyatta.com>

[-- Attachment #1: rx_handler_rcu.patch --]
[-- Type: text/plain, Size: 595 bytes --]

Suggested by Eric's bridge RCU changes.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 include/linux/netdevice.h |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/include/linux/netdevice.h	2010-11-14 11:41:53.224298362 -0800
+++ b/include/linux/netdevice.h	2010-11-14 11:42:42.546359900 -0800
@@ -995,8 +995,8 @@ struct net_device {
 	unsigned int		real_num_rx_queues;
 #endif
 
-	rx_handler_func_t	*rx_handler;
-	void			*rx_handler_data;
+	rx_handler_func_t __rcu	*rx_handler;
+	void __rcu		*rx_handler_data;
 
 	struct netdev_queue __rcu *ingress_queue;
 



^ permalink raw reply

* Re: [PATCH] net: bnx2x: fix error value sign
From: Eric Dumazet @ 2010-11-14 20:32 UTC (permalink / raw)
  To: Vasiliy Kulikov; +Cc: kernel-janitors, Eilon Greenstein, netdev, linux-kernel
In-Reply-To: <1289766583.2743.144.camel@edumazet-laptop>

Le dimanche 14 novembre 2010 à 21:29 +0100, Eric Dumazet a écrit :

> I remember sending same patch in the past... it was lost somehow...

Ah, it was another issue, patch was not lost ;)




^ permalink raw reply

* Re: [PATCH] net: bnx2x: fix error value sign
From: Eric Dumazet @ 2010-11-14 20:29 UTC (permalink / raw)
  To: Vasiliy Kulikov; +Cc: kernel-janitors, Eilon Greenstein, netdev, linux-kernel
In-Reply-To: <1289765315-6028-1-git-send-email-segoon@openwall.com>

Le dimanche 14 novembre 2010 à 23:08 +0300, Vasiliy Kulikov a écrit :
> bnx2x_init_one() should return negative value on error.
> By mistake it returns ENODEV instead of -ENODEV.
> 
> Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
> ---
>  Compile tested.
> 
>  drivers/net/bnx2x/bnx2x_main.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/net/bnx2x/bnx2x_main.c b/drivers/net/bnx2x/bnx2x_main.c
> index 4a6f0ea..be52edc 100644
> --- a/drivers/net/bnx2x/bnx2x_main.c
> +++ b/drivers/net/bnx2x/bnx2x_main.c
> @@ -9064,7 +9064,7 @@ static int __devinit bnx2x_init_one(struct pci_dev *pdev,
>  	default:
>  		pr_err("Unknown board_type (%ld), aborting\n",
>  			   ent->driver_data);
> -		return ENODEV;
> +		return -ENODEV;
>  	}
>  
>  	cid_count += CNIC_CONTEXT_USE;

I remember sending same patch in the past... it was lost somehow...

^ permalink raw reply

* Re: [PATCH] tcp: restrict net.ipv4.tcp_adv_min_scale (#20312)
From: Alexey Dobriyan @ 2010-11-14 20:14 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, shemminger, netdev
In-Reply-To: <1289764183.2743.94.camel@edumazet-laptop>

On Sun, Nov 14, 2010 at 08:49:43PM +0100, Eric Dumazet wrote:
> > +static int _minus_31 = -31;
> > +static int _31 = 31;
> 
> Please use normal symbols, not starting by underscore.

static int thirty_one = 31? :-)

^ permalink raw reply

* Re: [PATCH] net: bnx2x: fix error value sign
From: Eilon Greenstein @ 2010-11-14 20:26 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: kernel-janitors@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <1289765315-6028-1-git-send-email-segoon@openwall.com>

On Sun, 2010-11-14 at 12:08 -0800, Vasiliy Kulikov wrote:
> bnx2x_init_one() should return negative value on error.
> By mistake it returns ENODEV instead of -ENODEV.
> 
> Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Thanks Vasiliy!

Acked-by: Eilon Greenstein <eilong@broadcom.com>
> ---
>  Compile tested.
> 
>  drivers/net/bnx2x/bnx2x_main.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/net/bnx2x/bnx2x_main.c b/drivers/net/bnx2x/bnx2x_main.c
> index 4a6f0ea..be52edc 100644
> --- a/drivers/net/bnx2x/bnx2x_main.c
> +++ b/drivers/net/bnx2x/bnx2x_main.c
> @@ -9064,7 +9064,7 @@ static int __devinit bnx2x_init_one(struct pci_dev *pdev,
>  	default:
>  		pr_err("Unknown board_type (%ld), aborting\n",
>  			   ent->driver_data);
> -		return ENODEV;
> +		return -ENODEV;
>  	}
>  
>  	cid_count += CNIC_CONTEXT_USE;

^ permalink raw reply

* Re: [RFC] pull linux-2.6 into net-next-2.6 ?
From: David Miller @ 2010-11-14 20:21 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1289636478.2743.19.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 13 Nov 2010 09:21:18 +0100

> Andrew pushed to Linus the patch introducing atomic_inc_not_zero_hint()
> 
> (commit 3f9d35b9514da675)
> 
> Would it be possible to get it in net-next-2.6 so that I can start using
> it in network stack ?

Done.

^ permalink raw reply

* [PATCH] net: bnx2x: fix error value sign
From: Vasiliy Kulikov @ 2010-11-14 20:08 UTC (permalink / raw)
  To: kernel-janitors; +Cc: Eilon Greenstein, netdev, linux-kernel

bnx2x_init_one() should return negative value on error.
By mistake it returns ENODEV instead of -ENODEV.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
---
 Compile tested.

 drivers/net/bnx2x/bnx2x_main.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bnx2x/bnx2x_main.c b/drivers/net/bnx2x/bnx2x_main.c
index 4a6f0ea..be52edc 100644
--- a/drivers/net/bnx2x/bnx2x_main.c
+++ b/drivers/net/bnx2x/bnx2x_main.c
@@ -9064,7 +9064,7 @@ static int __devinit bnx2x_init_one(struct pci_dev *pdev,
 	default:
 		pr_err("Unknown board_type (%ld), aborting\n",
 			   ent->driver_data);
-		return ENODEV;
+		return -ENODEV;
 	}
 
 	cid_count += CNIC_CONTEXT_USE;
-- 
1.7.0.4


^ permalink raw reply related

* Re: [PATCH/RFC] netfilter: nf_conntrack_sip: Handle quirky Cisco phones
From: Eric Dumazet @ 2010-11-14 19:57 UTC (permalink / raw)
  To: Kevin Cernekee
  Cc: Patrick McHardy, David S. Miller, Alexey Kuznetsov,
	Pekka Savola (ipv6), James Morris, Hideaki YOSHIFUJI,
	netfilter-devel, netfilter, coreteam, linux-kernel, netdev
In-Reply-To: <AANLkTik4PLga8mX73c8iONPeOtpDiuDhfu3xiPmF07jC@mail.gmail.com>

Le dimanche 14 novembre 2010 à 10:33 -0800, Kevin Cernekee a écrit :
> On Sun, Nov 14, 2010 at 12:59 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > I would like to get an exact SIP exchange to make sure their is not
> > another way to handle this without adding a "Cisco" string somewhere...
> >
> > Please provide a pcap or tcpdump -A
> 
> Existing nf_nat_sip: phone sends unauthenticated REGISTER requests
> over and over again, because it is not seeing the replies sent back to
> port 50070:
> 
> 10:05:53.496479 IP 192.168.2.28.50070 > 67.215.241.250.5060: SIP, length: 723
> E`...[..@.r.....C...........REGISTER sip:losangeles.voip.ms SIP/2.0
> Via: SIP/2.0/
> 

Hmm, partial tcpdump... you should use" tcpdump -s 1000 -A" 

We miss the

Via: SIP/2.0/UDP 192.168.2.28:5060;branch=xxxxxxxx


Maybe a fix would be to use this "5060" port, instead of hardcoding it
like you did ?


> 
> Patched nf_nat_sip: router sends the replies back to port 5060, so the
> phone is now able to register itself and make calls:
> 
> 10:09:46.221631 IP 192.168.2.28.50618 > 67.215.241.250.5060: SIP, length: 723
> E`...G..@.p.....C...........REGISTER sip:losangeles.voip.ms SIP/2.0
> Via: SIP/2.0/
> 
> 10:09:46.253052 IP 67.215.241.250.5060 > 192.168.2.28.5060: SIP, length: 491
> E....+..4..$C...............SIP/2.0 100 Trying
> Via: SIP/2.0/UDP 192.168.2.28:5060
> 
> 10:09:46.253472 IP 67.215.241.250.5060 > 192.168.2.28.5060: SIP, length: 550
> E..B.,..4...C...............SIP/2.0 401 Unauthorized
> Via: SIP/2.0/UDP 192.168.2.2
> 
> 10:09:46.261602 IP 192.168.2.28.50618 > 67.215.241.250.5060: SIP, length: 900
> E`...H..@.p.....C...........REGISTER sip:losangeles.voip.ms SIP/2.0
> Via: SIP/2.0/
> 
> 10:09:46.290211 IP 67.215.241.250.5060 > 192.168.2.28.5060: SIP, length: 491
> E....-..4.."C...............SIP/2.0 100 Trying
> Via: SIP/2.0/UDP 192.168.2.28:5060
> 
> 10:09:46.295041 IP 67.215.241.250.5060 > 192.168.2.28.5060: SIP, length: 579
> E.._....4...C............K..SIP/2.0 200 OK
> Via: SIP/2.0/UDP 192.168.2.28:5060;bra
> 
> 
> BTW, I thought of two possible issues with the original patch:
> 
> 1) Might need to call skb_make_writable() prior to modifying the
> packet.  Presumably the second invocation inside
> nf_nat_mangle_udp_packet() will have no effect.
> 
> (Is there a cleaner way to mangle just the port number?  Most of the
> utility functions only help with modifying the data area.)
> 
> 2) I should probably be checking to make sure request == 0 before
> mangling the packet.  The current behavior is harmless if the SIP
> proxy is on port 5060, but that might not always be the case.
> 
> I can roll these, along with any other suggestions, into v2.



^ permalink raw reply

* Re: [PATCH] ipv4: mitigate an integer underflow when comparing tcp timestamps
From: David Miller @ 2010-11-14 19:55 UTC (permalink / raw)
  To: eric.dumazet
  Cc: r0bertz, netdev, linux-kernel, kuznet, pekkas, jmorris, yoshfuji,
	kaber
In-Reply-To: <1289724745.2743.61.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 14 Nov 2010 09:52:25 +0100

> Really, if you have multiple clients behind a common NAT, you cannot use
> this code at all, since NAT doesnt usually change TCP timestamps.

NAT is %100 incompatible with TW recycling, full stop.

There is no maybe, or maybe not.

If you are behind NAT you must not turn this feature on, ever.

^ permalink raw reply

* Re: [PATCH] tcp: restrict net.ipv4.tcp_adv_min_scale (#20312)
From: Eric Dumazet @ 2010-11-14 19:49 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: davem, shemminger, netdev
In-Reply-To: <20101114151825.GA25137@core2.telecom.by>

Le dimanche 14 novembre 2010 à 17:18 +0200, Alexey Dobriyan a écrit :
> tcp_win_from_space() does the following:
> 
> 	if (sysctl_tcp_adv_win_scale <= 0)
> 		return space >> (-sysctl_tcp_adv_win_scale);
> 	else
> 		return space - (space >> sysctl_tcp_adv_win_scale);
> 
> "space" is int.
> 
> As per C99 6.5.7 (3) shifting int for 32 or more bits is
> undefined behaviour.
> 
> Indeed, if sysctl_tcp_adv_win_scale is exactly 32, space >> 32 equals
> space and function returns 0;
> 
> Which means we busyloop in tcp_fixup_rcvbuf().
> 
> Restrict net.ipv4.tcp_adv_win_scale to [-31, 31].
> 
> Fix https://bugzilla.kernel.org/show_bug.cgi?id=20312
> 
> Steps to reproduce:
> 
> 	echo 32 >/proc/sys/net/ipv4/tcp_adv_win_scale
> 	wget www.kernel.org
> 	[softlockup]
> 
> Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
> ---
> 

Please fix your patch title, you speak of adv_min_scale ...

> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -26,6 +26,8 @@ static int zero;
>  static int tcp_retr1_max = 255;
>  static int ip_local_port_range_min[] = { 1, 1 };
>  static int ip_local_port_range_max[] = { 65535, 65535 };
> +static int _minus_31 = -31;
> +static int _31 = 31;

Please use normal symbols, not starting by underscore.

>  
>  /* Update system visible IP port range */
>  static void set_local_port_range(int range[2])
> @@ -426,7 +428,9 @@ static struct ctl_table ipv4_table[] = {
>  		.data		= &sysctl_tcp_adv_win_scale,
>  		.maxlen		= sizeof(int),
>  		.mode		= 0644,
> -		.proc_handler	= proc_dointvec
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &_minus_31,
> +		.extra2		= &_31,
>  	},
>  	{
>  		.procname	= "tcp_tw_reuse",

Please change Documentation/networking/ip-sysctl.txt to reflect possible
values of adv_win_scale

Thanks



^ permalink raw reply

* Re: [BUG]: skge not working (as module) in 2.6.37-rc1
From: Maciej Rutecki @ 2010-11-14 19:45 UTC (permalink / raw)
  To: Marin Mitov
  Cc: Stephen Hemminger, Stephen Hemminger, netdev, linux-kernel,
	David S. Miller
In-Reply-To: <201011072345.52784.mitov@issp.bas.bg>

On niedziela, 7 listopada 2010 o 22:45:52 Marin Mitov wrote:
> Hi Stephen,
> 
> skge as in 2.6.36 (and before) is working.
> As in 2.6.37-rc1 it is not:
> 

I created a Bugzilla entry at 
https://bugzilla.kernel.org/show_bug.cgi?id=22892
for your bug report, please add your address to the CC list in there, thanks!
-- 
Maciej Rutecki
http://www.maciek.unixy.pl

^ permalink raw reply

* Re: sky2 for on-board 88e8055 fails to detect mac address
From: Stephen Hemminger @ 2010-11-14 18:58 UTC (permalink / raw)
  To: Guillaume Leclanche; +Cc: netdev
In-Reply-To: <AANLkTinR9vpuGoizvNsvnpMBM-UnWS66yMjwMQaKSk_c@mail.gmail.com>

On Sun, 14 Nov 2010 12:42:06 +0100
Guillaume Leclanche <guillaume@leclanche.net> wrote:

> Hello,
> 
> I use a "Marvell Technology Group Ltd. 88E8055 PCI-E Gigabit Ethernet
> Controller".
> It seems that the sky2 module doesn't manage to grab the mac address
> from the HW :
> 
> [    0.741173] sky2: driver version 1.28
> [    0.741206] sky2 0000:02:00.0: PCI INT A -> GSI 17 (level, low) -> IRQ 17
> [    0.741216] sky2 0000:02:00.0: setting latency timer to 64
> [    0.741244] sky2 0000:02:00.0: Yukon-2 EC Ultra chip revision 2
> [    0.741339] sky2 0000:02:00.0: irq 44 for MSI/MSI-X
> [    0.744781] sky2 0000:02:00.0: eth0: addr 00:00:00:00:00:00
> 
> Of course I can set the mac address manually with "ip link set eth0
> addr ..." (and then it works fine) but I guess that's not the normal
> behavior from the driver?
> 
> I'd be happy to troubleshoot but I'm not very used to kernel
> internals, so tell me if you need some output.
> Below a few targeted info about my system.
> 
> Guillaume
> 
> 2.6.35-22-generic #35-Ubuntu SMP Sat Oct 16 20:36:48 UTC 2010 i686 GNU/Linux
> 
> 02:00.0 Ethernet controller: Marvell Technology Group Ltd. 88E8055
> PCI-E Gigabit Ethernet Controller
> 	Subsystem: Marvell Technology Group Ltd. 88E8055 PCI-E Gigabit
> Ethernet Controller
> 	Flags: bus master, fast devsel, latency 0, IRQ 44
> 	Memory at fe8fc000 (64-bit, non-prefetchable) [size=16K]
> 	I/O ports at a800 [size=256]
> 	Expansion ROM at fe8c0000 [disabled] [size=128K]
> 	Capabilities: [48] Power Management version 3
> 	Capabilities: [50] Vital Product Data
> 	Capabilities: [5c] MSI: Enable+ Count=1/1 Maskable- 64bit+
> 	Capabilities: [e0] Express Legacy Endpoint, MSI 00
> 	Capabilities: [100] Advanced Error Reporting
> 	Kernel driver in use: sky2
> 	Kernel modules: sky2

The driver reads the MAC address from EEPROM.
There are a couple of possibilities:
1. The driver could have problem when reading the value because of a timing issue.
   BUT I doubt that because when that happens the driver would see ff:ff:ff:ff:ff:ff
2. The BIOS is screwing it up on boot.
   Did you enable the device in the BIOS?
3. The vendor screwed up and didn't program the address into the hardware.

It doesn't look like a kernel/driver problem at all.




^ permalink raw reply

* Re: [PATCH net-next-2.6] bridge: add __rcu annotations
From: Stephen Hemminger @ 2010-11-14 18:47 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev
In-Reply-To: <1289685861.2743.38.camel@edumazet-laptop>

On Sat, 13 Nov 2010 23:04:21 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Le samedi 13 novembre 2010 à 10:13 -0800, Stephen Hemminger a écrit :
> > On Sat, 13 Nov 2010 18:58:50 +0100
> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > 
> > > Le samedi 13 novembre 2010 à 09:35 -0800, Stephen Hemminger a écrit :
> > > > On Sat, 13 Nov 2010 09:15:28 +0100
> > > > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > > > 
> > > > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > > > > index 578debb..ffbd177 100644
> > > > > --- a/include/linux/netdevice.h
> > > > > +++ b/include/linux/netdevice.h
> > > > > @@ -996,7 +996,10 @@ struct net_device {
> > > > >  #endif
> > > > >  
> > > > >  	rx_handler_func_t	*rx_handler;
> > > > > -	void			*rx_handler_data;
> > > > > +	union {
> > > > > +		void				*rx_handler_data;
> > > > > +		struct net_bridge_port __rcu	*br_port_rcu;
> > > > > +	};
> > > > >  
> > > > >  	struct netdev_queue __rcu *ingress_queue;
> > > > 
> > > > I don't like making the generic hook typed again.
> > > > We don't do this for other callbacks, timers, workqueues, ...
> > > > Why is it necessary for RCU notation.
> > > > 
> > > 
> > > because rcu_dereference() needs the type for __CHECKER__/sparse checks
> > > 
> > > #define __rcu_dereference_check(p, c, space) \
> > >         ({ \
> > >                 typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \
> > >                 rcu_lockdep_assert(c); \
> > >                 rcu_dereference_sparse(p, space); \
> > >                 smp_read_barrier_depends(); \
> > >                 ((typeof(*p) __force __kernel *)(_________p1)); \
> > >         })
> > > 
> > > So using a "void *ptr" is not an option
> > > 
> > > Its also cleaner to use
> > > 
> > > rcu_dereference(dev->br_port_rcu)
> > > 
> > > instead of 
> > > 
> > > (struct net_bridge_port *)rcu_dereference(dev->rx_handler_data)
> > 
> > There must be a better way. What about use of that hook by macvlan and openvswitch?
> 
> macvlan and openvswitch (is it part of linux yet ???)
> 
> I honestly dont understand your point Stephen, maybe you could explain a
> bit more what is the problem ?
> 
> I use a union, like many other ones in the kernel. This is the first
> time I ear this is not good to add type safety.
> 
> You can use either one or other field at your convenience.
> 
> If you are talking about stacking hooks, that has nothing to do with
> this (cleanup) rcu patch, but previous introduction of
> rx_handler_data/rx_handler ?
> 
> Please run sparse on x86_64 machine and watch all the warnings in bridge
> code. (with CONFIG_SPARSE_RCU_POINTER=y)
> 
> Me confused.
> 
> 

You combined three different sets of changes and introduced a needless
union. I will split them up and show you what I want.


-- 

^ permalink raw reply

* Re: [PATCH/RFC] netfilter: nf_conntrack_sip: Handle quirky Cisco phones
From: Kevin Cernekee @ 2010-11-14 18:33 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, David S. Miller, Alexey Kuznetsov,
	Pekka Savola (ipv6), James Morris, Hideaki YOSHIFUJI,
	netfilter-devel, netfilter, coreteam, linux-kernel, netdev
In-Reply-To: <1289725175.2743.65.camel@edumazet-laptop>

On Sun, Nov 14, 2010 at 12:59 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> I would like to get an exact SIP exchange to make sure their is not
> another way to handle this without adding a "Cisco" string somewhere...
>
> Please provide a pcap or tcpdump -A

Existing nf_nat_sip: phone sends unauthenticated REGISTER requests
over and over again, because it is not seeing the replies sent back to
port 50070:

10:05:53.496479 IP 192.168.2.28.50070 > 67.215.241.250.5060: SIP, length: 723
E`...[..@.r.....C...........REGISTER sip:losangeles.voip.ms SIP/2.0
Via: SIP/2.0/

10:05:53.587370 IP 67.215.241.250.5060 > 192.168.2.28.50070: SIP, length: 486
E.......3..fC...............SIP/2.0 100 Trying
Via: SIP/2.0/UDP 192.168.2.28:5060

10:05:53.587807 IP 67.215.241.250.5060 > 192.168.2.28.50070: SIP, length: 550
E..B....3..%C...............SIP/2.0 401 Unauthorized
Via: SIP/2.0/UDP 192.168.2.2

10:05:57.496541 IP 192.168.2.28.50070 > 67.215.241.250.5060: SIP, length: 723
E`...\..@.r.....C...........REGISTER sip:losangeles.voip.ms SIP/2.0
Via: SIP/2.0/

10:05:57.526716 IP 67.215.241.250.5060 > 192.168.2.28.50070: SIP, length: 486
E.......3..dC...............SIP/2.0 100 Trying
Via: SIP/2.0/UDP 192.168.2.28:5060

10:05:57.527162 IP 67.215.241.250.5060 > 192.168.2.28.50070: SIP, length: 550
E..B....3..#C...............SIP/2.0 401 Unauthorized
Via: SIP/2.0/UDP 192.168.2.2

10:06:01.486821 IP 192.168.2.28.50070 > 67.215.241.250.5060: SIP, length: 723
E`...]..@.r.....C...........REGISTER sip:losangeles.voip.ms SIP/2.0
Via: SIP/2.0/

10:06:01.515611 IP 67.215.241.250.5060 > 192.168.2.28.50070: SIP, length: 486
E.......3..bC...............SIP/2.0 100 Trying
Via: SIP/2.0/UDP 192.168.2.28:5060

10:06:01.516024 IP 67.215.241.250.5060 > 192.168.2.28.50070: SIP, length: 550
E..B....3..!C...............SIP/2.0 401 Unauthorized
Via: SIP/2.0/UDP 192.168.2.2

... continues forever ...


Patched nf_nat_sip: router sends the replies back to port 5060, so the
phone is now able to register itself and make calls:

10:09:46.221631 IP 192.168.2.28.50618 > 67.215.241.250.5060: SIP, length: 723
E`...G..@.p.....C...........REGISTER sip:losangeles.voip.ms SIP/2.0
Via: SIP/2.0/

10:09:46.253052 IP 67.215.241.250.5060 > 192.168.2.28.5060: SIP, length: 491
E....+..4..$C...............SIP/2.0 100 Trying
Via: SIP/2.0/UDP 192.168.2.28:5060

10:09:46.253472 IP 67.215.241.250.5060 > 192.168.2.28.5060: SIP, length: 550
E..B.,..4...C...............SIP/2.0 401 Unauthorized
Via: SIP/2.0/UDP 192.168.2.2

10:09:46.261602 IP 192.168.2.28.50618 > 67.215.241.250.5060: SIP, length: 900
E`...H..@.p.....C...........REGISTER sip:losangeles.voip.ms SIP/2.0
Via: SIP/2.0/

10:09:46.290211 IP 67.215.241.250.5060 > 192.168.2.28.5060: SIP, length: 491
E....-..4.."C...............SIP/2.0 100 Trying
Via: SIP/2.0/UDP 192.168.2.28:5060

10:09:46.295041 IP 67.215.241.250.5060 > 192.168.2.28.5060: SIP, length: 579
E.._....4...C............K..SIP/2.0 200 OK
Via: SIP/2.0/UDP 192.168.2.28:5060;bra


BTW, I thought of two possible issues with the original patch:

1) Might need to call skb_make_writable() prior to modifying the
packet.  Presumably the second invocation inside
nf_nat_mangle_udp_packet() will have no effect.

(Is there a cleaner way to mangle just the port number?  Most of the
utility functions only help with modifying the data area.)

2) I should probably be checking to make sure request == 0 before
mangling the packet.  The current behavior is harmless if the SIP
proxy is on port 5060, but that might not always be the case.

I can roll these, along with any other suggestions, into v2.

^ permalink raw reply

* Attention Please,  Investment Initiative From Jozef Ruud Roosevelt.
From: Jozef Ruud Roosevelt @ 2010-11-14 14:47 UTC (permalink / raw)





Attention Please,

I am Jozef Ruud Roosevelt, a Dutch National presently residing in
Newcastle United Kingdom, I have a proposition Involving an investment
initiative in your country economy  to discuss with you, It will be of
mutual benefit to both of us, and I believe we can handle it together,
once we have a common understanding and mutual cooperation in the
execution of the modalities. I work with Abn Amro Bank as an auditor.

Should you be interested, please e-mail back to me through this email
address: (jozef.rosevelt@live.co.uk) or you can give me a call (+44 702
4056 175).
I await your earliest response.

Yours Sincerely,
Jozef Ruud Roosevel.


^ permalink raw reply

* [PATCH] net: ipv4: tcp_probe: cleanup snprintf() use
From: Vasiliy Kulikov @ 2010-11-14 17:06 UTC (permalink / raw)
  To: kernel-janitors
  Cc: David S. Miller, Alexey Kuznetsov, Pekka Savola (ipv6),
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, netdev,
	linux-kernel

snprintf() returns number of bytes that were copied if there is no overflow.
This code uses return value as number of copied bytes.  Theoretically format
string '%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n' may be expanded
up to 163 bytes.  In reality tv.tv_sec is just few bytes instead of 20, 2 ports
are just 5 bytes each instead of 10, length is 5 bytes instead of 10.  The rest
is an unstrusted input.  Theoretically if tv_sec is big then copy_to_user() would
overflow tbuf.

tbuf was increased to fit in 163 bytes.  snprintf() is used to follow return
value semantic.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
---
 Compile tested.

Format length:
	20 for '%lu'
	1 for '.'
	9 for '%09lu'
	1 for ' '
	15 for '%pI4'
	1 for ':'
	10 for '%u'
	1 for ' '
	15 for '%pI4'
	1 for ':'
	10 for '%u'
	1 for ' '
	11 for '%d'
	1 for ' '
	10 for '%#x'
	1 for ' '
	10 for '%#x'
	1 for ' '
	10 for '%u'
	1 for ' '
	10 for '%u'
	1 for ' '
	10 for '%u'
	1 for ' '
	10 for '%u'
	1 for '\n'
163 for '%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n'

 net/ipv4/tcp_probe.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 6211e21..3b7bf19 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -154,7 +154,7 @@ static int tcpprobe_sprint(char *tbuf, int n)
 	struct timespec tv
 		= ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
 
-	return snprintf(tbuf, n,
+	return scnprintf(tbuf, n,
 			"%lu.%09lu %pI4:%u %pI4:%u %d %#x %#x %u %u %u %u\n",
 			(unsigned long) tv.tv_sec,
 			(unsigned long) tv.tv_nsec,
@@ -174,7 +174,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,
 		return -EINVAL;
 
 	while (cnt < len) {
-		char tbuf[128];
+		char tbuf[164];
 		int width;
 
 		/* Wait for data in buffer */
-- 
1.7.0.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox