Netdev List

Netdev List
 help / color / mirror / Atom feed

* [Patch 3/3] net: reserve ports for applications using fixed port numbers
From: Amerigo Wang @ 2010-04-12 10:04 UTC (permalink / raw)
  To: linux-kernel
  Cc: Octavian Purdila, Eric Dumazet, penguin-kernel, netdev,
	Neil Horman, Amerigo Wang, David Miller, ebiederm
In-Reply-To: <20100412100744.5302.92442.sendpatchset@localhost.localdomain>

From: Octavian Purdila <opurdila@ixiacom.com>

This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports which
allows users to reserve ports for third-party applications.

The reserved ports will not be used by automatic port assignments
(e.g. when calling connect() or bind() with port number 0). Explicit
port allocation behavior is unchanged.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---

Index: linux-2.6/Documentation/networking/ip-sysctl.txt
===================================================================
--- linux-2.6.orig/Documentation/networking/ip-sysctl.txt
+++ linux-2.6/Documentation/networking/ip-sysctl.txt
@@ -588,6 +588,37 @@ ip_local_port_range - 2 INTEGERS
 	(i.e. by default) range 1024-4999 is enough to issue up to
 	2000 connections per second to systems supporting timestamps.
 
+ip_local_reserved_ports - list of comma separated ranges
+	Specify the ports which are reserved for known third-party
+	applications. These ports will not be used by automatic port
+	assignments (e.g. when calling connect() or bind() with port
+	number 0). Explicit port allocation behavior is unchanged.
+
+	The format used for both input and output is a comma separated
+	list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and
+	10). Writing to the file will clear all previously reserved
+	ports and update the current list with the one given in the
+	input.
+
+	Note that ip_local_port_range and ip_local_reserved_ports
+	settings are independent and both are considered by the kernel
+	when determining which ports are available for automatic port
+	assignments.
+
+	You can reserve ports which are not in the current
+	ip_local_port_range, e.g.:
+
+	$ cat /proc/sys/net/ipv4/ip_local_port_range
+	32000	61000
+	$ cat /proc/sys/net/ipv4/ip_local_reserved_ports
+	8080,9148
+
+	although this is redundant. However such a setting is useful
+	if later the port range is changed to a value that will
+	include the reserved ports.
+
+	Default: Empty
+
 ip_nonlocal_bind - BOOLEAN
 	If set, allows processes to bind() to non-local IP addresses,
 	which can be quite useful - but may break some applications.
Index: linux-2.6/drivers/infiniband/core/cma.c
===================================================================
--- linux-2.6.orig/drivers/infiniband/core/cma.c
+++ linux-2.6/drivers/infiniband/core/cma.c
@@ -1980,6 +1980,8 @@ retry:
 	/* FIXME: add proper port randomization per like inet_csk_get_port */
 	do {
 		ret = idr_get_new_above(ps, bind_list, next_port, &port);
+		if (!ret && inet_is_reserved_local_port(port))
+			ret = -EAGAIN;
 	} while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
 
 	if (ret)
@@ -2995,11 +2997,19 @@ static void cma_remove_one(struct ib_dev
 static int __init cma_init(void)
 {
 	int ret, low, high, remaining;
+	int tries = 10;
 
-	get_random_bytes(&next_port, sizeof next_port);
 	inet_get_local_port_range(&low, &high);
+again:
+	get_random_bytes(&next_port, sizeof next_port);
 	remaining = (high - low) + 1;
 	next_port = ((unsigned int) next_port % remaining) + low;
+	if (inet_is_reserved_local_port(next_port)) {
+		if (tries--)
+			goto again;
+		else
+			return -EBUSY;
+	}
 
 	cma_wq = create_singlethread_workqueue("rdma_cm");
 	if (!cma_wq)
Index: linux-2.6/include/net/ip.h
===================================================================
--- linux-2.6.orig/include/net/ip.h
+++ linux-2.6/include/net/ip.h
@@ -184,6 +184,12 @@ extern struct local_ports {
 } sysctl_local_ports;
 extern void inet_get_local_port_range(int *low, int *high);
 
+extern unsigned long *sysctl_local_reserved_ports;
+static inline int inet_is_reserved_local_port(int port)
+{
+	return test_bit(port, sysctl_local_reserved_ports);
+}
+
 extern int sysctl_ip_default_ttl;
 extern int sysctl_ip_nonlocal_bind;
 
Index: linux-2.6/net/ipv4/af_inet.c
===================================================================
--- linux-2.6.orig/net/ipv4/af_inet.c
+++ linux-2.6/net/ipv4/af_inet.c
@@ -1552,9 +1552,13 @@ static int __init inet_init(void)
 
 	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
 
+	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+	if (!sysctl_local_reserved_ports)
+		goto out;
+
 	rc = proto_register(&tcp_prot, 1);
 	if (rc)
-		goto out;
+		goto out_free_reserved_ports;
 
 	rc = proto_register(&udp_prot, 1);
 	if (rc)
@@ -1653,6 +1657,8 @@ out_unregister_udp_proto:
 	proto_unregister(&udp_prot);
 out_unregister_tcp_proto:
 	proto_unregister(&tcp_prot);
+out_free_reserved_ports:
+	kfree(sysctl_local_reserved_ports);
 	goto out;
 }
 
Index: linux-2.6/net/ipv4/inet_connection_sock.c
===================================================================
--- linux-2.6.orig/net/ipv4/inet_connection_sock.c
+++ linux-2.6/net/ipv4/inet_connection_sock.c
@@ -37,6 +37,9 @@ struct local_ports sysctl_local_ports __
 	.range = { 32768, 61000 },
 };
 
+unsigned long *sysctl_local_reserved_ports;
+EXPORT_SYMBOL(sysctl_local_reserved_ports);
+
 void inet_get_local_port_range(int *low, int *high)
 {
 	unsigned seq;
@@ -108,6 +111,8 @@ again:
 
 		smallest_size = -1;
 		do {
+			if (inet_is_reserved_local_port(rover))
+				goto next_nolock;
 			head = &hashinfo->bhash[inet_bhashfn(net, rover,
 					hashinfo->bhash_size)];
 			spin_lock(&head->lock);
@@ -130,6 +135,7 @@ again:
 			break;
 		next:
 			spin_unlock(&head->lock);
+		next_nolock:
 			if (++rover > high)
 				rover = low;
 		} while (--remaining > 0);
Index: linux-2.6/net/ipv4/inet_hashtables.c
===================================================================
--- linux-2.6.orig/net/ipv4/inet_hashtables.c
+++ linux-2.6/net/ipv4/inet_hashtables.c
@@ -456,6 +456,8 @@ int __inet_hash_connect(struct inet_time
 		local_bh_disable();
 		for (i = 1; i <= remaining; i++) {
 			port = low + (i + offset) % remaining;
+			if (inet_is_reserved_local_port(port))
+				continue;
 			head = &hinfo->bhash[inet_bhashfn(net, port,
 					hinfo->bhash_size)];
 			spin_lock(&head->lock);
Index: linux-2.6/net/ipv4/sysctl_net_ipv4.c
===================================================================
--- linux-2.6.orig/net/ipv4/sysctl_net_ipv4.c
+++ linux-2.6/net/ipv4/sysctl_net_ipv4.c
@@ -299,6 +299,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= ipv4_local_port_range,
 	},
+	{
+		.procname	= "ip_local_reserved_ports",
+		.data		= NULL, /* initialized in sysctl_ipv4_init */
+		.maxlen		= 65536,
+		.mode		= 0644,
+		.proc_handler	= proc_do_large_bitmap,
+	},
 #ifdef CONFIG_IP_MULTICAST
 	{
 		.procname	= "igmp_max_memberships",
@@ -736,6 +743,16 @@ static __net_initdata struct pernet_oper
 static __init int sysctl_ipv4_init(void)
 {
 	struct ctl_table_header *hdr;
+	struct ctl_table *i;
+
+	for (i = ipv4_table; i->procname; i++) {
+		if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
+			i->data = sysctl_local_reserved_ports;
+			break;
+		}
+	}
+	if (!i->procname)
+		return -EINVAL;
 
 	hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
 	if (hdr == NULL)
Index: linux-2.6/net/ipv4/udp.c
===================================================================
--- linux-2.6.orig/net/ipv4/udp.c
+++ linux-2.6/net/ipv4/udp.c
@@ -233,7 +233,8 @@ int udp_lib_get_port(struct sock *sk, un
 			 */
 			do {
 				if (low <= snum && snum <= high &&
-				    !test_bit(snum >> udptable->log, bitmap))
+				    !test_bit(snum >> udptable->log, bitmap) &&
+				    !inet_is_reserved_local_port(snum))
 					goto found;
 				snum += rand;
 			} while (snum != first);
Index: linux-2.6/net/sctp/socket.c
===================================================================
--- linux-2.6.orig/net/sctp/socket.c
+++ linux-2.6/net/sctp/socket.c
@@ -5436,6 +5436,8 @@ static long sctp_get_port_local(struct s
 			rover++;
 			if ((rover < low) || (rover > high))
 				rover = low;
+			if (inet_is_reserved_local_port(rover))
+				continue;
 			index = sctp_phashfn(rover);
 			head = &sctp_port_hashtable[index];
 			sctp_spin_lock(&head->lock);

^ permalink raw reply

* Re: [Patch 1/3] sysctl: refactor integer handling proc code
From: Alexey Dobriyan @ 2010-04-12 10:18 UTC (permalink / raw)
  To: Amerigo Wang
  Cc: linux-kernel, Octavian Purdila, Eric Dumazet, penguin-kernel,
	netdev, Neil Horman, ebiederm, David Miller
In-Reply-To: <20100412100754.5302.99552.sendpatchset@localhost.localdomain>

On Mon, Apr 12, 2010 at 06:04:04AM -0400, Amerigo Wang wrote:
> As we are about to add another integer handling proc function a little
> bit of cleanup is in order: add a few helper functions to improve code
> readability and decrease code duplication.
> 
> In the process a bug is also fixed: if the user specifies a number
> with more then 20 digits it will be interpreted as two integers
> (e.g. 10000...13 will be interpreted as 100.... and 13).

ULONG_MAX is not 22 digits always.

The fix is to not rely on simple_strtoul()

I guess it's time to finally remove it. :-(

Also, it's better to copy_from user stuff once.
Without looking at non-trivial users, one page should be enough.

> Behavior for EFAULT handling was changed as well. Previous to this
> patch, when an EFAULT error occurred in the middle of a write
> operation, although some of the elements were set, that was not
> acknowledged to the user (by shorting the write and returning the
> number of bytes accepted). EFAULT is now treated just like any other
> errors by acknowledging the amount of bytes accepted.

> +static int proc_skip_wspace(char __user **buf, size_t *size)
> +{
> +	char c;
> +
> +	while (*size) {
> +		if (get_user(c, *buf))
> +			return -EFAULT;
> +		if (!isspace(c))
> +			break;
> +		(*size)--;
> +		(*buf)++;
> +	}
> +
> +	return 0;
> +}

yeah, copy_from_user once, so we won't have this.

> +static bool isanyof(char c, const char *v, unsigned len)

A what?
this is memchr()

> +{
> +	int i;
> +
> +	if (!len)
> +		return false;
> +
> +	for (i = 0; i < len; i++)
> +		if (c == v[i])
> +			break;
> +	if (i == len)
> +		return false;
> +
> +	return true;
> +}
> +
> +#define TMPBUFLEN 22
> +/**
> + * proc_get_long - reads an ASCII formated integer from a user buffer
> + *
> + * @buf - user buffer
> + * @size - size of the user buffer
> + * @val - this is where the number will be stored
> + * @neg - set to %TRUE if number is negative
> + * @perm_tr - a vector which contains the allowed trailers
> + * @perm_tr_len - size of the perm_tr vector
> + * @tr - pointer to store the trailer character
> + *
> + * In case of success 0 is returned and buf and size are updated with
> + * the amount of bytes read. If tr is non NULL and a trailing
> + * character exist (size is non zero after returning from this
> + * function) tr is updated with the trailing character.
> + */
> +static int proc_get_long(char __user **buf, size_t *size,
> +			  unsigned long *val, bool *neg,
> +			  const char *perm_tr, unsigned perm_tr_len, char *tr)
> +{
> +	int len;
> +	char *p, tmp[TMPBUFLEN];
> +
> +	if (!*size)
> +		return -EINVAL;
> +
> +	len = *size;
> +	if (len > TMPBUFLEN-1)
> +		len = TMPBUFLEN-1;
> +
> +	if (copy_from_user(tmp, *buf, len))
> +		return -EFAULT;
> +
> +	tmp[len] = 0;
> +	p = tmp;
> +	if (*p == '-' && *size > 1) {
> +		*neg = 1;
> +		p++;
> +	} else
> +		*neg = 0;
> +	if (!isdigit(*p))
> +		return -EINVAL;
> +
> +	*val = simple_strtoul(p, &p, 0);
> +
> +	len = p - tmp;
> +
> +	/* We don't know if the next char is whitespace thus we may accept
> +	 * invalid integers (e.g. 1234...a) or two integers instead of one
> +	 * (e.g. 123...1). So lets not allow such large numbers. */
> +	if (len == TMPBUFLEN - 1)
> +		return -EINVAL;
> +
> +	if (len < *size && perm_tr_len && !isanyof(*p, perm_tr, perm_tr_len))
> +		return -EINVAL;
> +
> +	if (tr && (len < *size))
> +		*tr = *p;
> +
> +	*buf += len;
> +	*size -= len;
> +
> +	return 0;
> +}

^ permalink raw reply

* Re: [Patch 1/3] sysctl: refactor integer handling proc code
From: Alexey Dobriyan @ 2010-04-12 10:18 UTC (permalink / raw)
  To: Amerigo Wang
  Cc: linux-kernel, Octavian Purdila, Eric Dumazet, penguin-kernel,
	netdev, Neil Horman, ebiederm, David Miller
In-Reply-To: <20100412100754.5302.99552.sendpatchset@localhost.localdomain>

On Mon, Apr 12, 2010 at 06:04:04AM -0400, Amerigo Wang wrote:
> As we are about to add another integer handling proc function a little
> bit of cleanup is in order: add a few helper functions to improve code
> readability and decrease code duplication.
> 
> In the process a bug is also fixed: if the user specifies a number
> with more then 20 digits it will be interpreted as two integers
> (e.g. 10000...13 will be interpreted as 100.... and 13).

ULONG_MAX is not 22 digits always.

The fix is to not rely on simple_strtoul()

I guess it's time to finally remove it. :-(

Also, it's better to copy_from user stuff once.
Without looking at non-trivial users, one page should be enough.

> Behavior for EFAULT handling was changed as well. Previous to this
> patch, when an EFAULT error occurred in the middle of a write
> operation, although some of the elements were set, that was not
> acknowledged to the user (by shorting the write and returning the
> number of bytes accepted). EFAULT is now treated just like any other
> errors by acknowledging the amount of bytes accepted.

> +static int proc_skip_wspace(char __user **buf, size_t *size)
> +{
> +	char c;
> +
> +	while (*size) {
> +		if (get_user(c, *buf))
> +			return -EFAULT;
> +		if (!isspace(c))
> +			break;
> +		(*size)--;
> +		(*buf)++;
> +	}
> +
> +	return 0;
> +}

yeah, copy_from_user once, so we won't have this.

> +static bool isanyof(char c, const char *v, unsigned len)

A what?
this is memchr()

> +{
> +	int i;
> +
> +	if (!len)
> +		return false;
> +
> +	for (i = 0; i < len; i++)
> +		if (c == v[i])
> +			break;
> +	if (i == len)
> +		return false;
> +
> +	return true;
> +}
> +
> +#define TMPBUFLEN 22
> +/**
> + * proc_get_long - reads an ASCII formated integer from a user buffer
> + *
> + * @buf - user buffer
> + * @size - size of the user buffer
> + * @val - this is where the number will be stored
> + * @neg - set to %TRUE if number is negative
> + * @perm_tr - a vector which contains the allowed trailers
> + * @perm_tr_len - size of the perm_tr vector
> + * @tr - pointer to store the trailer character
> + *
> + * In case of success 0 is returned and buf and size are updated with
> + * the amount of bytes read. If tr is non NULL and a trailing
> + * character exist (size is non zero after returning from this
> + * function) tr is updated with the trailing character.
> + */
> +static int proc_get_long(char __user **buf, size_t *size,
> +			  unsigned long *val, bool *neg,
> +			  const char *perm_tr, unsigned perm_tr_len, char *tr)
> +{
> +	int len;
> +	char *p, tmp[TMPBUFLEN];
> +
> +	if (!*size)
> +		return -EINVAL;
> +
> +	len = *size;
> +	if (len > TMPBUFLEN-1)
> +		len = TMPBUFLEN-1;
> +
> +	if (copy_from_user(tmp, *buf, len))
> +		return -EFAULT;
> +
> +	tmp[len] = 0;
> +	p = tmp;
> +	if (*p == '-' && *size > 1) {
> +		*neg = 1;
> +		p++;
> +	} else
> +		*neg = 0;
> +	if (!isdigit(*p))
> +		return -EINVAL;
> +
> +	*val = simple_strtoul(p, &p, 0);
> +
> +	len = p - tmp;
> +
> +	/* We don't know if the next char is whitespace thus we may accept
> +	 * invalid integers (e.g. 1234...a) or two integers instead of one
> +	 * (e.g. 123...1). So lets not allow such large numbers. */
> +	if (len == TMPBUFLEN - 1)
> +		return -EINVAL;
> +
> +	if (len < *size && perm_tr_len && !isanyof(*p, perm_tr, perm_tr_len))
> +		return -EINVAL;
> +
> +	if (tr && (len < *size))
> +		*tr = *p;
> +
> +	*buf += len;
> +	*size -= len;
> +
> +	return 0;
> +}

^ permalink raw reply

* Re: [v3 Patch 2/3] bridge: make bridge support netpoll
From: Cong Wang @ 2010-04-12 10:37 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: linux-kernel, netdev, bridge, Andy Gospodarek, Neil Horman,
	Jeff Moyer, Matt Mackall, bonding-devel, Jay Vosburgh,
	David Miller
In-Reply-To: <20100408083710.2b61ee44@nehalam>

Stephen Hemminger wrote:
>> Index: linux-2.6/net/bridge/br_forward.c
>> ===================================================================
>> --- linux-2.6.orig/net/bridge/br_forward.c
>> +++ linux-2.6/net/bridge/br_forward.c
>> @@ -15,6 +15,7 @@
>>  #include <linux/slab.h>
>>  #include <linux/kernel.h>
>>  #include <linux/netdevice.h>
>> +#include <linux/netpoll.h>
>>  #include <linux/skbuff.h>
>>  #include <linux/if_vlan.h>
>>  #include <linux/netfilter_bridge.h>
>> @@ -50,7 +51,13 @@ int br_dev_queue_push_xmit(struct sk_buf
>>  		else {
>>  			skb_push(skb, ETH_HLEN);
>>  
>> -			dev_queue_xmit(skb);
>> +#ifdef CONFIG_NET_POLL_CONTROLLER
>> +			if (skb->dev->priv_flags & IFF_IN_NETPOLL) {
>> +				netpoll_send_skb(skb->dev->npinfo->netpoll, skb);
>> +				skb->dev->priv_flags &= ~IFF_IN_NETPOLL;
>> +			} else
>> +#endif
> 
> There is no protection on dev->priv_flags for SMP access.
> It would better bit value in dev->state if you are using it as control flag.
> 
> Then you could use 
> 			if (unlikely(test_and_clear_bit(__IN_NETPOLL, &skb->dev->state)))
> 				netpoll_send_skb(...)
> 
> 

Hmm, I think we can't use ->state here, it is not for this kind of purpose,
according to its comments.

Also, I find other usages of IFF_XXX flags of ->priv_flags are also using
&, | to set or clear the flags. So there must be some other things preventing
the race...


Thanks.

^ permalink raw reply

* Re: [v3 Patch 2/3] bridge: make bridge support netpoll
From: Eric Dumazet @ 2010-04-12 10:38 UTC (permalink / raw)
  To: Cong Wang
  Cc: Stephen Hemminger, linux-kernel, netdev, bridge, Andy Gospodarek,
	Neil Horman, Jeff Moyer, Matt Mackall, bonding-devel,
	Jay Vosburgh, David Miller
In-Reply-To: <4BC2F7E2.7020309@redhat.com>

Le lundi 12 avril 2010 à 18:37 +0800, Cong Wang a écrit :
> Stephen Hemminger wrote:
> > There is no protection on dev->priv_flags for SMP access.
> > It would better bit value in dev->state if you are using it as control flag.
> > 
> > Then you could use 
> > 			if (unlikely(test_and_clear_bit(__IN_NETPOLL, &skb->dev->state)))
> > 				netpoll_send_skb(...)
> > 
> > 
> 
> Hmm, I think we can't use ->state here, it is not for this kind of purpose,
> according to its comments.
> 
> Also, I find other usages of IFF_XXX flags of ->priv_flags are also using
> &, | to set or clear the flags. So there must be some other things preventing
> the race...

Yes, its RTNL that protects priv_flags changes, hopefully...

^ permalink raw reply

* [PATCH] iproute2: add option to build m_xt as a tc module.
From: Andreas Henriksson @ 2010-04-12 11:55 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

Add TC_CONFIG_XT_MODULE option that can be added
either to Config (after ./configure) or as an argument to "make".

This will build the xt module (action ipt) of tc as a
shared object that is linked at runtime by tc if used,
rather then built into tc.

This is similar to how the atm qdisc support
is handled (q_atm.so).

Signed-off-by: Andreas Henriksson <andreas@fatal.se>

---

The reason for this is simply to be able to avoid
the tc binary from being linked to libxtables. This way
distributions who ship binary packages can
avoid a dependency on the iptables package by
ignoring m_xt.so in the dependency analysis
and let actual users of the tc arguments "action ipt"
make sure they have iptables installed.
(See http://bugs.debian.org/576953 )

This was not a problem with the old/deprecated
m_ipt module which did runtime linking of
the iptables library.

Having the split inside tc, rather then between tc and the required
library, is preferred. This way we'll notice at build-time
when the required library breaks API/ABI rather
then having to rely on people that uses the functionality
to report back when the ABI is broken.
(We've learned this the hard way in debian after many
angry bugreports.)

I've had jamal pre-review this and he didn't see any
problems with this.

diff --git a/tc/Makefile b/tc/Makefile
index 805c108..3af33cf 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -43,10 +43,18 @@ TCMODULES += em_cmp.o
 TCMODULES += em_u32.o
 TCMODULES += em_meta.o

+TCSO :=
+ifeq ($(TC_CONFIG_ATM),y)
+  TCSO += q_atm.so
+endif

 ifeq ($(TC_CONFIG_XT),y)
-  TCMODULES += m_xt.o
-  LDLIBS += -lxtables
+  ifeq ($(TC_CONFIG_XT_MODULE),y)
+    TCSO += m_xt.so
+  else
+    TCMODULES += m_xt.o
+    LDLIBS += -lxtables
+  endif
 else
   ifeq ($(TC_CONFIG_XT_OLD),y)
     TCMODULES += m_xt_old.o
@@ -81,11 +89,6 @@ ifneq ($(IPT_LIB_DIR),)
 	CFLAGS += -DIPT_LIB_DIR=\"$(IPT_LIB_DIR)\"
 endif

-TCSO :=
-ifeq ($(TC_CONFIG_ATM),y)
-  TCSO += q_atm.so
-endif
-
 YACC := bison
 LEX := flex

@@ -114,6 +117,9 @@ clean:
 q_atm.so: q_atm.c
 	$(CC) $(CFLAGS) $(LDFLAGS) -shared -fpic -o q_atm.so q_atm.c -latm

+m_xt.so: m_xt.c
+	$(CC) $(CFLAGS) $(LDFLAGS) -shared -fpic -o m_xt.so m_xt.c -lxtables
+
 %.yacc.c: %.y
 	$(YACC) $(YACCFLAGS) -o $@ $<

^ permalink raw reply related

* Bug#572201: forcedeth driver hangs under heavy load
From: stephen mulcahy @ 2010-04-12 12:39 UTC (permalink / raw)
  To: netdev; +Cc: Ben Hutchings, Eric Dumazet, Ayaz Abdulla, 572201
In-Reply-To: <4BC2EF88.3060203@atlanticlinux.ie>

stephen mulcahy wrote:
> It doesn't - further testing over the weekend saw 6 of 45 machines drop 
> off the network with this problem. Nothing in dmesg or system logs. 
> Happy to run more tests if someone can advise on what should be run.

I also just tried using the 2.6.30-2-amd64 (Debian) forcedeth kernel 
module while running the 2.6.32-3-amd64 (Debian) kernel and experienced 
the same symptoms.

Not sure if thats any help.

-stephen

^ permalink raw reply

* Bug#572201: forcedeth driver hangs under heavy load
From: Eric Dumazet @ 2010-04-12 12:47 UTC (permalink / raw)
  To: stephen mulcahy; +Cc: netdev, Ben Hutchings, Ayaz Abdulla, 572201
In-Reply-To: <4BC31486.1090603@gmail.com>

Le lundi 12 avril 2010 à 13:39 +0100, stephen mulcahy a écrit :
> stephen mulcahy wrote:
> > It doesn't - further testing over the weekend saw 6 of 45 machines drop 
> > off the network with this problem. Nothing in dmesg or system logs. 
> > Happy to run more tests if someone can advise on what should be run.
> 
> I also just tried using the 2.6.30-2-amd64 (Debian) forcedeth kernel 
> module while running the 2.6.32-3-amd64 (Debian) kernel and experienced 
> the same symptoms.
> 
> Not sure if thats any help.
> 

I am not sure I understand. Are you saying that using 2.6.30-2-amd64
kernel also makes your forcedeth adapter being not functional ?

Are both way non functional (RX and TX), or only one side ?






-- 
To UNSUBSCRIBE, email to debian-bugs-dist-REQUEST@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmaster@lists.debian.org

^ permalink raw reply

* Re: forcedeth driver hangs under heavy load
From: stephen mulcahy @ 2010-04-12 13:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Ben Hutchings, Ayaz Abdulla, 572201
In-Reply-To: <1271076426.16881.21.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le lundi 12 avril 2010 à 13:39 +0100, stephen mulcahy a écrit :
> I am not sure I understand. Are you saying that using 2.6.30-2-amd64
> kernel also makes your forcedeth adapter being not functional ?

Hi Eric,

If I run my tests with the 2.6.30-2-amd64 kernel the network doesn't 
malfunction.

If I run my tests with the 2.6.32-3-amd64 kernel the network does 
malfunction.

If I take the forcedeth.ko module from the 2.6.30-2-amd64 kernel and 
drop that into /lib/modules/2.6.32-3-amd64/kernel/drivers/net/ and then 
reboot to 2.6.32-3-amd64 and rerun my tests - the network does malfunction.

> Are both way non functional (RX and TX), or only one side ?

Whats the best way of testing this? (tcpdump listening on both hosts and 
then running pings between the systems?)

-stephen

^ permalink raw reply

* Re: forcedeth driver hangs under heavy load
From: stephen mulcahy @ 2010-04-12 13:19 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Ben Hutchings, Ayaz Abdulla, 572201
In-Reply-To: <4BC31AA0.5070006@gmail.com>

stephen mulcahy wrote:
>> Are both way non functional (RX and TX), or only one side ?
> 
> Whats the best way of testing this? (tcpdump listening on both hosts and 
> then running pings between the systems?)


stephen mulcahy wrote:
 >> Are both way non functional (RX and TX), or only one side ?
 >
 > Whats the best way of testing this? (tcpdump listening on both hosts and
 > then running pings between the systems?)

On one of the nodes that is in the malfunctioning state (node05), I ran

ssh node20

and grabbed the following output from running tcpdump on node20

root@node20:~# tcpdump host node20 and node05
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 96 bytes
14:12:59.612626 IP node05.webstar.cnet.36295 > node20.ssh: Flags [S], 
seq 3677858646, win 5840, options [mss 1460,sackOK,TS val 1599534 ecr 
0,nop,wscale 7], length 0
14:12:59.612656 IP node20.ssh > node05.webstar.cnet.36295: Flags [S.], 
seq 3610575850, ack 3677858647, win 5792, options [mss 1460,sackOK,TS 
val 1598775 ecr 1599534,nop,wscale 7], length 0
14:12:59.612718 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], 
ack 1, win 46, options [nop,nop,TS val 1599534 ecr 1598775], length 0
14:12:59.617434 IP node20.ssh > node05.webstar.cnet.36295: Flags [P.], 
seq 1:33, ack 1, win 46, options [nop,nop,TS val 1598776 ecr 1599534], 
length 32
14:12:59.617522 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], 
ack 33, win 46, options [nop,nop,TS val 1599535 ecr 1598776], length 0
14:12:59.617609 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1599535 ecr 1598776], 
length 32
14:12:59.820434 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 4294936586:4294936618, ack 2620194849, win 46, options [nop,nop,TS 
val 1599586 ecr 1598776], length 32
14:13:00.229069 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 4294961734:4294961766, ack 3928358945, win 46, options [nop,nop,TS 
val 1599688 ecr 1598776], length 32
14:13:01.044396 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 4294964167:4294964199, ack 410320929, win 46, options [nop,nop,TS 
val 1599892 ecr 1598776], length 32
14:13:02.676308 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1600300 ecr 1598776], 
length 32
14:13:05.940804 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 17294:17326, ack 3045851169, win 46, options [nop,nop,TS val 1601116 
ecr 1598776], length 32
14:13:12.468484 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 17294:17326, ack 3045851169, win 46, options [nop,nop,TS val 1602748 
ecr 1598776], length 32
14:13:23.846891 IP node20.ssh > node05.webstar.cnet.36084: Flags [F.], 
seq 2093054475, ack 2175389538, win 46, options [nop,nop,TS val 1604834 
ecr 1575591], length 0
14:13:23.847278 IP node05.webstar.cnet.36084 > node20.ssh: Flags [R], 
seq 2175389538, win 0, length 0
14:13:25.523850 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1606012 ecr 1598776], 
length 32
14:13:50.127509 IP node20.ssh > node05.webstar.cnet.36143: Flags [F.], 
seq 2526196657, ack 2590340885, win 46, options [nop,nop,TS val 1611404 
ecr 1582161], length 0
14:13:50.127879 IP node05.webstar.cnet.36143 > node20.ssh: Flags [R], 
seq 2590340885, win 0, length 0
14:13:51.633934 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 4294963190:4294963222, ack 9830433, win 46, options [nop,nop,TS val 
1612540 ecr 1598776], length 32
14:13:55.125525 ARP, Request who-has node05.webstar.cnet tell node20, 
length 28
14:13:55.125886 ARP, Reply node05.webstar.cnet is-at 00:30:48:ce:dc:02 
(oui Unknown), length 46
14:14:43.855380 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1625596 ecr 1598776], 
length 32
14:14:48.855143 ARP, Request who-has node20 tell node05.webstar.cnet, 
length 46
14:14:48.855469 ARP, Reply node20 is-at 00:30:48:ce:de:34 (oui Unknown), 
length 28
14:14:59.617675 IP node20.ssh > node05.webstar.cnet.36295: Flags [F.], 
seq 33, ack 1, win 46, options [nop,nop,TS val 1628777 ecr 1599535], 
length 0
14:14:59.618202 IP node05.webstar.cnet.36295 > node20.ssh: Flags [FP.], 
seq 4294959654:4294960446, ack 3930456098, win 46, options [nop,nop,TS 
val 1629536 ecr 1628777], length 792
14:14:59.821527 IP node20.ssh > node05.webstar.cnet.36295: Flags [F.], 
seq 33, ack 1, win 46, options [nop,nop,TS val 1628828 ecr 1599535], 
length 0
14:14:59.821598 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], 
ack 34, win 46, options [nop,nop,TS val 1629587 ecr 1628828,nop,nop,sack 
1 {33:34}], length 0
^C^
27 packets captured
31 packets received by filter
0 packets dropped by kernel


I then did ifdown and ifup on node05 and again ran

ssh node20

and grabbed the following output from running tcpdump on node20

root@node20:~# tcpdump host node20 and node05
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 96 bytes
14:15:50.626410 IP node05.webstar.cnet.36690 > node20.ssh: Flags [S], 
seq 2044900531, win 5840, options [mss 1460,sackOK,TS val 1642289 ecr 
0,nop,wscale 7], length 0
14:15:50.626441 IP node20.ssh > node05.webstar.cnet.36690: Flags [S.], 
seq 1976694445, ack 2044900532, win 5792, options [mss 1460,sackOK,TS 
val 1641529 ecr 1642289,nop,wscale 7], length 0
14:15:50.626482 IP node05.webstar.cnet.36690 > node20.ssh: Flags [.], 
ack 1, win 46, options [nop,nop,TS val 1642289 ecr 1641529], length 0
14:15:50.631138 IP node20.ssh > node05.webstar.cnet.36690: Flags [P.], 
seq 1:33, ack 1, win 46, options [nop,nop,TS val 1641530 ecr 1642289], 
length 32
14:15:50.631218 IP node05.webstar.cnet.36690 > node20.ssh: Flags [.], 
ack 33, win 46, options [nop,nop,TS val 1642290 ecr 1641530], length 0
14:15:50.631267 IP node05.webstar.cnet.36690 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1642290 ecr 1641530], 
length 32
14:15:50.631281 IP node20.ssh > node05.webstar.cnet.36690: Flags [.], 
ack 33, win 46, options [nop,nop,TS val 1641530 ecr 1642290], length 0
14:15:50.631367 IP node05.webstar.cnet.36690 > node20.ssh: Flags [P.], 
seq 33:825, ack 33, win 46, options [nop,nop,TS val 1642290 ecr 
1641530], length 792
14:15:50.631376 IP node20.ssh > node05.webstar.cnet.36690: Flags [.], 
ack 825, win 58, options [nop,nop,TS val 1641530 ecr 1642290], length 0
14:15:50.631808 IP node20.ssh > node05.webstar.cnet.36690: Flags [P.], 
seq 33:817, ack 825, win 58, options [nop,nop,TS val 1641530 ecr 
1642290], length 784
14:15:50.631950 IP node05.webstar.cnet.36690 > node20.ssh: Flags [P.], 
seq 825:849, ack 817, win 58, options [nop,nop,TS val 1642290 ecr 
1641530], length 24
14:15:50.633353 IP node20.ssh > node05.webstar.cnet.36690: Flags [P.], 
seq 817:969, ack 849, win 58, options [nop,nop,TS val 1641530 ecr 
1642290], length 152
14:15:50.633932 IP node05.webstar.cnet.36690 > node20.ssh: Flags [P.], 
seq 849:993, ack 969, win 71, options [nop,nop,TS val 1642291 ecr 
1641530], length 144
14:15:50.637998 IP node20.ssh > node05.webstar.cnet.36690: Flags [P.], 
seq 969:1689, ack 993, win 70, options [nop,nop,TS val 1641532 ecr 
1642291], length 720
14:15:50.676465 IP node05.webstar.cnet.36690 > node20.ssh: Flags [.], 
ack 1689, win 83, options [nop,nop,TS val 1642302 ecr 1641532], length 0
14:16:09.776134 IP node05.webstar.cnet.49671 > node20.50060: Flags [S], 
seq 2348078217, win 5840, options [mss 1460,sackOK,TS val 1647077 ecr 
0,nop,wscale 7], length 0
14:16:09.776498 IP node20.50060 > node05.webstar.cnet.49671: Flags [R.], 
seq 0, ack 2348078218, win 0, length 0
^C
17 packets captured
21 packets received by filter
0 packets dropped by kernel


Does that help?

^ permalink raw reply

* Re: [PATCH] rps: add flow director support
From: Tom Herbert @ 2010-04-12 13:34 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, netdev
In-Reply-To: <1271022140-3917-1-git-send-email-xiaosuo@gmail.com>

On Sun, Apr 11, 2010 at 2:42 PM, Changli Gao <xiaosuo@gmail.com> wrote:
> add rps flow director support
>
> with rps flow director, users can do weighted packet dispatching among CPUs.
> For example, CPU0:CPU1 is 1:3 for eth0's rx-0:
>
"Flow director" is a misnomer here in that it has no per flow
awareness, that is what RFS provides.  Please use a different name.

>  localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows
>  localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3
>
It might be better to put this in its own directory and also do it per
CPU instead of hash entry.  This should result in a lot fewer entries
and I'm not sure how you would deal with holes in the hash table for
unspecified entries.  Also, it would be nice not to have to specify a
number of entries.  Maybe something like:

localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/0
localhost linux # echo 3 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/1

To specify CPU 0 with weight 1, CPU 1 with weight 3.

> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
>  net/core/net-sysfs.c |  176 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 172 insertions(+), 4 deletions(-)
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index 1e7fdd6..d904610 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -511,6 +511,109 @@ static struct sysfs_ops rx_queue_sysfs_ops = {
>        .store = rx_queue_attr_store,
>  };
>
> +static DEFINE_MUTEX(rps_map_lock);
> +
> +static ssize_t show_rps_flow(struct netdev_rx_queue *queue,
> +                            struct rx_queue_attribute *attribute, char *buf)
> +{
> +       unsigned long flowid;
> +       struct rps_map *map;
> +       u16 cpu;
> +
> +       strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
> +       rcu_read_lock();
> +       map = rcu_dereference(queue->rps_map);
> +       if (map && flowid < map->len)
> +               cpu = map->cpus[flowid];
> +       else
> +               cpu = 0;
> +       rcu_read_unlock();
> +       return sprintf(buf, "%hu\n", cpu);
> +}
> +
> +static ssize_t store_rps_flow(struct netdev_rx_queue *queue,
> +                             struct rx_queue_attribute *attribute,
> +                             const char *buf, size_t len)
> +{
> +       unsigned long flowid, cpu;
> +       struct rps_map *map;
> +
> +       if (!capable(CAP_NET_ADMIN))
> +               return -EPERM;
> +
> +       if (strict_strtoul(buf, 0, &cpu))
> +               return -EINVAL;
> +       strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
> +
> +       mutex_lock(&rps_map_lock);
> +       map = queue->rps_map;
> +       if (map && flowid < map->len)
> +               map->cpus[flowid] = cpu;
> +       mutex_unlock(&rps_map_lock);
> +
> +       return len;
> +}
> +
> +static struct rx_queue_attribute **rps_flow_attribute;
> +static int rps_flow_attribute_size;
> +
> +/* must be called with rps_map_lock locked */
> +static int update_rps_flow_files(struct kobject *kobj,
> +                                struct rps_map *old_map, struct rps_map *map)
> +{
> +       int i;
> +       int old_map_len = old_map ? old_map->len : 0;
> +       int map_len = map ? map->len : 0;
> +
> +       if (old_map_len >= map_len) {
> +               for (i = map_len; i < old_map_len; i++)
> +                       sysfs_remove_file(kobj, &rps_flow_attribute[i]->attr);
> +               return 0;
> +       }
> +
> +       if (map_len > rps_flow_attribute_size) {
> +               struct rx_queue_attribute **attrs;
> +               char name[sizeof("rps_flow_4294967295")];
> +               char *pname;
> +
> +               attrs = krealloc(rps_flow_attribute, map_len * sizeof(void *),
> +                                GFP_KERNEL);
> +               if (attrs == NULL)
> +                       return -ENOMEM;
> +               rps_flow_attribute = attrs;
> +               for (i = rps_flow_attribute_size; i < map_len; i++) {
> +                       rps_flow_attribute[i] = kmalloc(sizeof(**attrs),
> +                                                       GFP_KERNEL);
> +                       if (rps_flow_attribute[i] == NULL)
> +                               break;
> +                       sprintf(name, "rps_flow_%d", i);
> +                       pname = kstrdup(name, GFP_KERNEL);
> +                       if (pname == NULL) {
> +                               kfree(rps_flow_attribute[i]);
> +                               break;
> +                       }
> +                       rps_flow_attribute[i]->attr.name = pname;
> +                       rps_flow_attribute[i]->attr.mode = S_IRUGO | S_IWUSR;
> +                       rps_flow_attribute[i]->show = show_rps_flow;
> +                       rps_flow_attribute[i]->store = store_rps_flow;
> +               }
> +               rps_flow_attribute_size = i;
> +               if (i != map_len)
> +                       return -ENOMEM;
> +       }
> +
> +       for (i = old_map_len; i < map_len; i++) {
> +               if (sysfs_create_file(kobj, &rps_flow_attribute[i]->attr)) {
> +                       while (--i >= old_map_len)
> +                               sysfs_remove_file(kobj,
> +                                                 &rps_flow_attribute[i]->attr);
> +                       return -ENOMEM;
> +               }
> +       }
> +
> +       return 0;
> +}
> +
>  static ssize_t show_rps_map(struct netdev_rx_queue *queue,
>                            struct rx_queue_attribute *attribute, char *buf)
>  {
> @@ -555,7 +658,6 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>        struct rps_map *old_map, *map;
>        cpumask_var_t mask;
>        int err, cpu, i;
> -       static DEFINE_SPINLOCK(rps_map_lock);
>
>        if (!capable(CAP_NET_ADMIN))
>                return -EPERM;
> @@ -588,10 +690,15 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>                map = NULL;
>        }
>
> -       spin_lock(&rps_map_lock);
> +       mutex_lock(&rps_map_lock);
>        old_map = queue->rps_map;
> -       rcu_assign_pointer(queue->rps_map, map);
> -       spin_unlock(&rps_map_lock);
> +       err = update_rps_flow_files(&queue->kobj, old_map, map);
> +       if (!err)
> +               rcu_assign_pointer(queue->rps_map, map);
> +       mutex_unlock(&rps_map_lock);
> +
> +       if (err)
> +               return err;
>
>        if (old_map)
>                call_rcu(&old_map->rcu, rps_map_release);
> @@ -603,8 +710,69 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>  static struct rx_queue_attribute rps_cpus_attribute =
>        __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
>
> +static ssize_t show_rps_flows(struct netdev_rx_queue *queue,
> +               struct rx_queue_attribute *attribute, char *buf)
> +{
> +       struct rps_map *map;
> +       unsigned int len;
> +
> +       rcu_read_lock();
> +       map = rcu_dereference(queue->rps_map);
> +       len = map ? map->len : 0;
> +       rcu_read_unlock();
> +       return sprintf(buf, "%u\n", len);
> +}
> +
> +static ssize_t store_rps_flows(struct netdev_rx_queue *queue,
> +                              struct rx_queue_attribute *attribute,
> +                              const char *buf, size_t len)
> +{
> +       struct rps_map *old_map, *map;
> +       unsigned long flows;
> +       int err;
> +
> +       if (!capable(CAP_NET_ADMIN))
> +               return -EPERM;
> +
> +       if (strict_strtoul(buf, 0, &flows))
> +               return -EINVAL;
> +       if (flows != 0) {
> +               map = kzalloc(max_t(unsigned, RPS_MAP_SIZE(flows),
> +                                   L1_CACHE_BYTES), GFP_KERNEL);
> +               if (map == NULL)
> +                       return -ENOMEM;
> +               map->len = flows;
> +       } else {
> +               map = NULL;
> +       }
> +
> +       mutex_lock(&rps_map_lock);
> +       old_map = queue->rps_map;
> +       err = update_rps_flow_files(&queue->kobj, old_map, map);
> +       if (!err) {
> +               if (old_map && map)
> +                       memcpy(map->cpus, old_map->cpus,
> +                              sizeof(map->cpus[0]) *
> +                              min_t(unsigned int, flows, old_map->len));
> +               rcu_assign_pointer(queue->rps_map, map);
> +       }
> +       mutex_unlock(&rps_map_lock);
> +
> +       if (err)
> +               return err;
> +
> +       if (old_map)
> +               call_rcu(&old_map->rcu, rps_map_release);
> +
> +       return len;
> +}
> +
> +static struct rx_queue_attribute rps_flows_attribute =
> +       __ATTR(rps_flows, S_IRUGO | S_IWUSR, show_rps_flows, store_rps_flows);
> +
>  static struct attribute *rx_queue_default_attrs[] = {
>        &rps_cpus_attribute.attr,
> +       &rps_flows_attribute.attr,
>        NULL
>  };
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* hdlc_ppp: why no detach()?
From: Michael Barkowski @ 2010-04-12 14:15 UTC (permalink / raw)
  To: Krzysztof Halasa
  Cc: David S. Miller, Julia Lawall, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org

Hello Krzyztof,

I am looking at your hdlc_ppp code and I don't understand: why is there
not the equivalent of fr_detach() in there?

pc8300_drv:cpc_remove_one() frees netdevs quite confidently but I wonder
how it can be so sure that there are not skbs in hdlc_ppp's tx_queue
associated with those devices before freeing them....

Even if you wanted to switch a device from PPP to Frame Relay, I don't
see the method right now.  If I may ask, please, what am I missing?

If you agree there is a need for detach(), I would be happy to work on
it and make a submission.

thanks for your time,

-- 
Michael Barkowski
RuggedCom, Inc.

^ permalink raw reply

* Re: [PATCH] rps: add flow director support
From: Changli Gao @ 2010-04-12 14:27 UTC (permalink / raw)
  To: Tom Herbert; +Cc: David S. Miller, netdev
In-Reply-To: <z2o65634d661004120634h8336409er33af1fb75c2a9d1b@mail.gmail.com>

On Mon, Apr 12, 2010 at 9:34 PM, Tom Herbert <therbert@google.com> wrote:
> On Sun, Apr 11, 2010 at 2:42 PM, Changli Gao <xiaosuo@gmail.com> wrote:
>> add rps flow director support
>>
>> with rps flow director, users can do weighted packet dispatching among CPUs.
>> For example, CPU0:CPU1 is 1:3 for eth0's rx-0:
>>
> "Flow director" is a misnomer here in that it has no per flow
> awareness, that is what RFS provides.  Please use a different name.

Flow here is a bundle of flow, not the original meaning. How about
"rps_buckets" and "rps_bucket_x"?

>
>>  localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows
>>  localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0
>>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1
>>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2
>>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3
>>
> It might be better to put this in its own directory

I have thought that before, but since they control the same data in
kernel as rps_cpus does, I put them in the same directory.

> and also do it per
> CPU instead of hash entry.  This should result in a lot fewer entries
> and I'm not sure how you would deal with holes in the hash table for
> unspecified entries.  Also, it would be nice not to have to specify a
> number of entries.  Maybe something like:
>
> localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/0
> localhost linux # echo 3 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/1
>
> To specify CPU 0 with weight 1, CPU 1 with weight 3.
>

Your way is more simple and straightforward. My idea has it own advantage:
1. control the rate precision through rps_flows.
2. do dynamic weighted packet dispatching by migrating some flows from
some CPUs to other CPUs. During this operations, only the flows
migrated are affected, and OOO only occurs in these flows.

-- 
Regards，
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: hdlc_ppp: why no detach()?
From: Michael Barkowski @ 2010-04-12 14:34 UTC (permalink / raw)
  To: Krzysztof Halasa
  Cc: David S. Miller, Julia Lawall, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <4BC32B00.1030600@ruggedcom.com>

Michael Barkowski wrote:
> Hello Krzyztof,
> 
> I am looking at your hdlc_ppp code and I don't understand: why is there
> not the equivalent of fr_detach() in there?
> 
> pc8300_drv:cpc_remove_one() frees netdevs quite confidently but I wonder
> how it can be so sure that there are not skbs in hdlc_ppp's tx_queue
> associated with those devices before freeing them....
> 

the above is the real danger I see - free the netdev, then ppp's timer
comes along and dequeues from tx_queue an skb with invalid device.

> Even if you wanted to switch a device from PPP to Frame Relay, I don't
> see the method right now.  If I may ask, please, what am I missing?
> 

Ok - this part was a momentary lapse on my part - please strike from
the record :)

> If you agree there is a need for detach(), I would be happy to work on
> it and make a submission.
> 
> thanks for your time,
> 


-- 
Michael Barkowski
905-482-4577

^ permalink raw reply

* Re: Strange packet drops with heavy firewalling
From: Benny Lyne Amorsen @ 2010-04-12 14:44 UTC (permalink / raw)
  To: zhigang gong; +Cc: netdev
In-Reply-To: <q2v40c9f5b21004120116p766df82dj88c6af4e4cad55f@mail.gmail.com>

man, 12 04 2010 kl. 16:16 +0800, skrev zhigang gong:

> How do you know the per CPU usage data, by oprofile? I'm just a little
> surprised with the result, as it shows your new core is running 10x
> faster than your old core :). 

Well the old server had only two CPU's plus hyperthreading, and the
CPU's were Pentium-4-based. Add a slow memory bus to that and you have a
fairly slow system. It's almost 5 years old, so Moore's law says 2**3
increase in number of transistors...

In about the same time frame Linux has gone from being able to fill
1Gbps ethernet to being able to fill 10Gbps ethernet 

> What's the average packet size?

I asked the switch (I can't find a handy equivalent to ifstat which
counts packets instead of bytes). The 5 minute average packet sizes seem
to vary in the range 450 to 550 bytes.

> If your packet size is 64 bytes, then the pps(packet per second) rate
> should be about 585Kpps. As I know, this value is almost the best
> result when the standard linux kernel is processing the networking
> traffic with a normal 1Gb ethernet card (without multi-queue support)
> on a intel box. If it is the case, to buy a better ethernet card with
> multi-queue support should be a good choice. Otherwise, it may not
> help. 

I am far from that, perhaps 1/10th of that. I do a lot more processing
on at least some of the packets though (the ones starting new flows).

/Benny

^ permalink raw reply

* Very Important
From: Jiang Jianmin @ 2010-04-12 14:56 UTC (permalink / raw)


Good Day,
 
I have a secured business proposal of $28,272,000.00.Contact me via my private email(cncn1_jiang_jianmin2011@yahoo.com.cn)if interested.
 
Mr Jiang Jianmin.

^ permalink raw reply

* Re: forcedeth driver hangs under heavy load
From: Eric Dumazet @ 2010-04-12 15:24 UTC (permalink / raw)
  To: stephen mulcahy; +Cc: netdev, Ben Hutchings, Ayaz Abdulla, 572201
In-Reply-To: <4BC31DDE.7010005@gmail.com>

Le lundi 12 avril 2010 à 14:19 +0100, stephen mulcahy a écrit :

> Does that help?

Well, yes, because it seems a TCP problem.

root@node20:~# tcpdump host node20 and node05
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 96 bytes
14:12:59.612626 IP node05.webstar.cnet.36295 > node20.ssh: Flags [S], seq 3677858646, win 5840, options [mss 1460,sackOK,TS val 1599534 ecr 0,nop,wscale 7], length 0
14:12:59.612656 IP node20.ssh > node05.webstar.cnet.36295: Flags [S.], seq 3610575850, ack 3677858647, win 5792, options [mss 1460,sackOK,TS val 1598775 ecr 1599534,nop,wscale 7], length 0
14:12:59.612718 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], ack 1, win 46, options [nop,nop,TS val 1599534 ecr 1598775], length 0
14:12:59.617434 IP node20.ssh > node05.webstar.cnet.36295: Flags [P.], seq 1:33, ack 1, win 46, options [nop,nop,TS val 1598776 ecr 1599534], length 32
14:12:59.617522 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], ack 33, win 46, options [nop,nop,TS val 1599535 ecr 1598776], length 0
14:12:59.617609 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 1:33, ack 33, win 46, options [nop,nop,TS val 1599535 ecr 1598776], length 32

All following xmitted frames are completely out of sync, this makes no sense.

Sequence number went backward.

14:12:59.820434 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 4294936586:4294936618, ack 2620194849, win 46, options [nop,nop,TS val 1599586 ecr 1598776], length 32
14:13:00.229069 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 4294961734:4294961766, ack 3928358945, win 46, options [nop,nop,TS val 1599688 ecr 1598776], length 32
14:13:01.044396 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 4294964167:4294964199, ack 410320929, win 46, options [nop,nop,TS val 1599892 ecr 1598776], length 32


14:13:02.676308 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 1:33, ack 33, win 46, options [nop,nop,TS val 1600300 ecr 1598776], length 32
14:13:05.940804 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 17294:17326, ack 3045851169, win 46, options [nop,nop,TS val 1601116 ecr 1598776], length 32
14:13:12.468484 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 17294:17326, ack 3045851169, win 46, options [nop,nop,TS val 1602748 ecr 1598776], length 32
14:13:25.523850 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 1:33, ack 33, win 46, options [nop,nop,TS val 1606012 ecr 1598776], length 32
14:13:51.633934 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 4294963190:4294963222, ack 9830433, win 46, options [nop,nop,TS val 1612540 ecr 1598776], length 32
14:14:43.855380 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], seq 1:33, ack 33, win 46, options [nop,nop,TS val 1625596 ecr 1598776], length 32
14:14:59.617675 IP node20.ssh > node05.webstar.cnet.36295: Flags [F.], seq 33, ack 1, win 46, options [nop,nop,TS val 1628777 ecr 1599535], length 0
14:14:59.618202 IP node05.webstar.cnet.36295 > node20.ssh: Flags [FP.], seq 4294959654:4294960446, ack 3930456098, win 46, options [nop,nop,TS val 1629536 ecr 1628777], length 792
14:14:59.821527 IP node20.ssh > node05.webstar.cnet.36295: Flags [F.], seq 33, ack 1, win 46, options [nop,nop,TS val 1628828 ecr 1599535], length 0
14:14:59.821598 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], ack 34, win 46, options [nop,nop,TS val 1629587 ecr 1628828,nop,nop,sack 1 {33:34}], length 0

Do you have some netfilters rules ?



^ permalink raw reply

* Re: [Bonding-devel] [v3 Patch 2/3] bridge: make bridge support netpoll
From: Stephen Hemminger @ 2010-04-12 15:38 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Cong Wang, Jay Vosburgh, Neil Horman, netdev, Matt Mackall,
	bridge, linux-kernel, David Miller, Jeff Moyer, Andy Gospodarek,
	bonding-devel
In-Reply-To: <1271068737.16881.18.camel@edumazet-laptop>

On Mon, 12 Apr 2010 12:38:57 +0200
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Le lundi 12 avril 2010 à 18:37 +0800, Cong Wang a écrit :
> > Stephen Hemminger wrote:
> > > There is no protection on dev->priv_flags for SMP access.
> > > It would better bit value in dev->state if you are using it as control flag.
> > > 
> > > Then you could use 
> > > 			if (unlikely(test_and_clear_bit(__IN_NETPOLL, &skb->dev->state)))
> > > 				netpoll_send_skb(...)
> > > 
> > > 
> > 
> > Hmm, I think we can't use ->state here, it is not for this kind of purpose,
> > according to its comments.
> > 
> > Also, I find other usages of IFF_XXX flags of ->priv_flags are also using
> > &, | to set or clear the flags. So there must be some other things preventing
> > the race...
> 
> Yes, its RTNL that protects priv_flags changes, hopefully...
> 

The patch was not protecting priv_flags with RTNL.
For example..


@@ -308,7 +312,9 @@ static void netpoll_send_skb(struct netp
 		     tries > 0; --tries) {
 			if (__netif_tx_trylock(txq)) {
 				if (!netif_tx_queue_stopped(txq)) {
+					dev->priv_flags |= IFF_IN_NETPOLL;
 					status = ops->ndo_start_xmit(skb, dev);
+					dev->priv_flags &= ~IFF_IN_NETPOLL;
 					if (status == NETDEV_TX_OK)
 						txq_trans_update(txq);

^ permalink raw reply

* [PATCH 0/4] IPv6 addrconf related fixes
From: Stephen Hemminger @ 2010-04-12 15:41 UTC (permalink / raw)
  To: davem; +Cc: netdev

These apply to net-next, the problems do not exist in earlier kernels.
The problems started when I added changes to retain IPv6 addresses
when link goes down.

-- 

^ permalink raw reply

* [PATCH 1/4] IPv6: keep route for tentative address
From: Stephen Hemminger @ 2010-04-12 15:41 UTC (permalink / raw)
  To: David S. Miller, Tantilov, Emil S; +Cc: netdev
In-Reply-To: <20100412154130.397252857@vyatta.com>

[-- Attachment #1: ipv6-addrconf1.patch --]
[-- Type: text/plain, Size: 689 bytes --]

Recent changes preserve IPv6 address when link goes down (good).
But would cause address to point to dead dst entry (bad).
The simplest fix is to just not delete route if address is
being held for later use.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/net/ipv6/addrconf.c	2010-04-11 12:19:37.938082190 -0700
+++ b/net/ipv6/addrconf.c	2010-04-11 12:25:05.349309074 -0700
@@ -4046,7 +4046,8 @@ static void __ipv6_ifa_notify(int event,
 			addrconf_leave_anycast(ifp);
 		addrconf_leave_solict(ifp->idev, &ifp->addr);
 		dst_hold(&ifp->rt->u.dst);
-		if (ip6_del_rt(ifp->rt))
+
+		if (ifp->dead && ip6_del_rt(ifp->rt))
 			dst_free(&ifp->rt->u.dst);
 		break;
 	}

-- 


^ permalink raw reply

* [PATCH 2/4] IPv6: keep tentative addresses in hash table
From: Stephen Hemminger @ 2010-04-12 15:41 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <20100412154130.397252857@vyatta.com>

[-- Attachment #1: ipv6-addrconf2.patch --]
[-- Type: text/plain, Size: 1105 bytes --]

When link goes down, want address to be preserved but in a tentative
state, therefore it has to stay in hash list.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/ipv6/addrconf.c	2010-04-11 12:25:05.349309074 -0700
+++ b/net/ipv6/addrconf.c	2010-04-11 12:25:10.408996382 -0700
@@ -2703,17 +2703,18 @@ static int addrconf_ifdown(struct net_de
 			/* Flag it for later restoration when link comes up */
 			ifa->flags |= IFA_F_TENTATIVE;
 			in6_ifa_hold(ifa);
+			write_unlock_bh(&idev->lock);
 		} else {
 			list_del(&ifa->if_list);
 			ifa->dead = 1;
-		}
-		write_unlock_bh(&idev->lock);
+			write_unlock_bh(&idev->lock);
 
-		/* clear hash table */
-		spin_lock_bh(&addrconf_hash_lock);
-		hlist_del_init_rcu(&ifa->addr_lst);
-		__in6_ifa_put(ifa);
-		spin_unlock_bh(&addrconf_hash_lock);
+			/* clear hash table */
+			spin_lock_bh(&addrconf_hash_lock);
+			hlist_del_init_rcu(&ifa->addr_lst);
+			__in6_ifa_put(ifa);
+			spin_unlock_bh(&addrconf_hash_lock);
+		}
 
 		__ipv6_ifa_notify(RTM_DELADDR, ifa);
 		atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa);

-- 


^ permalink raw reply

* [PATCH 3/4] ipv6: additional ref count for hash list unnecessary
From: Stephen Hemminger @ 2010-04-12 15:41 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <20100412154130.397252857@vyatta.com>

[-- Attachment #1: ipv6-addrconf3.patch --]
[-- Type: text/plain, Size: 1002 bytes --]

Since an address in hash list has to already have a ref count,
no additional ref count is needed. 

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


--- a/net/ipv6/addrconf.c	2010-04-11 12:25:32.609002374 -0700
+++ b/net/ipv6/addrconf.c	2010-04-11 12:26:52.715246164 -0700
@@ -675,7 +675,6 @@ ipv6_add_addr(struct inet6_dev *idev, co
 	hash = ipv6_addr_hash(addr);
 
 	hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]);
-	in6_ifa_hold(ifa);
 	spin_unlock(&addrconf_hash_lock);
 
 	write_lock(&idev->lock);
@@ -723,7 +722,6 @@ static void ipv6_del_addr(struct inet6_i
 
 	spin_lock_bh(&addrconf_hash_lock);
 	hlist_del_init_rcu(&ifp->addr_lst);
-	__in6_ifa_put(ifp);
 	spin_unlock_bh(&addrconf_hash_lock);
 
 	write_lock_bh(&idev->lock);
@@ -2712,7 +2710,6 @@ static int addrconf_ifdown(struct net_de
 			/* clear hash table */
 			spin_lock_bh(&addrconf_hash_lock);
 			hlist_del_init_rcu(&ifa->addr_lst);
-			__in6_ifa_put(ifa);
 			spin_unlock_bh(&addrconf_hash_lock);
 		}
 

-- 


^ permalink raw reply

* [PATCH 4/4] IPv6: only notify protocols if address is compeletely gone
From: Stephen Hemminger @ 2010-04-12 15:41 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <20100412154130.397252857@vyatta.com>

[-- Attachment #1: ipv6-addrconf4.patch --]
[-- Type: text/plain, Size: 793 bytes --]

The notifier for address down should only be called if address is completely
gone, not just being marked as tentative on link transistion. The code
in net-next would case bonding/sctp/s390 to see address disappear on link
down, but they would never see it reappear on link up.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/ipv6/addrconf.c	2010-04-11 14:34:36.919767724 -0700
+++ b/net/ipv6/addrconf.c	2010-04-11 14:35:00.533967946 -0700
@@ -2714,7 +2714,9 @@ static int addrconf_ifdown(struct net_de
 		}
 
 		__ipv6_ifa_notify(RTM_DELADDR, ifa);
-		atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa);
+		if (ifa->dead)
+			atomic_notifier_call_chain(&inet6addr_chain,
+						   NETDEV_DOWN, ifa);
 		in6_ifa_put(ifa);
 
 		write_lock_bh(&idev->lock);

-- 


^ permalink raw reply

* Re: forcedeth driver hangs under heavy load
From: stephen mulcahy @ 2010-04-12 16:11 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Ben Hutchings, Ayaz Abdulla, 572201
In-Reply-To: <1271085862.16881.38.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le lundi 12 avril 2010 à 14:19 +0100, stephen mulcahy a écrit :
> 
> Do you have some netfilters rules ?
> 

Hi Eric,

I don't have any netfilters rules:

root@node34:~# for table in filter nat mangle raw; do iptables -t $table 
-L; done
Chain INPUT (policy ACCEPT)
target     prot opt source               destination

Chain FORWARD (policy ACCEPT)
target     prot opt source               destination

Chain OUTPUT (policy ACCEPT)
target     prot opt source               destination
Chain PREROUTING (policy ACCEPT)
target     prot opt source               destination

Chain POSTROUTING (policy ACCEPT)
target     prot opt source               destination

Chain OUTPUT (policy ACCEPT)
target     prot opt source               destination
Chain PREROUTING (policy ACCEPT)
target     prot opt source               destination

Chain INPUT (policy ACCEPT)
target     prot opt source               destination

Chain FORWARD (policy ACCEPT)
target     prot opt source               destination

Chain OUTPUT (policy ACCEPT)
target     prot opt source               destination

Chain POSTROUTING (policy ACCEPT)
target     prot opt source               destination
Chain PREROUTING (policy ACCEPT)
target     prot opt source               destination

Chain OUTPUT (policy ACCEPT)
target     prot opt source               destination


I re-ran this on the 2.6.32 kernel (with the 2.6.32 forcedeth module) 
just in case that was screwing something up.

node33 is in the unresponsive state this time. I'm running tcpdump on 
node34. on node33 I try to ssh to node34 (using ip address of node34). I 
note that I can ping between node33 and node34.

root@node34:~# tcpdump -v host node34 and node33
tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 96 
bytes
17:05:19.622384 IP (tos 0x0, ttl 64, id 21435, offset 0, flags [DF], 
proto TCP (6), length 60)
     node33.webstar.cnet.43653 > node34.ssh: Flags [S], cksum 0xb994 
(correct), seq 1675314077, win 5840, options [mss 1460,sackOK,TS val 
331814 ecr 0,nop,wscale 7], length 0
17:05:19.622754 IP (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto 
TCP (6), length 60)
     node34.ssh > node33.webstar.cnet.43653: Flags [S.], cksum 0x9d81 
(correct), seq 1669769379, ack 1675314078, win 5792, options [mss 
1460,sackOK,TS val 331779 ecr 331814,nop,wscale 7], length 0
17:05:19.622813 IP (tos 0x0, ttl 64, id 21436, offset 0, flags [DF], 
proto TCP (6), length 52)
     node33.webstar.cnet.43653 > node34.ssh: Flags [.], cksum 0xe2bf 
(correct), ack 1, win 46, options [nop,nop,TS val 331814 ecr 331779], 
length 0
17:05:19.627666 IP (tos 0x0, ttl 64, id 47271, offset 0, flags [DF], 
proto TCP (6), length 84)
     node34.ssh > node33.webstar.cnet.43653: Flags [P.], seq 1:33, ack 
1, win 46, options [nop,nop,TS val 331780 ecr 331814], length 32
17:05:19.627748 IP (tos 0x0, ttl 64, id 21437, offset 0, flags [DF], 
proto TCP (6), length 52)
     node33.webstar.cnet.43653 > node34.ssh: Flags [.], cksum 0xe29c 
(correct), ack 33, win 46, options [nop,nop,TS val 331816 ecr 331780], 
length 0
17:05:19.627833 IP (tos 0x0, ttl 64, id 21438, offset 0, flags [DF], 
proto TCP (6), length 84, bad cksum 1f8a (->d189)!)
     node33.webstar.cnet.43653 > node34.ssh: Flags [P.], seq 
23413:23445, ack 2749038625, win 46, options [nop,nop,TS val 331816 ecr 
331780], length 32
17:05:19.831634 IP (tos 0x0, ttl 64, id 21439, offset 0, flags [DF], 
proto TCP (6), length 84, bad cksum d189 (->d188)!)
     node33.webstar.cnet.43653 > node34.ssh: Flags [P.], seq 1:33, ack 
33, win 46, options [nop,nop,TS val 331867 ecr 331780], length 32
17:05:20.239603 IP (tos 0x0, ttl 64, id 21440, offset 0, flags [DF], 
proto TCP (6), length 84, bad cksum 15c6 (->d187)!)
     node33.webstar.cnet.43653 > node34.ssh: Flags [P.], seq 
30492:30524, ack 809893921, win 46, options [nop,nop,TS val 331969 ecr 
331780], length 32
17:05:21.055534 IP (tos 0x0, ttl 64, id 21441, offset 0, flags [DF], 
proto TCP (6), length 84, bad cksum d187 (->d186)!)
     node33.webstar.cnet.43653 > node34.ssh: Flags [P.], seq 1:33, ack 
33, win 46, options [nop,nop,TS val 332173 ecr 331780], length 32
17:05:22.687386 IP (tos 0x0, ttl 64, id 21442, offset 0, flags [DF], 
proto TCP (6), length 84, bad cksum d186 (->d185)!)
     node33.webstar.cnet.43653 > node34.ssh: Flags [P.], seq 1:33, ack 
33, win 46, options [nop,nop,TS val 332581 ecr 331780], length 32
17:05:25.950935 IP (tos 0x0, ttl 64, id 21443, offset 0, flags [DF], 
proto TCP (6), length 84, bad cksum 15c4 (->d184)!)
     node33.webstar.cnet.43653 > node34.ssh: Flags [P.], seq 
30492:30524, ack 809893921, win 46, options [nop,nop,TS val 333397 ecr 
331780], length 32
17:05:32.478527 IP (tos 0x0, ttl 64, id 21444, offset 0, flags [DF], 
proto TCP (6), length 84, bad cksum c01 (->d183)!)
     node33.webstar.cnet.43653 > node34.ssh: Flags [P.], seq 
43997:44029, ack 1311047713, win 46, options [nop,nop,TS val 335029 ecr 
331780], length 32
17:05:45.533370 IP (tos 0x0, ttl 64, id 21445, offset 0, flags [DF], 
proto TCP (6), length 84, bad cksum 23d (->d182)!)
     node33.webstar.cnet.43653 > node34.ssh: Flags [P.], seq 3348:3380, 
ack 4054450209, win 46, options [nop,nop,TS val 338293 ecr 331780], 
length 32
17:06:08.719187 IP (tos 0x0, ttl 64, id 27660, offset 0, flags [DF], 
proto TCP (6), length 1500, bad cksum 5360 (->b3b3)!)
     node33.webstar.cnet.50060 > node34.35725: Flags [.], seq 
1203473738:1203475186, ack 1191452767, win 54, options [nop,nop,TS val 
344089 ecr 256770], length 1448
17:06:11.643080 IP (tos 0x0, ttl 64, id 21446, offset 0, flags [DF], 
proto TCP (6), length 84, bad cksum e4f2 (->d181)!)
     node33.webstar.cnet.43653 > node34.ssh: Flags [P.], seq 
47331:47363, ack 4110811169, win 46, options [nop,nop,TS val 344821 ecr 
331780], length 32
17:06:13.715233 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 
node34 tell node33.webstar.cnet, length 46
17:06:13.715257 ARP, Ethernet (len 6), IPv4 (len 4), Reply node34 is-at 
00:30:48:f0:06:72 (oui Unknown), length 28
17:07:03.866492 IP (tos 0x0, ttl 64, id 21447, offset 0, flags [DF], 
proto TCP (6), length 84, bad cksum b413 (->d180)!)
     node33.webstar.cnet.43653 > node34.ssh: Flags [P.], seq 
28939:28971, ack 1913782305, win 46, options [nop,nop,TS val 357877 ecr 
331780], length 32
17:07:08.862055 ARP, Ethernet (len 6), IPv4 (len 4), Request who-has 
node34 tell node33.webstar.cnet, length 46
17:07:08.862370 ARP, Ethernet (len 6), IPv4 (len 4), Reply node34 is-at 
00:30:48:f0:06:72 (oui Unknown), length 28
17:07:19.627910 IP (tos 0x0, ttl 64, id 47272, offset 0, flags [DF], 
proto TCP (6), length 52)
     node34.ssh > node33.webstar.cnet.43653: Flags [F.], cksum 0x6d6b 
(correct), seq 33, ack 1, win 46, options [nop,nop,TS val 361780 ecr 
331816], length 0
17:07:19.628403 IP (tos 0x0, ttl 64, id 21448, offset 0, flags [DF], 
proto TCP (6), length 844, bad cksum aa4d (->ce87)!)
     node33.webstar.cnet.43653 > node34.ssh: Flags [FP.], seq 
20399:21191, ack 2356871202, win 46, options [nop,nop,TS val 361818 ecr 
361780], length 792
17:07:19.833456 IP (tos 0x0, ttl 64, id 47273, offset 0, flags [DF], 
proto TCP (6), length 52)
     node34.ssh > node33.webstar.cnet.43653: Flags [F.], cksum 0x6d37 
(correct), seq 33, ack 1, win 46, options [nop,nop,TS val 361832 ecr 
331816], length 0
17:07:19.833517 IP (tos 0x0, ttl 64, id 21449, offset 0, flags [DF], 
proto TCP (6), length 64)
     node33.webstar.cnet.43653 > node34.ssh: Flags [.], cksum 0xa5e9 
(correct), ack 34, win 46, options [nop,nop,TS val 361870 ecr 
361832,nop,nop,sack 1 {33:34}], length 0

At this point, I see a "Connection closed by 10.141.0.34" message on 
node33 (from where I am attempting to ssh).

Again, if I ifdown on node33 and ifup again - I can then see from node33 
to node34 without problems.

-stephen

^ permalink raw reply

* Re: NULL pointer dereference panic in stable (2.6.33.2), amd64
From: Denys Fedorysychenko @ 2010-04-12 16:11 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Krishna Kumar2, David Miller, netdev
In-Reply-To: <1271064703.16881.16.camel@edumazet-laptop>

[-- Attachment #1: Type: Text/Plain, Size: 163 bytes --]

On Monday 12 April 2010 12:31:43 Eric Dumazet wrote:
.
Seems problem still remain. Patched kernel, but paniced now.
Btw, i dont have any multiqueue card, i think.

[-- Attachment #2: x.txt --]
[-- Type: text/plain, Size: 12174 bytes --]

Apr 12 18:46:58 80.83.17.1 dropbear[4843]: exit before auth: Disconnect received
Apr 12 18:46:59 80.83.17.1 dropbear[4845]: Child connection from 82.113.44.186:48692
Apr 12 18:46:59 80.83.17.1 dropbear[4844]: exit before auth: Disconnect received
Apr 12 18:46:59 80.83.17.1 kernel: [12598.956375] BUG: unable to handle kernel NULL pointer dereference at (null)
Apr 12 18:46:59 80.83.17.1 kernel: [12598.956571] IP: [<ffffffff811e587f>] dev_queue_xmit+0x28c/0x46d
Apr 12 18:46:59 80.83.17.1 kernel: [12598.956762] PGD 21debc067 PUD 21c881067 PMD 0 
Apr 12 18:46:59 80.83.17.1 kernel: [12598.956947] Oops: 0000 [#1] SMP 
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957126] last sysfs file: /sys/devices/virtual/vc/vcs3/dev
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957311] CPU 0 
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] Pid: 0, comm: swapper Not tainted 2.6.33.2-build-0052test-64 #2         /        
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] RIP: 0010:[<ffffffff811e587f>]  [<ffffffff811e587f>] dev_queue_xmit+0x28c/0x46d
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] RSP: 0000:ffff880028203a30  EFLAGS: 00010202
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] RAX: 0000000000002000 RBX: 0000000000000000 RCX: ffff880209d8a900
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] RDX: ffff88021d870000 RSI: 0000000000000000 RDI: ffff88020a7b48e8
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] RBP: ffff880028203a60 R08: ffff88021c8be89c R09: ffff88021c8bec00
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] R10: dead000000200200 R11: dead000000100100 R12: ffff88021f98a880
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] R13: ffff88021d5c0900 R14: ffff88020a7b48e8 R15: ffff88021cbad000
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] FS:  0000000000000000(0000) GS:ffff880028200000(0000) knlGS:0000000000000000
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] CR2: 0000000000000000 CR3: 000000021c9d8000 CR4: 00000000000006f0
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] Process swapper (pid: 0, threadinfo ffffffff81392000, task ffffffff813a1020)
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] Stack:
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  ffff88021d870000 ffff88021d5c0900 0000000000000042 ffff88021d5c0900
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] <0> ffff88021cbad000 ffff88021cbad000 ffff880028203a80 ffffffffa01c12a9
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] <0> 0000000000000000 ffff88020a7b48e8 ffff880028203ad0 ffffffff811e540e
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] Call Trace:
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  <IRQ> 
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffffa01c12a9>] vlan_dev_hwaccel_hard_start_xmit+0x68/0x86 [8021q]
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff811e540e>] dev_hard_start_xmit+0x232/0x304
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff811f648a>] sch_direct_xmit+0x5d/0x16b
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff811f6654>] __qdisc_run+0xbc/0xdc
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff811e5939>] dev_queue_xmit+0x346/0x46d
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff8120a384>] ip_finish_output2+0x1c2/0x206
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff8120a430>] ip_finish_output+0x68/0x6a
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff8120a4d2>] ip_output+0xa0/0xa5
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff81206d2e>] ip_forward_finish+0x2e/0x32
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff81206ff4>] ip_forward+0x2c2/0x322
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff81205ae0>] ip_rcv_finish+0x2f0/0x30a
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff81205d77>] ip_rcv+0x27d/0x2a4
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff8124ad48>] ? vlan_hwaccel_do_receive+0x2b/0xda
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff811e47b6>] netif_receive_skb+0x450/0x475
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff811e4909>] napi_skb_finish+0x24/0x3b
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff8124b01b>] vlan_gro_receive+0x7c/0x81
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffffa015d6c5>] e1000_receive_skb+0x4a/0x65 [e1000e]
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffffa015d8cb>] e1000_clean_rx_irq+0x1eb/0x29c [e1000e]
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffffa015ebfb>] e1000_clean+0x75/0x22e [e1000e]
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffffa0234d6c>] ? hfsc_dequeue+0x171/0x2a6 [sch_hfsc]
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff811e4e56>] net_rx_action+0xa7/0x17a
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff81039670>] __do_softirq+0x96/0x11a
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff810037cc>] call_softirq+0x1c/0x28
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff81005543>] do_softirq+0x33/0x68
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff81039407>] irq_exit+0x36/0x75
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff81004c3e>] do_IRQ+0xaa/0xc1
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff8125ba93>] ret_from_intr+0x0/0xa
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  <EOI> 
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff8100a0c7>] ? mwait_idle+0x66/0x6b
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff81001d24>] ? enter_idle+0x20/0x22
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff81001d7b>] cpu_idle+0x55/0x8d
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff8124bba5>] rest_init+0x79/0x7b
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff813fca70>] start_kernel+0x362/0x36d
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff813fc0a8>] x86_64_start_reservations+0xa5/0xa9
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  [<ffffffff813fc189>] x86_64_start_kernel+0xdd/0xe4
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] Code: e2 48 8b 55 d0 49 c1 e4 07 66 41 8b 86 a6 00 00 00 4c 03 a2 00 03 00 00 80 e4 cf 80 cc 20 49 8b 5c 24 08 66 41 89 86 a6 00 00 00 <48> 83 3b 00 0f 84 bb 00 00 00 4c 8d ab 9c 00 00 00 4c 89 ef e8 
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] RIP  [<ffffffff811e587f>] dev_queue_xmit+0x28c/0x46d
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342]  RSP <ffff880028203a30>
Apr 12 18:46:59 80.83.17.1 kernel: [12598.957342] CR2: 0000000000000000
Apr 12 18:46:59 80.83.17.1 kernel: [12598.974856] ---[ end trace 739e5480c8ab784f ]---
Apr 12 18:46:59 80.83.17.1 kernel: [12598.975082] Kernel panic - not syncing: Fatal exception in interrupt
Apr 12 18:46:59 80.83.17.1 kernel: [12598.975311] Pid: 0, comm: swapper Tainted: G      D    2.6.33.2-build-0052test-64 #2
Apr 12 18:46:59 80.83.17.1 kernel: [12598.975706] Call Trace:
Apr 12 18:46:59 80.83.17.1 kernel: [12598.975920]  <IRQ>  [<ffffffff81259753>] panic+0xa0/0x161
Apr 12 18:46:59 80.83.17.1 kernel: [12598.976200]  [<ffffffff81003293>] ? apic_timer_interrupt+0x13/0x20
Apr 12 18:46:59 80.83.17.1 kernel: [12598.976431]  [<ffffffff81035673>] ? kmsg_dump+0x112/0x12c
Apr 12 18:46:59 80.83.17.1 kernel: [12598.976657]  [<ffffffff81006651>] oops_end+0xaa/0xba
Apr 12 18:46:59 80.83.17.1 kernel: [12598.976882]  [<ffffffff8101e653>] no_context+0x1f3/0x202
Apr 12 18:46:59 80.83.17.1 kernel: [12598.977113]  [<ffffffff8101e81c>] __bad_area_nosemaphore+0x1ba/0x1e0
Apr 12 18:46:59 80.83.17.1 kernel: [12598.977347]  [<ffffffff8113f8b3>] ? swiotlb_map_page+0x0/0xd5
Apr 12 18:46:59 80.83.17.1 kernel: [12598.977577]  [<ffffffffa015c55a>] ? pci_map_single+0x8a/0x99 [e1000e]
Apr 12 18:46:59 80.83.17.1 kernel: [12598.977806]  [<ffffffff8113f0c0>] ? swiotlb_dma_mapping_error+0x18/0x25
Apr 12 18:46:59 80.83.17.1 kernel: [12598.978045]  [<ffffffffa015a2e0>] ? pci_dma_mapping_error+0x31/0x3d [e1000e]
Apr 12 18:46:59 80.83.17.1 kernel: [12598.978282]  [<ffffffffa015cc37>] ? e1000_xmit_frame+0x6ce/0xa43 [e1000e]
Apr 12 18:46:59 80.83.17.1 kernel: [12598.978513]  [<ffffffff8101e850>] bad_area_nosemaphore+0xe/0x10
Apr 12 18:46:59 80.83.17.1 kernel: [12598.978741]  [<ffffffff8101eb32>] do_page_fault+0x114/0x24a
Apr 12 18:46:59 80.83.17.1 kernel: [12598.978967]  [<ffffffff8125bc9f>] page_fault+0x1f/0x30
Apr 12 18:46:59 80.83.17.1 kernel: [12598.979196]  [<ffffffff811e587f>] ? dev_queue_xmit+0x28c/0x46d
Apr 12 18:46:59 80.83.17.1 kernel: [12598.979426]  [<ffffffffa01c12a9>] vlan_dev_hwaccel_hard_start_xmit+0x68/0x86 [8021q]
Apr 12 18:46:59 80.83.17.1 kernel: [12598.979821]  [<ffffffff811e540e>] dev_hard_start_xmit+0x232/0x304
Apr 12 18:46:59 80.83.17.1 kernel: [12598.980055]  [<ffffffff811f648a>] sch_direct_xmit+0x5d/0x16b
Apr 12 18:46:59 80.83.17.1 kernel: [12598.980284]  [<ffffffff811f6654>] __qdisc_run+0xbc/0xdc
Apr 12 18:46:59 80.83.17.1 kernel: [12598.980514]  [<ffffffff811e5939>] dev_queue_xmit+0x346/0x46d
Apr 12 18:46:59 80.83.17.1 kernel: [12598.980740]  [<ffffffff8120a384>] ip_finish_output2+0x1c2/0x206
Apr 12 18:46:59 80.83.17.1 kernel: [12598.980966]  [<ffffffff8120a430>] ip_finish_output+0x68/0x6a
Apr 12 18:46:59 80.83.17.1 kernel: [12598.981197]  [<ffffffff8120a4d2>] ip_output+0xa0/0xa5
Apr 12 18:46:59 80.83.17.1 kernel: [12598.981427]  [<ffffffff81206d2e>] ip_forward_finish+0x2e/0x32
Apr 12 18:46:59 80.83.17.1 kernel: [12598.981654]  [<ffffffff81206ff4>] ip_forward+0x2c2/0x322
Apr 12 18:46:59 80.83.17.1 kernel: [12598.981880]  [<ffffffff81205ae0>] ip_rcv_finish+0x2f0/0x30a
Apr 12 18:46:59 80.83.17.1 kernel: [12598.982111]  [<ffffffff81205d77>] ip_rcv+0x27d/0x2a4
Apr 12 18:46:59 80.83.17.1 kernel: [12598.982337]  [<ffffffff8124ad48>] ? vlan_hwaccel_do_receive+0x2b/0xda
Apr 12 18:46:59 80.83.17.1 kernel: [12598.982566]  [<ffffffff811e47b6>] netif_receive_skb+0x450/0x475
Apr 12 18:46:59 80.83.17.1 kernel: [12598.982793]  [<ffffffff811e4909>] napi_skb_finish+0x24/0x3b
Apr 12 18:46:59 80.83.17.1 kernel: [12598.983025]  [<ffffffff8124b01b>] vlan_gro_receive+0x7c/0x81
Apr 12 18:46:59 80.83.17.1 kernel: [12598.983260]  [<ffffffffa015d6c5>] e1000_receive_skb+0x4a/0x65 [e1000e]
Apr 12 18:46:59 80.83.17.1 kernel: [12598.983492]  [<ffffffffa015d8cb>] e1000_clean_rx_irq+0x1eb/0x29c [e1000e]
Apr 12 18:46:59 80.83.17.1 kernel: [12598.983727]  [<ffffffffa015ebfb>] e1000_clean+0x75/0x22e [e1000e]
Apr 12 18:46:59 80.83.17.1 kernel: [12598.983955]  [<ffffffffa0234d6c>] ? hfsc_dequeue+0x171/0x2a6 [sch_hfsc]
Apr 12 18:46:59 80.83.17.1 kernel: [12598.984190]  [<ffffffff811e4e56>] net_rx_action+0xa7/0x17a
Apr 12 18:46:59 80.83.17.1 kernel: [12598.984416]  [<ffffffff81039670>] __do_softirq+0x96/0x11a
Apr 12 18:46:59 80.83.17.1 kernel: [12598.984642]  [<ffffffff810037cc>] call_softirq+0x1c/0x28
Apr 12 18:46:59 80.83.17.1 kernel: [12598.984866]  [<ffffffff81005543>] do_softirq+0x33/0x68
Apr 12 18:46:59 80.83.17.1 kernel: [12598.985097]  [<ffffffff81039407>] irq_exit+0x36/0x75
Apr 12 18:46:59 80.83.17.1 kernel: [12598.985323]  [<ffffffff81004c3e>] do_IRQ+0xaa/0xc1
Apr 12 18:46:59 80.83.17.1 kernel: [12598.985546]  [<ffffffff8125ba93>] ret_from_intr+0x0/0xa
Apr 12 18:46:59 80.83.17.1 kernel: [12598.985770]  <EOI>  [<ffffffff8100a0c7>] ? mwait_idle+0x66/0x6b
Apr 12 18:46:59 80.83.17.1 kernel: [12598.986048]  [<ffffffff81001d24>] ? enter_idle+0x20/0x22
Apr 12 18:46:59 80.83.17.1 kernel: [12598.986284]  [<ffffffff81001d7b>] cpu_idle+0x55/0x8d
Apr 12 18:46:59 80.83.17.1 kernel: [12598.986507]  [<ffffffff8124bba5>] rest_init+0x79/0x7b
Apr 12 18:46:59 80.83.17.1 kernel: [12598.986730]  [<ffffffff813fca70>] start_kernel+0x362/0x36d
Apr 12 18:46:59 80.83.17.1 kernel: [12598.986955]  [<ffffffff813fc0a8>] x86_64_start_reservations+0xa5/0xa9
Apr 12 18:46:59 80.83.17.1 kernel: [12598.987189]  [<ffffffff813fc189>] x86_64_start_kernel+0xdd/0xe4

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox