Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH] sysctl: fix min/max handling in __do_proc_doulongvec_minmax()
From: Américo Wang @ 2010-10-07  7:18 UTC (permalink / raw)
  To: Américo Wang
  Cc: Eric Dumazet, Robin Holt, Andrew Morton, linux-kernel,
	Willy Tarreau, David S. Miller, netdev, James Morris,
	Pekka Savola (ipv6), Patrick McHardy, Alexey Kuznetsov, ebiederm
In-Reply-To: <20101005130117.GK5170@cr0.nay.redhat.com>

On Tue, Oct 05, 2010 at 09:01:17PM +0800, Américo Wang wrote:
>On Mon, Oct 04, 2010 at 12:38:21PM +0200, Eric Dumazet wrote:
>>Le lundi 04 octobre 2010 à 18:35 +0800, Américo Wang a écrit :
>>
>>> Your patch does fix the problem, but seems not a good solution,
>>> we should skip all min/max checking if ->extra(1|2) is NULL,
>>> instead of checking it every time within the loop.
>>
>>Please do submit a patch, we'll see if you come to a better solution,
>>with no added code size (this is slow path, I dont care for checking it
>>'every time winthin the loop')
>>
>>
>
>I have one, but just did compile test. :)
>I will test it tomorrow.
>

Here is the final one.

--------------->

Eric D. noticed that we may trigger an OOPS if we leave ->extra{1,2}
to NULL when we use proc_doulongvec_minmax().

Actually, we don't need to store min/max values in a vector,
because all the elements in the vector should share the same min/max
value, like what proc_dointvec_minmax() does.

This also shrinks the binary size a little bit.

   text    data     bss     dec     hex filename
  12419    8744    4016   25179    625b kernel/sysctl.o.BEFORE
  12395    8744    4024   25163    624b kernel/sysctl.o.AFTER

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Eric W. <ebiederm@xmission.com>
Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>

---
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f88552c..ba5e511 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2448,86 +2448,119 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
 				do_proc_dointvec_minmax_conv, &param);
 }
 
-static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
-				     void __user *buffer,
+static int __doulongvec_minmax_read(void *data, void __user *buffer,
 				     size_t *lenp, loff_t *ppos,
 				     unsigned long convmul,
 				     unsigned long convdiv)
 {
-	unsigned long *i, *min, *max;
-	int vleft, first = 1, err = 0;
-	unsigned long page = 0;
-	size_t left;
-	char *kbuf;
+	unsigned long *i = (unsigned long *) data;
+	int err = 0;
+	bool first = true;
+	size_t left = *lenp;
 
-	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
-		*lenp = 0;
-		return 0;
+	for (; left; i++, first=false) {
+		unsigned long val;
+
+		val = convdiv * (*i) / convmul;
+		if (!first)
+			err = proc_put_char(&buffer, &left, '\t');
+		err = proc_put_long(&buffer, &left, val, false);
+		if (err)
+			break;
 	}
 
-	i = (unsigned long *) data;
-	min = (unsigned long *) table->extra1;
-	max = (unsigned long *) table->extra2;
-	vleft = table->maxlen / sizeof(unsigned long);
-	left = *lenp;
+	if (!first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
 
-	if (write) {
-		if (left > PAGE_SIZE - 1)
-			left = PAGE_SIZE - 1;
-		page = __get_free_page(GFP_TEMPORARY);
-		kbuf = (char *) page;
-		if (!kbuf)
-			return -ENOMEM;
-		if (copy_from_user(kbuf, buffer, left)) {
-			err = -EFAULT;
-			goto free;
-		}
-		kbuf[left] = 0;
+	*lenp -= left;
+	*ppos += *lenp;
+	return err;
+}
+
+static int __doulongvec_minmax_write(void *data, void __user *buffer,
+				     size_t *lenp, loff_t *ppos, int vleft,
+				     unsigned long min, unsigned long max)
+{
+	char *kbuf;
+	size_t left = *lenp;
+	unsigned long page = 0;
+	unsigned long *i = (unsigned long *) data;
+	int err = 0;
+	bool first = true;
+
+	if (left > PAGE_SIZE - 1)
+		left = PAGE_SIZE - 1;
+	page = __get_free_page(GFP_TEMPORARY);
+	kbuf = (char *) page;
+	if (!kbuf)
+		return -ENOMEM;
+	if (copy_from_user(kbuf, buffer, left)) {
+		err = -EFAULT;
+		goto free;
 	}
+	kbuf[left] = 0;
 
-	for (; left && vleft--; i++, min++, max++, first=0) {
+	for (; left && vleft--; i++, first=false) {
 		unsigned long val;
+		bool neg;
 
-		if (write) {
-			bool neg;
-
-			left -= proc_skip_spaces(&kbuf);
+		left -= proc_skip_spaces(&kbuf);
 
-			err = proc_get_long(&kbuf, &left, &val, &neg,
-					     proc_wspace_sep,
-					     sizeof(proc_wspace_sep), NULL);
-			if (err)
-				break;
-			if (neg)
-				continue;
-			if ((min && val < *min) || (max && val > *max))
-				continue;
-			*i = val;
-		} else {
-			val = convdiv * (*i) / convmul;
-			if (!first)
-				err = proc_put_char(&buffer, &left, '\t');
-			err = proc_put_long(&buffer, &left, val, false);
-			if (err)
-				break;
-		}
+		err = proc_get_long(&kbuf, &left, &val, &neg,
+				     proc_wspace_sep,
+				     sizeof(proc_wspace_sep), NULL);
+		if (err)
+			break;
+		if (neg)
+			continue;
+		if (val < min || val > max)
+			continue;
+		*i = val;
 	}
 
-	if (!write && !first && left && !err)
-		err = proc_put_char(&buffer, &left, '\n');
-	if (write && !err)
+	if (!err)
 		left -= proc_skip_spaces(&kbuf);
 free:
-	if (write) {
-		free_page(page);
-		if (first)
-			return err ? : -EINVAL;
-	}
+	free_page(page);
+	if (first)
+		return err ? : -EINVAL;
+
 	*lenp -= left;
 	*ppos += *lenp;
 	return err;
 }
 
+static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
+				     void __user *buffer,
+				     size_t *lenp, loff_t *ppos,
+				     unsigned long convmul,
+				     unsigned long convdiv)
+{
+	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+
+	if (write) {
+		unsigned long min, max;
+		int vleft;
+
+		vleft = table->maxlen / sizeof(unsigned long);
+		if (table->extra1)
+			min = *(unsigned long *) table->extra1;
+		else
+			min = 0;
+		if (table->extra2)
+			max = *(unsigned long *) table->extra2;
+		else
+			max = ULONG_MAX;
+		return __doulongvec_minmax_write(data, buffer, lenp,
+						  ppos, vleft, min, max);
+	} else
+		return __doulongvec_minmax_read(data, buffer, lenp,
+						 ppos, convmul, convdiv);
+}
+
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
 				     void __user *buffer,
 				     size_t *lenp, loff_t *ppos,

^ permalink raw reply related

* Re: pull-request: bluetooth-2.6 2010-10-05
From: David Miller @ 2010-10-07  8:02 UTC (permalink / raw)
  To: padovan; +Cc: linville, marcel, linux-bluetooth, netdev
In-Reply-To: <20101005171349.GA16520@vigoh>

From: "Gustavo F. Padovan" <padovan@profusion.mobi>
Date: Tue, 5 Oct 2010 14:13:49 -0300

> In this patch set we have two fixes for regressions in L2CAP due to ERTM code
> we added in L2CAP for 2.6.36, a bugfix in the L2CAP Streaming Mode that was
> making the kernel crash. And a fix for a deadlock issue between the sk_sndbuf
> and the backlog queue in ERTM. The rest are also needed bug fixes.

Pulled, thanks.

^ permalink raw reply

* BUG ? ipip unregister_netdevice_many()
From: Hans Schillstrom @ 2010-10-07  8:48 UTC (permalink / raw)
  To: netdev@vger.kernel.org; +Cc: Eric W. Biederman, Daniel Lezcano

Hello
I'm trying to exit a network name space and it doesn't work (or am I doing something wrong?)
The only netdevices left are lo and the tunnels ip6tnl0, sit0 and tunl0 when exiting netns.

A netns is created by lxc-execute with two interfaces eth0 eth1 (macvlan)
(see conf file at the end)

Kernel: net-next-2.6 top from 4 october 2010

I added some printk's inn ipip.c  ipip_exit_net()
...
        rtnl_lock();
        printk(KERN_ERR "ipip_exit_net(enter)\n");
        ipip_destroy_tunnels(ipn, &list);
        printk(KERN_ERR "ipip_exit_net(1)\n");
        unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
        printk(KERN_ERR "ipip_exit_net(2)\n");
        unregister_netdevice_many(&list);
        printk(KERN_ERR "ipip_exit_net(3)\n");
        rtnl_unlock();
        printk(KERN_ERR "ipip_exit_net(exit)\n");


Exit steps:
===== Screen dump =====

 # ifconfig eth0  0.0.0.0  down
 # ifconfig eth1  0.0.0.0  down
 # ifconfig lo  0.0.0.0  down
 # ip li de eth0
 # ip li de eth1
 # ifconfig -a
ip6tnl0   Link encap:UNSPEC  HWaddr 00-00-00-00-00-00-00-00-00-00-00-00-00-00-00-00  
          NOARP  MTU:1460  Metric:1
          RX packets:0 errors:0 dropped:0 overruns:0 frame:0
          TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:0 
          RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)

lo        Link encap:Local Loopback  
          inet addr:127.0.0.1  Mask:255.0.0.0
          LOOPBACK  MTU:16436  Metric:1
          RX packets:0 errors:0 dropped:0 overruns:0 frame:0
          TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:0 
          RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)

sit0      Link encap:IPv6-in-IPv4  
          NOARP  MTU:1480  Metric:1
          RX packets:0 errors:0 dropped:0 overruns:0 frame:0
          TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:0 
          RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)

tunl0     Link encap:UNSPEC  HWaddr 00-00-00-00-00-00-00-00-00-00-00-00-00-00-00-00  
          NOARP  MTU:1480  Metric:1
          RX packets:0 errors:0 dropped:0 overruns:0 frame:0
          TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:0 
          RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)

 # ps
  PID USER       VSZ STAT COMMAND
    1 root     12412 S    /usr/lib64/lxc/lxc-init -- /var/bin/init
    2 root      4540 S    /bin/ash /var/bin/init
    7 root      6640 S    inetd
    8 root      4544 S    /bin/ash
   26 root      4544 R    ps
 # lsmod 
Module                  Size  Used by    Not tainted
macvlan                 8709  0 
pcnet32                29549  0 
tg3                   112093  0 
libphy                 21043  1 tg3
 # kill 7 2
 # ps
  PID USER       VSZ STAT COMMAND
    1 root     12412 S    /usr/lib64/lxc/lxc-init -- /var/bin/init
    8 root      4544 S    /bin/ash
   28 root      4544 R    ps
 # exit  ( here is the exit from netns  )
 # ipip_exit_net(enter)
ipip_exit_net(1)
ipip_exit_net(2)
------------[ cut here ]------------
WARNING: at /home/hans/evip/kvm/net-next-2.6/kernel/sysctl.c:1953 unregister_sysctl_table+0xc7/0xf9()
Hardware name: Bochs
Modules linked in: macvlan pcnet32 tg3 libphy
Pid: 5, comm: kworker/u:0 Not tainted 2.6.36-rc3+ #7
Call Trace:
 [<ffffffff8103e281>] warn_slowpath_common+0x85/0x9d
 [<ffffffff8103e2b3>] warn_slowpath_null+0x1a/0x1c
 [<ffffffff81045e64>] unregister_sysctl_table+0xc7/0xf9
 [<ffffffff812c86a5>] neigh_sysctl_unregister+0x27/0x3f
 [<ffffffff81342108>] addrconf_ifdown+0x415/0x45e
 [<ffffffff81342b98>] addrconf_notify+0x756/0x7fe
 [<ffffffff812cacfb>] ? neigh_ifdown+0xc3/0xd4
 [<ffffffff813622b3>] ? ip6mr_device_event+0x8d/0x9e
 [<ffffffff8105eddb>] notifier_call_chain+0x37/0x63
 [<ffffffff8105ee8b>] raw_notifier_call_chain+0x14/0x16
 [<ffffffff812c15c7>] call_netdevice_notifiers+0x4a/0x4f
 [<ffffffff812c1c1b>] rollback_registered_many+0x121/0x208
 [<ffffffff812c1d1d>] unregister_netdevice_many+0x1b/0x71
 [<ffffffff81324209>] ipip_exit_net+0xea/0x11a
 [<ffffffff812bc941>] ? cleanup_net+0x0/0x198
 [<ffffffff812bc2cf>] ops_exit_list+0x2a/0x5b
 [<ffffffff812bca39>] cleanup_net+0xf8/0x198
 [<ffffffff810568c7>] process_one_work+0x2a2/0x44d
 [<ffffffff81056e35>] worker_thread+0x1db/0x34e
 [<ffffffff81056c5a>] ? worker_thread+0x0/0x34e
 [<ffffffff8105a030>] kthread+0x82/0x8a
 [<ffffffff81003954>] kernel_thread_helper+0x4/0x10
 [<ffffffff81059fae>] ? kthread+0x0/0x8a
 [<ffffffff81003950>] ? kernel_thread_helper+0x0/0x10
---[ end trace 939b5185219f32e7 ]---
ipip_exit_net(3)
ipip_exit_net(exit)
unregister_netdevice: waiting for lo to become free. Usage count = 4
unregister_netdevice: waiting for lo to become free. Usage count = 4
unregister_netdevice: waiting for lo to become free. Usage count = 4
....
...
===== End of screen dump =====

lxc conf file:
# Container with network virtualized using the vlan device driver
# Local eth0 uplink
lxc.utsname = fee_0
lxc.network.type = macvlan
lxc.network.flags = up
lxc.network.link = eth1
lxc.network.hwaddr = 00:00:04:01:01:01
lxc.network.ipv4 = 192.168.1.21/24
lxc.network.ipv6 = 2003::2:1:1/96
# local eth1 downlink - to the RS farm
lxc.network.type = macvlan
lxc.network.flags = up
lxc.network.link = eth0
lxc.network.hwaddr = 00:00:03:01:01:01
lxc.network.ipv4 = 192.168.0.21/24
lxc.network.ipv6 = 2003::1:1:1/96
lxc.mount.entry = /var/lib/lxc/fee_0/var /var none rw,bind 0 0



Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>

^ permalink raw reply

* [PATCH] sysctl: fix min/max handling in __do_proc_doulongvec_minmax()
From: Américo Wang @ 2010-10-07  9:25 UTC (permalink / raw)
  To: Américo Wang
  Cc: Eric Dumazet, Robin Holt, Andrew Morton, linux-kernel,
	Willy Tarreau, David S. Miller, netdev, James Morris,
	Pekka Savola (ipv6), Patrick McHardy, Alexey Kuznetsov, ebiederm
In-Reply-To: <20101007071859.GD5471@cr0.nay.redhat.com>

>>
>
>Here is the final one.

Oops, that one is not correct. Hopefully this one
is correct.

--------------->

Eric D. noticed that we may trigger an OOPS if we leave ->extra{1,2}
to NULL when we use proc_doulongvec_minmax().

Actually, we don't need to store min/max values in a vector,
because all the elements in the vector should share the same min/max
value, like what proc_dointvec_minmax() does.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Eric W. <ebiederm@xmission.com>
Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>

---
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f88552c..fad9208 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2448,86 +2448,119 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
 				do_proc_dointvec_minmax_conv, &param);
 }
 
-static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
-				     void __user *buffer,
-				     size_t *lenp, loff_t *ppos,
+static int __doulongvec_minmax_read(void *data, void __user *buffer,
+				     size_t *lenp, loff_t *ppos, int vleft,
 				     unsigned long convmul,
 				     unsigned long convdiv)
 {
-	unsigned long *i, *min, *max;
-	int vleft, first = 1, err = 0;
-	unsigned long page = 0;
-	size_t left;
-	char *kbuf;
+	unsigned long *i = data;
+	int err = 0;
+	bool first = true;
+	size_t left = *lenp;
 
-	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
-		*lenp = 0;
-		return 0;
+	for (; left && vleft--; i++, first=false) {
+		unsigned long val;
+
+		val = convdiv * (*i) / convmul;
+		if (!first)
+			err = proc_put_char(&buffer, &left, '\t');
+		err = proc_put_long(&buffer, &left, val, false);
+		if (err)
+			break;
 	}
 
-	i = (unsigned long *) data;
-	min = (unsigned long *) table->extra1;
-	max = (unsigned long *) table->extra2;
-	vleft = table->maxlen / sizeof(unsigned long);
-	left = *lenp;
+	if (!first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
 
-	if (write) {
-		if (left > PAGE_SIZE - 1)
-			left = PAGE_SIZE - 1;
-		page = __get_free_page(GFP_TEMPORARY);
-		kbuf = (char *) page;
-		if (!kbuf)
-			return -ENOMEM;
-		if (copy_from_user(kbuf, buffer, left)) {
-			err = -EFAULT;
-			goto free;
-		}
-		kbuf[left] = 0;
+	*lenp -= left;
+	*ppos += *lenp;
+	return err;
+}
+
+static int __doulongvec_minmax_write(void *data, void __user *buffer,
+				     size_t *lenp, loff_t *ppos, int vleft,
+				     unsigned long min, unsigned long max)
+{
+	char *kbuf;
+	size_t left = *lenp;
+	unsigned long page = 0;
+	unsigned long *i = (unsigned long *) data;
+	int err = 0;
+	bool first = true;
+
+	if (left > PAGE_SIZE - 1)
+		left = PAGE_SIZE - 1;
+	page = __get_free_page(GFP_TEMPORARY);
+	kbuf = (char *) page;
+	if (!kbuf)
+		return -ENOMEM;
+	if (copy_from_user(kbuf, buffer, left)) {
+		err = -EFAULT;
+		goto free;
 	}
+	kbuf[left] = 0;
 
-	for (; left && vleft--; i++, min++, max++, first=0) {
+	for (; left && vleft--; i++, first=false) {
 		unsigned long val;
+		bool neg;
 
-		if (write) {
-			bool neg;
-
-			left -= proc_skip_spaces(&kbuf);
+		left -= proc_skip_spaces(&kbuf);
 
-			err = proc_get_long(&kbuf, &left, &val, &neg,
-					     proc_wspace_sep,
-					     sizeof(proc_wspace_sep), NULL);
-			if (err)
-				break;
-			if (neg)
-				continue;
-			if ((min && val < *min) || (max && val > *max))
-				continue;
-			*i = val;
-		} else {
-			val = convdiv * (*i) / convmul;
-			if (!first)
-				err = proc_put_char(&buffer, &left, '\t');
-			err = proc_put_long(&buffer, &left, val, false);
-			if (err)
-				break;
-		}
+		err = proc_get_long(&kbuf, &left, &val, &neg,
+				     proc_wspace_sep,
+				     sizeof(proc_wspace_sep), NULL);
+		if (err)
+			break;
+		if (neg)
+			continue;
+		if (val < min || val > max)
+			continue;
+		*i = val;
 	}
 
-	if (!write && !first && left && !err)
-		err = proc_put_char(&buffer, &left, '\n');
-	if (write && !err)
+	if (!err)
 		left -= proc_skip_spaces(&kbuf);
 free:
-	if (write) {
-		free_page(page);
-		if (first)
-			return err ? : -EINVAL;
-	}
+	free_page(page);
+	if (first)
+		return err ? : -EINVAL;
+
 	*lenp -= left;
 	*ppos += *lenp;
 	return err;
 }
 
+static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
+				     void __user *buffer,
+				     size_t *lenp, loff_t *ppos,
+				     unsigned long convmul,
+				     unsigned long convdiv)
+{
+	int vleft;
+	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+
+	vleft = table->maxlen / sizeof(unsigned long);
+	if (write) {
+		unsigned long min, max;
+
+		if (table->extra1)
+			min = *(unsigned long *) table->extra1;
+		else
+			min = 0;
+		if (table->extra2)
+			max = *(unsigned long *) table->extra2;
+		else
+			max = ULONG_MAX;
+		return __doulongvec_minmax_write(data, buffer, lenp,
+						  ppos, vleft, min, max);
+	} else
+		return __doulongvec_minmax_read(data, buffer, lenp,
+						 ppos, vleft, convmul, convdiv);
+}
+
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
 				     void __user *buffer,
 				     size_t *lenp, loff_t *ppos,

^ permalink raw reply related

* Re: [PATCH] SIW: Queue pair
From: Bernard Metzler @ 2010-10-07  9:21 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: linux-rdma, linux-rdma-owner, netdev
In-Reply-To: <AANLkTimM3Lpmt3JLgbCtuCjqFjNb8AkvBpdzpN7Sx978@mail.gmail.com>



linux-rdma-owner@vger.kernel.org wrote on 10/06/2010 07:57:29 PM:

> On Tue, Oct 5, 2010 at 8:55 AM, Bernard Metzler <bmt@zurich.ibm.com>
wrote:
> >
> > [ ... ]
> > +int
> > +siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
> > +             enum siw_qp_attr_mask mask)
> > +{
> > [ ... ]
> > +               case SIW_QP_STATE_ERROR:
> > +                       siw_rq_flush(qp);
> > +                       qp->attrs.state = SIW_QP_STATE_ERROR;
> > +                       drop_conn = 1;
> > +                       break;
>
> At least for InfiniBand when the state of a queue pair is changed to
> Error, work requests must be completed in error. The above code just
> flushes pending work requests. That doesn't look correct to me.
>
> A quote from the InfiniBand Architecture Specification, table 91, QP
> State Transition Properties:
>
> Any state to Reset: QP attributes are reset to the same values after
> the QP was created. Outstanding Work Requests are removed from the
> queues without notifying the Consumer.
>
right. i was trying to follow the verbs specification, which makes the
same statement - see sec. 6.2.4 of the Hilland vers:

The following is done on entry into the Error state:

   *   The RI MUST flush any incomplete WRs on the SQ or RQ. All WQEs
       on the SQ and RQ, except for the WQE that caused the error (if
       any), MUST be returned with the Flushed Error Completion Status
       through the Completion Queue associated with the WQ.

following that, siw_rq_flush() flushes to the CQ. so, i think, we are
in sync here (the code flushes only the RQ to the CQ here, since we come
from IDLE or RTR, which must not have anything on the SQ).


thanks,
bernard.


^ permalink raw reply

* Re: [PATCH] sysctl: fix min/max handling in __do_proc_doulongvec_minmax()
From: Eric Dumazet @ 2010-10-07  9:51 UTC (permalink / raw)
  To: Américo Wang
  Cc: Robin Holt, Andrew Morton, linux-kernel, Willy Tarreau,
	David S. Miller, netdev, James Morris, Pekka Savola (ipv6),
	Patrick McHardy, Alexey Kuznetsov, ebiederm
In-Reply-To: <20101007092538.GE5471@cr0.nay.redhat.com>

Le jeudi 07 octobre 2010 à 17:25 +0800, Américo Wang a écrit :
> >>
> >
> >Here is the final one.
> 
> Oops, that one is not correct. Hopefully this one
> is correct.
> 
> --------------->
> 
> Eric D. noticed that we may trigger an OOPS if we leave ->extra{1,2}
> to NULL when we use proc_doulongvec_minmax().
> 
> Actually, we don't need to store min/max values in a vector,
> because all the elements in the vector should share the same min/max
> value, like what proc_dointvec_minmax() does.
> 

If we assert same min/max limits are to be applied to all elements,
a much simpler fix than yours would be :

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f88552c..8e45451 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2485,7 +2485,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 		kbuf[left] = 0;
 	}
 
-	for (; left && vleft--; i++, min++, max++, first=0) {
+	for (; left && vleft--; i++, first=0) {
 		unsigned long val;
 
 		if (write) {


Please dont send huge patches like this to 'fix' a bug,
especially on slow path.

First we fix the bug, _then_ we can try to make code more 
efficient or more pretty or shorter.

So the _real_ question is :

Should the min/max limits should be a single pair,
shared by all elements, or a vector of limits.



^ permalink raw reply related

* Re: [net-next PATCH] igb: update adapter stats when reading /proc/net/dev.
From: Jesper Dangaard Brouer @ 2010-10-07 10:06 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, netdev, jeffrey.t.kirsher
In-Reply-To: <20101006.234611.191419337.davem@davemloft.net>

On Wed, 2010-10-06 at 23:46 -0700, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Thu, 07 Oct 2010 08:44:08 +0200
> 
> > Le mercredi 06 octobre 2010 à 23:36 -0700, David Miller a écrit :
> >> From: Jesper Dangaard Brouer <hawk@comx.dk>
> >> Date: Tue, 05 Oct 2010 16:18:33 +0200
> >> 
> >> > Network driver igb: Improve the accuracy of stats in /proc/net/dev, by
> >> > updating the adapter stats when reading /proc/net/dev.  Currently the
> >> > stats are updated by the watchdog timer every 2 sec, or when getting
> >> > stats via ethtool -S.
> >> > 
> >> > A number of userspace apps read these /proc/net/dev stats every second,
> >> > e.g. ifstat, which then gives a perceived very bursty traffic pattern,
> >> > which is actually false.
> >> > 
> >> > Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk>
> >> 
> >> I assume that the Intel folks will take care of integrating this
> >> now that the locking is fixed.
> > 
> > I integrated Jesper patch into my cumulative patch.
> 
> Ok.

I'm fine with this, as long as the commit message describe this change
of accuracy of stats in /proc/net/dev.

Something like:

 This patch also increase the accuracy of stats in /proc/net/dev, by
 updating the adapter stats when reading /proc/net/dev.  Previously
 the stats were only updated by the watchdog timer every 2 sec, which
 resulted in false observations from userspace.


-- 
Med venlig hilsen / Best regards
  Jesper Brouer
  ComX Networks A/S
  Linux Network Kernel Developer
  Cand. Scient Datalog / MSc.CS
  Author of http://adsl-optimizer.dk
  LinkedIn: http://www.linkedin.com/in/brouer



^ permalink raw reply

* Re: [net-next PATCH] igb: update adapter stats when reading /proc/net/dev.
From: Eric Dumazet @ 2010-10-07 10:24 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: David Miller, netdev, jeffrey.t.kirsher
In-Reply-To: <1286446004.6992.7.camel@firesoul.comx.local>

Le jeudi 07 octobre 2010 à 12:06 +0200, Jesper Dangaard Brouer a écrit :
> On Wed, 2010-10-06 at 23:46 -0700, David Miller wrote:
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Thu, 07 Oct 2010 08:44:08 +0200
> > 
> > > Le mercredi 06 octobre 2010 à 23:36 -0700, David Miller a écrit :
> > >> From: Jesper Dangaard Brouer <hawk@comx.dk>
> > >> Date: Tue, 05 Oct 2010 16:18:33 +0200
> > >> 
> > >> > Network driver igb: Improve the accuracy of stats in /proc/net/dev, by
> > >> > updating the adapter stats when reading /proc/net/dev.  Currently the
> > >> > stats are updated by the watchdog timer every 2 sec, or when getting
> > >> > stats via ethtool -S.
> > >> > 
> > >> > A number of userspace apps read these /proc/net/dev stats every second,
> > >> > e.g. ifstat, which then gives a perceived very bursty traffic pattern,
> > >> > which is actually false.
> > >> > 
> > >> > Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk>
> > >> 
> > >> I assume that the Intel folks will take care of integrating this
> > >> now that the locking is fixed.
> > > 
> > > I integrated Jesper patch into my cumulative patch.
> > 
> > Ok.
> 
> I'm fine with this, as long as the commit message describe this change
> of accuracy of stats in /proc/net/dev.
> 
> Something like:
> 
>  This patch also increase the accuracy of stats in /proc/net/dev, by
>  updating the adapter stats when reading /proc/net/dev.  Previously
>  the stats were only updated by the watchdog timer every 2 sec, which
>  resulted in false observations from userspace.
> 
> 

Well, its not exactly true :)

Previously, igb stats were updated :
- By watchdog timer, every 2 secs
- Every time an "ethtool -S ethX" was done

There is no general guarantee netdev stats are immediately available to
user.

ndo_get_stats() is not allowed to sleep, (because of bonding...), so
drivers can not always provide accurate stats, if they need to make a
long transaction with hardware.

Other drivers do the same (provide hardware statistics), with about one
second resolution.

So the "resulted in false observations from userspace." is something
that might upset admins, but is not a hard requirement of netdev stats.




^ permalink raw reply

* Re: [net-next PATCH] igb: update adapter stats when reading /proc/net/dev.
From: Jesper Dangaard Brouer @ 2010-10-07 11:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, jeffrey.t.kirsher
In-Reply-To: <1286447084.2912.24.camel@edumazet-laptop>

On Thu, 2010-10-07 at 12:24 +0200, Eric Dumazet wrote:
> Le jeudi 07 octobre 2010 à 12:06 +0200, Jesper Dangaard Brouer a écrit :
> > On Wed, 2010-10-06 at 23:46 -0700, David Miller wrote:
> > > From: Eric Dumazet <eric.dumazet@gmail.com>
> > > Date: Thu, 07 Oct 2010 08:44:08 +0200
> > > 
> > > > Le mercredi 06 octobre 2010 à 23:36 -0700, David Miller a écrit :
> > > >> From: Jesper Dangaard Brouer <hawk@comx.dk>
> > > >> Date: Tue, 05 Oct 2010 16:18:33 +0200
> > > >> 
> > > >> > Network driver igb: Improve the accuracy of stats in /proc/net/dev, by
> > > >> > updating the adapter stats when reading /proc/net/dev.  Currently the
> > > >> > stats are updated by the watchdog timer every 2 sec, or when getting
> > > >> > stats via ethtool -S.
> > > >> > 
> > > >> > A number of userspace apps read these /proc/net/dev stats every second,
> > > >> > e.g. ifstat, which then gives a perceived very bursty traffic pattern,
> > > >> > which is actually false.
> > > >> > 
> > > >> > Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk>
> > > >> 
> > > >> I assume that the Intel folks will take care of integrating this
> > > >> now that the locking is fixed.
> > > > 
> > > > I integrated Jesper patch into my cumulative patch.
> > > 
> > > Ok.
> > 
> > I'm fine with this, as long as the commit message describe this change
> > of accuracy of stats in /proc/net/dev.
> > 
> > Something like:
> > 
> >  This patch also increase the accuracy of stats in /proc/net/dev, by
> >  updating the adapter stats when reading /proc/net/dev.  Previously
> >  the stats were only updated by the watchdog timer every 2 sec, which
> >  resulted in false observations from userspace.
> > 
> > 
> 
> Well, its not exactly true :)
> 
> Previously, igb stats were updated :
> - By watchdog timer, every 2 secs
> - Every time an "ethtool -S ethX" was done
> 
> There is no general guarantee netdev stats are immediately available to
> user.

I'm not talking general, just for this driver.
And I'm not giving any guarantees, that is why I choose the wording
"increase the accuracy" and not "provide accurate stats".

> ndo_get_stats() is not allowed to sleep, (because of bonding...), so
> drivers can not always provide accurate stats, if they need to make a
> long transaction with hardware.
> 
> Other drivers do the same (provide hardware statistics), with about one
> second resolution.

Yes, a lot of drivers don't provide accurate states. And one second
resolution, as most drivers use, will usually be good enough for most
admins.

Our usage pattern is a bit different, as we are very interested in
measuring bursty traffic.  I'm explain you why during the Netfilter
Workshop, if I get my talk about IPTV accepted ;-)

I also use the increased accuracy, when running my pktgen testing
scripts.


> So the "resulted in false observations from userspace." is something
> that might upset admins, but is not a hard requirement of netdev stats.

Its definitly not a hard requirement, but lets fix it where we can.

Then change the text:
 "resulted in false observations from userspace"
to:
 "resulted in inaccurate observations from userspace"

-- 
Med venlig hilsen / Best regards
  Jesper Brouer
  ComX Networks A/S
  Linux Network Kernel Developer
  Cand. Scient Datalog / MSc.CS
  Author of http://adsl-optimizer.dk
  LinkedIn: http://www.linkedin.com/in/brouer



^ permalink raw reply

* [RESEND] [PATCH stable-2.6.32] Phonet: disable network namespace support
From: Rémi Denis-Courmont @ 2010-10-07 12:17 UTC (permalink / raw)
  To: linux-kernel, stable; +Cc: netdev, Rémi Denis-Courmont, Ben Hutchings

From: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>

From: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>

Network namespace in the Phonet socket stack causes an OOPS when a
namespace is destroyed. This occurs as the loopback exit_net handler is
called after the Phonet exit_net handler, and re-enters the Phonet
stack. I cannot think of any nice way to fix this in kernel <= 2.6.32.

For lack of a better solution, disable namespace support completely.
If you need that, upgrade to a newer kernel.

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Acked-by: David S. Miller <davem@davemloft>
Cc: Ben Hutchings <ben@decadent.org.uk>
---
 net/phonet/af_phonet.c  |    4 ++++
 net/phonet/pn_dev.c     |   12 ++++++++++--
 net/phonet/pn_netlink.c |    9 ++++++++-
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index f60c0c2..519ff9d 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -67,6 +67,8 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol)
 	struct phonet_protocol *pnp;
 	int err;
 
+	if (!net_eq(net, &init_net))
+		return -EAFNOSUPPORT;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
@@ -353,6 +355,8 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
 	struct sockaddr_pn sa;
 	u16 len;
 
+	if (!net_eq(net, &init_net))
+		goto out;
 	/* check we have at least a full Phonet header */
 	if (!pskb_pull(skb, sizeof(struct phonethdr)))
 		goto out;
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
index 5f42f30..5a2275c 100644
--- a/net/phonet/pn_dev.c
+++ b/net/phonet/pn_dev.c
@@ -246,7 +246,11 @@ static struct notifier_block phonet_device_notifier = {
 /* Per-namespace Phonet devices handling */
 static int phonet_init_net(struct net *net)
 {
-	struct phonet_net *pnn = kmalloc(sizeof(*pnn), GFP_KERNEL);
+	struct phonet_net *pnn;
+
+	if (!net_eq(net, &init_net))
+		return 0;
+	pnn = kmalloc(sizeof(*pnn), GFP_KERNEL);
 	if (!pnn)
 		return -ENOMEM;
 
@@ -263,9 +267,13 @@ static int phonet_init_net(struct net *net)
 
 static void phonet_exit_net(struct net *net)
 {
-	struct phonet_net *pnn = net_generic(net, phonet_net_id);
+	struct phonet_net *pnn;
 	struct net_device *dev;
 
+	if (!net_eq(net, &init_net))
+		return;
+	pnn = net_generic(net, phonet_net_id);
+
 	rtnl_lock();
 	for_each_netdev(net, dev)
 		phonet_device_destroy(dev);
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
index d21fd35..7acab1e 100644
--- a/net/phonet/pn_netlink.c
+++ b/net/phonet/pn_netlink.c
@@ -68,6 +68,8 @@ static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *attr)
 	int err;
 	u8 pnaddr;
 
+	if (!net_eq(net, &init_net))
+		return -EOPNOTSUPP;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
@@ -124,12 +126,16 @@ nla_put_failure:
 
 static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = sock_net(skb->sk);
 	struct phonet_device_list *pndevs;
 	struct phonet_device *pnd;
 	int dev_idx = 0, dev_start_idx = cb->args[0];
 	int addr_idx = 0, addr_start_idx = cb->args[1];
 
-	pndevs = phonet_device_list(sock_net(skb->sk));
+	if (!net_eq(net, &init_net))
+		goto skip;
+
+	pndevs = phonet_device_list(net);
 	spin_lock_bh(&pndevs->lock);
 	list_for_each_entry(pnd, &pndevs->list, list) {
 		u8 addr;
@@ -154,6 +160,7 @@ static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 
 out:
 	spin_unlock_bh(&pndevs->lock);
+skip:
 	cb->args[0] = dev_idx;
 	cb->args[1] = addr_idx;
 
-- 
1.7.0.4


^ permalink raw reply related

* Segmentation fault when using ping6
From: Rogerio Pimentel @ 2010-10-07 12:26 UTC (permalink / raw)
  To: linux-arm-kernel, netdev

Hi,

I'm having a "segmentation fault" problem when using ping6.

When testing on ARM9 platforms (i.MX25 and i.MX27), it returns the error:

root@freescale ~$ ping6 ::1
PING ::1 (::1): 56 data bytes
Segmentation fault

When testing on ARM11 and ARM Cortex A8 platforms (i.MX31 and i.MX51), 
it works:

root@freescale ~$ ping6 ::1
PING ::1 (::1): 56 data bytes
64 bytes from ::1: seq=0 ttl=64 time=0.331 ms
64 bytes from ::1: seq=1 ttl=64 time=0.214 ms
64 bytes from ::1: seq=2 ttl=64 time=0.176 ms
64 bytes from ::1: seq=3 ttl=64 time=0.155 ms

On all cases, I'm using Linux Kernel 2.6.35 (from Mainline) and Busybox 
1.15.0

Does anybody have any idea?

Thanks in advance,

Rogerio Pimentel


^ permalink raw reply

* [PATCH] net/fec: carrier off initially to avoid root mount failure
From: Oskar Schirmer @ 2010-10-07 12:30 UTC (permalink / raw)
  To: netdev; +Cc: Dan Malek, Sebastian Siewior, Hans J. Koch, Oskar Schirmer

with hardware slow in negotiation, the system did freeze
while trying to mount root on nfs at boot time.

the link state has not been initialised so network stack
tried to start transmission right away. this caused instant
retries, as the driver solely stated business upon link down,
rendering the system unusable.

notify carrier off initially to prevent transmission until
phylib will report link up.

Signed-off-by: Oskar Schirmer <oskar@linutronix.de>
---
 drivers/net/fec.c |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/drivers/net/fec.c b/drivers/net/fec.c
index 768b840..e83f67d 100644
--- a/drivers/net/fec.c
+++ b/drivers/net/fec.c
@@ -1311,6 +1311,9 @@ fec_probe(struct platform_device *pdev)
 	if (ret)
 		goto failed_mii_init;
 
+	/* Carrier starts down, phylib will bring it up */
+	netif_carrier_off(ndev);
+
 	ret = register_netdev(ndev);
 	if (ret)
 		goto failed_register;
-- 
1.7.0.4


^ permalink raw reply related

* Re: Segmentation fault when using ping6
From: Eric Dumazet @ 2010-10-07 12:36 UTC (permalink / raw)
  To: Rogerio Pimentel; +Cc: linux-arm-kernel, netdev
In-Reply-To: <4CADBC7A.6060908@freescale.com>

Le jeudi 07 octobre 2010 à 09:26 -0300, Rogerio Pimentel a écrit :
> Hi,
> 
> I'm having a "segmentation fault" problem when using ping6.
> 
> When testing on ARM9 platforms (i.MX25 and i.MX27), it returns the error:
> 
> root@freescale ~$ ping6 ::1
> PING ::1 (::1): 56 data bytes
> Segmentation fault
> 
> When testing on ARM11 and ARM Cortex A8 platforms (i.MX31 and i.MX51), 
> it works:
> 
> root@freescale ~$ ping6 ::1
> PING ::1 (::1): 56 data bytes
> 64 bytes from ::1: seq=0 ttl=64 time=0.331 ms
> 64 bytes from ::1: seq=1 ttl=64 time=0.214 ms
> 64 bytes from ::1: seq=2 ttl=64 time=0.176 ms
> 64 bytes from ::1: seq=3 ttl=64 time=0.155 ms
> 
> On all cases, I'm using Linux Kernel 2.6.35 (from Mainline) and Busybox 
> 1.15.0
> 
> Does anybody have any idea?

Do you have something displayed on console ? ( dmesg output )

gdb could tell you where segmentation takes place.



^ permalink raw reply

* [PATCH net-next] neigh: speedup neigh_hh_init()
From: Eric Dumazet @ 2010-10-07 12:53 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

When a new dst is used to send a frame, neigh_resolve_output() tries to
associate an struct hh_cache to this dst, calling neigh_hh_init() with
the neigh rwlock write locked.

Most of the time, hh_cache is already known and linked into neighbour,
so we find it and increment its refcount.

This patch changes the logic so that we call neigh_hh_init() with
neighbour lock read locked only, so that fast path can be run in
parallel by concurrent cpus.

This brings part of the speedup we got with commit c7d4426a98a5f
(introduce DST_NOCACHE flag) for non cached dsts, even for cached ones,
removing one of the contention point that routers hit on multiqueue
enabled machines.

Further improvements would need to use a seqlock instead of an rwlock to
protect neigh->ha[], to not dirty neigh too often and remove two atomic
ops.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/netdevice.h |    6 ++
 net/core/dst.c            |    4 -
 net/core/neighbour.c      |   99 ++++++++++++++++++++++--------------
 3 files changed, 69 insertions(+), 40 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6abcef6..4160db3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -281,6 +281,12 @@ struct hh_cache {
 	unsigned long	hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)];
 };
 
+static inline void hh_cache_put(struct hh_cache *hh)
+{
+	if (atomic_dec_and_test(&hh->hh_refcnt))
+		kfree(hh);
+}
+
 /* Reserve HH_DATA_MOD byte aligned hard_header_len, but at least that much.
  * Alternative is:
  *   dev->hard_header_len ? (dev->hard_header_len +
diff --git a/net/core/dst.c b/net/core/dst.c
index 6c41b1f..978a1ee 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -228,8 +228,8 @@ again:
 	child = dst->child;
 
 	dst->hh = NULL;
-	if (hh && atomic_dec_and_test(&hh->hh_refcnt))
-		kfree(hh);
+	if (hh)
+		hh_cache_put(hh);
 
 	if (neigh) {
 		dst->neighbour = NULL;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 3ffafaa..53cc548 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -709,8 +709,7 @@ void neigh_destroy(struct neighbour *neigh)
 		write_seqlock_bh(&hh->hh_lock);
 		hh->hh_output = neigh_blackhole;
 		write_sequnlock_bh(&hh->hh_lock);
-		if (atomic_dec_and_test(&hh->hh_refcnt))
-			kfree(hh);
+		hh_cache_put(hh);
 	}
 
 	skb_queue_purge(&neigh->arp_queue);
@@ -1210,39 +1209,67 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
 }
 EXPORT_SYMBOL(neigh_event_ns);
 
+static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst,
+				   __be16 protocol)
+{
+	struct hh_cache *hh;
+
+	for (hh = n->hh; hh; hh = hh->hh_next) {
+		if (hh->hh_type == protocol) {
+			atomic_inc(&hh->hh_refcnt);
+			if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
+				hh_cache_put(hh);
+			return true;
+		}
+	}
+	return false;
+}
+
+/* called with read_lock_bh(&n->lock); */
 static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
 			  __be16 protocol)
 {
 	struct hh_cache	*hh;
 	struct net_device *dev = dst->dev;
 
-	for (hh = n->hh; hh; hh = hh->hh_next)
-		if (hh->hh_type == protocol)
-			break;
+	if (likely(neigh_hh_lookup(n, dst, protocol)))
+		return;
 
-	if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
-		seqlock_init(&hh->hh_lock);
-		hh->hh_type = protocol;
-		atomic_set(&hh->hh_refcnt, 0);
-		hh->hh_next = NULL;
+	/* slow path */
+	hh = kzalloc(sizeof(*hh), GFP_ATOMIC);
+	if (!hh)
+		return;
 
-		if (dev->header_ops->cache(n, hh)) {
-			kfree(hh);
-			hh = NULL;
-		} else {
-			atomic_inc(&hh->hh_refcnt);
-			hh->hh_next = n->hh;
-			n->hh	    = hh;
-			if (n->nud_state & NUD_CONNECTED)
-				hh->hh_output = n->ops->hh_output;
-			else
-				hh->hh_output = n->ops->output;
-		}
+	seqlock_init(&hh->hh_lock);
+	hh->hh_type = protocol;
+	atomic_set(&hh->hh_refcnt, 2);
+
+	if (dev->header_ops->cache(n, hh)) {
+		kfree(hh);
+		return;
 	}
-	if (hh)	{
-		atomic_inc(&hh->hh_refcnt);
-		dst->hh = hh;
+	read_unlock(&n->lock);
+	write_lock(&n->lock);
+	
+	/* must check if another thread already did the insert */
+	if (neigh_hh_lookup(n, dst, protocol)) {
+		kfree(hh);
+		goto end;
 	}
+
+	if (n->nud_state & NUD_CONNECTED)
+		hh->hh_output = n->ops->hh_output;
+	else
+		hh->hh_output = n->ops->output;
+
+	hh->hh_next = n->hh;
+	n->hh	    = hh;
+
+	if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
+		hh_cache_put(hh);
+end:
+	write_unlock(&n->lock);
+	read_lock(&n->lock);
 }
 
 /* This function can be used in contexts, where only old dev_queue_xmit
@@ -1281,21 +1308,17 @@ int neigh_resolve_output(struct sk_buff *skb)
 	if (!neigh_event_send(neigh, skb)) {
 		int err;
 		struct net_device *dev = neigh->dev;
+
+		read_lock_bh(&neigh->lock);
 		if (dev->header_ops->cache &&
 		    !dst->hh &&
-		    !(dst->flags & DST_NOCACHE)) {
-			write_lock_bh(&neigh->lock);
-			if (!dst->hh)
-				neigh_hh_init(neigh, dst, dst->ops->protocol);
-			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
-					      neigh->ha, NULL, skb->len);
-			write_unlock_bh(&neigh->lock);
-		} else {
-			read_lock_bh(&neigh->lock);
-			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
-					      neigh->ha, NULL, skb->len);
-			read_unlock_bh(&neigh->lock);
-		}
+		    !(dst->flags & DST_NOCACHE))
+			neigh_hh_init(neigh, dst, dst->ops->protocol);
+
+		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+				      neigh->ha, NULL, skb->len);
+		read_unlock_bh(&neigh->lock);
+
 		if (err >= 0)
 			rc = neigh->ops->queue_xmit(skb);
 		else



^ permalink raw reply related

* Re: Segmentation fault when using ping6
From: George G. Davis @ 2010-10-07 13:04 UTC (permalink / raw)
  To: Rogerio Pimentel; +Cc: linux-arm-kernel, netdev
In-Reply-To: <4CADBC7A.6060908@freescale.com>

On Thu, Oct 07, 2010 at 09:26:34AM -0300, Rogerio Pimentel wrote:
> Hi,
> 
> I'm having a "segmentation fault" problem when using ping6.
> 
> When testing on ARM9 platforms (i.MX25 and i.MX27), it returns the error:
> 
> root@freescale ~$ ping6 ::1
> PING ::1 (::1): 56 data bytes
> Segmentation fault
> 
> When testing on ARM11 and ARM Cortex A8 platforms (i.MX31 and
> i.MX51), it works:
> 
> root@freescale ~$ ping6 ::1
> PING ::1 (::1): 56 data bytes
> 64 bytes from ::1: seq=0 ttl=64 time=0.331 ms
> 64 bytes from ::1: seq=1 ttl=64 time=0.214 ms
> 64 bytes from ::1: seq=2 ttl=64 time=0.176 ms
> 64 bytes from ::1: seq=3 ttl=64 time=0.155 ms
> 
> On all cases, I'm using Linux Kernel 2.6.35 (from Mainline) and
> Busybox 1.15.0
> 
> Does anybody have any idea?

Random guess is that you may be using a userland tuned for ARMv6 or later
but trying to run that on <ARMv6 machines.

You can enable this to see what's going wrong:

config DEBUG_USER
        bool "Verbose user fault messages"
        help
          When a user program crashes due to an exception, the kernel can
          print a brief message explaining what the problem was. This is
          sometimes helpful for debugging but serves no purpose on a
          production system. Most people should say N here.

          In addition, you need to pass user_debug=N on the kernel command
          line to enable this feature.  N consists of the sum of:

              1 - undefined instruction events
              2 - system calls
              4 - invalid data aborts
              8 - SIGSEGV faults
             16 - SIGBUS faults

--
Regards,
George

> 
> Thanks in advance,
> 
> Rogerio Pimentel
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply

* [PATCH net-next-2.6] net: Update kernel-doc for netif_set_real_num_rx_queues()
From: Ben Hutchings @ 2010-10-07 13:08 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, john.r.fastabend, therbert, Eric Dumazet

Synchronise the comment with the preceding implementation change.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 net/core/dev.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index f78d996..599a1fa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1575,8 +1575,8 @@ EXPORT_SYMBOL(netif_set_real_num_tx_queues);
  *
  *	This must be called either with the rtnl_lock held or before
  *	registration of the net device.  Returns 0 on success, or a
- *	negative error code.  If called before registration, it also
- *	sets the maximum number of queues, and always succeeds.
+ *	negative error code.  If called before registration, it always
+ *	succeeds.
  */
 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
 {
-- 
1.7.2.1


-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* Re: [RFC] net: allow FEC driver to not have attached PHY
From: Ben Hutchings @ 2010-10-07 13:11 UTC (permalink / raw)
  To: Greg Ungerer; +Cc: netdev, gerg
In-Reply-To: <201010070350.o973oGFE026910@goober.internal.moreton.com.au>

On Thu, 2010-10-07 at 13:50 +1000, Greg Ungerer wrote:
> Hi All,
> 
> I have a board with a ColdFire SoC on it with the built-in FEC
> ethernet module. On this hardware the FEC eth output is directly
> attached to a RTL8305 4-port 10/100 switch. There is no conventional
> PHY, the FEC output is direct into the uplink port of the switch
> chip.
> 
> This setup doesn't work after the FEC code was switch to using
> phylib. The driver used to have code to bypass phy detection/setup
> for this particular board. The phylib probe finds nothing, and of
> course sets a no-link condition.
> 
> So, what is the cleanest way to support this?
> 
> The attached patch adds a config option to do this sort of generically
> for the FEC driver. But I am wondering if there isn't a better way?
[...]

Perhaps there could be a null PHY driver which does no MDIO and always
reports link-up at the expected speed and duplex.  We used something
like that in the sfc driver for some PHY-less boards (though we use our
own PHY abstraction, not phylib).

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH] ethtool: add the stmmac support
From: Ben Hutchings @ 2010-10-07 13:13 UTC (permalink / raw)
  To: Peppe CAVALLARO; +Cc: netdev@vger.kernel.org
In-Reply-To: <4CAC0AE8.2060808@st.com>

On Wed, 2010-10-06 at 07:36 +0200, Peppe CAVALLARO wrote:
> Hello,
> 
> On 09/28/2010 11:51 AM, Giuseppe CAVALLARO wrote:
> > Add the stmmac support into the ethtool to
> > dump both the Mac Core and Dma registers.
> 
> Any news for this patch?
> 
> The stmmac is now working on several platforms (not only on STM ST40
> based boxes). I think it's worth having the ethtool support for the driver.
> 
> Welcome review and advice as usual.
[...]

You need to send ethtool patches to the ethtool maintainer, Jeff Garzik
<jgarzik@pobox.com>.  I expect he'll make an ethtool release shortly
after Linux 2.6.36, and will apply patches then.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [RFC] net: allow FEC driver to not have attached PHY
From: Florian Fainelli @ 2010-10-07 13:24 UTC (permalink / raw)
  To: Greg Ungerer; +Cc: netdev, gerg
In-Reply-To: <201010070350.o973oGFE026910@goober.internal.moreton.com.au>

Hi,

On Thursday 07 October 2010 05:50:16 Greg Ungerer wrote:
> Hi All,
> 
> I have a board with a ColdFire SoC on it with the built-in FEC
> ethernet module. On this hardware the FEC eth output is directly
> attached to a RTL8305 4-port 10/100 switch. There is no conventional
> PHY, the FEC output is direct into the uplink port of the switch
> chip.
> 
> This setup doesn't work after the FEC code was switch to using
> phylib. The driver used to have code to bypass phy detection/setup
> for this particular board. The phylib probe finds nothing, and of
> course sets a no-link condition.
> 
> So, what is the cleanest way to support this?

If phy detection fails and you cannot attach to a known PHY driver, you could 
register a fixed-PHY driver wich will report the link to be up. I had to do 
something like this for the cpmac driver[1] where various switches and 
external PHY configurations are supported.

[1]: 
https://dev.openwrt.org/browser/trunk/target/linux/ar7/patches-2.6.35/972-
cpmac_multi_probe.patch

> 
> The attached patch adds a config option to do this sort of generically
> for the FEC driver. But I am wondering if there isn't a better way?
> 
> Regards
> Greg
> 
> 
> ---
> 
> [RFC] net: allow FEC driver to not have attached PHY
> 
> At least one board using the FEC driver does not have a conventional
> PHY attached to it, it is directly connected to a somewhat simple
> ethernet switch (the board is the SnapGear/LITE, and the attached
> 4-port ethernet switch is a RealTek RTL8305). This switch does not
> present the usual register interface of a PHY, it presents nothing.
> So a PHY scan will find nothing.
> 
> After the FEC driver was changed to use phylib for supporting phys
> it no longer works on this particular board/switch setup.
> 
> Add a config option to allow configuring the FEC driver to not expect
> a PHY to be present.
> 
> Signed-off-by: Greg Ungerer <gerg@uclinux.org>
> ---
>  drivers/net/Kconfig |    9 +++++++++
>  drivers/net/fec.c   |    7 +++++++
>  2 files changed, 16 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
> index 93494e2..ee44728 100644
> --- a/drivers/net/Kconfig
> +++ b/drivers/net/Kconfig
> @@ -1934,6 +1934,15 @@ config FEC2
>  	  Say Y here if you want to use the second built-in 10/100 Fast
>  	  ethernet controller on some Motorola ColdFire processors.
> 
> +config FEC_NOPHY
> +	bool "FEC has no attached PHY"
> +	depends on FEC
> +	help
> +	  Some boards using the FEC driver may not have a PHY directly
> +	  attached to it. Typically in this scenario the FEC output is
> +	  directly connected to the input of an ethernet switch or hub.
> +	  Say Y here if your hardware is like this.
> +
>  config FEC_MPC52xx
>  	tristate "MPC52xx FEC driver"
>  	depends on PPC_MPC52xx && PPC_BESTCOMM
> diff --git a/drivers/net/fec.c b/drivers/net/fec.c
> index 768b840..3637f89 100644
> --- a/drivers/net/fec.c
> +++ b/drivers/net/fec.c
> @@ -910,6 +910,11 @@ fec_enet_open(struct net_device *dev)
>  	if (ret)
>  		return ret;
> 
> +#ifdef CONFIG_FEC_NOPHY
> +	/* No PHY connected, assume link is always up */
> +	fep->link = 1;
> +	fec_restart(dev, 0);
> +#else
>  	/* Probe and connect to PHY when open the interface */
>  	ret = fec_enet_mii_probe(dev);
>  	if (ret) {
> @@ -917,6 +922,8 @@ fec_enet_open(struct net_device *dev)
>  		return ret;
>  	}
>  	phy_start(fep->phy_dev);
> +#endif
> +
>  	netif_start_queue(dev);
>  	fep->opened = 1;
>  	return 0;
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC] net: allow FEC driver to not have attached PHY
From: Simon Farnsworth @ 2010-10-07 13:28 UTC (permalink / raw)
  To: netdev
In-Reply-To: <201010070350.o973oGFE026910@goober.internal.moreton.com.au>

Greg Ungerer wrote:

> 
> Hi All,
> 
> I have a board with a ColdFire SoC on it with the built-in FEC
> ethernet module. On this hardware the FEC eth output is directly
> attached to a RTL8305 4-port 10/100 switch. There is no conventional
> PHY, the FEC output is direct into the uplink port of the switch
> chip.
> 
> This setup doesn't work after the FEC code was switch to using
> phylib. The driver used to have code to bypass phy detection/setup
> for this particular board. The phylib probe finds nothing, and of
> course sets a no-link condition.
> 
> So, what is the cleanest way to support this?
> 
Is there anything that stops you using the fixed MDIO support in phylib 
(CONFIG_FIXED_PHY)? It seems to me to be intended for this sort of 
situation.

-- 
Simon


^ permalink raw reply

* Re: [PATCH] Use firmware provided index to register a network interface
From: Narendra_K @ 2010-10-07 14:14 UTC (permalink / raw)
  To: greg
  Cc: netdev, linux-hotplug, linux-pci, Matt_Domsch, Charles_Rose,
	Jordan_Hargrave, Vijay_Nijhawan
In-Reply-To: <20100923163336.GA15603@kroah.com>

On Thu, Sep 23, 2010 at 10:03:36PM +0530, Greg KH wrote:
>    On Thu, Sep 23, 2010 at 09:20:57PM +0530, Narendra_K@Dell.com wrote:
>    > > Now trying to change the kernel namespace itself seems like a bad hack
>    > > around this fact.
>    > >
>    >
>    > The patch does not change the existing kernel name space.
> 
>    You are "reordering it", right?
> 
>    > It adheres to the existing ethN name space with IFNAMSIZ length
>    > requirements. The patch only makes sure that eth0 always corresponds
>    > to what is labeled as 'Gb1' on server chassis, on systems where SMBIOS
>    > type 41 record is available. And removes the need for any renames for
>    > the interfaces which have a firmware index assigned by system
>    > firmware, as the very first name assigned by the kernel will be as
>    > expected and deterministic.
> 
>    And on systems with buggy firmware, what is going to happen here?  And
>    yes, there will be buggy BIOS tables, we can guarantee that, as this is
>    not something that Windows supports, right?
> 

It took some time to find out the details asked above. Right, windows
does not use SMBIOS type 41 record to derive names. But as a datapoint,
windows also has the same problem.

Yes, firmware and BIOS tables can be buggy. How about a command line
parameter 'no_netfwindex', passing which firmware index will not be
used to derive ethN names ? That would handle the scenario of buggy
firmware and names will be derived in the now existing way.

I will submit a patch shortly implementing this.

-- 
With regards,
Narendra K

^ permalink raw reply

* Re: [PATCH] Use firmware provided index to register a network interface
From: Tim Small @ 2010-10-07 14:27 UTC (permalink / raw)
  To: Narendra_K
  Cc: greg, netdev, linux-hotplug, linux-pci, Matt_Domsch, Charles_Rose,
	Jordan_Hargrave, Vijay_Nijhawan
In-Reply-To: <20101007141433.GA2641@libnet-test.oslab.blr.amer.dell.com>

On 07/10/10 15:14, Narendra_K@Dell.com wrote:
> Yes, firmware and BIOS tables can be buggy. How about a command line
> parameter 'no_netfwindex', passing which firmware index will not be
> used to derive ethN names ? That would handle the scenario of buggy
> firmware and names will be derived in the now existing way.
>
> I will submit a patch shortly implementing this.
>    

What was the reason for not doing this in user space again?  You stated 
that you got races when doing renames like eth0 -> eth2, but the 
solution looked like renaming into a different name space such as eth0 
-> ethlom2  etc. so that it wouldn't race with the names handed out by 
the kernel?

Tim.

^ permalink raw reply

* [PATCH V2] Use firmware provided index to register a network interface
From: Narendra_K @ 2010-10-07 14:23 UTC (permalink / raw)
  To: netdev, linux-hotplug, linux-pci
  Cc: Matt_Domsch, Jordan_Hargrave, Vijay_Nijhawan, Charles_Rose

Hello,

V1 -> V2:

This patch addresses the scenario of buggy firmware/BIOS tables. The
patch introduces a command line parameter 'no_netfwindex', passing which
firmware provided index will not be used to derive 'eth' names. By
default, firmware index will be used and the parameter can be used to
work around buggy firmware/BIOS tables.

Please find the patch below.

From: Narendra K <narendra_k@dell.com>
Subject: [PATCH] Use firmware provided index to register a network device

This patch uses the firmware provided index to derive the ethN name.
If the firmware provides an index for the corresponding pdev, the N
is derived from the index.

As an example, consider a PowerEdge R710 which has 4 BCM5709
Lan-On-Motherboard ports,1 Intel 82572EI port and 4 82575GB ports.
The system firmware communicates the order of the 4 Lan-On-Motherboard
ports by assigning indexes to each one of them. This is available to
the OS as the SMBIOS type 41 record(for onboard devices), in the field
'device type index'. It looks like below -

Handle 0x2900, DMI type 41, 11 bytes
Onboard Device
	Reference Designation: Embedded NIC 1
	Type: Ethernet
	Status: Enabled
	Type Instance: 1
	Bus Address: 0000:01:00.0

Handle 0x2901, DMI type 41, 11 bytes
Onboard Device
	Reference Designation: Embedded NIC 2
	Type: Ethernet
	Status: Enabled
	Type Instance: 2
	Bus Address: 0000:01:00.1

Handle 0x2902, DMI type 41, 11 bytes
Onboard Device
	Reference Designation: Embedded NIC 3
	Type: Ethernet
	Status: Enabled
	Type Instance: 3
	Bus Address: 0000:02:00.0

Handle 0x2903, DMI type 41, 11 bytes
Onboard Device
	Reference Designation: Embedded NIC 4
	Type: Ethernet
	Status: Enabled
	Type Instance: 4
	Bus Address: 0000:02:00.1

The OS can use this index to name the network interfaces as below.

Onboard devices -

Interface		Fwindex		Driver
Name
eth[fwindex - 1] =eth0	1		bnx2
eth[fwindex - 1] =eth1	2		bnx2
eth[fwindex - 1] =eth2	3		bnx2
eth[fwindex - 1] =eth3	4		bnx2

The add-in devices do not get any index and they will get names from
eth4 onwards.

Add-in interfaces -

eth4			e1000e
eth5			igb
eth6			igb
eth7			igb
eth8			igb

With this patch,

1. This patch adheres to the established ABI of ethN namespace with
IFNAMSIZ length and ensures that onboard network interfaces get
expected names at the first instance itself and avoids any renaming
later.

2. The 'eth0' of the OS always corresponds to the 'Gb1' as labeled on
the system chassis. There is determinism in the way Lan-On-Motherboard
ports get named.

3. The add-in devices will always be named from beyond what the
Lan-On-Motherboard names as show above. But there is no determinism
as to which add-in interface gets what ethN name.

Passing 'no_netfwindex' command line parameter would result in
firmware index not being used to derive the names as described above.

Signed-off-by: Narendra K <narendra_k@dell.com>
---
 Documentation/kernel-parameters.txt |    6 +++++
 drivers/pci/pci-label.c             |    1 +
 drivers/pci/pci-sysfs.c             |    5 ++++
 include/linux/netdevice.h           |    2 +
 include/linux/pci.h                 |    1 +
 net/core/dev.c                      |   42 ++++++++++++++++++++++++++++++-----
 6 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 3cdb4d8..73edbc0 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1529,6 +1529,12 @@ and is between 256 and 4096 characters. It is defined in the file
 			This usage is only documented in each driver source
 			file if at all.
 
+	no_netfwindex	[NET] Do not use firmware index to derive ethN names
+			Names for onboard network interfaces are derived from
+			the firmware provided index for these devices. Using
+			this parameter would result in firmware index not being
+			used to derive ethN names.
+
 	nf_conntrack.acct=
 			[NETFILTER] Enable connection tracking flow accounting
 			0 to disable accounting
diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c
index 90c0a72..8086268 100644
--- a/drivers/pci/pci-label.c
+++ b/drivers/pci/pci-label.c
@@ -55,6 +55,7 @@ find_smbios_instance_string(struct pci_dev *pdev, char *buf,
 							 "%s\n",
 							 dmi->name);
 			}
+			pdev->firmware_index = donboard->instance;
 			return strlen(dmi->name);
 		}
 	}
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index b5a7d9b..448ed9d 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -28,6 +28,7 @@
 #include "pci.h"
 
 static int sysfs_initialized;	/* = 0 */
+int pci_netdevs_with_fwindex;
 
 /* show configuration fields */
 #define pci_config_attr(field, format_string)				\
@@ -1167,6 +1168,10 @@ int __must_check pci_create_sysfs_dev_files (struct pci_dev *pdev)
 
 	pci_create_firmware_label_files(pdev);
 
+	if (pdev->firmware_index && (pdev->class >> 16) ==
+						PCI_BASE_CLASS_NETWORK)
+		pci_netdevs_with_fwindex++;
+
 	return 0;
 
 err_vga_file:
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 46c36ff..4398dcf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1080,6 +1080,8 @@ struct net_device {
 
 #define	NETDEV_ALIGN		32
 
+extern int pci_netdevs_with_fwindex;
+
 static inline
 struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
 					 unsigned int index)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b1d1795..90113bb 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -243,6 +243,7 @@ struct pci_dev {
 	unsigned short	subsystem_vendor;
 	unsigned short	subsystem_device;
 	unsigned int	class;		/* 3 bytes: (base,sub,prog-if) */
+	unsigned int	firmware_index; /* Firmware provided index */
 	u8		revision;	/* PCI revision, low byte of class word */
 	u8		hdr_type;	/* PCI header type (`multi' flag masked out) */
 	u8		pcie_cap;	/* PCI-E capability offset */
diff --git a/net/core/dev.c b/net/core/dev.c
index 1ae6543..f7982c4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -853,9 +853,18 @@ int dev_valid_name(const char *name)
 }
 EXPORT_SYMBOL(dev_valid_name);
 
+int netdev_use_fwindex = 1;
+
+static int __init netdev_use_fwindex_to_register(char *str)
+{
+	netdev_use_fwindex = 0;
+	return 0;
+}
+early_param("no_netfwindex", netdev_use_fwindex_to_register);
+
 /**
  *	__dev_alloc_name - allocate a name for a device
- *	@net: network namespace to allocate the device name in
+ *	@dev:  device
  *	@name: name format string
  *	@buf:  scratch buffer and result name string
  *
@@ -868,13 +877,15 @@ EXPORT_SYMBOL(dev_valid_name);
  *	Returns the number of the unit assigned or a negative errno code.
  */
 
-static int __dev_alloc_name(struct net *net, const char *name, char *buf)
+static int __dev_alloc_name(struct net_device *dev, const char *name, char *buf)
 {
 	int i = 0;
 	const char *p;
 	const int max_netdevices = 8*PAGE_SIZE;
 	unsigned long *inuse;
 	struct net_device *d;
+	struct net *net;
+	struct pci_dev *pdev;
 
 	p = strnchr(name, IFNAMSIZ-1, '%');
 	if (p) {
@@ -886,15 +897,36 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 		if (p[1] != 'd' || strchr(p + 2, '%'))
 			return -EINVAL;
 
+		if (likely(netdev_use_fwindex)) {
+			pdev = to_pci_dev(dev->dev.parent);
+			if (pdev && pdev->firmware_index) {
+				snprintf(buf, IFNAMSIZ, name,
+					 pdev->firmware_index - 1);
+				return pdev->firmware_index - 1;
+			}
+		}
+
 		/* Use one page as a bit array of possible slots */
 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 		if (!inuse)
 			return -ENOMEM;
 
+		/* Reserve 0 to < pci_netdevs_with_fwindex for integrated
+		 * ports with fwindex and allocate from pci_netdevs_with_fwindex
+		 * onwards for add-in devices
+		 */
+		if (likely(netdev_use_fwindex)) {
+			for (i = 0; i < pci_netdevs_with_fwindex; i++)
+				set_bit(i, inuse);
+		} else
+			pci_netdevs_with_fwindex = 0;
+
+		net = dev_net(dev);
+
 		for_each_netdev(net, d) {
 			if (!sscanf(d->name, name, &i))
 				continue;
-			if (i < 0 || i >= max_netdevices)
+			if (i < pci_netdevs_with_fwindex || i >= max_netdevices)
 				continue;
 
 			/*  avoid cases where sscanf is not exact inverse of printf */
@@ -936,12 +968,10 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 int dev_alloc_name(struct net_device *dev, const char *name)
 {
 	char buf[IFNAMSIZ];
-	struct net *net;
 	int ret;
 
 	BUG_ON(!dev_net(dev));
-	net = dev_net(dev);
-	ret = __dev_alloc_name(net, name, buf);
+	ret = __dev_alloc_name(dev, name, buf);
 	if (ret >= 0)
 		strlcpy(dev->name, buf, IFNAMSIZ);
 	return ret;
-- 
1.7.0.1

-- 
With regards,
Narendra K

^ permalink raw reply related

* Re: [PATCH] Use firmware provided index to register a network interface
From: Greg KH @ 2010-10-07 14:31 UTC (permalink / raw)
  To: Narendra_K
  Cc: netdev, linux-hotplug, linux-pci, Matt_Domsch, Charles_Rose,
	Jordan_Hargrave, Vijay_Nijhawan
In-Reply-To: <20101007141433.GA2641@libnet-test.oslab.blr.amer.dell.com>

On Thu, Oct 07, 2010 at 07:44:57PM +0530, Narendra_K@Dell.com wrote:
> On Thu, Sep 23, 2010 at 10:03:36PM +0530, Greg KH wrote:
> >    On Thu, Sep 23, 2010 at 09:20:57PM +0530, Narendra_K@Dell.com wrote:
> >    > > Now trying to change the kernel namespace itself seems like a bad hack
> >    > > around this fact.
> >    > >
> >    >
> >    > The patch does not change the existing kernel name space.
> > 
> >    You are "reordering it", right?
> > 
> >    > It adheres to the existing ethN name space with IFNAMSIZ length
> >    > requirements. The patch only makes sure that eth0 always corresponds
> >    > to what is labeled as 'Gb1' on server chassis, on systems where SMBIOS
> >    > type 41 record is available. And removes the need for any renames for
> >    > the interfaces which have a firmware index assigned by system
> >    > firmware, as the very first name assigned by the kernel will be as
> >    > expected and deterministic.
> > 
> >    And on systems with buggy firmware, what is going to happen here?  And
> >    yes, there will be buggy BIOS tables, we can guarantee that, as this is
> >    not something that Windows supports, right?
> > 
> 
> It took some time to find out the details asked above. Right, windows
> does not use SMBIOS type 41 record to derive names. But as a datapoint,
> windows also has the same problem.

If windows does not use this field, then you can guarantee it will show
up incorrectly in BIOSes and never be tested by manufacturers before
they ship their machines.

> Yes, firmware and BIOS tables can be buggy. How about a command line
> parameter 'no_netfwindex', passing which firmware index will not be
> used to derive ethN names ? That would handle the scenario of buggy
> firmware and names will be derived in the now existing way.

I'd prefer the option never be added in the first place as you are
placing a big burden on people whose machines were working properly on
old kernels, to suddenly have to add a command line option to get their
box to now work again.

And again, as this is a "simple" rename type operation, I still don't
see why you can't just do this from a udev rule, especially if the
firmware information is exported to userspace.  That is where such
"policy" should be added, not within the kernel itself.

thanks,

greg k-h

^ permalink raw reply

* Re: [PATCH] SIW: iWARP Protocol headers
From: Bernard Metzler @ 2010-10-07 14:55 UTC (permalink / raw)
  To: David Dillow
  Cc: Bart Van Assche, Jason Gunthorpe,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1286393504.26136.31.camel-FqX9LgGZnHWDB2HL1qBt2PIbXMQ5te18@public.gmane.org>



David Dillow <dave-i1Mk8JYDVaaSihdK6806/g@public.gmane.org> wrote on 10/06/2010 09:31:44 PM:

> On Wed, 2010-10-06 at 12:37 -0600, Jason Gunthorpe wrote:
> > On Wed, Oct 06, 2010 at 02:22:55PM -0400, David Dillow wrote:
> > > On Wed, 2010-10-06 at 11:25 -0600, Jason Gunthorpe wrote:
> > > > On Wed, Oct 06, 2010 at 02:52:46PM +0200, Bernard Metzler wrote:
> > > > It is actually a little more complicated than just this. I assume
you
> > > > are casting the structures over packet payloads? In this case you
> > > > have to guarentee alignment (or used packed everywhere). Does iwarp
> > > > have provisions for alignment? If so you can construct your
bitfields
> > > > using the alignment type, ie if iWarp guarantees 4 bytes then the
> > > > biggest type you can use is u32 - then you can avoid using packed.
> > > >
> > > > Mind you, I'm not sure how to guarentee alignment when you consider
> > > > all the possible sources of unalignment in the stack: TCP, IP,L2
stack ?
> > >
> > > You don't have to. The TCP stack, IIRC, requires the architecture to
fix
> > > up misaligned accesses, or at least it used to. It will try to keep
> > > things aligned if possible -- see the comment above NET_IP_ALIGN in
> > > include/linux/skbuff.h.
> >
> > I was under the impression that this was just to align the IP
> > header.
>
> It is, and it assumes an ethernet header of 14 bytes. As you note, VLANs
> and everything else can mess with that, which is why the TCP stack
> requires the architecture to handle unaligned accesses. It's a bonus if
> iWarp can guarantee an alignment WRT the IP or TCP header, but not
> required.
>
> > > The structures in Bernard's header have all fields naturally
> > > aligned, so no need for the packed attribute and its baggage.
> >
> > No, they arent:
> >
> > > +struct iwarp_rdma_write {
> > > +   struct iwarp_ctrl   ctrl;
> > > +   __u32         sink_stag;
> > > +   __u64         sink_to;
> > > +} __attribute__((__packed__));
> >
> > For instance has sink_to unaligned if the maximum aligment of the
> > payload is only guarenteed to be 4. The proper thing to do in this
> > case is probably __attribute__((__packed__, __aligned__(4))) if that
> > works the way I think.. :)
>
> struct iwarp_ctrl consists of 2 u16s, so sink_stag is aligned to 4 bytes
> from the start of the struct iwarp_rdma_write, and sink_to is at 8 bytes
> -- so they are naturally aligned with respect to the structure. The
> compiler will not add padding, so no need for the packed attribute.
>
> Note I'm not talking about alignment in memory -- that'll depend on
> where the data starts, and as I mentioned, the stack doesn't make any
> guarantees about that, relying on the architecture to fixup any
> misaligned accesses.

agreed. all n-bytes are starting on a multipe of n-byte offset
relative to the start of the structure.

many thanks,
bernard.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox