Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [MeeGo-Dev][PATCH v3] Topcliff: Update PCH_CAN driver to 2.6.35
From: Masayuki Ohtake @ 2010-10-05 12:09 UTC (permalink / raw)
  To: Marc Kleine-Budde
  Cc: andrew.chih.howe.khor-ral2JQCrhuEAvxtiuMwx3w, Samuel Ortiz,
	margie.foster-ral2JQCrhuEAvxtiuMwx3w,
	netdev-u79uwXL29TY76Z2rM5mHXA, Wolfgang Grandegger,
	yong.y.wang-ral2JQCrhuEAvxtiuMwx3w,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	kok.howg.ewe-ral2JQCrhuEAvxtiuMwx3w, Christian Pellegrin,
	Tomoya MORINAGA, meego-dev-WXzIur8shnEAvxtiuMwx3w,
	David S. Miller, joel.clark-ral2JQCrhuEAvxtiuMwx3w,
	qi.wang-ral2JQCrhuEAvxtiuMwx3w
In-Reply-To: <4CAB0732.3010400@pengutronix.de>

Hi Marc,

On Tuesday, October 05, 2010 8:08 PM, Marc Kleine-Budde wrote:
> If FIFO is working you might also think about NAPI.

I think NAPI isn't necessary for our CAN driver.
NAPI is for high-speed networking.
CAN is NOT high-speed.

In fact, some accepted CAN drivers don't have NAPI.

Thanks, Ohtake(OKISemi)

^ permalink raw reply

* Re: [PATCH] sysctl: fix min/max handling in __do_proc_doulongvec_minmax()
From: Américo Wang @ 2010-10-05 13:01 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Américo Wang, Robin Holt, Andrew Morton, linux-kernel,
	Willy Tarreau, David S. Miller, netdev, James Morris,
	Hideaki YOSHIFUJI, Pekka Savola (ipv6), Patrick McHardy,
	Alexey Kuznetsov
In-Reply-To: <1286188701.18293.57.camel@edumazet-laptop>

On Mon, Oct 04, 2010 at 12:38:21PM +0200, Eric Dumazet wrote:
>Le lundi 04 octobre 2010 à 18:35 +0800, Américo Wang a écrit :
>
>> Your patch does fix the problem, but seems not a good solution,
>> we should skip all min/max checking if ->extra(1|2) is NULL,
>> instead of checking it every time within the loop.
>
>Please do submit a patch, we'll see if you come to a better solution,
>with no added code size (this is slow path, I dont care for checking it
>'every time winthin the loop')
>
>

I have one, but just did compile test. :)
I will test it tomorrow.

NOT-Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>

---

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f88552c..345a193 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2448,86 +2448,119 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
 				do_proc_dointvec_minmax_conv, &param);
 }
 
-static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
-				     void __user *buffer,
+static int __doulongvec_minmax_read(void *data, void __user *buffer,
 				     size_t *lenp, loff_t *ppos,
 				     unsigned long convmul,
 				     unsigned long convdiv)
 {
-	unsigned long *i, *min, *max;
-	int vleft, first = 1, err = 0;
-	unsigned long page = 0;
-	size_t left;
-	char *kbuf;
+	unsigned long *i = (unsigned long *) data;
+	int err = 0;
+	bool first = true;
+	size_t left = *lenp;
 
-	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
-		*lenp = 0;
-		return 0;
+	for (; left; i++, first=false) {
+		unsigned long val;
+
+		val = convdiv * (*i) / convmul;
+		if (!first)
+			err = proc_put_char(&buffer, &left, '\t');
+		err = proc_put_long(&buffer, &left, val, false);
+		if (err)
+			break;
 	}
 
-	i = (unsigned long *) data;
-	min = (unsigned long *) table->extra1;
-	max = (unsigned long *) table->extra2;
-	vleft = table->maxlen / sizeof(unsigned long);
-	left = *lenp;
+	if (!first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
 
-	if (write) {
-		if (left > PAGE_SIZE - 1)
-			left = PAGE_SIZE - 1;
-		page = __get_free_page(GFP_TEMPORARY);
-		kbuf = (char *) page;
-		if (!kbuf)
-			return -ENOMEM;
-		if (copy_from_user(kbuf, buffer, left)) {
-			err = -EFAULT;
-			goto free;
-		}
-		kbuf[left] = 0;
+	*lenp -= left;
+	*ppos += *lenp;
+	return err;
+}
+
+static int __doulongvec_minmax_write(void *data, void __user *buffer,
+				     size_t *lenp, loff_t *ppos, int vleft,
+				     unsigned long min, unsigned long max)
+{
+	char *kbuf;
+	size_t left = *lenp;
+	unsigned long page = 0;
+	unsigned long *i = (unsigned long *) data;
+	int err = 0;
+	bool first = true;
+
+	if (left > PAGE_SIZE - 1)
+		left = PAGE_SIZE - 1;
+	page = __get_free_page(GFP_TEMPORARY);
+	kbuf = (char *) page;
+	if (!kbuf)
+		return -ENOMEM;
+	if (copy_from_user(kbuf, buffer, left)) {
+		err = -EFAULT;
+		goto free;
 	}
+	kbuf[left] = 0;
 
-	for (; left && vleft--; i++, min++, max++, first=0) {
+	for (; left && vleft--; i++, min++, max++, first=false) {
 		unsigned long val;
+		bool neg;
 
-		if (write) {
-			bool neg;
-
-			left -= proc_skip_spaces(&kbuf);
+		left -= proc_skip_spaces(&kbuf);
 
-			err = proc_get_long(&kbuf, &left, &val, &neg,
-					     proc_wspace_sep,
-					     sizeof(proc_wspace_sep), NULL);
-			if (err)
-				break;
-			if (neg)
-				continue;
-			if ((min && val < *min) || (max && val > *max))
-				continue;
-			*i = val;
-		} else {
-			val = convdiv * (*i) / convmul;
-			if (!first)
-				err = proc_put_char(&buffer, &left, '\t');
-			err = proc_put_long(&buffer, &left, val, false);
-			if (err)
-				break;
-		}
+		err = proc_get_long(&kbuf, &left, &val, &neg,
+				     proc_wspace_sep,
+				     sizeof(proc_wspace_sep), NULL);
+		if (err)
+			break;
+		if (neg)
+			continue;
+		if (val < min || val > max)
+			continue;
+		*i = val;
 	}
 
-	if (!write && !first && left && !err)
-		err = proc_put_char(&buffer, &left, '\n');
-	if (write && !err)
+	if (!err)
 		left -= proc_skip_spaces(&kbuf);
 free:
-	if (write) {
-		free_page(page);
-		if (first)
-			return err ? : -EINVAL;
-	}
+	free_page(page);
+	if (first)
+		return err ? : -EINVAL;
+
 	*lenp -= left;
 	*ppos += *lenp;
 	return err;
 }
 
+static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
+				     void __user *buffer,
+				     size_t *lenp, loff_t *ppos,
+				     unsigned long convmul,
+				     unsigned long convdiv)
+{
+	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+
+	if (write) {
+		unsigned long min, max;
+		int vleft;
+
+		vleft = table->maxlen / sizeof(unsigned long);
+		if (table->extra1)
+			min = *(unsigned long *) table->extra1;
+		else
+			min = 0;
+		if (table->extra2)
+			max = *(unsigned long *) table->extra2;
+		else
+			max = ULONG_MAX;
+		return __doulongvec_minmax_write(data, buffer, lenp,
+						  ppos, vleft, min, max);
+	} else
+		return __doulongvec_minmax_read(data, buffer, lenp,
+						 ppos, convmul, convdiv);
+}
+
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
 				     void __user *buffer,
 				     size_t *lenp, loff_t *ppos,

^ permalink raw reply related

* Re: [PATCH] SIW: Module initialization
From: Bernard Metzler @ 2010-10-05 13:12 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: bart.vanassche-Re5JQEeQqe8AvxtiuMwx3w,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <AANLkTik5=i+A5_OpU0rVyfYi=ibgS9UWMu82vMCmM=PN-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

bart.vanassche-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org wrote on 10/05/2010 12:57:21 PM:

> Bart Van Assche <bvanassche-HInyCGIudOg@public.gmane.org> 
> Sent by: bart.vanassche-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org
> 
> 10/05/2010 12:57 PM 
> 
> To 
> 
> Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org> 
> 
> cc 
> 
> netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org 
> 
> Subject 
> 
> Re: [PATCH] SIW: Module initialization 
> 
> On Tue, Oct 5, 2010 at 8:54 AM, Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org> 
wrote:
> > +static int loopback_enabled;
> > +module_param(loopback_enabled, int, 0644);
> > +MODULE_PARM_DESC(loopback_enabled, "enable_loopback");
> 
> A minor comment: since kernel 2.6.31 the type "bool" can be used for
> boolean kernel module parameters.
> 
oh, thanks. there are currently two more occurrences (MPA crc, 0copy 
transmit). 
it will be changed accordingly. 

> > + * TODO: Dynamic device management (network device 
registration/removal).
> 
> The current implementation is such that one siw device is created for
> each network device found at kernel module load time. That means that
> you force the user to load the siw kernel module after all other
> kernel modules that register a network device. I'm not sure that's a
> good idea.
> 
good point. do you have a suggestion here - would you like to see 
siw to be enabled more selectively? 
iwarp is a protocol on top of TCP, explicitly defining the 
semantics of data fetching and placement. end-to-end connectivity 
and efficient data shipping is provided by TCP/IP. not taking into 
account dedicated RDMA hardware, any TCP stream may carry iwarp 
traffic. from that point of view, binding a software based 
rdma stack to dedicated devices is a concession to the 
given environment, in particular to the given rdma 
connection management. therefore, we started with binding to all 
available network devices. 


> > +                       if (!siw_device) {
> > +                               siw_device = siw_p;
> > +                               siw_p->next = NULL;
> > +                       } else {
> > +                               siw_p->next = siw_device->next;
> > +                               siw_device->next = siw_p;
> > +                       }
> 
> Why a custom linked list implementation instead of using <linux/list.h> 
?
> 
i agree. will be changed. 

Bernard.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] bonding: fix WARN_ON when writing to bond_master sysfs file (v2)
From: Neil Horman @ 2010-10-05 13:39 UTC (permalink / raw)
  To: netdev; +Cc: bonding-devel, fubar, davem, shemminger
In-Reply-To: <20101004202112.GA27897@hmsreliant.think-freely.org>

Ok, V2 of this patch, taking Stephens notes into account.  Switched to using
__dev_get_by_name to avoid reference count inc/dec.


Fix a WARN_ON failure in bond_masters sysfs file

Got a report of this warning recently

bonding: bond0 is being created...
------------[ cut here ]------------
WARNING: at fs/proc/generic.c:590 proc_register+0x14d/0x185()
Hardware name: ProLiant BL465c G1
proc_dir_entry 'bonding/bond0' already registered
Modules linked in: bonding ipv6 tg3 bnx2 shpchp amd64_edac_mod edac_core
ipmi_si
ipmi_msghandler serio_raw i2c_piix4 k8temp edac_mce_amd hpwdt microcode hpsa
cc
iss radeon ttm drm_kms_helper drm i2c_algo_bit i2c_core [last unloaded:
scsi_wai
t_scan]
Pid: 935, comm: ifup-eth Not tainted 2.6.33.5-124.fc13.x86_64 #1
Call Trace:
[<ffffffff8104b54c>] warn_slowpath_common+0x77/0x8f
[<ffffffff8104b5b1>] warn_slowpath_fmt+0x3c/0x3e
[<ffffffff8114bf0b>] proc_register+0x14d/0x185
[<ffffffff8114c20c>] proc_create_data+0x87/0xa1
[<ffffffffa0211e9b>] bond_create_proc_entry+0x55/0x95 [bonding]
[<ffffffffa0215e5d>] bond_init+0x95/0xd0 [bonding]
[<ffffffff8138cd97>] register_netdevice+0xdd/0x29e
[<ffffffffa021240b>] bond_create+0x8e/0xb8 [bonding]
[<ffffffffa021c4be>] bonding_store_bonds+0xb3/0x1c1 [bonding]
[<ffffffff812aec85>] class_attr_store+0x27/0x29
[<ffffffff8115423d>] sysfs_write_file+0x10f/0x14b
[<ffffffff81101acf>] vfs_write+0xa9/0x106
[<ffffffff81101be2>] sys_write+0x45/0x69
[<ffffffff81009b02>] system_call_fastpath+0x16/0x1b
---[ end trace a677c3f7f8b16b1e ]---
bonding: Bond creation failed.

It happens because a user space writer to bond_master can try to register and
already existing bond interface name.  Fix it by teaching bond_create to check
for the existance of devices with that name first in cases where a non-NULL name
parameter has been passed in

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>


 bond_main.c |    9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index fb70c3e..985cbc1 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -5168,6 +5168,15 @@ int bond_create(struct net *net, const char *name)
 		res = dev_alloc_name(bond_dev, "bond%d");
 		if (res < 0)
 			goto out;
+	} else {
+		/*
+		 * If we're given a name to register
+		 * we need to ensure that its not already
+		 * registered
+		 */
+		res = -EEXIST;
+		if (__dev_get_by_name(net, name) != NULL)
+			goto out;
 	}
 
 	res = register_netdevice(bond_dev);

^ permalink raw reply related

* Re: [PATCH] SIW: iWARP Protocol headers
From: Steve Wise @ 2010-10-05 13:53 UTC (permalink / raw)
  To: Bernard Metzler
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1286261630-5085-1-git-send-email-bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>

On 10/05/2010 01:53 AM, Bernard Metzler wrote:
> ---
>   drivers/infiniband/hw/siw/iwarp.h |  324 +++++++++++++++++++++++++++++++++++++
>   1 files changed, 324 insertions(+), 0 deletions(-)
>   create mode 100644 drivers/infiniband/hw/siw/iwarp.h
>
> diff --git a/drivers/infiniband/hw/siw/iwarp.h b/drivers/infiniband/hw/siw/iwarp.h
> new file mode 100644
> index 0000000..762c1d3
> --- /dev/null
> +++ b/drivers/infiniband/hw/siw/iwarp.h
> @@ -0,0 +1,324 @@
> +/*
> + * Software iWARP device driver for Linux
> + *
> + * Authors: Bernard Metzler<bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
> + *          Fredy Neeser<nfd-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
> + *
> + * Copyright (c) 2008-2010, IBM Corporation
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * BSD license below:
> + *
> + *   Redistribution and use in source and binary forms, with or
> + *   without modification, are permitted provided that the following
> + *   conditions are met:
> + *
> + *   - Redistributions of source code must retain the above copyright notice,
> + *     this list of conditions and the following disclaimer.
> + *
> + *   - Redistributions in binary form must reproduce the above copyright
> + *     notice, this list of conditions and the following disclaimer in the
> + *     documentation and/or other materials provided with the distribution.
> + *
> + *   - Neither the name of IBM nor the names of its contributors may be
> + *     used to endorse or promote products derived from this software without
> + *     specific prior written permission.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#ifndef _IWARP_H
> +#define _IWARP_H
> +
> +#include<rdma/rdma_user_cm.h>	/* RDMA_MAX_PRIVATE_DATA */
> +#include<linux/types.h>
> +#include<asm/byteorder.h>
> +
> +
> +#define RDMAP_VERSION		1
> +#define DDP_VERSION		1
> +#define MPA_REVISION_1		1
> +#define MPA_MAX_PRIVDATA	RDMA_MAX_PRIVATE_DATA
> +#define MPA_KEY_REQ		"MPA ID Req Frame"
> +#define MPA_KEY_REP		"MPA ID Rep Frame"
> +
> +struct mpa_rr_params {
> +#if defined(__LITTLE_ENDIAN_BITFIELD)
> +	__u16	res:5,
> +		r:1,
> +		c:1,
> +		m:1,
> +		rev:8;
> +#elif defined(__BIG_ENDIAN_BITFIELD)
> +	__u16	m:1,
> +		c:1,
> +		r:1,
> +		res:5,
> +		rev:8;
> +#else
> +#error "Adjust your<asm/byteorder.h>  defines"
> +#endif
> +	__u16	pd_len;
> +};
> +
> +/*
> + * MPA request/reply header
> + */
> +struct mpa_rr {
> +	__u8	key[16];
> +	struct mpa_rr_params params;
> +};
> +
> +/*
> + * Don't change the layout/size of this struct!
> + */
> +struct mpa_marker {
> +	__u16	rsvd;
> +	__u16	fpdu_hmd; /* FPDU header-marker distance (= MPA's FPDUPTR) */
> +};
> +
> +#define MPA_MARKER_SPACING	512
> +#define MPA_HDR_SIZE		2
> +
> +/*
> + * MPA marker size:
> + * - Standards-compliant marker insertion: Use sizeof(struct mpa_marker)
> + * - "Invisible markers" for testing sender's marker insertion
> + *   without affecting receiver: Use 0
> + */
> +#define MPA_MARKER_SIZE		sizeof(struct mpa_marker)
> +
> +
> +/*
> + * maximum MPA trailer
> + */
> +struct mpa_trailer {
> +	char	pad[4];
> +	__u32	crc;
> +};
> +
> +#define MPA_CRC_SIZE	4
> +
> +
> +/*
> + * Common portion of iWARP headers (MPA, DDP, RDMAP)
> + * for any FPDU
> + */
> +struct iwarp_ctrl {
> +	__u16	mpa_len;
> +#if defined(__LITTLE_ENDIAN_BITFIELD)
> +	__u16	dv:2,		/* DDP Version */
> +		rsvd:4,		/* DDP reserved, MBZ */
> +		l:1,		/* DDP Last flag */
> +		t:1,		/* DDP Tagged flag */
> +		opcode:4,	/* RDMAP opcode */
> +		rsv:2,		/* RDMAP reserved, MBZ */
> +		rv:2;		/* RDMAP Version, 01 for IETF, 00 for RDMAC */
> +#elif defined(__BIG_ENDIAN_BITFIELD)
> +	__u16	t:1,		/* DDP Tagged flag */
> +		l:1,		/* DDP Last flag */
> +		rsvd:4,		/* DDP reserved, MBZ */
> +		dv:2,		/* DDP Version */
> +		rv:2,		/* RDMAP Version, 01 for IETF, 00 for RDMAC */
> +		rsv:2,		/* RDMAP reserved, MBZ */
> +		opcode:4;	/* RDMAP opcode */
> +#else
> +#error "Adjust your<asm/byteorder.h>  defines"
> +#endif
> +};
> +
> +
> +struct rdmap_terminate_ctrl {
> +#if defined(__LITTLE_ENDIAN_BITFIELD)
> +	__u32	etype:4,
> +		layer:4,
> +		ecode:8,
> +		rsvd1:5,
> +		r:1,
> +		d:1,
> +		m:1,
> +		rsvd2:8;
> +#elif defined(__BIG_ENDIAN_BITFIELD)
> +	__u32	layer:4,
> +		etype:4,
> +		ecode:8,
> +		m:1,
> +		d:1,
> +		r:1,
> +		rsvd1:5,
> +		rsvd2:8;
> +#else
> +#error "Adjust your<asm/byteorder.h>  defines"
> +#endif
> +};
> +
> +
> +struct iwarp_rdma_write {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			sink_stag;
> +	__u64			sink_to;
> +} __attribute__((__packed__));
> +
> +struct iwarp_rdma_rreq {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			rsvd;
> +	__u32			ddp_qn;
> +	__u32			ddp_msn;
> +	__u32			ddp_mo;
> +	__u32			sink_stag;
> +	__u64			sink_to;
> +	__u32			read_size;
> +	__u32			source_stag;
> +	__u64			source_to;
> +} __attribute__((__packed__));
> +
> +struct iwarp_rdma_rresp {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			sink_stag;
> +	__u64			sink_to;
> +} __attribute__((__packed__));
> +
> +struct iwarp_send {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			rsvd;
> +	__u32			ddp_qn;
> +	__u32			ddp_msn;
> +	__u32			ddp_mo;
> +} __attribute__((__packed__));
> +
> +struct iwarp_send_inv {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			inval_stag;
> +	__u32			ddp_qn;
> +	__u32			ddp_msn;
> +	__u32			ddp_mo;
> +} __attribute__((__packed__));
> +
> +struct iwarp_terminate {
> +	struct iwarp_ctrl	ctrl;
> +	__u32				rsvd;
> +	__u32				ddp_qn;
> +	__u32				ddp_msn;
> +	__u32				ddp_mo;
> +	struct rdmap_terminate_ctrl	term_ctrl;
> +} __attribute__((__packed__));
> +
> +
> +/*
> + * Common portion of iWARP headers (MPA, DDP, RDMAP)
> + * for an FPDU carrying an untagged DDP segment
> + */
> +struct iwarp_ctrl_untagged {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			rsvd;
> +	__u32			ddp_qn;
> +	__u32			ddp_msn;
> +	__u32			ddp_mo;
> +} __attribute__((__packed__));
> +
> +/*
> + * Common portion of iWARP headers (MPA, DDP, RDMAP)
> + * for an FPDU carrying a tagged DDP segment
> + */
> +struct iwarp_ctrl_tagged {
> +	struct iwarp_ctrl	ctrl;
> +	__u32			ddp_stag;
> +	__u64			ddp_to;
> +} __attribute__((__packed__));
> +
>    

All of the above header structures should use __beXX types since the 
fields are all in Network Byte Order.

Also, did you run sparse on the patches (Documentation/sparse.txt)?


> +union iwarp_hdrs {
> +	struct iwarp_ctrl		ctrl;
> +	struct iwarp_ctrl_untagged	c_untagged;
> +	struct iwarp_ctrl_tagged	c_tagged;
> +	struct iwarp_rdma_write		rwrite;
> +	struct iwarp_rdma_rreq		rreq;
> +	struct iwarp_rdma_rresp		rresp;
> +	struct iwarp_terminate		terminate;
> +	struct iwarp_send		send;
> +	struct iwarp_send_inv		send_inv;
> +};
> +
> +
> +#define MPA_MIN_FRAG ((sizeof(union iwarp_hdrs) + MPA_CRC_SIZE))
> +
> +enum ddp_etype {
> +	DDP_ETYPE_CATASTROPHIC	= 0x0,
> +	DDP_ETYPE_TAGGED_BUF	= 0x1,
> +	DDP_ETYPE_UNTAGGED_BUF	= 0x2,
> +	DDP_ETYPE_RSVD		= 0x3
> +};
> +
> +enum ddp_ecode {
> +	DDP_ECODE_CATASTROPHIC		= 0x00,
> +	/* Tagged Buffer Errors */
> +	DDP_ECODE_T_INVALID_STAG	= 0x00,
> +	DDP_ECODE_T_BASE_BOUNDS		= 0x01,
> +	DDP_ECODE_T_STAG_NOT_ASSOC	= 0x02,
> +	DDP_ECODE_T_TO_WRAP		= 0x03,
> +	DDP_ECODE_T_DDP_VERSION		= 0x04,
> +	/* Untagged Buffer Errors */
> +	DDP_ECODE_UT_INVALID_QN		= 0x01,
> +	DDP_ECODE_UT_INVALID_MSN_NOBUF	= 0x02,
> +	DDP_ECODE_UT_INVALID_MSN_RANGE	= 0x03,
> +	DDP_ECODE_UT_INVALID_MO		= 0x04,
> +	DDP_ECODE_UT_MSG_TOOLONG	= 0x05,
> +	DDP_ECODE_UT_DDP_VERSION	= 0x06
> +};
> +
> +
> +enum rdmap_untagged_qn {
> +	RDMAP_UNTAGGED_QN_SEND		= 0,
> +	RDMAP_UNTAGGED_QN_RDMA_READ	= 1,
> +	RDMAP_UNTAGGED_QN_TERMINATE	= 2,
> +	RDMAP_UNTAGGED_QN_COUNT		= 3
> +};
> +
> +enum rdmap_etype {
> +	RDMAP_ETYPE_CATASTROPHIC	= 0x0,
> +	RDMAP_ETYPE_REMOTE_PROTECTION	= 0x1,
> +	RDMAP_ETYPE_REMOTE_OPERATION	= 0x2
> +};
> +
> +enum rdmap_ecode {
> +	RDMAP_ECODE_INVALID_STAG	= 0x00,
> +	RDMAP_ECODE_BASE_BOUNDS		= 0x01,
> +	RDMAP_ECODE_ACCESS_RIGHTS	= 0x02,
> +	RDMAP_ECODE_STAG_NOT_ASSOC	= 0x03,
> +	RDMAP_ECODE_TO_WRAP		= 0x04,
> +	RDMAP_ECODE_RDMAP_VERSION	= 0x05,
> +	RDMAP_ECODE_UNEXPECTED_OPCODE	= 0x06,
> +	RDMAP_ECODE_CATASTROPHIC_STREAM	= 0x07,
> +	RDMAP_ECODE_CATASTROPHIC_GLOBAL	= 0x08,
> +	RDMAP_ECODE_STAG_NOT_INVALIDATE	= 0x09,
> +	RDMAP_ECODE_UNSPECIFIED		= 0xff
> +};
> +
> +enum rdmap_elayer {
> +	RDMAP_ERROR_LAYER_RDMA	= 0x00,
> +	RDMAP_ERROR_LAYER_DDP	= 0x01,
> +	RDMAP_ERROR_LAYER_LLP	= 0x02	/* eg., MPA */
> +};
> +
> +enum rdma_opcode {
> +	RDMAP_RDMA_WRITE	= 0x0,
> +	RDMAP_RDMA_READ_REQ	= 0x1,
> +	RDMAP_RDMA_READ_RESP	= 0x2,
> +	RDMAP_SEND		= 0x3,
> +	RDMAP_SEND_INVAL	= 0x4,
> +	RDMAP_SEND_SE		= 0x5,
> +	RDMAP_SEND_SE_INVAL	= 0x6,
> +	RDMAP_TERMINATE		= 0x7,
> +	RDMAP_NOT_SUPPORTED	= RDMAP_TERMINATE + 1
> +};
> +
> +#endif
>    

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] SIW: User interface
From: Steve Wise @ 2010-10-05 14:17 UTC (permalink / raw)
  To: Bernard Metzler
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1286261647-5139-1-git-send-email-bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>

On 10/05/2010 01:54 AM, Bernard Metzler wrote:


<snip>

> +
> +/*
> + * siw_post_send()
> + *
> + * Post a list of S-WR's to a SQ.
> + *
> + * @ofa_qp:	OFA QP contained in siw QP
> + * @wr:		Null terminated list of user WR's
> + * @bad_wr:	Points to failing WR in case of synchronous failure.
> + */
> +int siw_post_send(struct ib_qp *ofa_qp, struct ib_send_wr *wr,
> +		  struct ib_send_wr **bad_wr)
> +{
> +	struct siw_wqe	*wqe = NULL;
> +	struct siw_qp	*qp = siw_qp_ofa2siw(ofa_qp);
> +
> +	unsigned long flags;
> +	int rv = 0;
> +
> +	dprint(DBG_WR|DBG_TX, "(QP%d): state=%d\n",
> +		QP_ID(qp), qp->attrs.state);
> +
> +	/*
> +	 * Acquire QP state lock for reading. The idea is that a
> +	 * user cannot move the QP out of RTS during TX/RX processing.
> +	 */
> +	down_read(&qp->state_lock);
> +
>    

I don't think you can use a rw_semaphore here because it potentially can 
block.  You cannot block/sleep in the post_send/post_recv (and some 
other) RDMA provider functions.  See 
Documentation/infiniband/core_locking.txt.


Steve.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH net-next] dccp:
From: Stephen Hemminger @ 2010-10-05 14:24 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, David Miller; +Cc: dccp, netdev

Remove dead code and make some functions static.
Compile tested only.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 net/dccp/dccp.h    |    2 --
 net/dccp/feat.c    |   10 ----------
 net/dccp/feat.h    |    1 -
 net/dccp/options.c |    4 +---
 net/dccp/proto.c   |   48 ++++++++++++++++++++++++------------------------
 5 files changed, 25 insertions(+), 40 deletions(-)

--- a/net/dccp/dccp.h	2010-10-05 23:09:10.000117419 +0900
+++ b/net/dccp/dccp.h	2010-10-05 23:22:37.879120065 +0900
@@ -246,7 +246,6 @@ static inline void dccp_clear_xmit_timer
 extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
 
 extern const char *dccp_packet_name(const int type);
-extern const char *dccp_state_name(const int state);
 
 extern void dccp_set_state(struct sock *sk, const int state);
 extern void dccp_done(struct sock *sk);
@@ -449,7 +448,6 @@ extern int dccp_insert_options_rsk(struc
 extern int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed);
 extern u32 dccp_timestamp(void);
 extern void dccp_timestamping_init(void);
-extern int dccp_insert_option_timestamp(struct sk_buff *skb);
 extern int dccp_insert_option(struct sk_buff *skb, unsigned char option,
 			      const void *value, unsigned char len);
 
--- a/net/dccp/feat.c	2010-10-05 23:09:09.828117827 +0900
+++ b/net/dccp/feat.c	2010-10-05 23:14:58.315118201 +0900
@@ -730,16 +730,6 @@ int dccp_feat_register_sp(struct sock *s
 				  0, list, len);
 }
 
-/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */
-int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val)
-{
-	/* any changes must be registered before establishing the connection */
-	if (sk->sk_state != DCCP_CLOSED)
-		return -EISCONN;
-	if (dccp_feat_type(feat) != FEAT_NN)
-		return -EINVAL;
-	return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val);
-}
 
 /*
  *	Tracking features whose value depend on the choice of CCID
--- a/net/dccp/options.c	2010-10-05 23:09:09.884117478 +0900
+++ b/net/dccp/options.c	2010-10-05 23:14:27.375117940 +0900
@@ -369,7 +369,7 @@ int dccp_insert_option_elapsed_time(stru
 
 EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
 
-int dccp_insert_option_timestamp(struct sk_buff *skb)
+static int dccp_insert_option_timestamp(struct sk_buff *skb)
 {
 	__be32 now = htonl(dccp_timestamp());
 	/* yes this will overflow but that is the point as we want a
@@ -378,8 +378,6 @@ int dccp_insert_option_timestamp(struct 
 	return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now));
 }
 
-EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
-
 static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
 					     struct dccp_request_sock *dreq,
 					     struct sk_buff *skb)
--- a/net/dccp/proto.c	2010-10-05 23:09:09.956117344 +0900
+++ b/net/dccp/proto.c	2010-10-05 23:22:09.171119892 +0900
@@ -50,6 +50,30 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo);
 /* the maximum queue length for tx in packets. 0 is no limit */
 int sysctl_dccp_tx_qlen __read_mostly = 5;
 
+#ifdef CONFIG_IP_DCCP_DEBUG
+static const char *dccp_state_name(const int state)
+{
+	static const char *const dccp_state_names[] = {
+	[DCCP_OPEN]		= "OPEN",
+	[DCCP_REQUESTING]	= "REQUESTING",
+	[DCCP_PARTOPEN]		= "PARTOPEN",
+	[DCCP_LISTEN]		= "LISTEN",
+	[DCCP_RESPOND]		= "RESPOND",
+	[DCCP_CLOSING]		= "CLOSING",
+	[DCCP_ACTIVE_CLOSEREQ]	= "CLOSEREQ",
+	[DCCP_PASSIVE_CLOSE]	= "PASSIVE_CLOSE",
+	[DCCP_PASSIVE_CLOSEREQ]	= "PASSIVE_CLOSEREQ",
+	[DCCP_TIME_WAIT]	= "TIME_WAIT",
+	[DCCP_CLOSED]		= "CLOSED",
+	};
+
+	if (state >= DCCP_MAX_STATES)
+		return "INVALID STATE!";
+	else
+		return dccp_state_names[state];
+}
+#endif
+
 void dccp_set_state(struct sock *sk, const int state)
 {
 	const int oldstate = sk->sk_state;
@@ -146,30 +170,6 @@ const char *dccp_packet_name(const int t
 
 EXPORT_SYMBOL_GPL(dccp_packet_name);
 
-const char *dccp_state_name(const int state)
-{
-	static const char *const dccp_state_names[] = {
-	[DCCP_OPEN]		= "OPEN",
-	[DCCP_REQUESTING]	= "REQUESTING",
-	[DCCP_PARTOPEN]		= "PARTOPEN",
-	[DCCP_LISTEN]		= "LISTEN",
-	[DCCP_RESPOND]		= "RESPOND",
-	[DCCP_CLOSING]		= "CLOSING",
-	[DCCP_ACTIVE_CLOSEREQ]	= "CLOSEREQ",
-	[DCCP_PASSIVE_CLOSE]	= "PASSIVE_CLOSE",
-	[DCCP_PASSIVE_CLOSEREQ]	= "PASSIVE_CLOSEREQ",
-	[DCCP_TIME_WAIT]	= "TIME_WAIT",
-	[DCCP_CLOSED]		= "CLOSED",
-	};
-
-	if (state >= DCCP_MAX_STATES)
-		return "INVALID STATE!";
-	else
-		return dccp_state_names[state];
-}
-
-EXPORT_SYMBOL_GPL(dccp_state_name);
-
 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
--- a/net/dccp/feat.h	2010-10-05 23:12:45.999118130 +0900
+++ b/net/dccp/feat.h	2010-10-05 23:13:10.347118219 +0900
@@ -111,7 +111,6 @@ extern int  dccp_feat_init(struct sock *
 extern void dccp_feat_initialise_sysctls(void);
 extern int  dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
 				  u8 const *list, u8 len);
-extern int  dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val);
 extern int  dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
 				    u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
 extern int  dccp_feat_clone_list(struct list_head const *, struct list_head *);

^ permalink raw reply

* [net-next PATCH] igb: update adapter stats when reading /proc/net/dev.
From: Jesper Dangaard Brouer @ 2010-10-05 14:18 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Jeff Kirsher

Network driver igb: Improve the accuracy of stats in /proc/net/dev, by
updating the adapter stats when reading /proc/net/dev.  Currently the
stats are updated by the watchdog timer every 2 sec, or when getting
stats via ethtool -S.

A number of userspace apps read these /proc/net/dev stats every second,
e.g. ifstat, which then gives a perceived very bursty traffic pattern,
which is actually false.

Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk>
---
 drivers/net/igb/igb_main.c |   12 +++++++++---
 1 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 55edcb7..6cec297 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -4218,11 +4218,17 @@ static void igb_reset_task(struct work_struct *work)
  * @netdev: network interface device structure
  *
  * Returns the address of the device statistics structure.
- * The statistics are actually updated from the timer callback.
+ * The statistics are also updated from the timer callback
+ * igb_watchdog_task().
  **/
 static struct net_device_stats *igb_get_stats(struct net_device *netdev)
 {
-	/* only return the current stats */
+	struct igb_adapter *adapter = netdev_priv(netdev);
+
+	/* update stats */
+	igb_update_stats(adapter);
+
+	/* return the current stats */
 	return &netdev->stats;
 }

@@ -4307,7 +4313,7 @@ static int igb_change_mtu(struct net_device *netdev, int new_mtu)

 void igb_update_stats(struct igb_adapter *adapter)
 {
-	struct net_device_stats *net_stats = igb_get_stats(adapter->netdev);
+	struct net_device_stats *net_stats = &adapter->netdev->stats;
 	struct e1000_hw *hw = &adapter->hw;
 	struct pci_dev *pdev = adapter->pdev;
 	u32 reg, mpc;

^ permalink raw reply related

* Re: [PATCH] SIW: Object management
From: Steve Wise @ 2010-10-05 14:26 UTC (permalink / raw)
  To: Bernard Metzler; +Cc: netdev, linux-rdma
In-Reply-To: <1286261665-5175-1-git-send-email-bmt@zurich.ibm.com>

On 10/05/2010 01:54 AM, Bernard Metzler wrote:

<snip>+
> +
> +/***** routines for WQE handling ***/
> +
> +/*
> + * siw_wqe_get()
> + *
> + * Get new WQE. For READ RESPONSE, take it from the free list which
> + * has a maximum size of maximum inbound READs. All other WQE are
> + * malloc'ed which creates some overhead. Consider change to
> + *
> + * 1. malloc WR only if it cannot be synchonously completed, or
> + * 2. operate own cache of reuseable WQE's.
> + *
> + * Current code trusts on malloc efficiency.
> + */
> +inline struct siw_wqe *siw_wqe_get(struct siw_qp *qp, enum siw_wr_opcode op)
> +{
> +	struct siw_wqe *wqe;
> +
> +	if (op == SIW_WR_RDMA_READ_RESP) {
> +		spin_lock(&qp->freelist_lock);
> +		if (!(list_empty(&qp->wqe_freelist))) {
> +			wqe = list_entry(qp->wqe_freelist.next,
> +					 struct siw_wqe, list);
> +			list_del(&wqe->list);
> +			spin_unlock(&qp->freelist_lock);
> +			wqe->processed = 0;
> +			dprint(DBG_OBJ|DBG_WR,
> +				"(QP%d): WQE from FreeList p: %p\n",
> +				QP_ID(qp), wqe);
> +		} else {
> +			spin_unlock(&qp->freelist_lock);
> +			wqe = NULL;
> +			dprint(DBG_ON|DBG_OBJ|DBG_WR,
> +				"(QP%d): FreeList empty!\n", QP_ID(qp));
> +		}
> +	} else {
> +		wqe = kzalloc(sizeof(struct siw_wqe), GFP_KERNEL);
> +		dprint(DBG_OBJ|DBG_WR, "(QP%d): New WQE p: %p\n",
> +			QP_ID(qp), wqe);
> +	}
>    

I think you can't allocate at GFP_KERNEL here if this is called from the 
post_ functions.  I think you might want to pre-allocate these when you 
create the QP...


Steve.

^ permalink raw reply

* Re: [PATCH] SIW: User interface
From: Bernard Metzler @ 2010-10-05 14:29 UTC (permalink / raw)
  To: Steve Wise; +Cc: linux-rdma, netdev
In-Reply-To: <4CAB337E.1040205@opengridcomputing.com>

Steve Wise <swise@opengridcomputing.com> wrote on 10/05/2010 04:17:34 PM:

> Steve Wise <swise@opengridcomputing.com> 
> 10/05/2010 04:17 PM
> 
> To
> 
> Bernard Metzler <bmt@zurich.ibm.com>
> 
> cc
> 
> netdev@vger.kernel.org, linux-rdma@vger.kernel.org
> 
> Subject
> 
> Re: [PATCH] SIW: User interface
> 
> On 10/05/2010 01:54 AM, Bernard Metzler wrote:
> 
> 
> <snip>
> 
> > +
> > +/*
> > + * siw_post_send()
> > + *
> > + * Post a list of S-WR's to a SQ.
> > + *
> > + * @ofa_qp:   OFA QP contained in siw QP
> > + * @wr:      Null terminated list of user WR's
> > + * @bad_wr:   Points to failing WR in case of synchronous failure.
> > + */
> > +int siw_post_send(struct ib_qp *ofa_qp, struct ib_send_wr *wr,
> > +        struct ib_send_wr **bad_wr)
> > +{
> > +   struct siw_wqe   *wqe = NULL;
> > +   struct siw_qp   *qp = siw_qp_ofa2siw(ofa_qp);
> > +
> > +   unsigned long flags;
> > +   int rv = 0;
> > +
> > +   dprint(DBG_WR|DBG_TX, "(QP%d): state=%d\n",
> > +      QP_ID(qp), qp->attrs.state);
> > +
> > +   /*
> > +    * Acquire QP state lock for reading. The idea is that a
> > +    * user cannot move the QP out of RTS during TX/RX processing.
> > +    */
> > +   down_read(&qp->state_lock);
> > +
> > 
> 
> I don't think you can use a rw_semaphore here because it potentially can 

> block.  You cannot block/sleep in the post_send/post_recv (and some 
> other) RDMA provider functions.  See 
> Documentation/infiniband/core_locking.txt.
> 
> 
ah, ok.
with that, a down_read_trylock() would solve the issue...?
given the limited set of errno values - what would you suggest
as a meaningful return value? EBUSY, EINVAL, ...?

thanks!
bernard.

> Steve.


^ permalink raw reply

* Re: [PATCH] SIW: User interface
From: Steve Wise @ 2010-10-05 14:32 UTC (permalink / raw)
  To: Bernard Metzler; +Cc: linux-rdma, netdev
In-Reply-To: <OF07B553EF.B746D69E-ONC12577B3.004EAD6A-C12577B3.004F92B0@ch.ibm.com>

On 10/05/2010 09:29 AM, Bernard Metzler wrote:
> Steve Wise<swise@opengridcomputing.com>  wrote on 10/05/2010 04:17:34 PM:
>
>    
>> Steve Wise<swise@opengridcomputing.com>
>> 10/05/2010 04:17 PM
>>
>> To
>>
>> Bernard Metzler<bmt@zurich.ibm.com>
>>
>> cc
>>
>> netdev@vger.kernel.org, linux-rdma@vger.kernel.org
>>
>> Subject
>>
>> Re: [PATCH] SIW: User interface
>>
>> On 10/05/2010 01:54 AM, Bernard Metzler wrote:
>>
>>
>> <snip>
>>
>>      
>>> +
>>> +/*
>>> + * siw_post_send()
>>> + *
>>> + * Post a list of S-WR's to a SQ.
>>> + *
>>> + * @ofa_qp:   OFA QP contained in siw QP
>>> + * @wr:      Null terminated list of user WR's
>>> + * @bad_wr:   Points to failing WR in case of synchronous failure.
>>> + */
>>> +int siw_post_send(struct ib_qp *ofa_qp, struct ib_send_wr *wr,
>>> +        struct ib_send_wr **bad_wr)
>>> +{
>>> +   struct siw_wqe   *wqe = NULL;
>>> +   struct siw_qp   *qp = siw_qp_ofa2siw(ofa_qp);
>>> +
>>> +   unsigned long flags;
>>> +   int rv = 0;
>>> +
>>> +   dprint(DBG_WR|DBG_TX, "(QP%d): state=%d\n",
>>> +      QP_ID(qp), qp->attrs.state);
>>> +
>>> +   /*
>>> +    * Acquire QP state lock for reading. The idea is that a
>>> +    * user cannot move the QP out of RTS during TX/RX processing.
>>> +    */
>>> +   down_read(&qp->state_lock);
>>> +
>>>
>>>        
>> I don't think you can use a rw_semaphore here because it potentially can
>>      
>    
>> block.  You cannot block/sleep in the post_send/post_recv (and some
>> other) RDMA provider functions.  See
>> Documentation/infiniband/core_locking.txt.
>>
>>
>>      
> ah, ok.
> with that, a down_read_trylock() would solve the issue...?
> given the limited set of errno values - what would you suggest
> as a meaningful return value? EBUSY, EINVAL, ...?
>
>    

I think it is expected that you should implement this without requiring 
the blocking semaphore.  Returning an error will cause the application 
to bail.


Steve.

^ permalink raw reply

* Re: [PATCH] bonding: fix to rejoin multicast groups immediately
From: Flavio Leitner @ 2010-10-05 14:34 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20101005.001338.52208103.davem@davemloft.net>

On Tue, Oct 05, 2010 at 12:13:38AM -0700, David Miller wrote:
> From: Flavio Leitner <fleitner@redhat.com>
> Date: Wed, 29 Sep 2010 04:12:07 -0300
> 
> > It should rejoin multicast groups immediately when
> > the failover happens to restore the multicast traffic.
> > 
> > Signed-off-by: Flavio Leitner <fleitner@redhat.com>
> 
> I suspect the IGMPv3 handling via a delayed action, as is currently
> implemented, is on purpose and is done so to follow the specification
> of the IGMPv3 RFCs.
> 
> Therefore you have to explain why your new behavior is so desirable
> and in particular why something as undesirable as violating the RFCs
> is therefore warranted.

That patch only changes the behavior for bonding during a link
failure, so if we have a bonding in active-backup or any other mode
with current-active-slave, the initialization will happen just fine
following IGMP specs.

However, neither the backup slave interface nor the backup switch
connected to backup slave knows about mcast. Thus when a link failure
happens, we shouldn't rely on timers to not stay out of the mcast
group losing traffic.

E.g. The V1 specs says that we shouldn't send any membership report
if it has been one in the last minute because that means the switch
is notified and the system will receive mcast traffic for that group. 
Therefore, if it sees one and a link failure happens right after that,
the backup slave will send another membership report only one minute
later. During this time the system loses traffic.

-- 
Flavio

^ permalink raw reply

* Re: [net-next PATCH] igb: update adapter stats when reading /proc/net/dev.
From: Eric Dumazet @ 2010-10-05 14:41 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: David S. Miller, netdev, Jeff Kirsher
In-Reply-To: <20101005141833.20929.10943.stgit@localhost>

Le mardi 05 octobre 2010 à 16:18 +0200, Jesper Dangaard Brouer a écrit :
> Network driver igb: Improve the accuracy of stats in /proc/net/dev, by
> updating the adapter stats when reading /proc/net/dev.  Currently the
> stats are updated by the watchdog timer every 2 sec, or when getting
> stats via ethtool -S.
> 
> A number of userspace apps read these /proc/net/dev stats every second,
> e.g. ifstat, which then gives a perceived very bursty traffic pattern,
> which is actually false.
> 
> Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk>

Unfortunately this is racy with igb_watchdog_task()

igb_update_stats() must be called under protection of a lock.

^ permalink raw reply

* Re: [PATCH] SIW: Object management
From: Bernard Metzler @ 2010-10-05 14:56 UTC (permalink / raw)
  To: Steve Wise
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <4CAB35A8.6080906-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>

Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org> wrote on 10/05/2010 04:26:48 PM:

> Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org> 
> 10/05/2010 04:26 PM
> 
> To
> 
> Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
> 
> cc
> 
> netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> 
> Subject
> 
> Re: [PATCH] SIW: Object management
> 
> On 10/05/2010 01:54 AM, Bernard Metzler wrote:
> 
> <snip>+
> > +
> > +/***** routines for WQE handling ***/
> > +
> > +/*
> > + * siw_wqe_get()
> > + *
> > + * Get new WQE. For READ RESPONSE, take it from the free list which
> > + * has a maximum size of maximum inbound READs. All other WQE are
> > + * malloc'ed which creates some overhead. Consider change to
> > + *
> > + * 1. malloc WR only if it cannot be synchonously completed, or
> > + * 2. operate own cache of reuseable WQE's.
> > + *
> > + * Current code trusts on malloc efficiency.
> > + */
> > +inline struct siw_wqe *siw_wqe_get(struct siw_qp *qp, enum 
> siw_wr_opcode op)
> > +{
> > +   struct siw_wqe *wqe;
> > +
> > +   if (op == SIW_WR_RDMA_READ_RESP) {
> > +      spin_lock(&qp->freelist_lock);
> > +      if (!(list_empty(&qp->wqe_freelist))) {
> > +         wqe = list_entry(qp->wqe_freelist.next,
> > +                struct siw_wqe, list);
> > +         list_del(&wqe->list);
> > +         spin_unlock(&qp->freelist_lock);
> > +         wqe->processed = 0;
> > +         dprint(DBG_OBJ|DBG_WR,
> > +            "(QP%d): WQE from FreeList p: %p\n",
> > +            QP_ID(qp), wqe);
> > +      } else {
> > +         spin_unlock(&qp->freelist_lock);
> > +         wqe = NULL;
> > +         dprint(DBG_ON|DBG_OBJ|DBG_WR,
> > +            "(QP%d): FreeList empty!\n", QP_ID(qp));
> > +      }
> > +   } else {
> > +      wqe = kzalloc(sizeof(struct siw_wqe), GFP_KERNEL);
> > +      dprint(DBG_OBJ|DBG_WR, "(QP%d): New WQE p: %p\n",
> > +         QP_ID(qp), wqe);
> > +   }
> > 
> 
> I think you can't allocate at GFP_KERNEL here if this is called from the 

> post_ functions.  I think you might want to pre-allocate these when you 
> create the QP...
> 
the idea was to keep the memory footprint small and flexible
while using the linux/list.h routines to manipulate all queues
(no ring buffers etc, just lists). at the same time we
decided to take the provided uverbs_cmd-syscall path down to
the driver even for the post_-functions (since we would have to ring a 
doorbell on the send path anyway, which in software, is a syscall).
in that path, even ib_uverbs_post_send() does one kmalloc() per wr
(it would be helpful if the provider could keep and reuse that wr of
known size, freeing it later at its own premises. that would avoid
the second kmalloc here.)

currently only work queue elements which are needed to satisfy
inbound read requests are pre-allocated (amount corresponding
to inbound read queue depth), since the read response is
scheduled in network softirq context which must not sleep.

that discussion may relate to the spinlock at the entrance to the
post_ verbs. going down the uverbs_cmd path may sleep anyway...?

thanks,
bernard.
> 
> Steve.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [net-next PATCH] igb: update adapter stats when reading /proc/net/dev.
From: Jesper Dangaard Brouer @ 2010-10-05 14:53 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev, Jeff Kirsher
In-Reply-To: <1286289703.2796.292.camel@edumazet-laptop>

On Tue, 2010-10-05 at 16:41 +0200, Eric Dumazet wrote:
> Le mardi 05 octobre 2010 à 16:18 +0200, Jesper Dangaard Brouer a écrit :
> > Network driver igb: Improve the accuracy of stats in /proc/net/dev, by
> > updating the adapter stats when reading /proc/net/dev.  Currently the
> > stats are updated by the watchdog timer every 2 sec, or when getting
> > stats via ethtool -S.
> > 
> > A number of userspace apps read these /proc/net/dev stats every second,
> > e.g. ifstat, which then gives a perceived very bursty traffic pattern,
> > which is actually false.
> > 
> > Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk>
> 
> Unfortunately this is racy with igb_watchdog_task()
> 
> igb_update_stats() must be called under protection of a lock.

Its already racy, because "ethtool -S" reads out the stats immediately,
and thus races with the timer.

See: igb_ethtool.c
 igb_get_ethtool_stats() invoke igb_update_stats(adapter);

-- 
Med venlig hilsen / Best regards
  Jesper Brouer
  ComX Networks A/S
  Linux Network Kernel Developer
  Cand. Scient Datalog / MSc.CS
  Author of http://adsl-optimizer.dk
  LinkedIn: http://www.linkedin.com/in/brouer



^ permalink raw reply

* Re: [PATCH] SIW: Object management
From: Steve Wise @ 2010-10-05 15:02 UTC (permalink / raw)
  To: Bernard Metzler; +Cc: linux-rdma, netdev
In-Reply-To: <OF706D1D5F.5849676F-ONC12577B3.004FB763-C12577B3.00521572@ch.ibm.com>

On 10/05/2010 09:56 AM, Bernard Metzler wrote:
> Steve Wise<swise@opengridcomputing.com>  wrote on 10/05/2010 04:26:48 PM:
>
>    
>> Steve Wise<swise@opengridcomputing.com>
>> 10/05/2010 04:26 PM
>>
>> To
>>
>> Bernard Metzler<bmt@zurich.ibm.com>
>>
>> cc
>>
>> netdev@vger.kernel.org, linux-rdma@vger.kernel.org
>>
>> Subject
>>
>> Re: [PATCH] SIW: Object management
>>
>> On 10/05/2010 01:54 AM, Bernard Metzler wrote:
>>
>> <snip>+
>>      
>>> +
>>> +/***** routines for WQE handling ***/
>>> +
>>> +/*
>>> + * siw_wqe_get()
>>> + *
>>> + * Get new WQE. For READ RESPONSE, take it from the free list which
>>> + * has a maximum size of maximum inbound READs. All other WQE are
>>> + * malloc'ed which creates some overhead. Consider change to
>>> + *
>>> + * 1. malloc WR only if it cannot be synchonously completed, or
>>> + * 2. operate own cache of reuseable WQE's.
>>> + *
>>> + * Current code trusts on malloc efficiency.
>>> + */
>>> +inline struct siw_wqe *siw_wqe_get(struct siw_qp *qp, enum
>>>        
>> siw_wr_opcode op)
>>      
>>> +{
>>> +   struct siw_wqe *wqe;
>>> +
>>> +   if (op == SIW_WR_RDMA_READ_RESP) {
>>> +      spin_lock(&qp->freelist_lock);
>>> +      if (!(list_empty(&qp->wqe_freelist))) {
>>> +         wqe = list_entry(qp->wqe_freelist.next,
>>> +                struct siw_wqe, list);
>>> +         list_del(&wqe->list);
>>> +         spin_unlock(&qp->freelist_lock);
>>> +         wqe->processed = 0;
>>> +         dprint(DBG_OBJ|DBG_WR,
>>> +            "(QP%d): WQE from FreeList p: %p\n",
>>> +            QP_ID(qp), wqe);
>>> +      } else {
>>> +         spin_unlock(&qp->freelist_lock);
>>> +         wqe = NULL;
>>> +         dprint(DBG_ON|DBG_OBJ|DBG_WR,
>>> +            "(QP%d): FreeList empty!\n", QP_ID(qp));
>>> +      }
>>> +   } else {
>>> +      wqe = kzalloc(sizeof(struct siw_wqe), GFP_KERNEL);
>>> +      dprint(DBG_OBJ|DBG_WR, "(QP%d): New WQE p: %p\n",
>>> +         QP_ID(qp), wqe);
>>> +   }
>>>
>>>        
>> I think you can't allocate at GFP_KERNEL here if this is called from the
>>      
>    
>> post_ functions.  I think you might want to pre-allocate these when you
>> create the QP...
>>
>>      
> the idea was to keep the memory footprint small and flexible
> while using the linux/list.h routines to manipulate all queues
> (no ring buffers etc, just lists). at the same time we
> decided to take the provided uverbs_cmd-syscall path down to
> the driver even for the post_-functions (since we would have to ring a
> doorbell on the send path anyway, which in software, is a syscall).
> in that path, even ib_uverbs_post_send() does one kmalloc() per wr
> (it would be helpful if the provider could keep and reuse that wr of
> known size, freeing it later at its own premises. that would avoid
> the second kmalloc here.)
>
> currently only work queue elements which are needed to satisfy
> inbound read requests are pre-allocated (amount corresponding
> to inbound read queue depth), since the read response is
> scheduled in network softirq context which must not sleep.
>
> that discussion may relate to the spinlock at the entrance to the
> post_ verbs. going down the uverbs_cmd path may sleep anyway...?
>
>    


The uverb calls may sleep, but certain kernel verbs must not.  Remember, 
the post_send/recv and other functions in your driver are called 
directly (almost) by kernel users like NFSRDMA.  These users may be 
calling in an interrupt context and thus you cannot block/sleep.

Steve.

^ permalink raw reply

* Re: [RFC] bonding: fix workqueue re-arming races
From: Narendra_K @ 2010-10-05 15:03 UTC (permalink / raw)
  To: jbohac; +Cc: fubar, bonding-devel, markine, jarkao2, chavey, netdev
In-Reply-To: <20101001182232.GB25971@midget.suse.cz>

On Fri, Oct 01, 2010 at 11:52:32PM +0530, Jiri Bohac wrote:
>    On Fri, Sep 24, 2010 at 06:23:53AM -0500, Narendra K wrote:
>    > On Fri, Sep 17, 2010 at 04:14:33AM +0530, Jay Vosburgh wrote:
>    > > Jay Vosburgh <fubar@us.ibm.com> wrote:
>    > The follwing call trace was seen -
>    >
>    > 2.6.35.with.upstream.patch-next-20100811-0.7-default+
>    > [14602.945876] ------------[ cut here ]------------
>    > [14602.950474] kernel BUG at kernel/workqueue.c:2844!
>    > [14602.955242] invalid opcode: 0000 [#1] SMP
>    > [14602.959341] last sysfs file: /sys/class/net/bonding_masters
>    > [14602.964888] CPU 1
>    > [14602.966714] Modules linked in: af_packet bonding ipv6
>    cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq
>    mperf microcode fuse loop dm_mod joydev usbhid hid bnx2 tpm_tis tpm
>    tpm_bios rtc_cmos iTCO_wdt iTCO_vendor_support sr_mod power_meter cdrom sg
>    serio_raw mptctl pcspkr rtc_core usb_storage dcdbas rtc_lib button
>    uhci_hcd ehci_hcd usbcore sd_mod crc_t10dif edd ext3 mbcache jbd fan
>    processor ide_pci_generic ide_core ata_generic ata_piix libata mptsas
>    mptscsih mptbase scsi_transport_sas scsi_mod thermal thermal_sys hwmon
>    > [14603.015002]
>    > [14603.016524] Pid: 4006, comm: ifdown-bonding Not tainted
>    2.6.35.with.upstream.patch-next-20100811-0.7-default+ #2 0M233H/PowerEdge
>    R710
>    > [14603.028554] RIP: 0010:[<ffffffff81067b50>]  [<ffffffff81067b50>]
>    destroy_workqueue+0x1d0/0x1e0
>    > [14603.037144] RSP: 0018:ffff88022a379d88  EFLAGS: 00010286
>    > [14603.042432] RAX: 000000000000003c RBX: ffff880228674240 RCX:
>    ffff880228f0e800
>    > [14603.049534] RDX: 0000000000001000 RSI: 0000000000000002 RDI:
>    000000000000001a
>    > [14603.056638] RBP: ffff88022a379da8 R08: ffff88022a379cf8 R09:
>    0000000000000000
>    > [14603.063741] R10: 00000000ffffffff R11: 0000000000000000 R12:
>    0000000000000002
>    > [14603.070842] R13: ffffffff817b8560 R14: ffff8802299d1480 R15:
>    ffff8802299d1488
>    > [14603.077944] FS:  00007f8e6a28f700(0000) GS:ffff880001c00000(0000)
>    knlGS:0000000000000000
>    > [14603.085999] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
>    > [14603.091719] CR2: 00007f8e6a2c2000 CR3: 0000000127d1c000 CR4:
>    00000000000006e0
>    > [14603.098822] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
>    0000000000000000
>    > [14603.105924] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
>    0000000000000400
>    > [14603.113026] Process ifdown-bonding (pid: 4006, threadinfo
>    ffff88022a378000, task ffff8802299b0080)
>    > [14603.121944] Stack:
>    > [14603.123944]  ffff88022a379da8 ffff8802299d1000 ffff8802299d1000
>    000000010036b6a4
>    > [14603.131182] <0> ffff88022a379dc8 ffffffffa030a91d ffff8802299d1000
>    000000010036b6a4
>    > [14603.138857] <0> ffff88022a379e28 ffffffff812e0a08 ffff88022a379e38
>    ffff88022a379de8
>    > [14603.146718] Call Trace:
>    > [14603.149158]  [<ffffffffa030a91d>] bond_destructor+0x1d/0x30 [bonding]
>    > [14603.155572]  [<ffffffff812e0a08>] netdev_run_todo+0x1a8/0x270
>    > [14603.161293]  [<ffffffff812ee859>] rtnl_unlock+0x9/0x10
>    > [14603.166411]  [<ffffffffa0317824>] bonding_store_bonds+0x1c4/0x1f0
>    [bonding]
>    > [14603.173342]  [<ffffffff810f26be>] ? alloc_pages_current+0x9e/0x110
>    > [14603.179497]  [<ffffffff81285c9e>] class_attr_store+0x1e/0x20
>    > [14603.185132]  [<ffffffff8116e365>] sysfs_write_file+0xc5/0x140
>    > [14603.190853]  [<ffffffff8110a68f>] vfs_write+0xcf/0x190
>    > [14603.195967]  [<ffffffff8110a840>] sys_write+0x50/0x90
>    > [14603.200996]  [<ffffffff81002ec2>] system_call_fastpath+0x16/0x1b
>    > [14603.206974] Code: 00 7f 14 8b 3b eb 91 3d 00 10 00 00 89 c2 77 10 8b
>    3b e9 07 ff ff ff 3d 00 10 00 00 89 c2 76 f0 8b 3b e9 a9 fe ff ff 0f 0b eb
>    fe <0f> 0b eb fe 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8b 3d 00
>    > [14603.226419] RIP  [<ffffffff81067b50>] destroy_workqueue+0x1d0/0x1e0
>    > [14603.232669]  RSP <ffff88022a379d88>
>    > [    0.000000] Initializing cgroup subsys cpuset
>    > [    0.000000] Initializing cgroup subsys cpu
> 
>    This should be the BUG_ON(cwq->nr_active) in
>    destroy_workqueue()
> 
>    This is really strange. bondng_store_bonds() can do two things:
>    create or delete a bonding device.
> 
>    I checked the delete path, where I would normally expect such a
>    problem, but I can't find a way it could fail in this way.
>    bondng_store_bonds() calls unregister_netdevice(), which
>    - calls rollback_registered() -> bond_close()
>    - puts the device on the net_todo_list.
>    On rtnl_unlock() netdev_run_todo() gets called and that calls
>    bond_destructor().
> 
>    bond_close() now makes sure the rearming work items are not
>    pending, thus, the only work items that may still be pending on
>    the workqueue are the non-rearming "commit" work items.
>    flush_workqueue(), called at the beginning of destroy_workqueue()
>    should have waited for these to finish.
>    If all of the above is correct, this BUG_ON should never trigger.
> 
>    Maybe I am overlooking something, or it may be some kind of
>    failure/race condition in the create path, resulting in
>    bond_destructor() being called as well.
> 
>    Narendra, any chance to capture the dmesg lines preceeding the
>    BUG message? This should show which of the above cases it is.

Jiri, I will try to reproduce the issue with ignore_loglevel to capture
more data on the serial console and share it shortly.

> 
>    I will try to come up with a debug patch that will tell us which
>    work remains active on the work queue.
> 
>    --
>    Jiri Bohac <jbohac@suse.cz>
>    SUSE Labs, SUSE CZ

-- 
With regards,
Narendra K

^ permalink raw reply

* Re: [net-next PATCH] igb: update adapter stats when reading /proc/net/dev.
From: Eric Dumazet @ 2010-10-05 15:19 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: David S. Miller, netdev, Jeff Kirsher
In-Reply-To: <1286290393.7071.38.camel@firesoul.comx.local>

Le mardi 05 octobre 2010 à 16:53 +0200, Jesper Dangaard Brouer a écrit :

> Its already racy, because "ethtool -S" reads out the stats immediately,
> and thus races with the timer.
> 
> See: igb_ethtool.c
>  igb_get_ethtool_stats() invoke igb_update_stats(adapter);
> 

You would be surprised how many bugs are waiting to be found and
fixed ;)

^ permalink raw reply

* Re: [PATCH] SIW: Object management
From: Bernard Metzler @ 2010-10-05 15:25 UTC (permalink / raw)
  To: Steve Wise; +Cc: linux-rdma, netdev
In-Reply-To: <4CAB3E0D.8050609@opengridcomputing.com>

Steve Wise <swise@opengridcomputing.com> wrote on 10/05/2010 05:02:37 PM:

> Steve Wise <swise@opengridcomputing.com> 
> 10/05/2010 05:02 PM
> 
> To
> 
> Bernard Metzler <BMT@zurich.ibm.com>
> 
> cc
> 
> linux-rdma@vger.kernel.org, netdev@vger.kernel.org
> 
> Subject
> 
> Re: [PATCH] SIW: Object management
> 
> On 10/05/2010 09:56 AM, Bernard Metzler wrote:
> > Steve Wise<swise@opengridcomputing.com>  wrote on 10/05/2010 04:26:48 
PM:
> >
> > 
> >> Steve Wise<swise@opengridcomputing.com>
> >> 10/05/2010 04:26 PM
> >>
> >> To
> >>
> >> Bernard Metzler<bmt@zurich.ibm.com>
> >>
> >> cc
> >>
> >> netdev@vger.kernel.org, linux-rdma@vger.kernel.org
> >>
> >> Subject
> >>
> >> Re: [PATCH] SIW: Object management
> >>
> >> On 10/05/2010 01:54 AM, Bernard Metzler wrote:
> >>
> >> <snip>+
> >> 
> >>> +
> >>> +/***** routines for WQE handling ***/
> >>> +
> >>> +/*
> >>> + * siw_wqe_get()
> >>> + *
> >>> + * Get new WQE. For READ RESPONSE, take it from the free list which
> >>> + * has a maximum size of maximum inbound READs. All other WQE are
> >>> + * malloc'ed which creates some overhead. Consider change to
> >>> + *
> >>> + * 1. malloc WR only if it cannot be synchonously completed, or
> >>> + * 2. operate own cache of reuseable WQE's.
> >>> + *
> >>> + * Current code trusts on malloc efficiency.
> >>> + */
> >>> +inline struct siw_wqe *siw_wqe_get(struct siw_qp *qp, enum
> >>> 
> >> siw_wr_opcode op)
> >> 
> >>> +{
> >>> +   struct siw_wqe *wqe;
> >>> +
> >>> +   if (op == SIW_WR_RDMA_READ_RESP) {
> >>> +      spin_lock(&qp->freelist_lock);
> >>> +      if (!(list_empty(&qp->wqe_freelist))) {
> >>> +         wqe = list_entry(qp->wqe_freelist.next,
> >>> +                struct siw_wqe, list);
> >>> +         list_del(&wqe->list);
> >>> +         spin_unlock(&qp->freelist_lock);
> >>> +         wqe->processed = 0;
> >>> +         dprint(DBG_OBJ|DBG_WR,
> >>> +            "(QP%d): WQE from FreeList p: %p\n",
> >>> +            QP_ID(qp), wqe);
> >>> +      } else {
> >>> +         spin_unlock(&qp->freelist_lock);
> >>> +         wqe = NULL;
> >>> +         dprint(DBG_ON|DBG_OBJ|DBG_WR,
> >>> +            "(QP%d): FreeList empty!\n", QP_ID(qp));
> >>> +      }
> >>> +   } else {
> >>> +      wqe = kzalloc(sizeof(struct siw_wqe), GFP_KERNEL);
> >>> +      dprint(DBG_OBJ|DBG_WR, "(QP%d): New WQE p: %p\n",
> >>> +         QP_ID(qp), wqe);
> >>> +   }
> >>>
> >>> 
> >> I think you can't allocate at GFP_KERNEL here if this is called from 
the
> >> 
> > 
> >> post_ functions.  I think you might want to pre-allocate these when 
you
> >> create the QP...
> >>
> >> 
> > the idea was to keep the memory footprint small and flexible
> > while using the linux/list.h routines to manipulate all queues
> > (no ring buffers etc, just lists). at the same time we
> > decided to take the provided uverbs_cmd-syscall path down to
> > the driver even for the post_-functions (since we would have to ring a
> > doorbell on the send path anyway, which in software, is a syscall).
> > in that path, even ib_uverbs_post_send() does one kmalloc() per wr
> > (it would be helpful if the provider could keep and reuse that wr of
> > known size, freeing it later at its own premises. that would avoid
> > the second kmalloc here.)
> >
> > currently only work queue elements which are needed to satisfy
> > inbound read requests are pre-allocated (amount corresponding
> > to inbound read queue depth), since the read response is
> > scheduled in network softirq context which must not sleep.
> >
> > that discussion may relate to the spinlock at the entrance to the
> > post_ verbs. going down the uverbs_cmd path may sleep anyway...?
> >
> > 
> 
> 
> The uverb calls may sleep, but certain kernel verbs must not.  Remember, 

> the post_send/recv and other functions in your driver are called 
> directly (almost) by kernel users like NFSRDMA.  These users may be 
> calling in an interrupt context and thus you cannot block/sleep.
> 
OK, very convincing. not a big change since siw_wqe_get/_put()
already maintain a list of pre-allocated wqe's (currently for
the read.responses).
but, would it be ok if the code distinguishes between user
land and in-kernel consumers? i would be very happy if we could
keep the pre-allocations per user land connection to its very
minimum...

thanks,
bernard.


> Steve.


^ permalink raw reply

* Re: [PATCH] SIW: Object management
From: Steve Wise @ 2010-10-05 15:37 UTC (permalink / raw)
  To: Bernard Metzler; +Cc: linux-rdma, netdev, Roland Dreier
In-Reply-To: <OFFCB10B70.3E1CC39D-ONC12577B3.0053113C-C12577B3.0054C076@ch.ibm.com>

On 10/05/2010 10:25 AM, Bernard Metzler wrote:
> Steve Wise<swise@opengridcomputing.com>  wrote on 10/05/2010 05:02:37 PM:
>
>    
>> Steve Wise<swise@opengridcomputing.com>
>> 10/05/2010 05:02 PM
>>
>> To
>>
>> Bernard Metzler<BMT@zurich.ibm.com>
>>
>> cc
>>
>> linux-rdma@vger.kernel.org, netdev@vger.kernel.org
>>
>> Subject
>>
>> Re: [PATCH] SIW: Object management
>>
>> On 10/05/2010 09:56 AM, Bernard Metzler wrote:
>>      
>>> Steve Wise<swise@opengridcomputing.com>   wrote on 10/05/2010 04:26:48
>>>        
> PM:
>    
>>>
>>>        
>>>> Steve Wise<swise@opengridcomputing.com>
>>>> 10/05/2010 04:26 PM
>>>>
>>>> To
>>>>
>>>> Bernard Metzler<bmt@zurich.ibm.com>
>>>>
>>>> cc
>>>>
>>>> netdev@vger.kernel.org, linux-rdma@vger.kernel.org
>>>>
>>>> Subject
>>>>
>>>> Re: [PATCH] SIW: Object management
>>>>
>>>> On 10/05/2010 01:54 AM, Bernard Metzler wrote:
>>>>
>>>> <snip>+
>>>>
>>>>          
>>>>> +
>>>>> +/***** routines for WQE handling ***/
>>>>> +
>>>>> +/*
>>>>> + * siw_wqe_get()
>>>>> + *
>>>>> + * Get new WQE. For READ RESPONSE, take it from the free list which
>>>>> + * has a maximum size of maximum inbound READs. All other WQE are
>>>>> + * malloc'ed which creates some overhead. Consider change to
>>>>> + *
>>>>> + * 1. malloc WR only if it cannot be synchonously completed, or
>>>>> + * 2. operate own cache of reuseable WQE's.
>>>>> + *
>>>>> + * Current code trusts on malloc efficiency.
>>>>> + */
>>>>> +inline struct siw_wqe *siw_wqe_get(struct siw_qp *qp, enum
>>>>>
>>>>>            
>>>> siw_wr_opcode op)
>>>>
>>>>          
>>>>> +{
>>>>> +   struct siw_wqe *wqe;
>>>>> +
>>>>> +   if (op == SIW_WR_RDMA_READ_RESP) {
>>>>> +      spin_lock(&qp->freelist_lock);
>>>>> +      if (!(list_empty(&qp->wqe_freelist))) {
>>>>> +         wqe = list_entry(qp->wqe_freelist.next,
>>>>> +                struct siw_wqe, list);
>>>>> +         list_del(&wqe->list);
>>>>> +         spin_unlock(&qp->freelist_lock);
>>>>> +         wqe->processed = 0;
>>>>> +         dprint(DBG_OBJ|DBG_WR,
>>>>> +            "(QP%d): WQE from FreeList p: %p\n",
>>>>> +            QP_ID(qp), wqe);
>>>>> +      } else {
>>>>> +         spin_unlock(&qp->freelist_lock);
>>>>> +         wqe = NULL;
>>>>> +         dprint(DBG_ON|DBG_OBJ|DBG_WR,
>>>>> +            "(QP%d): FreeList empty!\n", QP_ID(qp));
>>>>> +      }
>>>>> +   } else {
>>>>> +      wqe = kzalloc(sizeof(struct siw_wqe), GFP_KERNEL);
>>>>> +      dprint(DBG_OBJ|DBG_WR, "(QP%d): New WQE p: %p\n",
>>>>> +         QP_ID(qp), wqe);
>>>>> +   }
>>>>>
>>>>>
>>>>>            
>>>> I think you can't allocate at GFP_KERNEL here if this is called from
>>>>          
> the
>    
>>>>          
>>>        
>>>> post_ functions.  I think you might want to pre-allocate these when
>>>>          
> you
>    
>>>> create the QP...
>>>>
>>>>
>>>>          
>>> the idea was to keep the memory footprint small and flexible
>>> while using the linux/list.h routines to manipulate all queues
>>> (no ring buffers etc, just lists). at the same time we
>>> decided to take the provided uverbs_cmd-syscall path down to
>>> the driver even for the post_-functions (since we would have to ring a
>>> doorbell on the send path anyway, which in software, is a syscall).
>>> in that path, even ib_uverbs_post_send() does one kmalloc() per wr
>>> (it would be helpful if the provider could keep and reuse that wr of
>>> known size, freeing it later at its own premises. that would avoid
>>> the second kmalloc here.)
>>>
>>> currently only work queue elements which are needed to satisfy
>>> inbound read requests are pre-allocated (amount corresponding
>>> to inbound read queue depth), since the read response is
>>> scheduled in network softirq context which must not sleep.
>>>
>>> that discussion may relate to the spinlock at the entrance to the
>>> post_ verbs. going down the uverbs_cmd path may sleep anyway...?
>>>
>>>
>>>        
>>
>> The uverb calls may sleep, but certain kernel verbs must not.  Remember,
>>      
>    
>> the post_send/recv and other functions in your driver are called
>> directly (almost) by kernel users like NFSRDMA.  These users may be
>> calling in an interrupt context and thus you cannot block/sleep.
>>
>>      
> OK, very convincing. not a big change since siw_wqe_get/_put()
> already maintain a list of pre-allocated wqe's (currently for
> the read.responses).
> but, would it be ok if the code distinguishes between user
> land and in-kernel consumers? i would be very happy if we could
> keep the pre-allocations per user land connection to its very
> minimum...
>
>    

I think that's ok, but its bending the core locking rules a little I 
guess.  But the intent is that kernel users can definitely 
send/recv/poll in interrupt context, so possibly blocking for user mode 
QPs in on-kernel-bypass operations is probably ok...

What do you think Roland?

Steve.




^ permalink raw reply

* Re: [PATCH] SIW: iWARP Protocol headers
From: Bernard Metzler @ 2010-10-05 16:06 UTC (permalink / raw)
  To: Steve Wise; +Cc: linux-rdma, linux-rdma-owner, netdev
In-Reply-To: <4CAB2DDA.1020207@opengridcomputing.com>

<snip>
> > + */
> > +struct iwarp_ctrl_tagged {
> > +   struct iwarp_ctrl   ctrl;
> > +   __u32         ddp_stag;
> > +   __u64         ddp_to;
> > +} __attribute__((__packed__));
> > +
> > 
> 
> All of the above header structures should use __beXX types since the 
> fields are all in Network Byte Order.
> 
right. will get fixed.

> Also, did you run sparse on the patches (Documentation/sparse.txt)?
> 
> 
frankly, i was not aware. sorry. will do.

bernard.

^ permalink raw reply

* Re: [net-next-2.6 PATCH] net: netif_set_real_num_rx_queues may cap num_rx_queues at init time
From: John Fastabend @ 2010-10-05 16:08 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: bhutchings@solarflare.com, netdev@vger.kernel.org,
	therbert@google.com
In-Reply-To: <1286256926.2457.2.camel@edumazet-laptop>

On 10/4/2010 10:35 PM, Eric Dumazet wrote:
> Le lundi 04 octobre 2010 à 15:00 -0700, John Fastabend a écrit :
>> The logic for netif_set_real_num_rx_queues is the following,
>>
>> netif_set_real_num_rx_queues(dev, rxq)
>> {
>> 	...
>> 	if (dev->reg_state == NETREG_REGISTERED) {
>> 		...
>> 	} else {
>> 		dev->num_rx_queues = rxq;
>> 	}
>>
>> 	dev->real_num_rx_queues = rxq;
>> 	return 0;
>> }
>>
>> Some drivers init path looks like the following,
>>
>> alloc_etherdev_mq(priv_sz, max_num_queues_ever);
>> ...
>> netif_set_real_num_rx_queues(dev, queues_to_use_now);
>> ...
>> register_netdev(dev);
>> ...
>>
>> Because netif_set_real_num_rx_queues sets num_rx_queues if the
>> reg state is not NETREG_REGISTERED we end up with the incorrect
>> max number of rx queues. This patch proposes to remove the else
>> clause above so this does not occur.  Also just reading the
>> function set_real_num it seems a bit unexpected that num_rx_queues
>> gets set.
>>
> 
> You dont tell why its "incorrect".
> 

OK that is a poor description.

> Why should we keep num_rx_queues > real_num_rx_queues ?
> 

If we do not ever need them then we should not keep them I agree.
But having netif_set_real_num_rx_queues set something other then
'real_num_rx_queues' does not seem right to me at least. Also
netif_set_real_num_tx_queues and netif_set_real_num_rx_queues have
different behavior. It would be nice if this weren't the case but
they allocate queues in two places.

The drivers Ben modified seem to be split between these two flows

alloc_etherdev_mq(priv_sz, max_num_queues_ever);
register_netdev(dev);
netif_set_real_num_rx_queues(dev, queues_to_use_now);

and

alloc_etherdev_mq(priv_sz, max_num_queues_ever);
netif_set_real_num_rx_queues(dev, queues_to_use_now);
register_netdev(dev);

In the first case we never set 'num_rx_queues' because its after
the register so that leaves the second case. All the remaining
drivers except ixgbe, igb, and myri10ge know or could easily find
out the number of rx queues at alloc_etherdev_mq and are really trying
to explicitly set the num_rx_queues. Adding a num_rx_queues param to
alloc_etherdev_mq might make more sense in this case.

The myri10ge is querying the firmware which seems to be tangled up with
the netdev priv space, but otherwise isn't using any new knowledge. And
ixgbe/igb may end up capping the max number of queues because it is
trying to set the number of queues it intends to use now not the max it
will ever consume.

My point with this patch is I do not see any cases where we really do
not know the max number rx queues until after alloc_etherdev_mq() but before
register_netdev(). The piece I missed is if a driver wants to set rx queues
and tx queues separately they either need to explicitly set rx_num_queues or
use netif_set_real_num_rx_queues before registering the netdev. This syntax
seems error prone to me, and it would be better to set this in alloc_etherdev_mq().
But, maybe you disagree?

So I can either go rework the ixgbe/igb init flow so it does what I want.
Or add a parameter to alloc_etherdev_mq for rx queues, remove the
netif_set_real_num_rx_queues where it is not needed and modify the drivers
to use this interface. I like the second case because it makes the
netif_set_real_num_[rx|tx}_queues() behave the same but could do the first
just as easily.

Thanks,
John 

> It creates /sys files, and Qdisc stuff for nothing...
> 
> 
> 
>> CC: Ben Hutchings <bhutchings@solarflare.com>
>>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
>>
>>  net/core/dev.c |    2 --
>>  1 files changed, 0 insertions(+), 2 deletions(-)
>>
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index a313bab..f78d996 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -1592,8 +1592,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
>>  						  rxq);
>>  		if (rc)
>>  			return rc;
>> -	} else {
>> -		dev->num_rx_queues = rxq;
>>  	}
>>  
>>  	dev->real_num_rx_queues = rxq;
>>
>> --
> 
> 

^ permalink raw reply

* Re: [PATCH] SIW: iWARP Protocol headers
From: Bart Van Assche @ 2010-10-05 16:10 UTC (permalink / raw)
  To: Bernard Metzler
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1286261630-5085-1-git-send-email-bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>

On Tue, Oct 5, 2010 at 8:53 AM, Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org> wrote:
> +} __attribute__((__packed__));

Apparently you make extensive use of the packed attribute. Please read
this blog entry, in which it is explained why this is harmful:

http://digitalvampire.org/blog/index.php/2006/07/31/why-you-shouldnt-use-__attribute__packed/

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] SIW: Module initialization
From: Jason Gunthorpe @ 2010-10-05 16:24 UTC (permalink / raw)
  To: Bernard Metzler
  Cc: Bart Van Assche, bart.vanassche-Re5JQEeQqe8AvxtiuMwx3w,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <OFFB26426A.EB2C13FD-ONC12577B3.00488392-C12577B3.004895E4-Xeyd2O9EBijQT0dZR+AlfA@public.gmane.org>

On Tue, Oct 05, 2010 at 03:12:48PM +0200, Bernard Metzler wrote:

> good point. do you have a suggestion here - would you like to see 
> siw to be enabled more selectively? 
> iwarp is a protocol on top of TCP, explicitly defining the 
> semantics of data fetching and placement. end-to-end connectivity 
> and efficient data shipping is provided by TCP/IP. not taking into 
> account dedicated RDMA hardware, any TCP stream may carry iwarp 
> traffic. from that point of view, binding a software based 
> rdma stack to dedicated devices is a concession to the 
> given environment, in particular to the given rdma 
> connection management. therefore, we started with binding to all 
> available network devices. 

Considering the possibility for security issues with code like this it
would be much nicer to have an 'opt in' mechanism.. Ie a
 /sys/class/net/eth0/enable_iwarp
type of file.. You can track the add/remove of netdevices weth netevents.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [net-next-2.6 PATCH] net: netif_set_real_num_rx_queues may cap num_rx_queues at init time
From: Ben Hutchings @ 2010-10-05 16:34 UTC (permalink / raw)
  To: John Fastabend; +Cc: Eric Dumazet, netdev@vger.kernel.org, therbert@google.com
In-Reply-To: <4CAB4D8F.8080108@intel.com>

On Tue, 2010-10-05 at 09:08 -0700, John Fastabend wrote:
> On 10/4/2010 10:35 PM, Eric Dumazet wrote:
> > Le lundi 04 octobre 2010 à 15:00 -0700, John Fastabend a écrit :
> >> The logic for netif_set_real_num_rx_queues is the following,
> >>
> >> netif_set_real_num_rx_queues(dev, rxq)
> >> {
> >> 	...
> >> 	if (dev->reg_state == NETREG_REGISTERED) {
> >> 		...
> >> 	} else {
> >> 		dev->num_rx_queues = rxq;
> >> 	}
> >>
> >> 	dev->real_num_rx_queues = rxq;
> >> 	return 0;
> >> }
> >>
> >> Some drivers init path looks like the following,
> >>
> >> alloc_etherdev_mq(priv_sz, max_num_queues_ever);
> >> ...
> >> netif_set_real_num_rx_queues(dev, queues_to_use_now);
> >> ...
> >> register_netdev(dev);
> >> ...
> >>
> >> Because netif_set_real_num_rx_queues sets num_rx_queues if the
> >> reg state is not NETREG_REGISTERED we end up with the incorrect
> >> max number of rx queues. This patch proposes to remove the else
> >> clause above so this does not occur.  Also just reading the
> >> function set_real_num it seems a bit unexpected that num_rx_queues
> >> gets set.
> >>
> > 
> > You dont tell why its "incorrect".
> > 
> 
> OK that is a poor description.
> 
> > Why should we keep num_rx_queues > real_num_rx_queues ?
> > 
> 
> If we do not ever need them then we should not keep them I agree.
> But having netif_set_real_num_rx_queues set something other then
> 'real_num_rx_queues' does not seem right to me at least. Also
> netif_set_real_num_tx_queues and netif_set_real_num_rx_queues have
> different behavior. It would be nice if this weren't the case but
> they allocate queues in two places.
[...]

I only did this to satisfy Eric's desire to reduce memory usage.
However, I believe that there are currently no drivers that dynamically
increase numbers of RX or TX queues.  Until there are, there is not much
point in removing this assignment to num_rx_queues.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox