Netdev List
 help / color / mirror / Atom feed
* [net-next-2.6 PATCH 3/3] net_dcb: add application notifiers
From: John Fastabend @ 2010-12-20 20:50 UTC (permalink / raw)
  To: davem; +Cc: netdev, tgraf
In-Reply-To: <20101220205013.24232.3980.stgit@jf-dev1-dcblab>

DCBx applications priorities can be changed dynamically. If
application stacks are expected to keep the skb priority
consistent with the dcbx priority the stack will need to
be notified when these changes occur.

This patch adds application notifiers for the stack to register
with.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 include/net/dcbevent.h |   31 +++++++++++++++++++++++++++++++
 net/dcb/Makefile       |    2 +-
 net/dcb/dcbevent.c     |   40 ++++++++++++++++++++++++++++++++++++++++
 net/dcb/dcbnl.c        |    2 ++
 4 files changed, 74 insertions(+), 1 deletions(-)
 create mode 100644 include/net/dcbevent.h
 create mode 100644 net/dcb/dcbevent.c

diff --git a/include/net/dcbevent.h b/include/net/dcbevent.h
new file mode 100644
index 0000000..bc1e7ef
--- /dev/null
+++ b/include/net/dcbevent.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2010, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: John Fastabend <john.r.fastabend@intel.com>
+ */
+
+#ifndef _DCB_EVENT_H
+#define _DCB_EVENT_H
+
+enum dcbevent_notif_type {
+	DCB_APP_EVENT = 1,
+};
+
+extern int register_dcbevent_notifier(struct notifier_block *nb);
+extern int unregister_dcbevent_notifier(struct notifier_block *nb);
+extern int call_dcbevent_notifiers(unsigned long val, void *v);
+
+#endif
diff --git a/net/dcb/Makefile b/net/dcb/Makefile
index 9930f4c..c1282c9 100644
--- a/net/dcb/Makefile
+++ b/net/dcb/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_DCB) += dcbnl.o
+obj-$(CONFIG_DCB) += dcbnl.o dcbevent.o
diff --git a/net/dcb/dcbevent.c b/net/dcb/dcbevent.c
new file mode 100644
index 0000000..665a880
--- /dev/null
+++ b/net/dcb/dcbevent.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2010, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: John Fastabend <john.r.fastabend@intel.com>
+ */
+
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+
+static ATOMIC_NOTIFIER_HEAD(dcbevent_notif_chain);
+
+int register_dcbevent_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_register(&dcbevent_notif_chain, nb);
+}
+EXPORT_SYMBOL(register_dcbevent_notifier);
+
+int unregister_dcbevent_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&dcbevent_notif_chain, nb);
+}
+EXPORT_SYMBOL(unregister_dcbevent_notifier);
+
+int call_dcbevent_notifiers(unsigned long val, void *v)
+{
+	return atomic_notifier_call_chain(&dcbevent_notif_chain, val, v);
+}
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index adc8c8d..5f1d0ee 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -23,6 +23,7 @@
 #include <net/netlink.h>
 #include <net/rtnetlink.h>
 #include <linux/dcbnl.h>
+#include <net/dcbevent.h>
 #include <linux/rtnetlink.h>
 #include <net/sock.h>
 
@@ -1458,6 +1459,7 @@ u8 dcb_setapp(struct net_device *dev, struct dcb_app *new)
 	}
 out:
 	spin_unlock(&dcb_lock);
+	call_dcbevent_notifiers(DCB_APP_EVENT, new);
 	return 0;
 }
 EXPORT_SYMBOL(dcb_setapp);


^ permalink raw reply related

* [net-next-2.6 PATCH 1/3] dcbnl: add support for ieee8021Qaz attributes
From: John Fastabend @ 2010-12-20 20:50 UTC (permalink / raw)
  To: davem; +Cc: netdev, tgraf

The IEEE8021Qaz is the IEEE standard version of CEE. The
standard has had enough significant changes from the CEE
version that many of the CEE attributes have no meaning
in the new spec or do not easily map to IEEE standards.

Rather then attempt to create a complicated mapping
between CEE and IEEE standards this patch adds a nested
IEEE attribute to the list of DCB attributes. The policy
is,

	[DCB_ATTR_IFNAME]
	[DCB_ATTR_STATE]
	...
	[DCB_ATTR_IEEE]
		[DCB_ATTR_IEEE_ETS]
		[DCB_ATTR_IEEE_PFC]
		[DCB_ATTR_IEEE_APP_TABLE]
			[DCB_ATTR_IEEE_APP]
			...

The following dcbnl_rtnl_ops routines were added to handle
the IEEE standard,

	int (*ieee_getets) (struct net_device *, struct ieee_ets *);
	int (*ieee_setets) (struct net_device *, struct ieee_ets *);
	int (*ieee_getpfc) (struct net_device *, struct ieee_pfc *);
	int (*ieee_setpfc) (struct net_device *, struct ieee_pfc *);
	int (*ieee_getapp) (struct net_device *, struct dcb_app *);
	int (*ieee_setapp) (struct net_device *, struct dcb_app *);

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 include/linux/dcbnl.h |   95 ++++++++++++++++++++++++++++++++++++
 include/net/dcbnl.h   |   11 ++++
 net/dcb/dcbnl.c       |  131 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 237 insertions(+), 0 deletions(-)

diff --git a/include/linux/dcbnl.h b/include/linux/dcbnl.h
index 8723491..3706a13 100644
--- a/include/linux/dcbnl.h
+++ b/include/linux/dcbnl.h
@@ -22,6 +22,76 @@
 
 #include <linux/types.h>
 
+/* IEEE 802.1Qaz std supported values */
+#define IEEE_8021QAZ_MAX_TCS	8
+
+/* This structure contains the IEEE 802.1Qaz ETS managed object
+ *
+ * @ets_cap: indicates supported capacity of ets feature
+ * @cbs: credit based shaper ets algorithm supported
+ * @tc_tx_bw: tc tx bandwidth indexed by traffic class
+ * @tc_rx_bw: tc rx bandwidth indexed by traffic class
+ * @tc_tsa: TSA Assignment table, indexed by traffic class
+ * @prio_tc: priority assignment table mapping 8021Qp to traffic class
+ *
+ * ----
+ *  TSA Assignment 8 bit identifiers
+ *	0	strict priority
+ *	1	credit-based shaper
+ *	2	enhanced transmission selection
+ *	3-254	reserved
+ *	255	vendor specific
+ */
+struct ieee_ets {
+	__u8	ets_cap;
+	__u8	cbs;
+	__u8	tc_tx_bw[IEEE_8021QAZ_MAX_TCS];
+	__u8	tc_rx_bw[IEEE_8021QAZ_MAX_TCS];
+	__u8	tc_tsa[IEEE_8021QAZ_MAX_TCS];
+	__u8	prio_tc[IEEE_8021QAZ_MAX_TCS];
+};
+
+/* This structure contains the IEEE 802.1Qaz PFC managed object
+ *
+ * @pfc_cap: Indicates the number of traffic classes on the local device
+ *	     that may simultaneously have PFC enabled.
+ * @pfc_en: bitmap indicating pfc enabled traffic classes
+ * @mbc: enable macsec bypass capability
+ * @delay: the allowance made for a round-trip propagation delay of the
+ *	   link in bits.
+ * @requests: count of the sent pfc frames
+ * @indications: count of the received pfc frames
+ */
+struct ieee_pfc {
+	__u8	pfc_cap;
+	__u8	pfc_en;
+	__u8	mbc;
+	__u16	delay;
+	__u64	requests[IEEE_8021QAZ_MAX_TCS];
+	__u64	indications[IEEE_8021QAZ_MAX_TCS];
+};
+
+/* This structure contains the IEEE 802.1Qaz APP managed object
+ *
+ * @selector: protocol identifier type
+ * @protocol: protocol of type indicated
+ * @priority: 3-bit unsigned integer indicating priority
+ *
+ * ----
+ *  Selector field values
+ *	0	Reserved
+ *	1	Ethertype
+ *	2	Well known port number over TCP or SCTP
+ *	3	Well known port number over UDP or DCCP
+ *	4	Well known port number over TCP, SCTP, UDP, or DCCP
+ *	5-7	Reserved
+ */
+struct dcb_app {
+	__u8	selector;
+	__u32	protocol;
+	__u8	priority;
+};
+
 struct dcbmsg {
 	__u8               dcb_family;
 	__u8               cmd;
@@ -50,6 +120,8 @@ struct dcbmsg {
  * @DCB_CMD_SBCN: get backward congestion notification configration.
  * @DCB_CMD_GAPP: get application protocol configuration
  * @DCB_CMD_SAPP: set application protocol configuration
+ * @DCB_CMD_IEEE_SET: set IEEE 802.1Qaz configuration
+ * @DCB_CMD_IEEE_GET: get IEEE 802.1Qaz configuration
  */
 enum dcbnl_commands {
 	DCB_CMD_UNDEFINED,
@@ -83,6 +155,9 @@ enum dcbnl_commands {
 	DCB_CMD_GAPP,
 	DCB_CMD_SAPP,
 
+	DCB_CMD_IEEE_SET,
+	DCB_CMD_IEEE_GET,
+
 	__DCB_CMD_ENUM_MAX,
 	DCB_CMD_MAX = __DCB_CMD_ENUM_MAX - 1,
 };
@@ -102,6 +177,7 @@ enum dcbnl_commands {
  * @DCB_ATTR_CAP: DCB capabilities of the device (NLA_NESTED)
  * @DCB_ATTR_NUMTCS: number of traffic classes supported (NLA_NESTED)
  * @DCB_ATTR_BCN: backward congestion notification configuration (NLA_NESTED)
+ * @DCB_ATTR_IEEE: IEEE 802.1Qaz supported attributes (NLA_NESTED)
  */
 enum dcbnl_attrs {
 	DCB_ATTR_UNDEFINED,
@@ -119,10 +195,29 @@ enum dcbnl_attrs {
 	DCB_ATTR_BCN,
 	DCB_ATTR_APP,
 
+	/* IEEE std attributes */
+	DCB_ATTR_IEEE,
+
 	__DCB_ATTR_ENUM_MAX,
 	DCB_ATTR_MAX = __DCB_ATTR_ENUM_MAX - 1,
 };
 
+enum ieee_attrs {
+	DCB_ATTR_IEEE_UNSPEC,
+	DCB_ATTR_IEEE_ETS,
+	DCB_ATTR_IEEE_PFC,
+	DCB_ATTR_IEEE_APP_TABLE,
+	__DCB_ATTR_IEEE_MAX
+};
+#define DCB_ATTR_IEEE_MAX (__DCB_ATTR_IEEE_MAX - 1)
+
+enum ieee_attrs_app {
+	DCB_ATTR_IEEE_APP_UNSPEC,
+	DCB_ATTR_IEEE_APP,
+	__DCB_ATTR_IEEE_APP_MAX
+};
+#define DCB_ATTR_IEEE_APP_MAX (__DCB_ATTR_IEEE_APP_MAX - 1)
+
 /**
  * enum dcbnl_pfc_attrs - DCB Priority Flow Control user priority nested attrs
  *
diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h
index b36ac7e..e2d841e 100644
--- a/include/net/dcbnl.h
+++ b/include/net/dcbnl.h
@@ -20,11 +20,22 @@
 #ifndef __NET_DCBNL_H__
 #define __NET_DCBNL_H__
 
+#include <linux/dcbnl.h>
+
 /*
  * Ops struct for the netlink callbacks.  Used by DCB-enabled drivers through
  * the netdevice struct.
  */
 struct dcbnl_rtnl_ops {
+	/* IEEE 802.1Qaz std */
+	int (*ieee_getets) (struct net_device *, struct ieee_ets *);
+	int (*ieee_setets) (struct net_device *, struct ieee_ets *);
+	int (*ieee_getpfc) (struct net_device *, struct ieee_pfc *);
+	int (*ieee_setpfc) (struct net_device *, struct ieee_pfc *);
+	int (*ieee_getapp) (struct net_device *, struct dcb_app *);
+	int (*ieee_setapp) (struct net_device *, struct dcb_app *);
+
+	/* CEE std */
 	u8   (*getstate)(struct net_device *);
 	u8   (*setstate)(struct net_device *, u8);
 	void (*getpermhwaddr)(struct net_device *, u8 *);
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 19ac2b9..2ff9084 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -66,6 +66,7 @@ static const struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = {
 	[DCB_ATTR_PFC_STATE]   = {.type = NLA_U8},
 	[DCB_ATTR_BCN]         = {.type = NLA_NESTED},
 	[DCB_ATTR_APP]         = {.type = NLA_NESTED},
+	[DCB_ATTR_IEEE]	       = {.type = NLA_NESTED},
 };
 
 /* DCB priority flow control to User Priority nested attributes */
@@ -167,6 +168,17 @@ static const struct nla_policy dcbnl_app_nest[DCB_APP_ATTR_MAX + 1] = {
 	[DCB_APP_ATTR_PRIORITY]     = {.type = NLA_U8},
 };
 
+/* IEEE 802.1Qaz nested attributes. */
+static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = {
+	[DCB_ATTR_IEEE_ETS]	    = {.len = sizeof(struct ieee_ets)},
+	[DCB_ATTR_IEEE_PFC]	    = {.len = sizeof(struct ieee_pfc)},
+	[DCB_ATTR_IEEE_APP_TABLE]   = {.type = NLA_NESTED},
+};
+
+static const struct nla_policy dcbnl_ieee_app[DCB_ATTR_IEEE_APP_MAX + 1] = {
+	[DCB_ATTR_IEEE_APP]	    = {.len = sizeof(struct dcb_app)},
+};
+
 /* standard netlink reply call */
 static int dcbnl_reply(u8 value, u8 event, u8 cmd, u8 attr, u32 pid,
                        u32 seq, u16 flags)
@@ -1118,6 +1130,117 @@ err:
 	return ret;
 }
 
+/* Handle IEEE 802.1Qaz SET commands. If any requested operation can not
+ * be completed the entire msg is aborted and error value is returned.
+ * No attempt is made to reconcile the case where only part of the
+ * cmd can be completed.
+ */
+static int dcbnl_ieee_set(struct net_device *netdev, struct nlattr **tb,
+			  u32 pid, u32 seq, u16 flags)
+{
+	const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+	struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1];
+	int err = -EOPNOTSUPP;
+
+	if (!ops)
+		goto err;
+
+	err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX,
+			       tb[DCB_ATTR_IEEE], dcbnl_ieee_policy);
+	if (err)
+		goto err;
+
+	if (ieee[DCB_ATTR_IEEE_ETS] && ops->ieee_setets) {
+		struct ieee_ets *ets = nla_data(ieee[DCB_ATTR_IEEE_ETS]);
+		err = ops->ieee_setets(netdev, ets);
+		if (err)
+			goto err;
+	}
+
+	if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setets) {
+		struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]);
+		err = ops->ieee_setpfc(netdev, pfc);
+		if (err)
+			goto err;
+	}
+
+	if (ieee[DCB_ATTR_IEEE_APP_TABLE] && ops->ieee_setapp) {
+		struct nlattr *attr;
+		int rem;
+
+		nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) {
+			struct dcb_app *app_data;
+			if (nla_type(attr) != DCB_ATTR_IEEE_APP)
+				continue;
+			app_data = nla_data(attr);
+			err = ops->ieee_setapp(netdev, app_data);
+			if (err)
+				goto err;
+		}
+	}
+
+err:
+	dcbnl_reply(err, RTM_SETDCB, DCB_CMD_IEEE_SET, DCB_ATTR_IEEE,
+		    pid, seq, flags);
+	return err;
+}
+
+
+/* Handle IEEE 802.1Qaz GET commands. */
+static int dcbnl_ieee_get(struct net_device *netdev, struct nlattr **tb,
+			  u32 pid, u32 seq, u16 flags)
+{
+	struct sk_buff *skb;
+	struct nlmsghdr *nlh;
+	struct dcbmsg *dcb;
+	struct nlattr *ieee;
+	const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+	int err;
+
+	if (!ops)
+		return -EOPNOTSUPP;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOBUFS;
+
+	nlh = NLMSG_NEW(skb, pid, seq, RTM_GETDCB, sizeof(*dcb), flags);
+
+	dcb = NLMSG_DATA(nlh);
+	dcb->dcb_family = AF_UNSPEC;
+	dcb->cmd = DCB_CMD_IEEE_GET;
+
+	NLA_PUT_STRING(skb, DCB_ATTR_IFNAME, netdev->name);
+
+	ieee = nla_nest_start(skb, DCB_ATTR_IEEE);
+	if (!ieee)
+		goto nla_put_failure;
+
+	if (ops->ieee_getets) {
+		struct ieee_ets ets;
+		err = ops->ieee_getets(netdev, &ets);
+		if (!err)
+			NLA_PUT(skb, DCB_ATTR_IEEE_ETS, sizeof(ets), &ets);
+	}
+
+	if (ops->ieee_getpfc) {
+		struct ieee_pfc pfc;
+		err = ops->ieee_getpfc(netdev, &pfc);
+		if (!err)
+			NLA_PUT(skb, DCB_ATTR_IEEE_PFC, sizeof(pfc), &pfc);
+	}
+
+	nla_nest_end(skb, ieee);
+	nlmsg_end(skb, nlh);
+
+	return rtnl_unicast(skb, &init_net, pid);
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+nlmsg_failure:
+	kfree_skb(skb);
+	return -1;
+}
+
 static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 {
 	struct net *net = sock_net(skb->sk);
@@ -1223,6 +1346,14 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 		ret = dcbnl_setapp(netdev, tb, pid, nlh->nlmsg_seq,
 		                   nlh->nlmsg_flags);
 		goto out;
+	case DCB_CMD_IEEE_SET:
+		ret = dcbnl_ieee_set(netdev, tb, pid, nlh->nlmsg_seq,
+				 nlh->nlmsg_flags);
+		goto out;
+	case DCB_CMD_IEEE_GET:
+		ret = dcbnl_ieee_get(netdev, tb, pid, nlh->nlmsg_seq,
+				 nlh->nlmsg_flags);
+		goto out;
 	default:
 		goto errout;
 	}


^ permalink raw reply related

* Re: [PATCH net-next-2.6 v2 1/1] can: c_can: Added support for Bosch C_CAN controller
From: Wolfgang Grandegger @ 2010-12-20 20:39 UTC (permalink / raw)
  To: Bhupesh SHARMA
  Cc: Socketcan-core-0fE9KPoRgkgATYTw5x5z8w@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Marc Kleine-Budde
In-Reply-To: <D5ECB3C7A6F99444980976A8C6D896384DEAE47E1A-8vAmw3ZAcdzhJTuQ9jeba9BPR1lH4CV8@public.gmane.org>

Hi Bhupesh,

On 12/20/2010 05:29 AM, Bhupesh SHARMA wrote:
> Hi Wolfgang,
> 
> Thanks for the review.
> Please see my replies in-line:
> 
>> here comes my first quick preview.
>> On 12/15/2010 10:58 AM, Bhupesh Sharma wrote:
>>> Bosch C_CAN controller is a full-CAN implementation which is
>> compliant
>>> to CAN protocol version 2.0 part A and B. Bosch C_CAN user manual can
>> be
>>> obtained from:
>>> http://www.semiconductors.bosch.de/pdf/Users_Manual_C_CAN.pdf
>>>
>>> This patch adds the support for this controller.
>>> The following are the design choices made while writing the
>> controller driver:
>>> 1. Interface Register set IF1 has be used only in the current design.
>>> 2. Out of the 32 Message objects available, 16 are kept aside for RX
>> purposes
>>>    and the rest for TX purposes.
>>> 3. NAPI implementation is such that both the TX and RX paths function
>> in
>>>    polling mode.
>>>
>>> Changes since V1:
>>> 1. Implemented C_CAN as a platform driver with means of providing the
>>>    platform details and register offsets which may vary for different
>> SoCs
>>>    through platform data struct.
>>> 2. Implemented NAPI.
>>> 3. Removed memcpy calls globally.
>>> 4. Implemented CAN_CTRLMODE_*
>>> 5. Implemented and used priv->can.do_get_berr_counter.
>>> 6. Implemented c_can registers as a struct instead of enum.
>>> 7. Improved the TX path by implementing routines to get next Tx and
>> echo msg
>>>    objects.
>>>
>>> Signed-off-by: Bhupesh Sharma <bhupesh.sharma-qxv4g6HH51o@public.gmane.org>
>>> ---
>>>  drivers/net/can/Kconfig  |    7 +
>>>  drivers/net/can/Makefile |    1 +
>>>  drivers/net/can/c_can.c  | 1217
>> ++++++++++++++++++++++++++++++++++++++++++++++
>>>  3 files changed, 1225 insertions(+), 0 deletions(-)
>>>  create mode 100644 drivers/net/can/c_can.c
>>>
>>> diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig
>>> index 9d9e453..25d9d2e 100644
>>> --- a/drivers/net/can/Kconfig
>>> +++ b/drivers/net/can/Kconfig
>>> @@ -41,6 +41,13 @@ config CAN_AT91
>>>     ---help---
>>>       This is a driver for the SoC CAN controller in Atmel's
>> AT91SAM9263.
>>>
>>> +config CAN_C_CAN
>>> +   tristate "Bosch C_CAN controller"
>>> +   depends on CAN_DEV
>>> +   ---help---
>>> +     If you say yes to this option, support will be included for the
>>> +     Bosch C_CAN controller.
>>> +
>>>  config CAN_TI_HECC
>>>     depends on CAN_DEV && ARCH_OMAP3
>>>     tristate "TI High End CAN Controller"
>>> diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile
>>> index 0057537..b6cbe74 100644
>>> --- a/drivers/net/can/Makefile
>>> +++ b/drivers/net/can/Makefile
>>> @@ -12,6 +12,7 @@ obj-y                             += usb/
>>>  obj-$(CONFIG_CAN_SJA1000)  += sja1000/
>>>  obj-$(CONFIG_CAN_MSCAN)            += mscan/
>>>  obj-$(CONFIG_CAN_AT91)             += at91_can.o
>>> +obj-$(CONFIG_CAN_C_CAN)            += c_can.o
>>>  obj-$(CONFIG_CAN_TI_HECC)  += ti_hecc.o
>>>  obj-$(CONFIG_CAN_MCP251X)  += mcp251x.o
>>>  obj-$(CONFIG_CAN_BFIN)             += bfin_can.o
>>> diff --git a/drivers/net/can/c_can.c b/drivers/net/can/c_can.c
>>> new file mode 100644
>>> index 0000000..c281c17
>>> --- /dev/null
>>> +++ b/drivers/net/can/c_can.c
>>> @@ -0,0 +1,1217 @@
>>> +/*
>>> + * CAN bus driver for Bosch C_CAN controller
>>> + *
>>> + * Copyright (C) 2010 ST Microelectronics
>>> + * Bhupesh Sharma <bhupesh.sharma-qxv4g6HH51o@public.gmane.org>
>>> + *
>>> + * Borrowed heavily from the C_CAN driver originally written by:
>>> + * Copyright (C) 2007
>>> + * - Sascha Hauer, Marc Kleine-Budde, Pengutronix
>> <s.hauer-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
>>> + * - Simon Kallweit, intefo AG <simon.kallweit-+G9qxTFKJT/tRgLqZ5aouw@public.gmane.org>
>>> + *
>>> + * Bosch C_CAN controller is compliant to CAN protocol version 2.0
>> part A and B.
>>> + * Bosch C_CAN user manual can be obtained from:
>>> + * http://www.semiconductors.bosch.de/pdf/Users_Manual_C_CAN.pdf
>>> + *
>>> + * This file is licensed under the terms of the GNU General Public
>>> + * License version 2. This program is licensed "as is" without any
>>> + * warranty of any kind, whether express or implied.
>>> + */
>>> +
>>> +#include <linux/kernel.h>
>>> +#include <linux/version.h>
>>> +#include <linux/module.h>
>>> +#include <linux/interrupt.h>
>>> +#include <linux/delay.h>
>>> +#include <linux/netdevice.h>
>>> +#include <linux/if_arp.h>
>>> +#include <linux/if_ether.h>
>>> +#include <linux/list.h>
>>> +#include <linux/delay.h>
>>> +#include <linux/workqueue.h>
>>> +#include <linux/io.h>
>>> +#include <linux/platform_device.h>
>>> +#include <linux/clk.h>
>>> +
>>> +#include <linux/can.h>
>>> +#include <linux/can/dev.h>
>>> +#include <linux/can/error.h>
>>> +
>>> +#define DRV_NAME "c_can"
>>> +
>>> +/* control register */
>>> +#define CONTROL_TEST               (1 << 7)
>>> +#define CONTROL_CCE                (1 << 6)
>>> +#define CONTROL_DISABLE_AR (1 << 5)
>>> +#define CONTROL_ENABLE_AR  (0 << 5)
>>> +#define CONTROL_EIE                (1 << 3)
>>> +#define CONTROL_SIE                (1 << 2)
>>> +#define CONTROL_IE         (1 << 1)
>>> +#define CONTROL_INIT               (1 << 0)
>>> +
>>> +/* test register */
>>> +#define TEST_RX                    (1 << 7)
>>> +#define TEST_TX1           (1 << 6)
>>> +#define TEST_TX2           (1 << 5)
>>> +#define TEST_LBACK         (1 << 4)
>>> +#define TEST_SILENT                (1 << 3)
>>> +#define TEST_BASIC         (1 << 2)
>>> +
>>> +/* status register */
>>> +#define STATUS_BOFF                (1 << 7)
>>> +#define STATUS_EWARN               (1 << 6)
>>> +#define STATUS_EPASS               (1 << 5)
>>> +#define STATUS_RXOK                (1 << 4)
>>> +#define STATUS_TXOK                (1 << 3)
>>> +#define STATUS_LEC_MASK            0x07
>>> +#define LEC_STUFF_ERROR            1
>>> +#define LEC_FORM_ERROR             2
>>> +#define LEC_ACK_ERROR              3
>>> +#define LEC_BIT1_ERROR             4
>>> +#define LEC_BIT0_ERROR             5
>>> +#define LEC_CRC_ERROR              6
>>
>> Could be an enum!?
> 
> Yes LEC error types can be defined as enum, but #define also
> seems fine.

If you use it with switch, an anonymous enumeration seems more
appropriate for me.

...
>>> +static inline int c_can_object_put(struct net_device *dev,
>>> +                                   int iface, int objno, int mask)
>>> +{
>>> +   struct c_can_priv *priv = netdev_priv(dev);
>>> +   int timeout = (6 / priv->can.clock.freq);
>>
>> Hm, "timeout = 0" does not look resonable.
> 
> Let me see if I get your point here.
> You mean use something like:
> 
>         count = 6 /* non-zero count at start */
>         /* write message object no in IF COMM_REQ reg */
>         while (count) {
>                 udelay(timeout);
>                 count--;
>         }
>         /* read BUSY status from IF COM reg */
>         if (busy)
>                 return -ETIMEDOUT;

Yes, using a larger repeat count with a shorter delay is more efficient,
I think. And an error message would be nice as well, especially if you
don't handle the return code.

>>> +           return -ETIMEDOUT;
>>> +   }
>>
>> Is the timeout really needed? If yes, re-trying various times would
>> more
>> more safe.
> 
> Yes timeout is needed as per specs. Please see the approach given above.

OK, I just asked because it did work with timeout=0!!!

> If you agree the same can be added in V3.

See above.

>>> +   return 0;
>>> +}
...

>>> +   stats->rx_packets++;
>>> +   stats->rx_bytes += frame->can_dlc;
>>> +
>>> +   return 0;
>>
>> The return values are not handled anywhere!
> 
> Hmm. This is the tricky part. To be honest, a
> lot of driver's don't handle all the return values.
> This function is called from an isr / poll-event.
> Do you think it's useful to handle the return values
> there?

Well, if you don't handle the return code just use a *void* function and
print an error message instead, if appropriate.

...
>>> +
>>> +/*
>>> + * Configure C_CAN message objects for Tx and Rx purposes:
>>> + * C_CAN provides a total of 32 message objects that can be
>> configured
>>> + * either for Tx or Rx purposes. Here the first 16 message objects
>> are used as
>>> + * a reception FIFO. The end of reception FIFO is signified by the
>> EoB bit
>>> + * being SET. The remaining 16 message objects are kept aside for Tx
>> purposes.
>>> + * See user guide document for further details on configuring
>> message
>>> + * objects.
>>> + */
>>
>> Did you verify *in-order* transmisson and reception? You could use the
>> canfdtest program from the can-utils.
> 
> I will check V3 for the same.
> I also checked Marc's at91 driver and the
> approach implemented there for in-order rx
> object reception seems fine to me. If you and Marc agree I can
> use the same here. Also I need to add credits for the same :)

In-order-transmission and reception is *mandatory*. You can also have a
look to the pch_can driver. As already mentioned above, the "canfdtest"
program from the "can-utils" is useful to validate that requirement.

...
>>> +/*
>>> + * Configure C_CAN chip:
>>> + * - enable/disable auto-retransmission
>>> + * - set operating mode
>>> + * - configure message objects
>>> + */
>>> +static int c_can_chip_config(struct net_device *dev)
>>> +{
>>> +   struct c_can_priv *priv = netdev_priv(dev);
>>> +
>>> +   if (priv->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT)
>>> +           /* disable automatic retransmission */
>>> +           priv->write_reg(priv, &priv->reg_base->control,
>>> +                           CONTROL_DISABLE_AR);
>>> +   else
>>> +           /* enable automatic retransmission */
>>> +           priv->write_reg(priv, &priv->reg_base->control,
>>> +                           CONTROL_ENABLE_AR);
>>> +
>>> +   if (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) {
>>> +           /* loopback mode : useful for self-test function */
>>> +           priv->write_reg(priv, &priv->reg_base->control,
>> (CONTROL_EIE |
>>> +                           CONTROL_SIE | CONTROL_IE | CONTROL_TEST));
>>> +           priv->write_reg(priv, &priv->reg_base->test, TEST_LBACK);
>>> +   } else if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) {
>>> +           /* silent mode : bus-monitoring mode */
>>> +           priv->write_reg(priv, &priv->reg_base->control,
>> (CONTROL_EIE |
>>> +                           CONTROL_SIE | CONTROL_IE | CONTROL_TEST));
>>> +           priv->write_reg(priv, &priv->reg_base->test, TEST_SILENT);
>>> +   } else if (priv->can.ctrlmode & (CAN_CTRLMODE_LISTENONLY &
>>> +                                   CAN_CTRLMODE_LOOPBACK)) {
>>
>> As I see it, this case is never entered.
> 
> You are right. But as we discussed during the review of V1,
> as the c_can core supports this mode (loopback + listen-only)
> we should support the same in the driver as well.

Yes, and therefore you need to check the bits first, otherwise it will
not work.

> 
>>> +           /* loopback + silent mode : useful for hot self-test */
>>> +           priv->write_reg(priv, &priv->reg_base->control,
>> (CONTROL_EIE |
>>> +                           CONTROL_SIE | CONTROL_IE | CONTROL_TEST));
>>> +           priv->write_reg(priv, &priv->reg_base->test,
>>> +                           (TEST_LBACK | TEST_SILENT));
>>> +   } else
>>> +           /* normal mode*/
>>> +           priv->write_reg(priv, &priv->reg_base->control,
>>> +                           (CONTROL_EIE | CONTROL_SIE | CONTROL_IE));
>>> +
>>> +   /* configure message objects */
>>> +   c_can_configure_msg_objects(dev);
>>> +
>>> +   return 0;
>>> +}

...

>>> +/*
>>> + * c_can_do_rx_poll - read multiple CAN messages from message
>> objects
>>> + */
>>> +static int c_can_do_rx_poll(struct net_device *dev, int quota)
>>> +{
>>> +   u32 num_rx_pkts = 0;
>>> +   unsigned int msg_obj;
>>> +   struct c_can_priv *priv = netdev_priv(dev);
>>> +   u32 val = c_can_read_reg32(priv, &priv->reg_base->newdat1);
>>> +
>>> +   while (val & RECEIVE_OBJECT_BITS) {
>>> +           for (msg_obj = C_CAN_MSG_OBJ_RX_FIRST;
>>> +                           msg_obj <= C_CAN_MSG_OBJ_RX_LAST; msg_obj++) {
>>> +                   if (val & (1 << msg_obj)) {
>>> +                           c_can_read_msg_object(dev, 0, msg_obj);
>>> +                           num_rx_pkts++;
>>> +                           quota--;
>>
>> Where do you handle quota?
> 
> Sorry but I didn't get your meaning here.
> Everytime the rx_poll function is called quota is
> decremented and num of rx packets received is incremented.
> Am I missing something here?

You should not handle more than "quota" messages in this function. This
means that it shoud return if quota==0 is reached.

...

>>> +   /* check for 'last error code' which tells us the
>>> +    * type of the last error to occur on the CAN bus
>>> +    */
>>> +   if (lec_type) {
>>> +           /* common for all type of bus errors */
>>> +           priv->can.can_stats.bus_error++;
>>> +           stats->rx_errors++;
>>> +           cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR;
>>> +           cf->data[2] |= CAN_ERR_PROT_UNSPEC;
>>> +
>>> +           if (lec_type & LEC_STUFF_ERROR) {
>>> +                   dev_info(dev->dev.parent, "stuff error\n");
>>> +                   cf->data[2] |= CAN_ERR_PROT_STUFF;
>>> +           }
>>> +           if (lec_type & LEC_FORM_ERROR) {
>>> +                   dev_info(dev->dev.parent, "form error\n");
>>> +                   cf->data[2] |= CAN_ERR_PROT_FORM;
>>> +           }
>>> +           if (lec_type & LEC_ACK_ERROR) {
>>> +                   dev_info(dev->dev.parent, "ack error\n");
>>> +                   cf->data[2] |= (CAN_ERR_PROT_LOC_ACK |
>>> +                                   CAN_ERR_PROT_LOC_ACK_DEL);
>>> +           }
>>> +           if (lec_type & LEC_BIT1_ERROR) {
>>> +                   dev_info(dev->dev.parent, "bit1 error\n");
>>> +                   cf->data[2] |= CAN_ERR_PROT_BIT1;
>>> +           }
>>> +           if (lec_type & LEC_BIT0_ERROR) {
>>> +                   dev_info(dev->dev.parent, "bit0 error\n");
>>> +                   cf->data[2] |= CAN_ERR_PROT_BIT0;
>>> +           }
>>> +           if (lec_type & LEC_CRC_ERROR) {
>>> +                   dev_info(dev->dev.parent, "CRC error\n");
>>
>> Please use dev_dbg() here and above
> 
> Ok.
> 
>>> +                   cf->data[2] |= (CAN_ERR_PROT_LOC_CRC_SEQ |
>>> +                                   CAN_ERR_PROT_LOC_CRC_DEL);
>>> +           }
>>> +   }
>>
>> The lec should be handled by a switch statement. Also, please use
>> dev_dbg in favor of dev_info.
> 
> But as I have seen on the board, there can be multiple lec bits
> set at a time (e.g. shorting CAN TX and RX lines). In such cases the
> multiple-if structure handles the same. Do you agree?

No. Testing the individual bits of an enumeration value does not make
sense. I just speak about the last error code (lec_type).

>>> +   netif_receive_skb(skb);
>>> +   stats->rx_packets++;
>>> +   stats->rx_bytes += cf->can_dlc;
>>> +
>>> +   return 1;
>>> +}
>>> +
...
>>> +   priv->can.ctrlmode_supported = CAN_CTRLMODE_ONE_SHOT |
>>> +                                   CAN_CTRLMODE_LOOPBACK |
>>> +                                   CAN_CTRLMODE_LISTENONLY |
>>> +                                   CAN_CTRLMODE_BERR_REPORTING;
>>
>> Where is CAN_CTRLMODE_BERR_REPORTING implemented? Note that it has
>> nothing to do with do_get_berr_counter. Please check the SJA1000
>> driver:
>>
>> http://lxr.linux.no/#linux+v2.6.36/drivers/net/can/sja1000/sja1000.c#L1
>> 46
>>
>> Bus error handling can be requested by the user via netlink interface.
>>
>>  # ip link set canX type can ... berr-reporting on
>>
>> The driver then usually enables the bus error interrupts. I just
>> realize
>> that Documentation/networking/can.txt is not up-to-date. I will provide
>> a patch a.s.a.p.
> 
> Yes, I have seen the sja1000 implementation before preparing V2.
> But unfortunately the c_can core does not also only the bus-error-reporting
> to be masked/unmasked. There are three interrupt masks available in the
> Control register:
> 
> a) Error Interrupt Enable
> If Enabled - A change in the bits BOff or EWarn in the Status Register will
> generate an interrupt.
> b) Status Change Interrupt Enable
> If Enabled - An interrupt will be generated when a message transfer is
> successfully completed or a CAN bus error is detected.
> c) Module Interrupt Enable
> If Enabled - Interrupts will set IRQ_B to LOW.

OK, then you should handle it like the flexcan driver. See:

http://lxr.linux.no/#linux+v2.6.36/drivers/net/can/flexcan.c#L533

In the meantime I compared the CAN chapter of the PCH manual with the
C_CAN manual. The paragraphs I checked are *identical*. This makes
clear, that the "pch_can" is a clone of the  C_CAN CAN controller, with
a few extensions, though. Therefore it would make sense, to implement a
bus sensitive interface like for the SJA1000 allowing to handle both CAN
controllers with one driver sooner than later. Therefore, could you
please implement:

  drivers/net/can/c_can/c_can.c
                       /c_can_platform.c

Then an interface to the PCI based PCH CAN controller could be added
easily, e.g. as "pch_pci.c". You already had something similar in your
RFC version of the patch, IIRC.

Thanks,

Wolfgang.

^ permalink raw reply

* Re: [PATCH net-next] bnx2x: Add Nic partitioning mode (57712 devices)
From: Dimitris Michailidis @ 2010-12-20 19:44 UTC (permalink / raw)
  To: Matt Domsch
  Cc: Eilon Greenstein, Dmitry Kravkov, davem@davemloft.net,
	netdev@vger.kernel.org, narendra_k@dell.com,
	jordan_hargrave@dell.com
In-Reply-To: <20101219054953.GC5854@auslistsprd01.us.dell.com>

Matt Domsch wrote:
>> You can have several interfaces with same device link and different dev_id. 
>>  While the current driver doesn't do it you could also have several 
>> interfaces with different device links but same dev_id (NPAR situation, 
>> notice again that dev_ids are not per PCI function), or interfaces with 
>> different device and dev_id, or even interfaces with same device and dev_id.
> 
> What is the scope of dev_id then?  It's not per PCI device like I
> thought.

I don't think it could be that way because for these cards you can't 
statically tell which ports are controlled by a PCI function.  So knowing 
that an interface is say port 0 of a function would help little.

> It sounds like it's per card, but how can I know the card
> boundary?

Yes, it's per card and covers the PFs and VFs of the card.

> If I have 2 cards driven by cxgb4 in the system, each with say 4
> ports.  I could see a minimum of 8 PCI devices (fine), but the dev_id
> values would be?  0,1,2,3; 0,1,2,3 ?

Correct.

> How can I tell that these are
> two different cards, with two different sets of dev_id values, rather
> than one card with 4 ports, 8 (NPAR or SR-IOV) interfaces, with each 2
> interfaces mapping to the same port?

Doesn't the information in /sys/devices distinguish them?  For example, 
something like

/sys/devices/pci0000:00/0000:00:07.0/0000:04:00.0/net/eth2/dev_id == 0
/sys/devices/pci0000:00/0000:00:07.0/0000:04:00.1/net/eth3/dev_id == 0
/sys/devices/pci0000:00/0000:00:07.0/0000:04:01.0/net/eth5/dev_id == 0
/sys/devices/pci0000:00/0000:00:1c.0/0000:05:00.0/net/eth4/dev_id == 0

tells me there are two cards, one has eth4 on port 0, the other has eth2, 
eth3, and eth5 on its port 0 with eth5 being on a VF.

> dev_id is not system-wide unique.  It's not even slot unique best as I
> can tell.  If I had a PCI slot extender, with 2 PCI slots, and I put
> two of the above cards in, I would see 0,1,2,3; 0,1,2,3.  To be fair,
> my naming scheme doesn't really account for such an extender, though
> currently it would go pci<slot>#<12345678>.

Can you give an example of what /sys/devices looks like in the case you're 
considering?

^ permalink raw reply

* [PATCH] ehea: Avoid changing vlan flags
From: leitao @ 2010-12-20 19:02 UTC (permalink / raw)
  To: davem; +Cc: shemminger, netdev, jesse, Breno Leitao
In-Reply-To: <AANLkTimODzrAWS+05+FChtme_3um_9Ky7cwa3gWHFaXp@mail.gmail.com>

This patch avoids disabling the vlan flags using ethtool.

Signed-off-by: Breno Leitao <leitao@linux.vnet.ibm.com>
---
 drivers/net/ehea/ehea_ethtool.c |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ehea/ehea_ethtool.c b/drivers/net/ehea/ehea_ethtool.c
index afebf20..3e2e734 100644
--- a/drivers/net/ehea/ehea_ethtool.c
+++ b/drivers/net/ehea/ehea_ethtool.c
@@ -265,6 +265,13 @@ static void ehea_get_ethtool_stats(struct net_device *dev,
 
 static int ehea_set_flags(struct net_device *dev, u32 data)
 {
+	/* Avoid changing the VLAN flags */
+	if ((data & (ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN)) !=
+	    (ethtool_op_get_flags(dev) & (ETH_FLAG_RXVLAN |
+					  ETH_FLAG_TXVLAN))){
+		return -EINVAL;
+	}
+
 	return ethtool_op_set_flags(dev, data, ETH_FLAG_LRO
 					| ETH_FLAG_TXVLAN
 					| ETH_FLAG_RXVLAN);
-- 
1.7.3.3


^ permalink raw reply related

* Re: [PATCH 1/1] TCP: increase default initial receive window.
From: David Miller @ 2010-12-20 18:48 UTC (permalink / raw)
  To: rick.jones2; +Cc: shemminger, nanditad, netdev, therbert, chavey, ycheng
In-Reply-To: <4D0F9FCC.7010108@hp.com>

From: Rick Jones <rick.jones2@hp.com>
Date: Mon, 20 Dec 2010 10:26:20 -0800

>> Agree this is a good idea, but some further notes:
>>   * The control of receive window is a local function not covered by
>>     RFC.
>>   * Linux manipulates receive window automatically, unlike some other
>>     implementations.
>> But any change to TCP risks breaking other broken implementations
>> and users need a good way to recover. 
> 
> Always good to be careful, but break in what way?  Many stacks have
> been advertising an initial receive window of well above 10 segments
> going back literally decades.

Agreed.

^ permalink raw reply

* Re: [patch -next] vmxnet3: locking problems in xmit
From: David Miller @ 2010-12-20 18:44 UTC (permalink / raw)
  To: error27; +Cc: sbhatewara, pv-drivers, netdev, kernel-janitors
In-Reply-To: <20101220130315.GY1936@bicker>

From: Dan Carpenter <error27@gmail.com>
Date: Mon, 20 Dec 2010 16:03:15 +0300

> There were several paths that didn't release their locks.
> 
> Signed-off-by: Dan Carpenter <error27@gmail.com>

Applied.

^ permalink raw reply

* Re: [patch -next] typhoon: memory corruption in typhoon_get_drvinfo()
From: David Miller @ 2010-12-20 18:43 UTC (permalink / raw)
  To: error27; +Cc: dave, netdev, kernel-janitors
In-Reply-To: <20101220130018.GX1936@bicker>

From: Dan Carpenter <error27@gmail.com>
Date: Mon, 20 Dec 2010 16:00:18 +0300

> info->version only has space for 32 characters but my UTS_RELEASE is
> "2.6.37-rc6-next-20101217-05817-ge935fc8-dirty" so it doesn't fit.
> This is supposed to be the version of the driver, not the kernel
> version.  This driver doesn't have a version so lets just leave it
> blank.
> 
> Signed-off-by: Dan Carpenter <error27@gmail.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH] net: Add USB PID for new MOSCHIP USB ethernet controller MCS7832 variant
From: David Miller @ 2010-12-20 18:39 UTC (permalink / raw)
  To: andi; +Cc: arnd, dhollis, pchang23, netdev, linux-kernel
In-Reply-To: <20101219154257.GA29086@rhlx01.hs-esslingen.de>

From: Andreas Mohr <andi@lisas.de>
Date: Sun, 19 Dec 2010 16:42:57 +0100

> Due to active notification of the new MCS7832 version by the manufacturer
> (Mr. Milton; thanks!) -- quote: "functionality same as MCS7830",
> I'm now submitting this patch (on -rc6), intended for networking.git and -stable.
> 
> - add MCS7832 USB PID to be able to support this new device variant, too
> - add related descriptions
> 
> Signed-off-by: Andreas Mohr <andi@lisas.de>

Applied.

^ permalink raw reply

* Re: [PATCH] ipv4: Flush per-ns routing cache more sanely.
From: David Miller @ 2010-12-20 18:36 UTC (permalink / raw)
  To: eric.dumazet; +Cc: ebiederm, netdev, daniel.lezcano
In-Reply-To: <1292865233.2800.192.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 20 Dec 2010 18:13:53 +0100

> Le dimanche 19 décembre 2010 à 21:14 -0800, David Miller a écrit :
>> ipv4: Flush per-ns routing cache more sanely.
>> 
>> Flush the routing cache only of entries that match the
>> network namespace in which the purge event occurred.
>> 
>> Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> 
> Seems fine to me, thanks !
> 
> Acked-by: Eric Dumazet <eric.dumazet@gmail.com>

Thanks for reviewing.

^ permalink raw reply

* Re: [PATCH] ehea: Fixing some message level
From: David Miller @ 2010-12-20 18:35 UTC (permalink / raw)
  To: leitao; +Cc: netdev
In-Reply-To: <1292526792-28901-1-git-send-email-leitao@linux.vnet.ibm.com>

From: leitao@linux.vnet.ibm.com
Date: Thu, 16 Dec 2010 17:13:12 -0200

> Currently there are some info message that is set as error, and an error
> message that is set as debug. This patch just fixes it.
> 
> Signed-off-by: Breno Leitao <leitao@linux.vnet.ibm.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH] batman-adv: Return hna count on local buffer fill
From: David Miller @ 2010-12-20 18:32 UTC (permalink / raw)
  To: sven; +Cc: netdev
In-Reply-To: <1292786890-17755-1-git-send-email-sven@narfation.org>

From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 19 Dec 2010 20:28:10 +0100

> hna_local_fill_buffer must return the number of added hna entries and
> not the last checked hash bucket.
> 
> Signed-off-by: Sven Eckelmann <sven@narfation.org>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] pktgen: Remove unnecessary prefix from pr_<level>
From: David Miller @ 2010-12-20 18:30 UTC (permalink / raw)
  To: joe; +Cc: kristian, netdev
In-Reply-To: <1292526153.29894.37.camel@Joe-Laptop>

From: Joe Perches <joe@perches.com>
Date: Thu, 16 Dec 2010 11:02:33 -0800

> Remove "pktgen: " prefix string from one pr_info.
> pr_fmt adds it, so this is a duplicate.
> 
> Signed-off-by: Joe Perches <joe@perches.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-2.6] net_sched: always clone skbs
From: David Miller @ 2010-12-20 18:27 UTC (permalink / raw)
  To: eric.dumazet; +Cc: xiaosuo, jarkao2, netdev, hadi, pstaszewski
In-Reply-To: <1292855876.2800.46.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 20 Dec 2010 15:37:56 +0100

> Le lundi 20 décembre 2010 à 22:35 +0800, Changli Gao a écrit :
>> Pawel reported a panic related to handling shared skbs in ixgbe
>> incorrectly. So we need to revert my previous patch to work around
>> this bug. Instead of reverting the patch completely, I just revert
>> the essential lines, so we can add the previous optimization
>> back more easily in future.
>> 
>>     commit 3511c9132f8b1e1b5634e41a3331c44b0c13be70
>>     Author: Changli Gao <xiaosuo@gmail.com>
>>     Date:   Sat Oct 16 13:04:08 2010 +0000
>>     
>>         net_sched: remove the unused parameter of qdisc_create_dflt()
>> 
>> Reported-by: Pawel Staszewski <pstaszewski@itcare.pl>
>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> 
> Acked-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied, thanks everyone.

^ permalink raw reply

* Re: [PATCH 1/1] TCP: increase default initial receive window.
From: Rick Jones @ 2010-12-20 18:26 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Nandita Dukkipati, David Miller, netdev, therbert, chavey, ycheng
In-Reply-To: <20101220090352.3ea7ade2@nehalam>

> Agree this is a good idea, but some further notes:
>   * The control of receive window is a local function not covered by
>     RFC.
>   * Linux manipulates receive window automatically, unlike some other
>     implementations.
> 
> But any change to TCP risks breaking other broken implementations
> and users need a good way to recover. 

Always good to be careful, but break in what way?  Many stacks have been 
advertising an initial receive window of well above 10 segments going back 
literally decades.

HP-UX systems have been advertising a default/initial recieve window of 32768 
bytes since the mid 1990s, Solaris systems have been advertising a default 
receive window of 49152 for ages.  I cannot speak to Windows' default advertised 
window.  While that sound a bit like "But MOM! All my friends are doing it." it 
does seem to suggest that advertising an initial receive window of 10 segments 
is unlikely to uncover anything new.

rick jones

^ permalink raw reply

* RE: e1000e crash with 82574L  2.6.37-0.rc5
From: Wyborny, Carolyn @ 2010-12-20 17:24 UTC (permalink / raw)
  To: Brian Neu, netdev@vger.kernel.org; +Cc: e1000-devel@lists.sourceforge.net
In-Reply-To: <128120.41188.qm@web39307.mail.mud.yahoo.com>



>-----Original Message-----
>From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
>On Behalf Of Brian Neu
>Sent: Saturday, December 18, 2010 10:13 AM
>To: netdev@vger.kernel.org
>Subject: e1000e crash with 82574L 2.6.37-0.rc5
>
>Is this a known bug?
>Any fix out there?
>
>[162536.704103] WARNING: at net/sched/sch_generic.c:258
>dev_watchdog+0x111/0x185()  [162536.704110] Hardware name: H8SGL
>[162536.704116] NETDEV WATCHDOG: eth0 (e1000e): transmit queue 0 timed
>out
>[162536.704122] Modules linked in: ip6table_filter ebtable_nat ebtables
>act_police cls_flow cls_fw cls_u32 sch_htb sch_hfsc sch_ingress sch_sfq
>bridge
>stp llc xt_time xt_connlimit xt_realm iptable_raw xt_comment xt_recent
>xt_policy ipt_ULOG ipt_REDIRECT ipt_NETMAP ipt_MASQUERADE ipt_ECN
>ipt_ecn
>ipt_CLUSTERIP ipt_ah ipt_addrtype nf_nat_tftp nf_nat_snmp_basic
>nf_nat_sip
>nf_nat_pptp nf_nat_proto_gre nf_nat_irc nf_nat_h323 nf_nat_ftp
>nf_nat_amanda
>ts_kmp nf_conntrack_amanda nf_conntrack_sane nf_conntrack_tftp
>nf_conntrack_sip
>nf_conntrack_proto_sctp nf_conntrack_pptp nf_conntrack_proto_gre
>nf_conntrack_netlink nf_conntrack_netbios_ns nf_conntrack_irc
>nf_conntrack_h323
>nf_conntrack_ftp xt_TPROXY nf_tproxy_core ip6_tables nf_defrag_ipv6
>xt_tcpmss
>xt_pkttype xt_physdev xt_owner xt_NFQUEUE xt_NFLOG nfnetlink_log
>xt_multiport
>xt_mark xt_mac xt_limit xt_length xt_iprange xt_helper xt_hashlimit
>xt_DSCP
>xt_dscp xt_dccp xt_connmark xt_CLASSIFY ipt_LOG iptable_nat nf_nat
>iptable_mangle nfnetlink  fuse tun sunrpc cpufreq_ondemand powernow_k8
>freq_table mperf ipv6 kvm_amd kvm  uinput amd64_edac_mod edac_core
>e1000e
>edac_mce_amd i2c_piix4 i2c_core k10temp  ghes hed joydev microcode btrfs
>zlib_deflate libcrc32c raid1 pata_acpi  ata_generic sata_promise
>pata_atiixp
>sata_sil24 uas usb_storage [last unloaded:  scsi_wait_scan]
>[162536.704368]
>Pid: 0, comm: kworker/0:0 Not tainted  2.6.37-0.rc5.git2.1.fc13.x86_64
>#1
>[162536.704375] Call Trace:  [162536.704380]  <IRQ>
>[<ffffffff81052e29>]
>warn_slowpath_common+0x85/0x9d  [162536.704406]  [<ffffffff81052ee4>]
>warn_slowpath_fmt+0x46/0x48  [162536.704418]  [<ffffffff81419f70>]
>dev_watchdog+0x111/0x185  [162536.704430]  [<ffffffff8105fdcf>]
>run_timer_softirq+0x246/0x33f  [162536.704441]  [<ffffffff8105fd29>] ?
>run_timer_softirq+0x1a0/0x33f  [162536.704452]  [<ffffffff81419e5f>] ?
>dev_watchdog+0x0/0x185  [162536.704463]  [<ffffffff81059746>]
>__do_softirq+0x101/0x20c  [162536.704474]  [<ffffffff81011aa6>] ?
>native_sched_clock+0x2d/0x5f  [162536.704484]  [<ffffffff8100bc1c>]
>call_softirq+0x1c/0x30  [162536.704493]  [<ffffffff8100d29f>]
>do_softirq+0x4b/0xa3  [162536.704502]  [<ffffffff81059480>]
>irq_exit+0x57/0x9b
>[162536.704514]  [<ffffffff814bf93c>] smp_apic_timer_interrupt+0x8d/0x9b
>[162536.704523]  [<ffffffff8100b6d3>] apic_timer_interrupt+0x13/0x20
>[162536.704529]  <EOI>  [<ffffffff81012686>] ? default_idle+0x3e/0x65
>[162536.704549]  [<ffffffff8102d159>] ? native_safe_halt+0xb/0xd
>[162536.704560]  [<ffffffff810816ee>] ? trace_hardirqs_on+0xd/0xf
>[162536.704571]  [<ffffffff8101268b>] default_idle+0x43/0x65
>[162536.704581]
>[<ffffffff81009c81>] cpu_idle+0xbe/0x132  [162536.704592]
>[<ffffffff814af9ea>]
>start_secondary+0x242/0x244
>
>--
>To unsubscribe from this list: send the line "unsubscribe netdev" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html

Hello,

We have some known issues with 82574L but most have been resolved in the latest versions, so I'll need some more information.

What version of e1000e are you using exactly?  Are you able to download and test the latest version of the driver from SourceForge?
  
Please open an issue at SourceForge.net for easier tracking of debugging information.

Please provide an output of lspci -vvv.

What hw platform is this happening on?  How often does it happen and how long does it take to happen after reset or reboot?  Is ASPM enabled or disabled on your system.  Its possible to disable this in the BIOS, but not all BIOS provide the option.  If its enabled for some reason, please disable it and try the driver again.

Thanks,

Carolyn

Carolyn Wyborny
Linux Development
LAN Access Division
Intel Corporation



^ permalink raw reply

* Re: ip rule and/or route problem in 2.6.37-rc5+
From: Tom Herbert @ 2010-12-20 17:22 UTC (permalink / raw)
  To: David Miller; +Cc: greearb, netdev
In-Reply-To: <20101219.214230.39170893.davem@davemloft.net>

> Tom, please acknowledge this regression you've added to the tree.

Acknowledged.  Looking at it.

Tom

^ permalink raw reply

* Re: [PATCH] ipv4: Flush per-ns routing cache more sanely.
From: Eric Dumazet @ 2010-12-20 17:13 UTC (permalink / raw)
  To: David Miller; +Cc: ebiederm, netdev, daniel.lezcano
In-Reply-To: <20101219.211453.226783693.davem@davemloft.net>

Le dimanche 19 décembre 2010 à 21:14 -0800, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 26 Oct 2010 21:30:22 +0200
> 
> > Le mardi 26 octobre 2010 à 12:20 -0700, David Miller a écrit :
> >> From: ebiederm@xmission.com (Eric W. Biederman)
> >> Date: Tue, 26 Oct 2010 12:05:39 -0700
> >> 
> >> >> @@ -999,7 +999,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
> >> >>  		rt_cache_flush(dev_net(dev), 0);
> >> >>  		break;
> >> >>  	case NETDEV_UNREGISTER_BATCH:
> >> >> -		rt_cache_flush_batch();
> >> >> +		rt_cache_flush_batch(dev_net(dev));
> >> > 
> >> > It still has this incorrect conversion in it.
> >> 
> >> Sorry I missed that, what's the exact problem with it?
> > 
> > Because the way _BATCH operation is performed, we call it once...
> > 
> > rollback_registered_many() calls it for the first dev queued in the
> > list.
> > 
> > So it should be net independant.
> 
> Thanks Eric.  I finally got back to fixing this issue and respinning
> the patch.
> 
> Please review, in particular how I handled the RCU bits.
> 
> --------------------
> ipv4: Flush per-ns routing cache more sanely.
> 
> Flush the routing cache only of entries that match the
> network namespace in which the purge event occurred.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>


Seems fine to me, thanks !

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>



^ permalink raw reply

* RE: [Pv-drivers] [patch -next] vmxnet3: locking problems in xmit
From: Bhavesh Davda @ 2010-12-20 17:09 UTC (permalink / raw)
  To: Dan Carpenter, Shreyas Bhatewara
  Cc: VMware, Inc., netdev@vger.kernel.org,
	kernel-janitors@vger.kernel.org
In-Reply-To: <20101220130315.GY1936@bicker>

Indeed, indeed. Thanks for catching that and fixing it!

> 
> There were several paths that didn't release their locks.
> 
> Signed-off-by: Dan Carpenter <error27@gmail.com>

Signed-off-by: Bhavesh Davda <bhavesh@vmware.com>

> 
> diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c
> b/drivers/net/vmxnet3/vmxnet3_drv.c
> index 23154cf..939e546 100644
> --- a/drivers/net/vmxnet3/vmxnet3_drv.c
> +++ b/drivers/net/vmxnet3/vmxnet3_drv.c
> @@ -980,7 +980,7 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct
> vmxnet3_tx_queue *tq,
>  		}
>  	} else {
>  		tq->stats.drop_hdr_inspect_err++;
> -		goto drop_pkt;
> +		goto unlock_drop_pkt;
>  	}
> 
>  	/* fill tx descs related to addr & len */
> @@ -1052,6 +1052,8 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct
> vmxnet3_tx_queue *tq,
> 
>  hdr_too_big:
>  	tq->stats.drop_oversized_hdr++;
> +unlock_drop_pkt:
> +	spin_unlock_irqrestore(&tq->tx_lock, flags);
>  drop_pkt:
>  	tq->stats.drop_total++;
>  	dev_kfree_skb(skb);
> _______________________________________________
> Pv-drivers mailing list
> Pv-drivers@vmware.com
> http://mailman2.vmware.com/mailman/listinfo/pv-drivers

^ permalink raw reply

* Re: [PATCH 1/1] TCP: increase default initial receive window.
From: Stephen Hemminger @ 2010-12-20 17:03 UTC (permalink / raw)
  To: Nandita Dukkipati; +Cc: David Miller, netdev, therbert, chavey, ycheng
In-Reply-To: <AANLkTimSH_NfX2qno7sO3A4CHLsONmRUDHzWXY4reD28@mail.gmail.com>

On Sat, 18 Dec 2010 01:08:33 -0800
Nandita Dukkipati <nanditad@google.com> wrote:

> resend in plain text.
> 
> On Fri, Dec 17, 2010 at 9:13 PM, David Miller <davem@davemloft.net> wrote:
> > From: Nandita Dukkipati <nanditad@google.com>
> > Date: Fri, 17 Dec 2010 19:20:51 -0800
> >
> >> This patch changes the default initial receive window to 10 mss
> >> (defined constant). The default window is limited to the maximum
> >> of 10*1460 and 2*mss (when mss > 1460).
> >>
> >> Signed-off-by: Nandita Dukkipati <nanditad@google.com>
> >
> > That's an incredibly terse explanation for a very non-trivial
> > change with very non-trivial implications.
> >
> > What analysis have you performed to lead you to decide that this
> > was a reasonable change to make?  Where can people see that
> > analysis and look over it to see if they agree with your
> > assesment of the data?
> 
> Apologies for the terse comment. Here's a longer explanation.
> 
> Background:
> A recent proposal to the IETF [Ref: 5] recommends increasing TCP's
> initial congestion window to 10 mss or about 15KB. This proposal,
> backed with data from several large-scale live experiments as well as
> controlled testbed experiments, is under active discussion in the TCPM
> working group.
> 
> Analysis performed:
> Leading up to this proposal were several large-scale Internet
> experiments [Ref: 2] with an initial congestion window of 10 mss
> (IW10), where we showed that the average latency of HTTP responses
> improved by approximately 10%. This was accompanied by a slight
> increase in retransmission rate (0.5%), most of which is coming from
> applications opening multiple simultaneous connections. To understand
> the extreme
> worst case scenarios, as well as fairness issues with IW10 versus IW3
> traffic, we further conducted controlled testbed experiments
> (end-hosts are all Linux based). We came away finding minimal negative
> impact even under low link bandwidths (dial-ups) and small buffers
> [Ref: 3]. These results are extremely encouraging to adopting IW10.
> 
> But obviously, an initial congestion window of 10 mss is useless
> unless a TCP receiver advertises an initial receive window of at least
> 10 mss. Fortunately, in the large-scale Internet experiments we found
> that most of the operating systems advertised a large enough initial
> receive window, allowing us to experiment with various values of
> initial congestion windows. Linux systems were among the few
> exceptions that advertised a small receive window. This patch intends
> to fix that.
> 
> References:
> 
> 1. This site has a comprehensive list of all IW10 references to date.
> http://code.google.com/speed/protocols/tcpm-IW10.html
> 
> 2. Paper describing results from large-scale Internet experiments with IW10.
> http://ccr.sigcomm.org/drupal/?q=node/621
> 
> 3. Controlled testbed experiments with IW10 under worst case scenarios
> http://www.ietf.org/proceedings/79/slides/tcpm-0.pdf
> 
> 4. Raw test data from testbed experiments (Linux senders/receivers)
> with initial congestion window and initial receive window of 10 mss.
> http://research.csc.ncsu.edu/netsrv/?q=content/iw10
> 
> 5. Internet-Draft. Increasing TCP's Initial Window.
> https://datatracker.ietf.org/doc/draft-ietf-tcpm-initcwnd/

Agree this is a good idea, but some further notes:
  * The control of receive window is a local function not covered by
    RFC.
  * Linux manipulates receive window automatically, unlike some other
    implementations.

But any change to TCP risks breaking other broken implementations
and users need a good way to recover. Therefore I recommend this
be made a sysctl to allow for quick workaround for the user who has
to connect to some Elbonian printer.  Doing it per route is okay,
but for the worst case, it needs to be a sysctl.

The default value of the sysctl should be your new value (10),
and it should allow the old rfc usage if zero.

^ permalink raw reply

* [PATCH v2] net_sched: sch_sfq: better struct layouts
From: Eric Dumazet @ 2010-12-20 17:02 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Patrick McHardy, David Miller, netdev
In-Reply-To: <20101219212234.GA2323@del.dom.local>

Le dimanche 19 décembre 2010 à 22:22 +0100, Jarek Poplawski a écrit :

> I think open coding sk_buff_head is a wrong idea. Otherwise, this
> patch looks OK to me, only a few cosmetic suggestions below.
> 

I completely agree with you but this should be temporary, because David
really wants to use list_head for skbs, I believe this will be done ;)

I chose to name the list skblist to make clear where we want to plug a
real list_head once done.

Also, not using sk_buff_head saves at least 8 bytes per slot.

>   
> > -#define SFQ_DEPTH		128
> > +#define SFQ_DEPTH		128 /* max number of packets per slot (per flow) */
> > +#define SFQ_SLOTS		128 /* max number of flows */
> > +#define EMPTY_SLOT		255
> 
> SFQ_EMPTY_SLOT?

OK done


> >  
> > +struct sfq_slot {
> > +	struct sk_buff	*skblist_next;
> > +	struct sk_buff	*skblist_prev;
> > +	sfq_index	qlen; /* number of skbs in skblist */
> > +	sfq_index	next; /* next slot in sfq chain */
> > +	unsigned short	hash; /* hash value (index in ht[]) */
> > +	short		allot; /* credit for this slot */
> > +	struct sfq_head anchor; /* anchor in dep[] chains */
> 
> struct sfq_head dep?

OK

> 
> > +};
> > +
> >  struct sfq_sched_data
> >  {
> >  /* Parameters */
> > @@ -99,17 +114,24 @@ struct sfq_sched_data
> >  	struct tcf_proto *filter_list;
> >  	struct timer_list perturb_timer;
> >  	u32		perturbation;
> > -	sfq_index	tail;		/* Index of current slot in round */
> > -	sfq_index	max_depth;	/* Maximal depth */
> > +	sfq_index	max_depth;	/* depth of longest slot */
> 
> depth and/or length? (One dimension should be enough.)

maybe cur_depth ? Its not the maximal possible depth, but depth of
longest slot, or current max depth...

> 
> >  
> > +	struct sfq_slot *tail;		/* current slot in round */
> >  	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
> > -	sfq_index	next[SFQ_DEPTH];	/* Active slots link */
> > -	short		allot[SFQ_DEPTH];	/* Current allotment per slot */
> > -	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */
> > -	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */
> > -	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */
> > +	struct sfq_slot	slots[SFQ_SLOTS];
> > +	struct sfq_head	dep[SFQ_DEPTH];	/* Linked list of slots, indexed by depth */
> >  };
> >  
> > +/*
> > + * sfq_head are either in a sfq_slot or in dep[] array
> > + */
> > +static inline struct sfq_head *get_head(struct sfq_sched_data *q, sfq_index val)
> 
> static inline struct sfq_head *sfq_dep_head()?
> 

OK


> > -	/* If selected queue has length q->limit, this means that
> > -	 * all another queues are empty and that we do simple tail drop,
> 
> No reason to remove this line.

Well, the reason we drop this packet is not because other queues are
empty, but because we reach max depth for this queue. (I have the idea
to extend SFQ to allow more packets to be queued, still with a 127 limit
per queue, and 127 flows). With 10Gbs links, a global limit of 127
packets is short.


> If you really have to do this, all these: __skb_queue_tail(),
> __skb_dequeue(), skb_queue_head_init(), skb_peek() etc. used here
> should stay as (local) inline functions to remain readability.
> 

OK done, thanks a lot for reviewing and very useful comments !

We should address the problem of allot being 16bit, GRO makes allot
overflow so fast, that SFQ is not fair at all...

allot could use 17 bits and hash only 15 (10 really needed for current
divisor)

[PATCH v2] net_sched: sch_sfq: better struct layouts

This patch shrinks sizeof(struct sfq_sched_data)
from 0x14f8 (or more if spinlocks are bigger) to 0x1180 bytes, and
reduce text size as well.

   text    data     bss     dec     hex filename
   4821     152       0    4973    136d old/net/sched/sch_sfq.o
   4627     136       0    4763    129b new/net/sched/sch_sfq.o


All data for a slot/flow is now grouped in a compact and cache friendly
structure, instead of being spreaded in many different points.

struct sfq_slot {
        struct sk_buff  *skblist_next;
        struct sk_buff  *skblist_prev;
        sfq_index       qlen; /* number of skbs in skblist */
        sfq_index       next; /* next slot in sfq chain */
        unsigned short  hash; /* hash value (index in ht[]) */
        short           allot; /* credit for this slot */
        struct sfq_head dep; /* anchor in dep[] chains */
};

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jarek Poplawski <jarkao2@gmail.com>
---
v2: address Jarek comments

 net/sched/sch_sfq.c |  263 +++++++++++++++++++++++++-----------------
 1 files changed, 160 insertions(+), 103 deletions(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 3cf478d..ef94f3d 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -67,27 +67,42 @@
 
 	IMPLEMENTATION:
 	This implementation limits maximal queue length to 128;
-	maximal mtu to 2^15-1; number of hash buckets to 1024.
+	maximal mtu to 2^15-1; max 128 flows, number of hash buckets to 1024.
 	The only goal of this restrictions was that all data
-	fit into one 4K page :-). Struct sfq_sched_data is
-	organized in anti-cache manner: all the data for a bucket
-	are scattered over different locations. This is not good,
-	but it allowed me to put it into 4K.
+	fit into one 4K page on 32bit arches.
 
 	It is easy to increase these values, but not in flight.  */
 
-#define SFQ_DEPTH		128
+#define SFQ_DEPTH		128 /* max number of packets per flow */
+#define SFQ_SLOTS		128 /* max number of flows */
+#define SFQ_EMPTY_SLOT		255
 #define SFQ_HASH_DIVISOR	1024
 
-/* This type should contain at least SFQ_DEPTH*2 values */
+/* This type should contain at least SFQ_DEPTH + SFQ_SLOTS values */
 typedef unsigned char sfq_index;
 
+/*
+ * We dont use pointers to save space.
+ * Small indexes [0 ... SFQ_SLOTS - 1] are 'pointers' to slots[] array
+ * while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1]
+ * are 'pointers' to dep[] array
+ */
 struct sfq_head
 {
 	sfq_index	next;
 	sfq_index	prev;
 };
 
+struct sfq_slot {
+	struct sk_buff	*skblist_next;
+	struct sk_buff	*skblist_prev;
+	sfq_index	qlen; /* number of skbs in skblist */
+	sfq_index	next; /* next slot in sfq chain */
+	unsigned short	hash; /* hash value (index in ht[]) */
+	short		allot; /* credit for this slot */
+	struct sfq_head dep; /* anchor in dep[] chains */
+};
+
 struct sfq_sched_data
 {
 /* Parameters */
@@ -99,17 +114,24 @@ struct sfq_sched_data
 	struct tcf_proto *filter_list;
 	struct timer_list perturb_timer;
 	u32		perturbation;
-	sfq_index	tail;		/* Index of current slot in round */
-	sfq_index	max_depth;	/* Maximal depth */
+	sfq_index	cur_depth;	/* depth of longest slot */
 
+	struct sfq_slot *tail;		/* current slot in round */
 	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
-	sfq_index	next[SFQ_DEPTH];	/* Active slots link */
-	short		allot[SFQ_DEPTH];	/* Current allotment per slot */
-	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */
-	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */
-	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */
+	struct sfq_slot	slots[SFQ_SLOTS];
+	struct sfq_head	dep[SFQ_DEPTH];	/* Linked list of slots, indexed by depth */
 };
 
+/*
+ * sfq_head are either in a sfq_slot or in dep[] array
+ */
+static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
+{
+	if (val < SFQ_SLOTS)
+		return &q->slots[val].dep;
+	return &q->dep[val - SFQ_SLOTS];
+}
+
 static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
 {
 	return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
@@ -200,30 +222,41 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 	return 0;
 }
 
+/*
+ * x : slot number [0 .. SFQ_SLOTS - 1]
+ */
 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 {
 	sfq_index p, n;
-	int d = q->qs[x].qlen + SFQ_DEPTH;
+	int qlen = q->slots[x].qlen;
 
-	p = d;
-	n = q->dep[d].next;
-	q->dep[x].next = n;
-	q->dep[x].prev = p;
-	q->dep[p].next = q->dep[n].prev = x;
+	p = qlen + SFQ_SLOTS;
+	n = q->dep[qlen].next;
+
+	q->slots[x].dep.next = n;
+	q->slots[x].dep.prev = p;
+
+	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
+	sfq_dep_head(q, n)->prev = x;
 }
 
+#define sfq_unlink(q, x, n, p)			\
+	n = q->slots[x].dep.next;		\
+	p = q->slots[x].dep.prev;		\
+	sfq_dep_head(q, p)->next = n;		\
+	sfq_dep_head(q, n)->prev = p
+	
+
 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 {
 	sfq_index p, n;
+	int d;
 
-	n = q->dep[x].next;
-	p = q->dep[x].prev;
-	q->dep[p].next = n;
-	q->dep[n].prev = p;
-
-	if (n == p && q->max_depth == q->qs[x].qlen + 1)
-		q->max_depth--;
+	sfq_unlink(q, x, n, p);
 
+	d = q->slots[x].qlen--;
+	if (n == p && q->cur_depth == d)
+		q->cur_depth--;
 	sfq_link(q, x);
 }
 
@@ -232,34 +265,68 @@ static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 	sfq_index p, n;
 	int d;
 
-	n = q->dep[x].next;
-	p = q->dep[x].prev;
-	q->dep[p].next = n;
-	q->dep[n].prev = p;
-	d = q->qs[x].qlen;
-	if (q->max_depth < d)
-		q->max_depth = d;
+	sfq_unlink(q, x, n, p);
 
+	d = ++q->slots[x].qlen;
+	if (q->cur_depth < d)
+		q->cur_depth = d;
 	sfq_link(q, x);
 }
 
+/* helper functions : might be changed when/if skb use a standard list_head */
+
+/* remove one skb from tail of slot queue */
+static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
+{
+	struct sk_buff *skb = slot->skblist_prev;
+
+	slot->skblist_prev = skb->prev;
+	skb->next = skb->prev = NULL;
+	return skb;
+}
+
+/* remove one skb from head of slot queue */
+static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
+{
+	struct sk_buff *skb = slot->skblist_next;
+
+	slot->skblist_next = skb->next;
+	skb->next = skb->prev = NULL;
+	return skb;
+}
+
+static inline void slot_queue_init(struct sfq_slot *slot)
+{
+	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
+}
+
+/* add skb to slot queue (tail add) */
+static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
+{
+	skb->prev = slot->skblist_prev;
+	skb->next = (struct sk_buff *)slot;
+	slot->skblist_prev->next = skb;
+	slot->skblist_prev = skb;
+}
+
+
 static unsigned int sfq_drop(struct Qdisc *sch)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
-	sfq_index d = q->max_depth;
+	sfq_index x, d = q->cur_depth;
 	struct sk_buff *skb;
 	unsigned int len;
+	struct sfq_slot *slot;
 
-	/* Queue is full! Find the longest slot and
-	   drop a packet from it */
-
+	/* Queue is full! Find the longest slot and drop tail packet from it */
 	if (d > 1) {
-		sfq_index x = q->dep[d + SFQ_DEPTH].next;
-		skb = q->qs[x].prev;
+		x = q->dep[d].next;
+		slot = &q->slots[x];
+drop:
+		skb = slot_dequeue_tail(slot);
 		len = qdisc_pkt_len(skb);
-		__skb_unlink(skb, &q->qs[x]);
-		kfree_skb(skb);
 		sfq_dec(q, x);
+		kfree_skb(skb);
 		sch->q.qlen--;
 		sch->qstats.drops++;
 		sch->qstats.backlog -= len;
@@ -268,19 +335,11 @@ static unsigned int sfq_drop(struct Qdisc *sch)
 
 	if (d == 1) {
 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
-		d = q->next[q->tail];
-		q->next[q->tail] = q->next[d];
-		q->allot[q->next[d]] += q->quantum;
-		skb = q->qs[d].prev;
-		len = qdisc_pkt_len(skb);
-		__skb_unlink(skb, &q->qs[d]);
-		kfree_skb(skb);
-		sfq_dec(q, d);
-		sch->q.qlen--;
-		q->ht[q->hash[d]] = SFQ_DEPTH;
-		sch->qstats.drops++;
-		sch->qstats.backlog -= len;
-		return len;
+		x = q->tail->next;
+		slot = &q->slots[x];
+		q->tail->next = slot->next;
+		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+		goto drop;
 	}
 
 	return 0;
@@ -292,6 +351,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	struct sfq_sched_data *q = qdisc_priv(sch);
 	unsigned int hash;
 	sfq_index x;
+	struct sfq_slot *slot;
 	int uninitialized_var(ret);
 
 	hash = sfq_classify(skb, sch, &ret);
@@ -304,31 +364,33 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	hash--;
 
 	x = q->ht[hash];
-	if (x == SFQ_DEPTH) {
-		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
-		q->hash[x] = hash;
+	slot = &q->slots[x];
+	if (x == SFQ_EMPTY_SLOT) {
+		x = q->dep[0].next; /* get a free slot */
+		q->ht[hash] = x;
+		slot = &q->slots[x];
+		slot->hash = hash;
+		slot_queue_init(slot);
 	}
 
-	/* If selected queue has length q->limit, this means that
-	 * all another queues are empty and that we do simple tail drop,
+	/* If selected queue has length q->limit, do simple tail drop,
 	 * i.e. drop _this_ packet.
 	 */
-	if (q->qs[x].qlen >= q->limit)
+	if (slot->qlen >= q->limit)
 		return qdisc_drop(skb, sch);
 
 	sch->qstats.backlog += qdisc_pkt_len(skb);
-	__skb_queue_tail(&q->qs[x], skb);
+	slot_queue_add(slot, skb);
 	sfq_inc(q, x);
-	if (q->qs[x].qlen == 1) {		/* The flow is new */
-		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
-			q->tail = x;
-			q->next[x] = x;
-			q->allot[x] = q->quantum;
+	if (slot->qlen == 1) {		/* The flow is new */
+		if (q->tail == NULL) {	/* It is the first flow */
+			slot->next = x;
 		} else {
-			q->next[x] = q->next[q->tail];
-			q->next[q->tail] = x;
-			q->tail = x;
+			slot->next = q->tail->next;
+			q->tail->next = x;
 		}
+		q->tail = slot;
+		slot->allot = q->quantum;
 	}
 	if (++sch->q.qlen <= q->limit) {
 		sch->bstats.bytes += qdisc_pkt_len(skb);
@@ -344,14 +406,12 @@ static struct sk_buff *
 sfq_peek(struct Qdisc *sch)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
-	sfq_index a;
 
 	/* No active slots */
-	if (q->tail == SFQ_DEPTH)
+	if (q->tail == NULL)
 		return NULL;
 
-	a = q->next[q->tail];
-	return skb_peek(&q->qs[a]);
+	return q->slots[q->tail->next].skblist_next;
 }
 
 static struct sk_buff *
@@ -359,34 +419,32 @@ sfq_dequeue(struct Qdisc *sch)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
-	sfq_index a, old_a;
+	sfq_index a, next_a;
+	struct sfq_slot *slot;
 
 	/* No active slots */
-	if (q->tail == SFQ_DEPTH)
+	if (q->tail == NULL)
 		return NULL;
 
-	a = old_a = q->next[q->tail];
-
-	/* Grab packet */
-	skb = __skb_dequeue(&q->qs[a]);
+	a = q->tail->next;
+	slot = &q->slots[a];
+	skb = slot_dequeue_head(slot);
 	sfq_dec(q, a);
 	sch->q.qlen--;
 	sch->qstats.backlog -= qdisc_pkt_len(skb);
 
-	/* Is the slot empty? */
-	if (q->qs[a].qlen == 0) {
-		q->ht[q->hash[a]] = SFQ_DEPTH;
-		a = q->next[a];
-		if (a == old_a) {
-			q->tail = SFQ_DEPTH;
+	/* Is the slot now empty? */
+	if (slot->qlen == 0) {
+		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+		next_a = slot->next;
+		if (a == next_a) {
+			q->tail = NULL; /* no more active slots */
 			return skb;
 		}
-		q->next[q->tail] = a;
-		q->allot[a] += q->quantum;
-	} else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
-		q->tail = a;
-		a = q->next[a];
-		q->allot[a] += q->quantum;
+		q->tail->next = next_a;
+	} else if ((slot->allot -= qdisc_pkt_len(skb)) <= 0) {
+		q->tail = slot;
+		slot->allot += q->quantum;
 	}
 	return skb;
 }
@@ -450,17 +508,16 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 	init_timer_deferrable(&q->perturb_timer);
 
 	for (i = 0; i < SFQ_HASH_DIVISOR; i++)
-		q->ht[i] = SFQ_DEPTH;
+		q->ht[i] = SFQ_EMPTY_SLOT;
 
 	for (i = 0; i < SFQ_DEPTH; i++) {
-		skb_queue_head_init(&q->qs[i]);
-		q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
-		q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
+		q->dep[i].next = i + SFQ_SLOTS;
+		q->dep[i].prev = i + SFQ_SLOTS;
 	}
 
 	q->limit = SFQ_DEPTH - 1;
-	q->max_depth = 0;
-	q->tail = SFQ_DEPTH;
+	q->cur_depth = 0;
+	q->tail = NULL;
 	if (opt == NULL) {
 		q->quantum = psched_mtu(qdisc_dev(sch));
 		q->perturb_period = 0;
@@ -471,7 +528,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 			return err;
 	}
 
-	for (i = 0; i < SFQ_DEPTH; i++)
+	for (i = 0; i < SFQ_SLOTS; i++)
 		sfq_link(q, i);
 	return 0;
 }
@@ -547,9 +604,9 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 				struct gnet_dump *d)
 {
 	struct sfq_sched_data *q = qdisc_priv(sch);
-	sfq_index idx = q->ht[cl-1];
-	struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
-	struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
+	const struct sfq_slot *slot = &q->slots[q->ht[cl - 1]];
+	struct gnet_stats_queue qs = { .qlen = slot->qlen };
+	struct tc_sfq_xstats xstats = { .allot = slot->allot };
 
 	if (gnet_stats_copy_queue(d, &qs) < 0)
 		return -1;
@@ -565,7 +622,7 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 		return;
 
 	for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
-		if (q->ht[i] == SFQ_DEPTH ||
+		if (q->ht[i] == SFQ_EMPTY_SLOT ||
 		    arg->count < arg->skip) {
 			arg->count++;
 			continue;



^ permalink raw reply related

* Re: [PATCH net-next-2.6] ifb: add performance flags to dev->features
From: Changli Gao @ 2010-12-20 14:43 UTC (permalink / raw)
  To: Eric Dumazet, Herbert Xu
  Cc: David Miller, Jarek Poplawski, Paweł Staszewski,
	Linux Network Development list
In-Reply-To: <1292855120.2800.45.camel@edumazet-laptop>

On Mon, Dec 20, 2010 at 10:25 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>  drivers/net/ifb.c |    3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
> index 124dac4..c761551 100644
> --- a/drivers/net/ifb.c
> +++ b/drivers/net/ifb.c
> @@ -136,6 +136,9 @@ static void ifb_setup(struct net_device *dev)
>        ether_setup(dev);
>        dev->tx_queue_len = TX_Q_LIMIT;
>
> +       dev->features |= NETIF_F_NO_CSUM | NETIF_F_SG |
> +                        NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
> +                        NETIF_F_TSO;
>        dev->flags |= IFF_NOARP;
>        dev->flags &= ~IFF_MULTICAST;
>        dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
>
>
>

I think we can assign these features to vlan_features.

dev->vlan_features |= dev->features.

Although I don't know all the features related to GSO,  I just added
all of them. For pseudo NIC, I think it is safe to do so.

+       dev->features |= NETIF_F_SG | NETIF_F_NO_CSUM | NETIF_F_TSO |
+                        NETIF_F_UFO | NETIF_F_GSO_ROBUST | NETIF_F_TSO_ECN |
+                        NETIF_F_TSO6 | NETIF_F_FSO;

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [RFC PATCH 02/12] net: Introduce new feature setting ops
From: Ben Hutchings @ 2010-12-20 16:41 UTC (permalink / raw)
  To: Michał Mirosław; +Cc: netdev
In-Reply-To: <20101219234343.GA15644@rere.qmqm.pl>

On Mon, 2010-12-20 at 00:43 +0100, Michał Mirosław wrote:
> On Sun, Dec 19, 2010 at 09:22:39PM +0000, Ben Hutchings wrote:
> > On Sun, 2010-12-19 at 01:49 +0100, Michał Mirosław wrote:
> > > On Thu, Dec 16, 2010 at 11:13:06PM +0000, Ben Hutchings wrote:
> > > > On Wed, 2010-12-15 at 23:24 +0100, Michał Mirosław wrote:
> > [...]
> > > > > +static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
> > > > > +{
> > > > > +	struct ethtool_features cmd;
> > > > > +	struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
> > > > > +
> > > > > +	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
> > > > > +		return -EFAULT;
> > > > > +	useraddr += sizeof(cmd);
> > > > > +
> > > > > +	if (cmd.count > ETHTOOL_DEV_FEATURE_WORDS)
> > > > > +		cmd.count = ETHTOOL_DEV_FEATURE_WORDS;
> > > > So additional feature words will be silently ignored...
> > > > > +	if (copy_from_user(features, useraddr, sizeof(*features) * cmd.count))
> > > > > +		return -EFAULT;
> > > > > +	memset(features + cmd.count, 0,
> > > > > +		sizeof(features) - sizeof(*features) * cmd.count);
> > > > > +
> > > > > +	features[0].valid &= dev->hw_features | NETIF_F_SOFT_FEATURES;
> > > > [...]
> > > > 
> > > > ...as will any other unsupported features.  This is not a good idea.
> > > > (However, remembering which features are wanted does seem like a good
> > > > idea.)
> > > 
> > > That's intentional. Unsupported features can't be enabled anyway.
> > > hw_features is supposed to contain all features that the device can support
> > > and is able to enable/disable. This set should be constant and anything that
> > > is in the wanted_features set but is not supported because of other conditions
> > > will be stripped by ndo_fix_features() call.
> > > 
> > > Other way would be to return EINVAL when bits not changeable are present in
> > > the valid mask. I don't want to do that, since then your example of changing
> > > a feature without GFEATURES first will not work.
> > That's right, it shouldn't work.
> 
> A user says "enable any TSO available". This means ethtool could issue
> a request with .valid = NETIF_F_ALL_TSO, .requested = NETIF_F_ALL_TSO.
> If the device supports only TSOv4 this should enable it and leave others
> alone as whatever the user wants they can't be enabled.

I see your point, but...

> This is 1-1 conversion of the semantics current ethtool ops have - set_tso
> corresponds exactly to the request described above. This behaviour also
> allows to execute a command like "enable as many as you can of ..." that
> is usual goal of user enabling hardware offloads - to get best possible
> performance.

The current behaviour is that if TSO is not supported at all then any
attempt to control it fails with error EOPNOTSUPP.  Your proposed change
would make it return success.

So I think we have to expect ethtool (or other userland tool) to query
the supported feature flags if it is commanded to change some offload
feature that is represented by multiple feature flags in
net_device::features.  Alternately, we maintain a separate set of
feature flags for ethtool that doesn't make these distinctions.  But I
think it would be useful for ethtool to be able to query and report
exactly what features the device supports.

> Nevertheless, what problem is generated by ignoring unsupported bits here?

User confusion when an ethtool command 'succeeds' but has no effect.

> I can see the point of returning -EINVAL on bits that are not defined, though.
> Is that a good direction?

Any attempt to change undefined flags should definitely result in an
error.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH] e1000e: workaround missing power down mii control bit on 82571
From: Arthur Jones @ 2010-12-20 15:33 UTC (permalink / raw)
  To: Allan, Bruce W; +Cc: Ben Hutchings, Kirsher, Jeffrey T, netdev@vger.kernel.org
In-Reply-To: <8DD2590731AB5D4C9DBF71A877482A9001773F7BFC@orsmsx509.amr.corp.intel.com>

Hi Bruce, ...

On Fri, Dec 17, 2010 at 07:53:08AM -0800, Allan, Bruce W wrote:
> >-----Original Message-----
> >From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On
> >Behalf Of Arthur Jones
> >Sent: Friday, December 17, 2010 6:05 AM
> >To: Allan, Bruce W
> >Cc: Ben Hutchings; Kirsher, Jeffrey T; netdev@vger.kernel.org
> >Subject: Re: [PATCH] e1000e: workaround missing power down mii control bit on
> >82571
> >
> >That's not what I saw.  I saw the bit always
> >read back zero, even right after I set it.
> >
> >Have you tried writing the power down bit and
> >reading it back?
> >
> >Arthur
> 
> Yes, and it worked as expected.

Interesting!  So then you're not able to
repro the issue that I describe in the
patch then either?

This was an 82571?  Can you send along
the firmware revision and the full lspci
for the NIC where it works to read back
the power down bit?  Very strange...

Arthur


^ permalink raw reply

* Re: [PATCH 00/12] make rpc_pipefs be mountable multiple times
From: J. Bruce Fields @ 2010-12-20 14:46 UTC (permalink / raw)
  To: Kirill A. Shutsemov
  Cc: Trond Myklebust, Neil Brown, Pavel Emelyanov, linux-nfs,
	David S. Miller, netdev, linux-kernel
In-Reply-To: <1292846078-31793-1-git-send-email-kirill@shutemov.name>

On Mon, Dec 20, 2010 at 01:54:26PM +0200, Kirill A. Shutsemov wrote:
> From: Kirill A. Shutemov <kirill@shutemov.name>
> 
> Prepare nfs/sunrpc stack to use multiple instances of rpc_pipefs.
> Only for client for now.
> 
> Only quick sanity check was made. BTW, is there any check list for NFS
> contributors?

Nothing formal.

Chuck may have some nlm patches that conflict; I'm not sure what their
status is.

For testing rpc_pipefs, maybe a few mounts (simultaneous mounts might be
good), possibly over different security flavors, and with some of them
nfsv4, might be a good idea.

By the way, was there ever a resolution to Trond's question?:

	http://marc.info/?l=linux-nfs&m=128655758712817&w=2

	"The keyring upcalls are currently initiated through the same
	mechanism as module_request and therefore get started with the
	init_nsproxy namespace. We'd really like them to run inside the
	same container as the process.  As part of the same problem,
	there is the issue of what to do with the dns resolver and
	Bryan's new keyring based idmapper code."

--b.

> 
> Kirill A. Shutemov (12):
>   sunrpc: mount rpc_pipefs on initialization
>   sunrpc: introduce init_rpc_pipefs
>   sunrpc: push init_rpc_pipefs up to rpc_create() callers
>   sunrpc: tag svc_serv with rpc_pipefs mount point
>   sunrpc: get rpc_pipefs mount point for svc_serv from callers
>   lockd: get rpc_pipefs mount point from callers
>   sunrpc: get rpc_pipefs mount point for rpcb_create_local from callers
>   sunrpc: tag pipefs field of cache_detail with rpc_pipefs mount point
>   nfs: per-rpc_pipefs dns cache
>   sunrpc: introduce get_rpc_pipefs()
>   nfs: introduce mount option 'rpcmount'
>   sunrpc: make rpc_pipefs be mountable multiple times
> 
>  fs/lockd/clntlock.c                |    8 +-
>  fs/lockd/host.c                    |   12 +++-
>  fs/lockd/mon.c                     |   13 ++-
>  fs/lockd/svc.c                     |    4 +-
>  fs/nfs/cache_lib.c                 |   18 +----
>  fs/nfs/cache_lib.h                 |    3 +-
>  fs/nfs/callback.c                  |    6 +-
>  fs/nfs/callback.h                  |    3 +-
>  fs/nfs/client.c                    |   45 ++++++++++--
>  fs/nfs/dns_resolve.c               |  128 ++++++++++++++++++++++++++-------
>  fs/nfs/dns_resolve.h               |    8 +--
>  fs/nfs/inode.c                     |    8 +--
>  fs/nfs/internal.h                  |   10 ++-
>  fs/nfs/mount_clnt.c                |    1 +
>  fs/nfs/namespace.c                 |    3 +-
>  fs/nfs/nfs4namespace.c             |   20 +++--
>  fs/nfs/super.c                     |   20 +++++
>  fs/nfsd/nfs4callback.c             |    2 +
>  fs/nfsd/nfssvc.c                   |    8 +-
>  include/linux/lockd/bind.h         |    3 +-
>  include/linux/lockd/lockd.h        |    4 +-
>  include/linux/nfs_fs_sb.h          |    1 +
>  include/linux/sunrpc/cache.h       |    9 +--
>  include/linux/sunrpc/clnt.h        |    5 +-
>  include/linux/sunrpc/rpc_pipe_fs.h |    6 +-
>  include/linux/sunrpc/svc.h         |    9 +-
>  net/sunrpc/cache.c                 |   16 +++--
>  net/sunrpc/clnt.c                  |   19 +++--
>  net/sunrpc/rpc_pipe.c              |  142 ++++++++++++++++++++++++++++++------
>  net/sunrpc/rpcb_clnt.c             |   13 ++-
>  net/sunrpc/svc.c                   |   52 ++++++++-----
>  31 files changed, 430 insertions(+), 169 deletions(-)
> 
> -- 
> 1.7.3.4
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox