Netdev List
 help / color / mirror / Atom feed
* Re: tip: origin tree build failure, [patch] fix isdn/gigaset build failure
From: Ingo Molnar @ 2010-04-19 19:54 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: David Miller, tilman, akpm, netdev, linux-kernel
In-Reply-To: <alpine.LFD.2.00.1004191200050.14500@i5.linux-foundation.org>


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Mon, 19 Apr 2010, Linus Torvalds wrote:
> > 
> > Maybe add the #include <linux/sched.h> into gigaset.h, instead of 
> > common.c?
> 
> This compiled for me, although the only thing I tried was just turning all 
> the gigaset options to 'y'. Maybe some other config doesn't work. So I 
> committed it as likely to fix things.

I'll check it tomorrow - but i'd expect it to be enough, plus the all-yes 
thing is that matters most in practice in any case.

Thanks,

	Ingo

^ permalink raw reply

* [net-next PATCH 2/2] add enic iovnl ops for dynamic vnics
From: Scott Feldman @ 2010-04-19 19:18 UTC (permalink / raw)
  To: davem; +Cc: netdev, chrisw
In-Reply-To: <20100419191425.10423.88005.stgit@savbu-pc100.cisco.com>

From: Scott Feldman <scofeldm@cisco.com>

Add enic iovnl ops to support setting port-profile for dynamic vnics.  Enic
dynamic vnics are just like normal enic eth vnics except dynamic vnics require
an extra configuration step to assign a port-profile identifier to the
interface before the interface is useable. Once assigned, link comes up
on the interface and is ready for I/O.  The port-profile is used to configure
the network port assigned to the interface.  The network port configuration
includes VLAN membership, QoS policies, and port security settings typical
of a data center network.

Signed-off-by: Scott Feldman <scofeldm@cisco.com>
Signed-off-by: Roopa Prabhu<roprabhu@cisco.com>
---
 drivers/net/enic/Makefile     |    2 -
 drivers/net/enic/enic.h       |    3 +
 drivers/net/enic/enic_iovnl.c |  136 +++++++++++++++++++++++++++++++++++++++++
 drivers/net/enic/enic_main.c  |   67 +++++++++++++++++---
 drivers/net/enic/enic_res.c   |    5 ++
 drivers/net/enic/enic_res.h   |    1 
 drivers/net/enic/vnic_dev.c   |   58 +++++++++++++++++
 drivers/net/enic/vnic_dev.h   |    4 +
 drivers/net/enic/vnic_vic.c   |   73 ++++++++++++++++++++++
 drivers/net/enic/vnic_vic.h   |   58 +++++++++++++++++
 10 files changed, 394 insertions(+), 13 deletions(-)

diff --git a/drivers/net/enic/Makefile b/drivers/net/enic/Makefile
index 391c3bc..311613b 100644
--- a/drivers/net/enic/Makefile
+++ b/drivers/net/enic/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_ENIC) := enic.o
 
 enic-y := enic_main.o vnic_cq.o vnic_intr.o vnic_wq.o \
-	enic_res.o vnic_dev.o vnic_rq.o
+	enic_res.o vnic_dev.o vnic_rq.o vnic_vic.o enic_iovnl.o
 
diff --git a/drivers/net/enic/enic.h b/drivers/net/enic/enic.h
index 5fa56f1..5790655 100644
--- a/drivers/net/enic/enic.h
+++ b/drivers/net/enic/enic.h
@@ -122,4 +122,7 @@ struct enic {
 	unsigned int cq_count;
 };
 
+void enic_set_multicast_list(struct net_device *netdev);
+void enic_set_iovnl_ops(struct net_device *netdev);
+
 #endif /* _ENIC_H_ */
diff --git a/drivers/net/enic/enic_iovnl.c b/drivers/net/enic/enic_iovnl.c
new file mode 100644
index 0000000..37c5d85
--- /dev/null
+++ b/drivers/net/enic/enic_iovnl.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/netdevice.h>
+#include <linux/iovnl.h>
+#include <net/iovnl.h>
+
+#include "enic_res.h"
+#include "enic.h"
+#include "vnic_dev.h"
+#include "vnic_vic.h"
+
+static int enic_provinfo_add_tlv_str(struct vic_provinfo *vp, u16 type,
+	char *str)
+{
+	return str ? vic_provinfo_add_tlv(vp, type, strlen(str) + 1, str) : 0;
+}
+
+static int enic_set_port_profile(struct net_device *dev,
+	struct net_device *vf_dev, char *port_profile,
+	u8 *client_mac_addr, char *client_name, char *host_uuid)
+{
+	struct enic *enic = netdev_priv(dev);
+	struct vic_provinfo *vp;
+	u8 oui[3] = VIC_PROVINFO_CISCO_OUI;
+	int enable = 1;
+	int err;
+
+	if (vf_dev && dev != vf_dev)
+		return -EINVAL;
+
+	if (!port_profile)
+		return -EINVAL;
+
+	vp = vic_provinfo_alloc(GFP_KERNEL, oui, VIC_PROVINFO_LINUX_TYPE);
+	if (!vp)
+		return -ENOMEM;
+
+	enic_provinfo_add_tlv_str(vp,
+		VIC_LINUX_PROV_TLV_PORT_PROFILE_NAME_STR, port_profile);
+	vic_provinfo_add_tlv(vp, VIC_LINUX_PROV_TLV_CLIENT_MAC_ADDR,
+		6, client_mac_addr);
+	enic_provinfo_add_tlv_str(vp,
+		VIC_LINUX_PROV_TLV_CLIENT_NAME_STR, client_name);
+	enic_provinfo_add_tlv_str(vp,
+		VIC_LINUX_PROV_TLV_HOST_UUID_STR, host_uuid);
+
+	spin_lock(&enic->devcmd_lock);
+
+	err = vnic_dev_deinit(enic->vdev);
+	if (err)
+		goto err_out;
+
+	err = vnic_dev_logical_uplink(enic->vdev, enable);
+	if (err)
+		goto err_out;
+
+	err = vnic_dev_init_prov(enic->vdev, (u8 *)vp,
+		vic_provinfo_size(vp));
+
+err_out:
+	spin_unlock(&enic->devcmd_lock);
+
+	vic_provinfo_free(vp);
+
+	enic_set_multicast_list(dev);
+
+	return err;
+}
+
+static int enic_get_init_status(struct net_device *dev,
+	struct net_device *vf_dev)
+{
+	struct enic *enic = netdev_priv(dev);
+	int done, err, error;
+
+	if (vf_dev && dev != vf_dev)
+		return IOV_PORT_PROFILE_STATUS_UNKNOWN;
+
+	spin_lock(&enic->devcmd_lock);
+	err = vnic_dev_init_done(enic->vdev, &done, &error);
+	spin_unlock(&enic->devcmd_lock);
+
+	if (err || error)
+		return IOV_PORT_PROFILE_STATUS_ERROR;
+
+	if (!done)
+		return IOV_PORT_PROFILE_STATUS_INPROGRESS;
+
+	if (!error)
+		return IOV_PORT_PROFILE_STATUS_SUCCESS;
+
+	return IOV_PORT_PROFILE_STATUS_UNKNOWN;
+}
+
+static int enic_unset_port_profile(struct net_device *dev,
+	struct net_device *vf_dev)
+{
+	struct enic *enic = netdev_priv(dev);
+	int err;
+
+	if (vf_dev && dev != vf_dev)
+		return -EINVAL;
+
+	spin_lock(&enic->devcmd_lock);
+	err = vnic_dev_deinit(enic->vdev);
+	spin_unlock(&enic->devcmd_lock);
+
+	return err;
+}
+
+static const struct iovnl_ops enic_iovnl_ops = {
+	.set_port_profile		= enic_set_port_profile,
+	.unset_port_profile		= enic_unset_port_profile,
+	.get_port_profile_status	= enic_get_init_status,
+};
+
+void enic_set_iovnl_ops(struct net_device *netdev)
+{
+	netdev->iovnl_ops = &enic_iovnl_ops;
+}
diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c
index 1232887..2fcc161 100644
--- a/drivers/net/enic/enic_main.c
+++ b/drivers/net/enic/enic_main.c
@@ -49,10 +49,12 @@
 #define ENIC_DESC_MAX_SPLITS		(MAX_TSO / WQ_ENET_MAX_DESC_LEN + 1)
 
 #define PCI_DEVICE_ID_CISCO_VIC_ENET         0x0043  /* ethernet vnic */
+#define PCI_DEVICE_ID_CISCO_VIC_ENET_DYN     0x0044  /* enet dynamic vnic */
 
 /* Supported devices */
 static DEFINE_PCI_DEVICE_TABLE(enic_id_table) = {
 	{ PCI_VDEVICE(CISCO, PCI_DEVICE_ID_CISCO_VIC_ENET) },
+	{ PCI_VDEVICE(CISCO, PCI_DEVICE_ID_CISCO_VIC_ENET_DYN) },
 	{ 0, }	/* end of table */
 };
 
@@ -113,6 +115,11 @@ static const struct enic_stat enic_rx_stats[] = {
 static const unsigned int enic_n_tx_stats = ARRAY_SIZE(enic_tx_stats);
 static const unsigned int enic_n_rx_stats = ARRAY_SIZE(enic_rx_stats);
 
+static int enic_is_dynamic(struct enic *enic)
+{
+	return enic->pdev->device == PCI_DEVICE_ID_CISCO_VIC_ENET_DYN;
+}
+
 static int enic_get_settings(struct net_device *netdev,
 	struct ethtool_cmd *ecmd)
 {
@@ -810,16 +817,44 @@ static void enic_reset_mcaddrs(struct enic *enic)
 
 static int enic_set_mac_addr(struct net_device *netdev, char *addr)
 {
-	if (!is_valid_ether_addr(addr))
-		return -EADDRNOTAVAIL;
+	struct enic *enic = netdev_priv(netdev);
+
+	if (enic_is_dynamic(enic)) {
+		if (!is_zero_ether_addr(addr) && !is_valid_ether_addr(addr))
+			return -EADDRNOTAVAIL;
+	} else {
+		if (!is_valid_ether_addr(addr))
+			return -EADDRNOTAVAIL;
+	}
 
 	memcpy(netdev->dev_addr, addr, netdev->addr_len);
 
 	return 0;
 }
 
+static int enic_set_mac_address(struct net_device *netdev, void *p)
+{
+	struct enic *enic = netdev_priv(netdev);
+	struct sockaddr *saddr = p;
+	char *addr = saddr->sa_data;
+	int err;
+
+	if (!enic_is_dynamic(enic))
+		return 0;
+
+	err = enic_set_mac_addr(netdev, addr);
+	if (!err) {
+		spin_lock(&enic->devcmd_lock);
+		enic_del_station_addr(enic);
+		enic_add_station_addr(enic);
+		spin_unlock(&enic->devcmd_lock);
+	}
+
+	return err;
+}
+
 /* netif_tx_lock held, BHs disabled */
-static void enic_set_multicast_list(struct net_device *netdev)
+void enic_set_multicast_list(struct net_device *netdev)
 {
 	struct enic *enic = netdev_priv(netdev);
 	struct netdev_hw_addr *ha;
@@ -1440,10 +1475,12 @@ static int enic_open(struct net_device *netdev)
 	for (i = 0; i < enic->rq_count; i++)
 		vnic_rq_enable(&enic->rq[i]);
 
-	spin_lock(&enic->devcmd_lock);
-	enic_add_station_addr(enic);
-	spin_unlock(&enic->devcmd_lock);
-	enic_set_multicast_list(netdev);
+	if (!enic_is_dynamic(enic)) {
+		spin_lock(&enic->devcmd_lock);
+		enic_add_station_addr(enic);
+		spin_unlock(&enic->devcmd_lock);
+		enic_set_multicast_list(netdev);
+	}
 
 	netif_wake_queue(netdev);
 	napi_enable(&enic->napi);
@@ -1782,6 +1819,7 @@ static const struct net_device_ops enic_netdev_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_set_multicast_list	= enic_set_multicast_list,
+	.ndo_set_mac_address	= enic_set_mac_address,
 	.ndo_change_mtu		= enic_change_mtu,
 	.ndo_vlan_rx_register	= enic_vlan_rx_register,
 	.ndo_vlan_rx_add_vid	= enic_vlan_rx_add_vid,
@@ -2010,11 +2048,13 @@ static int __devinit enic_probe(struct pci_dev *pdev,
 
 	netif_carrier_off(netdev);
 
-	err = vnic_dev_init(enic->vdev, 0);
-	if (err) {
-		printk(KERN_ERR PFX
-			"vNIC dev init failed, aborting.\n");
-		goto err_out_dev_close;
+	if (!enic_is_dynamic(enic)) {
+		err = vnic_dev_init(enic->vdev, 0);
+		if (err) {
+			printk(KERN_ERR PFX
+				"vNIC dev init failed, aborting.\n");
+			goto err_out_dev_close;
+		}
 	}
 
 	err = enic_dev_init(enic);
@@ -2069,6 +2109,9 @@ static int __devinit enic_probe(struct pci_dev *pdev,
 	if (using_dac)
 		netdev->features |= NETIF_F_HIGHDMA;
 
+	if (enic_is_dynamic(enic))
+		enic_set_iovnl_ops(netdev);
+
 	enic->csum_rx_enabled = ENIC_SETTING(enic, RXCSUM);
 
 	enic->lro_mgr.max_aggr = ENIC_LRO_MAX_AGGR;
diff --git a/drivers/net/enic/enic_res.c b/drivers/net/enic/enic_res.c
index 02839bf..9f31f58 100644
--- a/drivers/net/enic/enic_res.c
+++ b/drivers/net/enic/enic_res.c
@@ -108,6 +108,11 @@ void enic_add_station_addr(struct enic *enic)
 	vnic_dev_add_addr(enic->vdev, enic->mac_addr);
 }
 
+void enic_del_station_addr(struct enic *enic)
+{
+	vnic_dev_del_addr(enic->vdev, enic->mac_addr);
+}
+
 void enic_add_multicast_addr(struct enic *enic, u8 *addr)
 {
 	vnic_dev_add_addr(enic->vdev, addr);
diff --git a/drivers/net/enic/enic_res.h b/drivers/net/enic/enic_res.h
index abc1974..0e16ba0 100644
--- a/drivers/net/enic/enic_res.h
+++ b/drivers/net/enic/enic_res.h
@@ -132,6 +132,7 @@ struct enic;
 
 int enic_get_vnic_config(struct enic *);
 void enic_add_station_addr(struct enic *enic);
+void enic_del_station_addr(struct enic *enic);
 void enic_add_multicast_addr(struct enic *enic, u8 *addr);
 void enic_del_multicast_addr(struct enic *enic, u8 *addr);
 void enic_add_vlan(struct enic *enic, u16 vlanid);
diff --git a/drivers/net/enic/vnic_dev.c b/drivers/net/enic/vnic_dev.c
index d43a9d4..44b2e41 100644
--- a/drivers/net/enic/vnic_dev.c
+++ b/drivers/net/enic/vnic_dev.c
@@ -682,6 +682,64 @@ int vnic_dev_init(struct vnic_dev *vdev, int arg)
 	return r;
 }
 
+int vnic_dev_init_done(struct vnic_dev *vdev, int *done, int *err)
+{
+	u64 a0 = 0, a1 = 0;
+	int wait = 1000;
+	int ret;
+
+	*done = 0;
+
+	ret = vnic_dev_cmd(vdev, CMD_INIT_STATUS, &a0, &a1, wait);
+	if (ret)
+		return ret;
+
+	*done = (a0 == 0);
+
+	*err = (a0 == 0) ? a1 : 0;
+
+	return 0;
+}
+
+int vnic_dev_init_prov(struct vnic_dev *vdev, u8 *buf, u32 len)
+{
+	u64 a0, a1 = len;
+	int wait = 1000;
+	u64 prov_pa;
+	void *prov_buf;
+	int ret;
+
+	prov_buf = pci_alloc_consistent(vdev->pdev, len, &prov_pa);
+	if (!prov_buf)
+		return -ENOMEM;
+
+	memcpy(prov_buf, buf, len);
+
+	a0 = prov_pa;
+
+	ret = vnic_dev_cmd(vdev, CMD_INIT_PROV_INFO, &a0, &a1, wait);
+
+	pci_free_consistent(vdev->pdev, len, prov_buf, prov_pa);
+
+	return ret;
+}
+
+int vnic_dev_logical_uplink(struct vnic_dev *vdev, int enable)
+{
+	u64 a0 = enable, a1 = 0;
+	int wait = 1000;
+
+	return vnic_dev_cmd(vdev, CMD_LOGICAL_UPLINK, &a0, &a1, wait);
+}
+
+int vnic_dev_deinit(struct vnic_dev *vdev)
+{
+	u64 a0 = 0, a1 = 0;
+	int wait = 1000;
+
+	return vnic_dev_cmd(vdev, CMD_DEINIT, &a0, &a1, wait);
+}
+
 int vnic_dev_link_status(struct vnic_dev *vdev)
 {
 	if (vdev->linkstatus)
diff --git a/drivers/net/enic/vnic_dev.h b/drivers/net/enic/vnic_dev.h
index f5be640..bd40045 100644
--- a/drivers/net/enic/vnic_dev.h
+++ b/drivers/net/enic/vnic_dev.h
@@ -124,6 +124,10 @@ int vnic_dev_disable(struct vnic_dev *vdev);
 int vnic_dev_open(struct vnic_dev *vdev, int arg);
 int vnic_dev_open_done(struct vnic_dev *vdev, int *done);
 int vnic_dev_init(struct vnic_dev *vdev, int arg);
+int vnic_dev_init_done(struct vnic_dev *vdev, int *done, int *err);
+int vnic_dev_init_prov(struct vnic_dev *vdev, u8 *buf, u32 len);
+int vnic_dev_logical_uplink(struct vnic_dev *vdev, int enable);
+int vnic_dev_deinit(struct vnic_dev *vdev);
 int vnic_dev_soft_reset(struct vnic_dev *vdev, int arg);
 int vnic_dev_soft_reset_done(struct vnic_dev *vdev, int *done);
 void vnic_dev_set_intr_mode(struct vnic_dev *vdev,
diff --git a/drivers/net/enic/vnic_vic.c b/drivers/net/enic/vnic_vic.c
new file mode 100644
index 0000000..d769772
--- /dev/null
+++ b/drivers/net/enic/vnic_vic.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include "vnic_vic.h"
+
+struct vic_provinfo *vic_provinfo_alloc(gfp_t flags, u8 *oui, u8 type)
+{
+	struct vic_provinfo *vp = kzalloc(VIC_PROVINFO_MAX_DATA, flags);
+
+	if (!vp || !oui)
+		return NULL;
+
+	memcpy(vp->oui, oui, sizeof(vp->oui));
+	vp->type = type;
+	vp->length = htonl(sizeof(vp->num_tlvs));
+
+	return vp;
+}
+
+void vic_provinfo_free(struct vic_provinfo *vp)
+{
+	kfree(vp);
+}
+
+int vic_provinfo_add_tlv(struct vic_provinfo *vp, u16 type, u16 length,
+	void *value)
+{
+	struct vic_provinfo_tlv *tlv;
+
+	if (!vp || !value)
+		return -EINVAL;
+
+	if (ntohl(vp->length) + sizeof(*tlv) + length >
+		VIC_PROVINFO_MAX_TLV_DATA)
+		return -ENOMEM;
+
+	tlv = (struct vic_provinfo_tlv *)((u8 *)vp->tlv +
+		ntohl(vp->length) - sizeof(vp->num_tlvs));
+
+	tlv->type = htons(type);
+	tlv->length = htons(length);
+	memcpy(tlv->value, value, length);
+
+	vp->num_tlvs = htonl(ntohl(vp->num_tlvs) + 1);
+	vp->length = htonl(ntohl(vp->length) + sizeof(*tlv) + length);
+
+	return 0;
+}
+
+size_t vic_provinfo_size(struct vic_provinfo *vp)
+{
+	return vp ?  ntohl(vp->length) + sizeof(*vp) - sizeof(vp->num_tlvs) : 0;
+}
diff --git a/drivers/net/enic/vnic_vic.h b/drivers/net/enic/vnic_vic.h
new file mode 100644
index 0000000..a7899fb
--- /dev/null
+++ b/drivers/net/enic/vnic_vic.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef _VNIC_VIC_H_
+#define _VNIC_VIC_H_
+
+/* Note: All integer fields in NETWORK byte order */
+
+/* Note: String field lengths include null char */
+
+#define VIC_PROVINFO_CISCO_OUI		{ 0x00, 0x00, 0x0c }
+#define VIC_PROVINFO_LINUX_TYPE		0x2
+
+enum vic_linux_prov_tlv_type {
+	VIC_LINUX_PROV_TLV_PORT_PROFILE_NAME_STR = 0,
+	VIC_LINUX_PROV_TLV_CLIENT_MAC_ADDR = 1,			/* u8[6] */
+	VIC_LINUX_PROV_TLV_CLIENT_NAME_STR = 2,
+	VIC_LINUX_PROV_TLV_HOST_UUID_STR = 8,
+};
+
+struct vic_provinfo {
+	u8 oui[3];		/* OUI of data provider */
+	u8 type;		/* provider-specific type */
+	u32 length;		/* length of data below */
+	u32 num_tlvs;		/* number of tlvs */
+	struct vic_provinfo_tlv {
+		u16 type;
+		u16 length;
+		u8 value[0];
+	} tlv[0];
+} __attribute__ ((packed));
+
+#define VIC_PROVINFO_MAX_DATA		1385
+#define VIC_PROVINFO_MAX_TLV_DATA (VIC_PROVINFO_MAX_DATA - \
+	sizeof(struct vic_provinfo))
+
+struct vic_provinfo *vic_provinfo_alloc(gfp_t flags, u8 *oui, u8 type);
+void vic_provinfo_free(struct vic_provinfo *vp);
+int vic_provinfo_add_tlv(struct vic_provinfo *vp, u16 type, u16 length,
+	void *value);
+size_t vic_provinfo_size(struct vic_provinfo *vp);
+
+#endif	/* _VNIC_VIC_H_ */


^ permalink raw reply related

* [net-next PATCH 1/2] add iovnl netlink support
From: Scott Feldman @ 2010-04-19 19:18 UTC (permalink / raw)
  To: davem; +Cc: netdev, chrisw
In-Reply-To: <20100419191425.10423.88005.stgit@savbu-pc100.cisco.com>

From: Scott Feldman <scofeldm@cisco.com>

IOV netlink (IOVNL) adds I/O Virtualization control support to a master
device (MD) netdev interface.  The MD (e.g. SR-IOV PF) will set/get
control settings on behalf of a slave netdevice (e.g. SR-IOV VF).  The
design allows for the case where master and slave are the
same netdev interface.

One control setting example is MAC/VLAN settings for a VF.  Another
example control setting is a port-profile for a VF.  A port-profile is an
identifier that defines policy-based settings on the network port
backing the VF.  The network port settings examples are VLAN membership,
QoS settings, and L2 security settings, typical of a data center network.

This patch adds the iovnl interface definitions and an iovnl module.

Signed-off-by: Scott Feldman <scofeldm@cisco.com>
Signed-off-by: Roopa Prabhu<roprabhu@cisco.com>
---
 include/linux/iovnl.h     |  124 +++++++++++++++++++++
 include/linux/netdevice.h |    4 +
 include/linux/rtnetlink.h |    5 +
 include/net/iovnl.h       |   36 ++++++
 net/Kconfig               |    1 
 net/Makefile              |    3 +
 net/iovnl/Kconfig         |   10 ++
 net/iovnl/Makefile        |    1 
 net/iovnl/iovnl.c         |  260 +++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 444 insertions(+), 0 deletions(-)

diff --git a/include/linux/iovnl.h b/include/linux/iovnl.h
new file mode 100644
index 0000000..ac5fcd3
--- /dev/null
+++ b/include/linux/iovnl.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __LINUX_IOVNL_H__
+#define __LINUX_IOVNL_H__
+
+#include <linux/types.h>
+
+#define IOVNL_PROTO_VERSION 1
+
+/**
+ * IOV netlink (IOVNL) adds I/O Virtualization control support to a master
+ * device (MD) netdev interface.  The MD (e.g. SR-IOV PF) will set/get
+ * control settings on behalf of a slave netdevice (e.g. SR-IOV VF).  The
+ * design allows for the degenerative case where master and slave are the
+ * same netdev interface.
+ *
+ * One control setting example is MAC/VLAN settings for a VF.  Another
+ * example control setting is a port-profile for a VF.  A port-profile is an
+ * identifier that defines policy-based settings on the network port
+ * backing the VF.  The network port settings examples are VLAN membership,
+ * QoS settings, and L2 security settings, typical of a data center network.
+ *
+ * This file defines an rtnetlink interface to allow setting of IOVNL
+ * on capable netdev devices.
+ */
+
+struct iovnlmsg {
+	__u8	family;
+	__u8	cmd;
+	__u16	pad;
+};
+
+/**
+ * enum iovnl_cmds - supported IOV commands
+ *
+ * @IOV_CMD_UNDEFINED: unspecified command to catch errors
+ * @IOV_CMD_SET_PORT_PROFILE: set the port-profile on the device
+ * @IOV_CMD_UNSET_PORT_PROFILE: clear port-profile on the device
+ * @IOV_CMD_GET_PORT_PROFILE_STATUS: return status of last
+ *   IOV_CMD_SET_PORT_PROFILE command
+ * @IOV_SET_MAC_VLAN: Set the MAC address and VLAN on the device
+ */
+enum iovnl_cmds {
+	IOV_CMD_UNDEFINED,
+
+	IOV_CMD_SET_PORT_PROFILE,
+	IOV_CMD_UNSET_PORT_PROFILE,
+	IOV_CMD_GET_PORT_PROFILE_STATUS,
+
+	IOV_CMD_SET_MAC_VLAN,
+
+	__IOV_CMD_ENUM_MAX,
+	IOV_CMD_MAX = __IOV_CMD_ENUM_MAX - 1,
+};
+
+/**
+ * enum iovnl_attrs - IOV top-level netlink attributes
+ *
+ * @IOV_ATTR_UNDEFINED: unspecified attribute to catch errors
+ * @IOV_ATTR_IFNAME: interface name of master (PF) net device (NLA_NUL_STRING)
+ * @IOV_ATTR_VF_IFNAME: interface name of target VF device (NLA_NUL_STRING)
+ * @IOV_ATTR_PORT_PROFILE: port-profile name to assign to device
+ *   (NLA_NUL_STRING)
+ * @IOV_ATTR_CLIENT_NAME: client name (NLA_NUL_STRING)
+ * @IOV_ATTR_HOST_UUID: host UUID (NLA_NUL_STRING)
+ * @IOV_ATTR_PORT_PROFILE_STATUS: status of last IOV_CMD_SET_PORT_PROFILE
+ *   command (NLA_U8)
+ * @IOV_ATTR_MAC_ADDR: device station MAC address (NLA_U8[6])
+ * @IOV_ATTR_VLAN: device 8021q VLAN ID (NLA_U16)
+ # @IOV_ATTR_STATUS: cmd return status code
+ */
+enum iovnl_attrs {
+	IOV_ATTR_UNDEFINED,
+
+	IOV_ATTR_IFNAME,
+	IOV_ATTR_VF_IFNAME,
+
+	IOV_ATTR_PORT_PROFILE,
+	IOV_ATTR_CLIENT_NAME,
+	IOV_ATTR_HOST_UUID,
+	IOV_ATTR_PORT_PROFILE_STATUS,
+
+	IOV_ATTR_MAC_ADDR,
+	IOV_ATTR_VLAN,
+
+	IOV_ATTR_STATUS,
+
+	__IOV_ATTR_ENUM_MAX,
+	IOV_ATTR_MAX = __IOV_ATTR_ENUM_MAX - 1,
+};
+
+/**
+ * enum iovnl_port_profile_status - IOV_ATTR_PORT_PROFILE_STATUS status
+ * return codes
+ *
+ * @IOV_PORT_PROFILE_STATUS_UNKNOWN: unspecified to catch errors
+ * @IOV_PORT_PROFILE_STATUS_SUCCESS:  port-profile aiovlied successfully
+ * @IOV_PORT_PROFILE_STATUS_ERROR: port-profile setting had error
+ * @IOV_PORT_PROFILE_STATUS_INPROGRESS: port-profile setting in-progress
+ */
+enum iovnl_port_profile_status {
+	IOV_PORT_PROFILE_STATUS_UNKNOWN,
+	IOV_PORT_PROFILE_STATUS_SUCCESS,
+	IOV_PORT_PROFILE_STATUS_ERROR,
+	IOV_PORT_PROFILE_STATUS_INPROGRESS,
+};
+
+#endif /* __LINUX_IOVNL_H__ */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 649a025..b531b0d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -50,6 +50,7 @@
 #ifdef CONFIG_DCB
 #include <net/dcbnl.h>
 #endif
+#include <net/iovnl.h>
 
 struct vlan_group;
 struct netpoll_info;
@@ -1048,6 +1049,9 @@ struct net_device {
 	const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
 
+	/* IOV netlink ops */
+	const struct iovnl_ops *iovnl_ops;
+
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	/* max exchange id for FCoE LRO by ddp */
 	unsigned int		fcoe_ddp_xid;
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index d1c7c90..aafadf7 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -113,6 +113,11 @@ enum {
 	RTM_SETDCB,
 #define RTM_SETDCB RTM_SETDCB
 
+	RTM_GETIOV = 82,
+#define RTM_GETIOV RTM_GETIOV
+	RTM_SETIOV,
+#define RTM_SETIOV RTM_SETIOV
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
diff --git a/include/net/iovnl.h b/include/net/iovnl.h
new file mode 100644
index 0000000..c353eee
--- /dev/null
+++ b/include/net/iovnl.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __NET_IOVNL_H__
+#define __NET_IOVNL_H__
+
+/*
+ * Ops struct for the netlink callbacks.  Used by IOVNL-enabled drivers through
+ * the netdevice struct.
+ */
+struct iovnl_ops {
+	int (*set_port_profile)(struct net_device *, struct net_device *,
+		char *, u8 *, char *, char *);
+	int (*unset_port_profile)(struct net_device *, struct net_device *);
+	int (*get_port_profile_status)(struct net_device *,
+		struct net_device *);
+	int (*set_mac_vlan)(struct net_device *, struct net_device *,
+		u8 *, u16);
+};
+
+#endif /* __NET_IOVNL_H__ */
diff --git a/net/Kconfig b/net/Kconfig
index 0d68b40..aca5de0 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -203,6 +203,7 @@ source "net/phonet/Kconfig"
 source "net/ieee802154/Kconfig"
 source "net/sched/Kconfig"
 source "net/dcb/Kconfig"
+source "net/iovnl/Kconfig"
 
 config RPS
 	boolean
diff --git a/net/Makefile b/net/Makefile
index cb7bdc1..23589e9 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -61,6 +61,9 @@ obj-$(CONFIG_CAIF)		+= caif/
 ifneq ($(CONFIG_DCB),)
 obj-y				+= dcb/
 endif
+ifneq ($(CONFIG_IOVNL),)
+obj-y				+= iovnl/
+endif
 obj-y				+= ieee802154/
 
 ifeq ($(CONFIG_NET),y)
diff --git a/net/iovnl/Kconfig b/net/iovnl/Kconfig
new file mode 100644
index 0000000..4548417
--- /dev/null
+++ b/net/iovnl/Kconfig
@@ -0,0 +1,10 @@
+config IOVNL
+	tristate "IOV rtnetlink support"
+	default n
+	---help---
+	  This enables support for configuring IOV
+	  on Ethernet adapters via rtnetlink.  Say 'Y'
+	  if you have a Ethernet adapter which supports network
+	  configuration using IOV rtnetlinl.
+
+	  If unsure, say N.
diff --git a/net/iovnl/Makefile b/net/iovnl/Makefile
new file mode 100644
index 0000000..9256d01
--- /dev/null
+++ b/net/iovnl/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_IOVNL) += iovnl.o
diff --git a/net/iovnl/iovnl.c b/net/iovnl/iovnl.c
new file mode 100644
index 0000000..ce9db50
--- /dev/null
+++ b/net/iovnl/iovnl.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/iovnl.h>
+#include <net/netlink.h>
+#include <net/rtnetlink.h>
+#include <net/iovnl.h>
+#include <net/sock.h>
+
+MODULE_AUTHOR("Roopa Prabhu <roprabhu@cisco.com, "
+	"Scott Feldman <scofeldm@cisco.com>");
+MODULE_DESCRIPTION("IOV netlink");
+MODULE_LICENSE("GPL");
+
+/* IOVNL netlink attributes policy */
+static const struct nla_policy iovnl_rtnl_policy[IOV_ATTR_MAX + 1] = {
+	[IOV_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
+	[IOV_ATTR_VF_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 },
+	[IOV_ATTR_PORT_PROFILE] =  { .type = NLA_NUL_STRING, .len = 32 },
+	[IOV_ATTR_CLIENT_NAME] = { .type = NLA_NUL_STRING, .len = 32 },
+	[IOV_ATTR_HOST_UUID] = { .type = NLA_NUL_STRING, .len = 64 },
+	[IOV_ATTR_PORT_PROFILE_STATUS] = { .type = NLA_U8 },
+	[IOV_ATTR_MAC_ADDR] = { .len = 6 },
+	[IOV_ATTR_VLAN] = { .type = NLA_U16 },
+	[IOV_ATTR_STATUS] = { .type = NLA_U8 },
+};
+
+/* standard netlink reply call */
+static int iovnl_reply(u8 value, u8 event, u8 cmd, u8 attr, u32 pid,
+	u32 seq, u16 flags)
+{
+	struct sk_buff *skb;
+	struct iovnlmsg *iov;
+	struct nlmsghdr *nlh;
+	int ret = -EINVAL;
+
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return ret;
+
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*iov), flags);
+
+	iov = NLMSG_DATA(nlh);
+	iov->family = AF_UNSPEC;
+	iov->cmd = cmd;
+	iov->pad = 0;
+
+	ret = nla_put_u8(skb, attr, value);
+	if (ret)
+		goto err;
+
+	/* end the message, assign the nlmsg_len. */
+	nlmsg_end(skb, nlh);
+	ret = rtnl_unicast(skb, &init_net, pid);
+	if (ret)
+		return -EINVAL;
+
+	return 0;
+nlmsg_failure:
+err:
+	kfree_skb(skb);
+	return ret;
+}
+
+static int iovnl_get_port_profile_status(struct net_device *dev,
+	struct net_device *vf_dev, u32 pid, u32 seq, u16 flags)
+{
+	int ret;
+
+	if (!dev->iovnl_ops->get_port_profile_status)
+		return -EINVAL;
+
+	ret = dev->iovnl_ops->get_port_profile_status(dev, vf_dev);
+
+	return  iovnl_reply(ret, RTM_GETIOV,
+		IOV_CMD_GET_PORT_PROFILE_STATUS, IOV_ATTR_PORT_PROFILE_STATUS,
+		pid, seq, flags);
+}
+
+
+static int iovnl_set_port_profile(struct net_device *dev,
+	struct net_device *vf_dev, struct nlattr **tb,
+	u32 pid, u32 seq, u16 flags)
+{
+	int i, ret;
+	char *port_profile = NULL;
+	u8 *mac_addr = NULL;
+	char *client_name = NULL;
+	char *host_uuid = NULL;
+
+	if (!tb[IOV_ATTR_PORT_PROFILE] || !dev->iovnl_ops->set_port_profile)
+		return -EINVAL;
+
+	for (i = 0; i <= IOV_ATTR_MAX; i++) {
+		if (!tb[i])
+			continue;
+		switch (tb[i]->nla_type) {
+		case IOV_ATTR_PORT_PROFILE:
+			port_profile = nla_data(tb[i]);
+			break;
+		case IOV_ATTR_MAC_ADDR:
+			mac_addr = nla_data(tb[i]);
+			break;
+		case IOV_ATTR_CLIENT_NAME:
+			client_name = nla_data(tb[i]);
+			break;
+		case IOV_ATTR_HOST_UUID:
+			host_uuid = nla_data(tb[i]);
+			break;
+		}
+	}
+
+	ret = dev->iovnl_ops->set_port_profile(dev, vf_dev,
+		port_profile, mac_addr, client_name, host_uuid);
+
+	return iovnl_reply(ret, RTM_SETIOV, IOV_CMD_SET_PORT_PROFILE,
+		IOV_ATTR_STATUS, pid, seq, flags);
+}
+
+static int iovnl_set_mac_vlan(struct net_device *dev,
+	struct net_device *vf_dev, struct nlattr **tb,
+	u32 pid, u32 seq, u16 flags)
+{
+	int i, ret;
+	u8 *mac_addr = NULL;
+	u16 vlan = 0;
+
+	if (!dev->iovnl_ops->set_mac_vlan)
+		return -EINVAL;
+
+	for (i = 0; i <= IOV_ATTR_MAX; i++) {
+		if (!tb[i])
+			continue;
+		switch (tb[i]->nla_type) {
+		case IOV_ATTR_MAC_ADDR:
+			mac_addr = nla_data(tb[i]);
+			break;
+		case IOV_ATTR_VLAN:
+			vlan = nla_get_u16(tb[i]);
+			break;
+		}
+	}
+
+	ret = dev->iovnl_ops->set_mac_vlan(dev, vf_dev,
+		mac_addr, vlan);
+
+	return iovnl_reply(ret, RTM_SETIOV, IOV_CMD_SET_MAC_VLAN,
+		IOV_ATTR_STATUS, pid, seq, flags);
+}
+
+static int iovnl_unset_port_profile(struct net_device *dev,
+	struct net_device *vf_dev, struct nlattr **tb,
+	u32 pid, u32 seq, u16 flags)
+{
+	int ret;
+
+	if (!dev->iovnl_ops->unset_port_profile)
+		return -EINVAL;
+
+	ret = dev->iovnl_ops->unset_port_profile(dev, vf_dev);
+
+	return iovnl_reply(ret, RTM_SETIOV, IOV_CMD_UNSET_PORT_PROFILE,
+		IOV_ATTR_STATUS, pid, seq, flags);
+}
+
+static int iovnl_doit(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct net *net = sock_net(skb->sk);
+	struct net_device *dev;
+	struct net_device *vf_dev = NULL;
+	struct iovnlmsg  *iov = (struct iovnlmsg *)NLMSG_DATA(nlh);
+	struct nlattr *tb[IOV_ATTR_MAX + 1];
+	u32 pid = skb ? NETLINK_CB(skb).pid : 0;
+	int ret;
+
+	if (!net_eq(net, &init_net))
+		return -EINVAL;
+
+	ret = nlmsg_parse(nlh, sizeof(*iov), tb, IOV_ATTR_MAX,
+		iovnl_rtnl_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[IOV_ATTR_IFNAME])
+		return -EINVAL;
+
+	dev = dev_get_by_name(&init_net, nla_data(tb[IOV_ATTR_IFNAME]));
+	if (!dev)
+		return -EINVAL;
+
+	if (tb[IOV_ATTR_VF_IFNAME])
+		vf_dev = dev_get_by_name(&init_net,
+			nla_data(tb[IOV_ATTR_VF_IFNAME]));
+
+	if (!dev->iovnl_ops)
+		goto errout;
+
+	switch (iov->cmd) {
+	case IOV_CMD_SET_PORT_PROFILE:
+		ret = iovnl_set_port_profile(dev, vf_dev,
+			tb, pid, nlh->nlmsg_seq, nlh->nlmsg_flags);
+		goto out;
+	case IOV_CMD_UNSET_PORT_PROFILE:
+		ret = iovnl_unset_port_profile(dev, vf_dev,
+			tb, pid, nlh->nlmsg_seq, nlh->nlmsg_flags);
+		goto out;
+	case IOV_CMD_GET_PORT_PROFILE_STATUS:
+		ret = iovnl_get_port_profile_status(dev, vf_dev,
+			pid, nlh->nlmsg_seq, nlh->nlmsg_flags);
+		goto out;
+	case IOV_CMD_SET_MAC_VLAN:
+		ret = iovnl_set_mac_vlan(dev, vf_dev,
+			tb, pid, nlh->nlmsg_seq, nlh->nlmsg_flags);
+		goto out;
+	default:
+		goto errout;
+	}
+errout:
+	ret = -EINVAL;
+out:
+	dev_put(dev);
+	if (vf_dev)
+		dev_put(vf_dev);
+
+	return ret;
+}
+
+static int __init iovnl_init(void)
+{
+	rtnl_register(PF_UNSPEC, RTM_GETIOV, iovnl_doit, NULL);
+	rtnl_register(PF_UNSPEC, RTM_SETIOV, iovnl_doit, NULL);
+
+	return 0;
+}
+module_init(iovnl_init);
+
+static void __exit iovnl_exit(void)
+{
+	rtnl_unregister(PF_UNSPEC, RTM_GETIOV);
+	rtnl_unregister(PF_UNSPEC, RTM_SETIOV);
+}
+module_exit(iovnl_exit);


^ permalink raw reply related

* [net-next PATCH 0/2] iovnl netlink ops + enic dynamic vnics
From: Scott Feldman @ 2010-04-19 19:18 UTC (permalink / raw)
  To: davem; +Cc: netdev, chrisw

Patch 1/2 adds new I/O Virtualization netlink ops:

  IOV netlink (IOVNL) adds I/O Virtualization control support to a master
  device (MD) netdev interface.  The MD (e.g. SR-IOV PF) will set/get
  control settings on behalf of a slave netdevice (e.g. SR-IOV VF).  The
  design allows for the case where master and slave are the
  same netdev interface.

  The ops currently defined are:
  	set_mac_vlan: set mac+vlan on VF
	set_port_profile: set port-profile on VF
	unset_port_profile: unset port-profile on VF

Patch 2/2 adds IOV netlink ops support to enic dynamic vnics:

  Add enic iovnl ops to support setting port-profile for dynamic vnics.  Enic
  dynamic vnics are just like normal enic eth vnics except dynamic vnics require
  an extra configuration step to assign a port-profile identifier to the
  interface before the interface is useable. Once assigned, link comes up
  on the interface and is ready for I/O.  The port-profile is used to configure
  the network port assigned to the interface.  The network port configuration
  includes VLAN membership, QoS policies, and port security settings typical
  of a data center network.

Signed-off-by: Scott Feldman <scofeldm@cisco.com>
Signed-off-by: Roopa Prabhu<roprabhu@cisco.com>

^ permalink raw reply

* Re: tip: origin tree build failure, [patch] fix isdn/gigaset build failure
From: David Miller @ 2010-04-19 19:06 UTC (permalink / raw)
  To: torvalds; +Cc: mingo, tilman, akpm, netdev, linux-kernel
In-Reply-To: <alpine.LFD.2.00.1004191200050.14500@i5.linux-foundation.org>

From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 19 Apr 2010 12:00:49 -0700 (PDT)

> On Mon, 19 Apr 2010, Linus Torvalds wrote:
>> 
>> Maybe add the #include <linux/sched.h> into gigaset.h, instead of 
>> common.c?
> 
> This compiled for me, although the only thing I tried was just turning all 
> the gigaset options to 'y'. Maybe some other config doesn't work. So I 
> committed it as likely to fix things.

Thanks Linus.

Ingo, let us know if there is still some problems in your
build tests.

^ permalink raw reply

* Re: tip: origin tree build failure, [patch] fix isdn/gigaset build failure
From: Linus Torvalds @ 2010-04-19 19:00 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: David Miller, tilman, akpm, netdev, linux-kernel
In-Reply-To: <alpine.LFD.2.00.1004191112550.14500@i5.linux-foundation.org>



On Mon, 19 Apr 2010, Linus Torvalds wrote:
> 
> Maybe add the #include <linux/sched.h> into gigaset.h, instead of 
> common.c?

This compiled for me, although the only thing I tried was just turning all 
the gigaset options to 'y'. Maybe some other config doesn't work. So I 
committed it as likely to fix things.

		Linus

^ permalink raw reply

* Re: tip: origin tree build failure, [patch] fix isdn/gigaset build failure
From: Linus Torvalds @ 2010-04-19 18:14 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: David Miller, tilman, akpm, netdev, linux-kernel
In-Reply-To: <20100419180520.GA9244@elte.hu>



On Mon, 19 Apr 2010, Ingo Molnar wrote:
> 
> Note, i just found that my patch is not enough as we fail the build elsewhere as well:

Maybe add the #include <linux/sched.h> into gigaset.h, instead of 
common.c?

IOW..

		Linus
---
 drivers/isdn/gigaset/gigaset.h |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/isdn/gigaset/gigaset.h b/drivers/isdn/gigaset/gigaset.h
index d32efb6..05947f9 100644
--- a/drivers/isdn/gigaset/gigaset.h
+++ b/drivers/isdn/gigaset/gigaset.h
@@ -20,6 +20,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/ctype.h>

^ permalink raw reply related

* Re: tip: origin tree build failure, [patch] fix isdn/gigaset build failure
From: Ingo Molnar @ 2010-04-19 18:05 UTC (permalink / raw)
  To: David Miller; +Cc: tilman, torvalds, akpm, netdev, linux-kernel
In-Reply-To: <20100419.103304.35342954.davem@davemloft.net>


* David Miller <davem@davemloft.net> wrote:

> From: Ingo Molnar <mingo@elte.hu>
> Date: Mon, 19 Apr 2010 19:27:31 +0200
> 
> > Introduced by commit b91ecb00 that got pushed out today. That change removed 
> > an implicit sched.h inclusion that came in via slab.h.
> > 
> > The patch below fixes it by adding the sched.h dependency.
> 
> Thanks Ingo, Linus please apply:
> 
> Acked-by: David S. Miller <davem@davemloft.net>

Note, i just found that my patch is not enough as we fail the build elsewhere as well:

drivers/isdn/gigaset/proc.c:52: error: 'TASK_UNINTERRUPTIBLE' undeclared (first use in this function)
drivers/isdn/gigaset/proc.c:52: error: (Each undeclared identifier is reported only once
drivers/isdn/gigaset/proc.c:52: error: for each function it appears in.)
drivers/isdn/gigaset/proc.c:52: error: implicit declaration of function 'schedule'
drivers/isdn/gigaset/interface.c:49: error: 'TASK_UNINTERRUPTIBLE' undeclared (first use in this function)
drivers/isdn/gigaset/interface.c:49: error: (Each undeclared identifier is reported only once
drivers/isdn/gigaset/interface.c:49: error: for each function it appears in.)
drivers/isdn/gigaset/interface.c:49: error: implicit declaration of function 'schedule'
drivers/isdn/gigaset/interface.c:83: error: 'TASK_UNINTERRUPTIBLE' undeclared (first use in this function)
drivers/isdn/gigaset/ev-layer.c:981: error: 'TASK_NORMAL' undeclared (first use in this function)
drivers/isdn/gigaset/ev-layer.c:981: error: (Each undeclared identifier is reported only once
drivers/isdn/gigaset/ev-layer.c:981: error: for each function it appears in.)
drivers/isdn/gigaset/ev-layer.c:1001: error: 'TASK_NORMAL' undeclared (first use in this function)
drivers/isdn/gigaset/ev-layer.c:1495: error: 'TASK_NORMAL' undeclared (first use in this function)

i'd suggest a revert of b91ecb00 instead.

	Ingo

^ permalink raw reply

* Re: tip: origin tree build failure, [patch] fix isdn/gigaset build failure
From: David Miller @ 2010-04-19 17:33 UTC (permalink / raw)
  To: mingo; +Cc: tilman, torvalds, akpm, netdev, linux-kernel
In-Reply-To: <20100419172731.GA4358@elte.hu>

From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 19 Apr 2010 19:27:31 +0200

> Introduced by commit b91ecb00 that got pushed out today. That change removed 
> an implicit sched.h inclusion that came in via slab.h.
> 
> The patch below fixes it by adding the sched.h dependency.

Thanks Ingo, Linus please apply:

Acked-by: David S. Miller <davem@davemloft.net>

> 	Ingo
> 
> diff --git a/drivers/isdn/gigaset/common.c b/drivers/isdn/gigaset/common.c
> index f6f45f2..a3aa17f 100644
> --- a/drivers/isdn/gigaset/common.c
> +++ b/drivers/isdn/gigaset/common.c
> @@ -16,6 +16,7 @@
>  #include "gigaset.h"
>  #include <linux/module.h>
>  #include <linux/moduleparam.h>
> +#include <linux/sched.h>
>  
>  /* Version Information */
>  #define DRIVER_AUTHOR "Hansjoerg Lipp <hjlipp@web.de>, Tilman Schmidt <tilman@imap.cc>, Stefan Eilers"

^ permalink raw reply

* tip: origin tree build failure, [patch] fix isdn/gigaset build failure
From: Ingo Molnar @ 2010-04-19 17:27 UTC (permalink / raw)
  To: David Miller, Tilman Schmidt; +Cc: torvalds, akpm, netdev, linux-kernel
In-Reply-To: <20100419.003857.260094283.davem@davemloft.net>


* David Miller <davem@davemloft.net> wrote:

> Tilman Schmidt (1):
>       gigaset: include cleanup cleanup
> 
>  drivers/isdn/gigaset/bas-gigaset.c       |    5 -----
>  drivers/isdn/gigaset/capi.c              |    2 --
>  drivers/isdn/gigaset/common.c            |    2 --
>  drivers/isdn/gigaset/gigaset.h           |    2 +-
>  drivers/isdn/gigaset/i4l.c               |    1 -
>  drivers/isdn/gigaset/interface.c         |    1 -
>  drivers/isdn/gigaset/proc.c              |    1 -
>  drivers/isdn/gigaset/ser-gigaset.c       |    3 ---
>  drivers/isdn/gigaset/usb-gigaset.c       |    4 ----

-tip testing triggered the following build failure (x86 allyesconfig):

drivers/isdn/gigaset/common.c: In function 'setflags':
drivers/isdn/gigaset/common.c:99: error: implicit declaration of function 'set_current_state'
drivers/isdn/gigaset/common.c:99: error: 'TASK_INTERRUPTIBLE' undeclared (first use in this function)
drivers/isdn/gigaset/common.c:99: error: (Each undeclared identifier is reported only once
drivers/isdn/gigaset/common.c:99: error: for each function it appears in.)
drivers/isdn/gigaset/common.c:100: error: implicit declaration of function 'schedule_timeout'
drivers/isdn/gigaset/common.c: In function 'cleanup_cs':
drivers/isdn/gigaset/common.c:900: error: 'TASK_INTERRUPTIBLE' undeclared (first use in this function)
drivers/isdn/gigaset/common.c: In function 'gigaset_start':
drivers/isdn/gigaset/common.c:942: error: 'TASK_UNINTERRUPTIBLE' undeclared (first use in this function)
drivers/isdn/gigaset/common.c:942: error: implicit declaration of function 'schedule'
drivers/isdn/gigaset/common.c: In function 'gigaset_shutdown':
drivers/isdn/gigaset/common.c:978: error: 'TASK_UNINTERRUPTIBLE' undeclared (first use in this function)
drivers/isdn/gigaset/common.c: In function 'gigaset_stop':
drivers/isdn/gigaset/common.c:1005: error: 'TASK_UNINTERRUPTIBLE' undeclared (first use in this function)
make[1]: *** [drivers/isdn/gigaset/common.o] Error 1
make: *** [drivers/isdn/gigaset/common.o] Error 2

Introduced by commit b91ecb00 that got pushed out today. That change removed 
an implicit sched.h inclusion that came in via slab.h.

The patch below fixes it by adding the sched.h dependency.

	Ingo

diff --git a/drivers/isdn/gigaset/common.c b/drivers/isdn/gigaset/common.c
index f6f45f2..a3aa17f 100644
--- a/drivers/isdn/gigaset/common.c
+++ b/drivers/isdn/gigaset/common.c
@@ -16,6 +16,7 @@
 #include "gigaset.h"
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/sched.h>
 
 /* Version Information */
 #define DRIVER_AUTHOR "Hansjoerg Lipp <hjlipp@web.de>, Tilman Schmidt <tilman@imap.cc>, Stefan Eilers"

^ permalink raw reply related

* Re: [PATCH RFC]: soreuseport: Bind multiple sockets to same port
From: Eric Dumazet @ 2010-04-19 17:16 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, netdev
In-Reply-To: <m2z65634d661004190838hd16ab01dpa074ce6aeb3572e8@mail.gmail.com>

Le lundi 19 avril 2010 à 08:38 -0700, Tom Herbert a écrit :

> Calling it a nightmare be be a little strong.  It is true that this
> could create long chains that need to be walked, but this might be
> done with good cache locality of the structures.  In any case, the
> lock contention seems to overshadow the cost of this; we were able to
> increase max number of DNS queries/sec by about 60% (I will try to
> publish some numbers this week).
> 

I have no doubt this patch increases performances, but I think its not a
long term solution. We can do better ;)


> >
> I agree that CPU awareness is desirable, but I'm really hesitant to
> resort to pinning; this can become pretty tangled on a shared server
> running a bunch of different applications-- would be nice if the
> kernel can just figure out the right thing to do :-)
> 

OK I can understand this, but please use an array of sockets bound to
same tuple, so that lookup stay constant, regardless of number of
sockets. UDP fast path is a sensible area for financial applications.

Once anchor is found in normal udp hashtable, the choice of a random
target in its array is O(1) too (you could use skb->rxhash if not null)

Hmm, maybe we even could use same mechanism for multicast, since we
currently perform a very expensive loop.




^ permalink raw reply

* Re: [PATCH net-next-2.6] rps: shortcut net_rps_action()
From: Tom Herbert @ 2010-04-19 16:02 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Changli Gao, David Miller, netdev
In-Reply-To: <1271689653.3845.73.camel@edumazet-laptop>

>
> [PATCH net-next-2.6] rps: shortcut net_rps_action()
>
> net_rps_action() is a bit expensive on NR_CPUS=64..4096 kernels, even if
> RPS is not active.
>
> Tom Herbert used two bitmasks to hold information needed to send IPI,
> but a single LIFO list seems more appropriate.
>
Yes, this patch is an improvement over that.

> Move all RPS logic into net_rps_action() to cleanup net_rx_action() code
> (remove two ifdefs)
>
> Move rps_remote_softirq_cpus into softnet_data to share its first cache
> line, filling an existing hole.
>
> In a future patch, we could call net_rps_action() from process_backlog()
> to make sure we send IPI before handling this cpu backlog.
>
Yes.  I did some quick experiments last night and there does seem to
be some gains in doing this.

> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  include/linux/netdevice.h |    9 ++--
>  net/core/dev.c            |   79 ++++++++++++++----------------------
>  2 files changed, 38 insertions(+), 50 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 649a025..83ab3da 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1381,17 +1381,20 @@ static inline int unregister_gifconf(unsigned int family)
>  }
>
>  /*
> - * Incoming packets are placed on per-cpu queues so that
> - * no locking is needed.
> + * Incoming packets are placed on per-cpu queues
>  */
>  struct softnet_data {
>        struct Qdisc            *output_queue;
>        struct list_head        poll_list;
>        struct sk_buff          *completion_queue;
>
> -       /* Elements below can be accessed between CPUs for RPS */
>  #ifdef CONFIG_RPS
> +       struct softnet_data     *rps_ipi_list;
> +
> +       /* Elements below can be accessed between CPUs for RPS */
>        struct call_single_data csd ____cacheline_aligned_in_smp;
> +       struct softnet_data     *rps_ipi_next;
> +       unsigned int            cpu;
>        unsigned int            input_queue_head;
>  #endif
>        struct sk_buff_head     input_pkt_queue;
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 7abf959..f6ff2cf 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2346,21 +2346,6 @@ done:
>        return cpu;
>  }
>
> -/*
> - * This structure holds the per-CPU mask of CPUs for which IPIs are scheduled
> - * to be sent to kick remote softirq processing.  There are two masks since
> - * the sending of IPIs must be done with interrupts enabled.  The select field
> - * indicates the current mask that enqueue_backlog uses to schedule IPIs.
> - * select is flipped before net_rps_action is called while still under lock,
> - * net_rps_action then uses the non-selected mask to send the IPIs and clears
> - * it without conflicting with enqueue_backlog operation.
> - */
> -struct rps_remote_softirq_cpus {
> -       cpumask_t mask[2];
> -       int select;
> -};
> -static DEFINE_PER_CPU(struct rps_remote_softirq_cpus, rps_remote_softirq_cpus);
> -
>  /* Called from hardirq (IPI) context */
>  static void trigger_softirq(void *data)
>  {
> @@ -2403,10 +2388,12 @@ enqueue:
>                if (napi_schedule_prep(&queue->backlog)) {
>  #ifdef CONFIG_RPS
>                        if (cpu != smp_processor_id()) {
> -                               struct rps_remote_softirq_cpus *rcpus =
> -                                   &__get_cpu_var(rps_remote_softirq_cpus);
> +                               struct softnet_data *myqueue;
> +
> +                               myqueue = &__get_cpu_var(softnet_data);
> +                               queue->rps_ipi_next = myqueue->rps_ipi_list;
> +                               myqueue->rps_ipi_list = queue;
>
> -                               cpu_set(cpu, rcpus->mask[rcpus->select]);
>                                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
>                                goto enqueue;
>                        }
> @@ -2911,7 +2898,9 @@ int netif_receive_skb(struct sk_buff *skb)
>  }
>  EXPORT_SYMBOL(netif_receive_skb);
>
> -/* Network device is going away, flush any packets still pending  */
> +/* Network device is going away, flush any packets still pending
> + * Called with irqs disabled.
> + */
>  static void flush_backlog(void *arg)
>  {
>        struct net_device *dev = arg;
> @@ -3340,24 +3329,33 @@ void netif_napi_del(struct napi_struct *napi)
>  }
>  EXPORT_SYMBOL(netif_napi_del);
>
> -#ifdef CONFIG_RPS
>  /*
> - * net_rps_action sends any pending IPI's for rps.  This is only called from
> - * softirq and interrupts must be enabled.
> + * net_rps_action sends any pending IPI's for rps.
> + * Note: called with local irq disabled, but exits with local irq enabled.
>  */
> -static void net_rps_action(cpumask_t *mask)
> +static void net_rps_action(void)
>  {
> -       int cpu;
> +#ifdef CONFIG_RPS
> +       struct softnet_data *locqueue = &__get_cpu_var(softnet_data);
> +       struct softnet_data *remqueue = locqueue->rps_ipi_list;
>
> -       /* Send pending IPI's to kick RPS processing on remote cpus. */
> -       for_each_cpu_mask_nr(cpu, *mask) {
> -               struct softnet_data *queue = &per_cpu(softnet_data, cpu);
> -               if (cpu_online(cpu))
> -                       __smp_call_function_single(cpu, &queue->csd, 0);
> -       }
> -       cpus_clear(*mask);
> -}
> +       if (remqueue) {
> +               locqueue->rps_ipi_list = NULL;
> +
> +               local_irq_enable();
> +
> +               /* Send pending IPI's to kick RPS processing on remote cpus. */
> +               while (remqueue) {
> +                       struct softnet_data *next = remqueue->rps_ipi_next;
> +                       if (cpu_online(remqueue->cpu))
> +                               __smp_call_function_single(remqueue->cpu,
> +                                                          &remqueue->csd, 0);
> +                       remqueue = next;
> +               }
> +       } else
>  #endif
> +               local_irq_enable();
> +}
>
>  static void net_rx_action(struct softirq_action *h)
>  {
> @@ -3365,10 +3363,6 @@ static void net_rx_action(struct softirq_action *h)
>        unsigned long time_limit = jiffies + 2;
>        int budget = netdev_budget;
>        void *have;
> -#ifdef CONFIG_RPS
> -       int select;
> -       struct rps_remote_softirq_cpus *rcpus;
> -#endif
>
>        local_irq_disable();
>
> @@ -3431,17 +3425,7 @@ static void net_rx_action(struct softirq_action *h)
>                netpoll_poll_unlock(have);
>        }
>  out:
> -#ifdef CONFIG_RPS
> -       rcpus = &__get_cpu_var(rps_remote_softirq_cpus);
> -       select = rcpus->select;
> -       rcpus->select ^= 1;
> -
> -       local_irq_enable();
> -
> -       net_rps_action(&rcpus->mask[select]);
> -#else
> -       local_irq_enable();
> -#endif
> +       net_rps_action();
>
>  #ifdef CONFIG_NET_DMA
>        /*
> @@ -5841,6 +5825,7 @@ static int __init net_dev_init(void)
>                queue->csd.func = trigger_softirq;
>                queue->csd.info = queue;
>                queue->csd.flags = 0;
> +               queue->cpu = i;
>  #endif
>
>                queue->backlog.poll = process_backlog;
>
>
>

^ permalink raw reply

* ep93xx_eth stopps receiving packages
From: Stefan Agner @ 2010-04-19 15:38 UTC (permalink / raw)
  To: netdev; +Cc: buytenh

Hello,

I'm using Linux 2.6.32.9 on a technologic systems TS-7250 SBC board, with
the ep93xx_eth driver for networking. On three identical, but independent
systems I noted that the system is unreachable after a while. On a serial
terminal I noted that only the TX counter counts onward, RX stays where it is,
no matter if i try to ping from or to the system. Wireshark tells me exactly
that too: I see helpless ARP requests which gets answered, but no ICMP. The
system doesnt receive the ARP requests, and just sends another one.
With a simple program which sends small packages in a fast pace I can
reproduce the problem after several seconds (additional CPU load seem to
provoke the problem even more). Remove and replug the network cable doesn't
solve the problem, but ifup/down does. I don't see any messages in dmesg,
memory is still available.

Is it a network driver problem or even a network stack problem?

It looks to me like a race condition in the network driver, which can be
triggered by short packets.

What can I do to track the problem further down?

Thanks for hints,
Stefan

----------------------------------------------------------------
This message was sent using IMP, the Internet Messaging Program.


^ permalink raw reply

* Re: [PATCH RFC]: soreuseport: Bind multiple sockets to same port
From: Tom Herbert @ 2010-04-19 15:38 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, netdev
In-Reply-To: <1271662103.16881.7300.camel@edumazet-laptop>

> High perf DNS server on such machine would have 16 threads, and probably
> 64 threads in two years.
>
> I understand you want 16 UDP sockets to avoid lock contention, but
> __udp4_lib_lookup() becomes a nightmare (It may already be ...)
>
Calling it a nightmare be be a little strong.  It is true that this
could create long chains that need to be walked, but this might be
done with good cache locality of the structures.  In any case, the
lock contention seems to overshadow the cost of this; we were able to
increase max number of DNS queries/sec by about 60% (I will try to
publish some numbers this week).

> My idea was to add a cpu lookup key.
>
> thread0 would use a new setsockopt() option to bind a socket to a
> virtual cpu0. Then do its normal bind( port=53)
>
I agree that CPU awareness is desirable, but I'm really hesitant to
resort to pinning; this can become pretty tangled on a shared server
running a bunch of different applications-- would be nice if the
kernel can just figure out the right thing to do :-)

> ...
>
> threadN would use a new setsockopt() option to bind a socket to a
> virtual cpuN. Then do its normal bind( port=53)
>
> Each thread then do its normal worker loop.
>
> Then, when receiving a frame on cpuN, we would automatically select the
> right socket because its score is higher than others.
>
>
> Another possibility would be to extend socket structure to be able to
> have a dynamically sized queues/locks.
>
>
>
>

^ permalink raw reply

* Re: [PATCH]Add device drivers (GbE, Packet Hub) for Topcliff
From: Ben Hutchings @ 2010-04-19 15:37 UTC (permalink / raw)
  To: Masayuki Ohtake
  Cc: Jonathan Corbet, netdev, andrew.chih.howe.khor, Intel OTC, LKML,
	Wang, Qi, Wang, Yong Y
In-Reply-To: <002301cadfbc$ab2e1490$66f8800a@maildom.okisemi.com>

On Mon, 2010-04-19 at 21:34 +0900, Masayuki Ohtake wrote:
> Hi jon,
> Thanks for your suggestion again.
> I joined the netdev and send my patch to "netdev <netdev@vger.kernel.org>".
> 
> --
> Hello netdev users,
> 
> I developed the device drivers for Linux kernel 2.6.33-1.
> This time, I added the following drivers
>   - GbE device
>   - Packet HUB device
> 
> The GbE and Packet Hub device drivers are related with each other.
> Because I send patch to LKML <linux-kernel@vger.kernel.org> and the netdev
> <netdev@vger.kernel.org> mailing list.
> 
> Would you check them?
> 
> The patch is uploaded to our WEB site in Sourceforge.net,
> Because the patch size was large.
[...]

Patches must be sent directly to the list for review, with few
exceptions.

The size limit for the list is 100 K.  Given that none of your source
files are this larger, you should be able to split pch_gbe.patch into 3
or more groups of source files, each within this limit.  The changes to
drivers/net/Kconfig and drivers/net/Makefile should be in the last part.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* [PATCH net-next-2.6] rps: shortcut net_rps_action()
From: Eric Dumazet @ 2010-04-19 15:07 UTC (permalink / raw)
  To: Changli Gao, David Miller, Tom Herbert; +Cc: netdev
In-Reply-To: <1271686957.3845.49.camel@edumazet-laptop>

Le lundi 19 avril 2010 à 16:22 +0200, Eric Dumazet a écrit :

> 
> Hmm, I just read again, and I now remember Tom used a single bitmap,
> then we had to add a second set because of a possible race.
> 
> A list would be enough.
> 

Here is the updated patch, using a single list instead of bitmap

RFC status becomes official patch ;)

Thanks Changli for your array suggestion !


[PATCH net-next-2.6] rps: shortcut net_rps_action()

net_rps_action() is a bit expensive on NR_CPUS=64..4096 kernels, even if
RPS is not active.

Tom Herbert used two bitmasks to hold information needed to send IPI,
but a single LIFO list seems more appropriate.

Move all RPS logic into net_rps_action() to cleanup net_rx_action() code
(remove two ifdefs)

Move rps_remote_softirq_cpus into softnet_data to share its first cache
line, filling an existing hole.

In a future patch, we could call net_rps_action() from process_backlog()
to make sure we send IPI before handling this cpu backlog.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/netdevice.h |    9 ++--
 net/core/dev.c            |   79 ++++++++++++++----------------------
 2 files changed, 38 insertions(+), 50 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 649a025..83ab3da 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1381,17 +1381,20 @@ static inline int unregister_gifconf(unsigned int family)
 }
 
 /*
- * Incoming packets are placed on per-cpu queues so that
- * no locking is needed.
+ * Incoming packets are placed on per-cpu queues
  */
 struct softnet_data {
 	struct Qdisc		*output_queue;
 	struct list_head	poll_list;
 	struct sk_buff		*completion_queue;
 
-	/* Elements below can be accessed between CPUs for RPS */
 #ifdef CONFIG_RPS
+	struct softnet_data	*rps_ipi_list;
+
+	/* Elements below can be accessed between CPUs for RPS */
 	struct call_single_data	csd ____cacheline_aligned_in_smp;
+	struct softnet_data	*rps_ipi_next;
+	unsigned int		cpu;
 	unsigned int		input_queue_head;
 #endif
 	struct sk_buff_head	input_pkt_queue;
diff --git a/net/core/dev.c b/net/core/dev.c
index 7abf959..f6ff2cf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2346,21 +2346,6 @@ done:
 	return cpu;
 }
 
-/*
- * This structure holds the per-CPU mask of CPUs for which IPIs are scheduled
- * to be sent to kick remote softirq processing.  There are two masks since
- * the sending of IPIs must be done with interrupts enabled.  The select field
- * indicates the current mask that enqueue_backlog uses to schedule IPIs.
- * select is flipped before net_rps_action is called while still under lock,
- * net_rps_action then uses the non-selected mask to send the IPIs and clears
- * it without conflicting with enqueue_backlog operation.
- */
-struct rps_remote_softirq_cpus {
-	cpumask_t mask[2];
-	int select;
-};
-static DEFINE_PER_CPU(struct rps_remote_softirq_cpus, rps_remote_softirq_cpus);
-
 /* Called from hardirq (IPI) context */
 static void trigger_softirq(void *data)
 {
@@ -2403,10 +2388,12 @@ enqueue:
 		if (napi_schedule_prep(&queue->backlog)) {
 #ifdef CONFIG_RPS
 			if (cpu != smp_processor_id()) {
-				struct rps_remote_softirq_cpus *rcpus =
-				    &__get_cpu_var(rps_remote_softirq_cpus);
+				struct softnet_data *myqueue;
+
+				myqueue = &__get_cpu_var(softnet_data);
+				queue->rps_ipi_next = myqueue->rps_ipi_list;
+				myqueue->rps_ipi_list = queue;
 
-				cpu_set(cpu, rcpus->mask[rcpus->select]);
 				__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 				goto enqueue;
 			}
@@ -2911,7 +2898,9 @@ int netif_receive_skb(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_receive_skb);
 
-/* Network device is going away, flush any packets still pending  */
+/* Network device is going away, flush any packets still pending
+ * Called with irqs disabled.
+ */
 static void flush_backlog(void *arg)
 {
 	struct net_device *dev = arg;
@@ -3340,24 +3329,33 @@ void netif_napi_del(struct napi_struct *napi)
 }
 EXPORT_SYMBOL(netif_napi_del);
 
-#ifdef CONFIG_RPS
 /*
- * net_rps_action sends any pending IPI's for rps.  This is only called from
- * softirq and interrupts must be enabled.
+ * net_rps_action sends any pending IPI's for rps.
+ * Note: called with local irq disabled, but exits with local irq enabled.
  */
-static void net_rps_action(cpumask_t *mask)
+static void net_rps_action(void)
 {
-	int cpu;
+#ifdef CONFIG_RPS
+	struct softnet_data *locqueue = &__get_cpu_var(softnet_data);
+	struct softnet_data *remqueue = locqueue->rps_ipi_list;
 
-	/* Send pending IPI's to kick RPS processing on remote cpus. */
-	for_each_cpu_mask_nr(cpu, *mask) {
-		struct softnet_data *queue = &per_cpu(softnet_data, cpu);
-		if (cpu_online(cpu))
-			__smp_call_function_single(cpu, &queue->csd, 0);
-	}
-	cpus_clear(*mask);
-}
+	if (remqueue) {
+		locqueue->rps_ipi_list = NULL;
+
+		local_irq_enable();
+
+		/* Send pending IPI's to kick RPS processing on remote cpus. */
+		while (remqueue) {
+			struct softnet_data *next = remqueue->rps_ipi_next;
+			if (cpu_online(remqueue->cpu))
+				__smp_call_function_single(remqueue->cpu,
+							   &remqueue->csd, 0);
+			remqueue = next;
+		}
+	} else
 #endif
+		local_irq_enable();
+}
 
 static void net_rx_action(struct softirq_action *h)
 {
@@ -3365,10 +3363,6 @@ static void net_rx_action(struct softirq_action *h)
 	unsigned long time_limit = jiffies + 2;
 	int budget = netdev_budget;
 	void *have;
-#ifdef CONFIG_RPS
-	int select;
-	struct rps_remote_softirq_cpus *rcpus;
-#endif
 
 	local_irq_disable();
 
@@ -3431,17 +3425,7 @@ static void net_rx_action(struct softirq_action *h)
 		netpoll_poll_unlock(have);
 	}
 out:
-#ifdef CONFIG_RPS
-	rcpus = &__get_cpu_var(rps_remote_softirq_cpus);
-	select = rcpus->select;
-	rcpus->select ^= 1;
-
-	local_irq_enable();
-
-	net_rps_action(&rcpus->mask[select]);
-#else
-	local_irq_enable();
-#endif
+	net_rps_action();
 
 #ifdef CONFIG_NET_DMA
 	/*
@@ -5841,6 +5825,7 @@ static int __init net_dev_init(void)
 		queue->csd.func = trigger_softirq;
 		queue->csd.info = queue;
 		queue->csd.flags = 0;
+		queue->cpu = i;
 #endif
 
 		queue->backlog.poll = process_backlog;



^ permalink raw reply related

* Re: [RFC] rps: shortcut net_rps_action()
From: Eric Dumazet @ 2010-04-19 14:22 UTC (permalink / raw)
  To: Changli Gao; +Cc: Tom Herbert, David Miller, netdev
In-Reply-To: <1271683627.3845.44.camel@edumazet-laptop>

Le lundi 19 avril 2010 à 15:27 +0200, Eric Dumazet a écrit :

> This is not true Changli
> 
> Please read again all previous mails about RPS, or the code.
> 

Hmm, I just read again, and I now remember Tom used a single bitmap,
then we had to add a second set because of a possible race.

A list would be enough.




^ permalink raw reply

* Re: [RFC] rps: shortcut net_rps_action()
From: Eric Dumazet @ 2010-04-19 13:27 UTC (permalink / raw)
  To: Changli Gao; +Cc: Tom Herbert, David Miller, netdev
In-Reply-To: <p2u412e6f7f1004190528l65cc6253w8b7f43f2657b551d@mail.gmail.com>

Le lundi 19 avril 2010 à 20:28 +0800, Changli Gao a écrit :
> On Mon, Apr 19, 2010 at 8:14 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > As several packets can be enqueued for a given cpu, we would need to
> > keep bitmasks.
> > We would have to add one test in enqueue_to_backlog()
> >
> > if (cpu_test_and_set(cpu, mask)) {
> >        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
> >        array[nb++] = cpu;
> > }
> 
>         rps_lock(queue);
>         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
>                 if (queue->input_pkt_queue.qlen) {
> ...
>                 if (napi_schedule_prep(&queue->backlog)) {
> #ifdef CONFIG_RPS
>                         if (cpu != smp_processor_id()) {
>                                 struct rps_remote_softirq_cpus *rcpus =
>                                     &__get_cpu_var(rps_remote_softirq_cpus);
> 
>                                 cpu_set(cpu, rcpus->mask[rcpus->select]);
>                                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
>                                 goto enqueue;
>                         }
> #endif
>                         __napi_schedule(&queue->backlog);
>                 }
> 
> Only the first packet of a softnet.input_pkt_queue may trigger IPI, so
> we don't need to keep bitmasks.
> 

This is not true Changli

Please read again all previous mails about RPS, or the code.




^ permalink raw reply

* Re: rps perfomance WAS(Re: rps: question
From: jamal @ 2010-04-19 12:48 UTC (permalink / raw)
  To: Changli Gao
  Cc: Eric Dumazet, Rick Jones, David Miller, therbert, netdev, robert,
	andi
In-Reply-To: <s2z412e6f7f1004160758l823ea0cah61a409972d4865fe@mail.gmail.com>


Sorry, didnt respond to you - busyed out setting up before trying
to think a little more about this..

On Fri, 2010-04-16 at 22:58 +0800, Changli Gao wrote:

> >
> > cpu   Total     |rps_recv |rps_ipi
> > -----+----------+---------+---------
> > cpu0 | 002dc7f1 |00000000 |000f4246
> > cpu1 | 002dc804 |000f4240 |00000000
> > -------------------------------------
> >
> > So: cpu0 receive 0x2dc7f1 pkts accummulative over time and
> > redirected to cpu1 (mostly, the extra 5 maybe to leftover since i clear
> > the data) and for the test 0xf4246 times it generated an IPI. It can be
> > seen that total running for CPU1 is 0x2dc804 but in this one run it
> > received 1M packets (0xf4240).
> 
> I remeber you redirected all the traffic from cpu0 to cpu1, and the data shows:
> 
> about 0x2dc7f1 packets are processed, and about 0xf4240 IPI are generated.

If you look at the patch, I am zeroing those stats - so 0xf4240 is only
one test (decimal 1M). I think there is something to what you are
saying; rps_ipi on cpu0 is ambigous because it counts the number of
times cpu0 softirq was scheduled as well as the number of times cpu0
scheduled other cpus. 
The extra six for cpu0 turn out to be the times an ethernet interrupt
scheduled the cpu0 softirq.

> a single packet is counted twice by CPU0 and CPU1. 

Well, the counts have different meanings; rps_ipi applies to source cpu
activity and rps_recv applies to destination. Example, if cpu0 in total
6 times found some destination cpu to be empty and 2 of those happen to
be on cpu1, cpu2, cpu3 then
cpu0: ipi_rps = 6
cpu1: rps_recv = 2
cpu2: rps_recv = 2
cpu3: rps_recv = 2


> If you change RPS setting by:
> 
> echo 1 > ..../rps_cpus
> 
> you will find the total number are doubled.

This is true. But IMO deserving and should be double counted.
It is just more fine-grained accounting.
IOW, I am not sure we need your patch because we will loose the
fine-grain accounting - and mine requires more work to be less ambigous.

cheers,
jamal 


^ permalink raw reply

* Re: [PATCH]Add device drivers (GbE, Packet Hub) for Topcliff
From: Masayuki Ohtake @ 2010-04-19 12:34 UTC (permalink / raw)
  To: Jonathan Corbet, netdev
  Cc: andrew.chih.howe.khor, Intel OTC, LKML, Wang, Qi, Wang, Yong Y
In-Reply-To: <20100416083508.64199657@tpl.lwn.net>

Hi jon,
Thanks for your suggestion again.
I joined the netdev and send my patch to "netdev <netdev@vger.kernel.org>".

--
Hello netdev users,

I developed the device drivers for Linux kernel 2.6.33-1.
This time, I added the following drivers
  - GbE device
  - Packet HUB device

The GbE and Packet Hub device drivers are related with each other.
Because I send patch to LKML <linux-kernel@vger.kernel.org> and the netdev
<netdev@vger.kernel.org> mailing list.

Would you check them?

The patch is uploaded to our WEB site in Sourceforge.net,
Because the patch size was large.

[Our WEB site in Sourceforge.net.]
http://sourceforge.net/projects/generalembedded/files/
"All Files" -> "Downloads" -> "Dev" -> "kernel 2.6.33-1"
  - pch_gbe.patch  (for GbE device)
  - pch_phub.patch (for Packet HUB device)


[About our development product]
Topcliff is a chip that has many peripherals.
The chip has UART, I2C, SPI, IEEE1588, CAN, Packet HUB, SATA, USB host, USB
device, SDIO, Gigabit Ethernet, GPIO and DMA.

Best regards,
Masayuki Ohtake <masa-korg@dsn.okisemi.com>
----- Original Message ----- 
From: "Jonathan Corbet" <corbet@lwn.net>
To: "Masayuki Ohtake" <masa-korg@dsn.okisemi.com>
Cc: "LKML" <linux-kernel@vger.kernel.org>; "Intel OTC"
<joel.clark@intel.com>; <andrew.chih.howe.khor@intel.com>
Sent: Friday, April 16, 2010 11:35 PM
Subject: Re: [PATCH]Add device drivers (GbE, Packet Hub) for Topcliff


> On Fri, 16 Apr 2010 19:09:04 +0900
> "Masayuki Ohtake" <masa-korg@dsn.okisemi.com> wrote:
>
> > I developed the device drivers for Linux kernel 2.6.33-1.
> > This time, I added the following drivers
> >   - GbE device
> >   - Packet HUB device
> >
> > Would you check them?
>
> Thanks for making your code available.  May I suggest, though, that you
> will get a much better response if you post the patches directly to the
> mailing list?  That is how our process works; it is simply harder to
> review code if you have to do digging through web sites to find it.
>
> Networking-specific patches should be posted to the netdev list as well.
>
> More information on how to post code for review can be found in the
> kernel source tree:
>
> Documentation/HOWTO
> Documentation/development-process
>
> Thanks,
>
> jon
>

^ permalink raw reply

* Re: [PATCH RFC]: soreuseport: Bind multiple sockets to same port
From: jamal @ 2010-04-19 12:31 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Tom Herbert, davem, netdev
In-Reply-To: <1271662103.16881.7300.camel@edumazet-laptop>

On Mon, 2010-04-19 at 09:28 +0200, Eric Dumazet wrote:

> High perf DNS server on such machine would have 16 threads, and probably
> 64 threads in two years.

if you dont care about x86, 64 SMT threads is already there
yesterday ;->

> I understand you want 16 UDP sockets to avoid lock contention, but
> __udp4_lib_lookup() becomes a nightmare (It may already be ...)
> 
> My idea was to add a cpu lookup key.

I like this idea better. 
Staring at data i collected over the weekend, I am scratching my head
trying to find some correlation. I see socket flows  bouncing around
CPUs other than what RPS directs to. The scheduler seems to have a mind
of its own. What is clear is if i can localize a flow/socket to a single
cpu i get best performance. RPS, when there is enough load, does better
because of this localization (DaveM made this statement earlier
actually).

I was hoping i could do a connect() + sched_setaffinity() and have RPS
direct that flow to me - but alas even RFS still depends on hashing.
Unless there is an easier way to do this, I was planning to look
at the RPS hashing and manually cook flows which end up on a cpu where 
I do sched_setaffinity()...

> thread0 would use a new setsockopt() option to bind a socket to a
> virtual cpu0. Then do its normal bind( port=53)

So question: Why not tie to sched_setaffinity? i.e at bind time you
lookup what cpu this socket is affined to?

cheers,
jamal


^ permalink raw reply

* Re: [RFC] rps: shortcut net_rps_action()
From: Changli Gao @ 2010-04-19 12:28 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Tom Herbert, David Miller, netdev
In-Reply-To: <1271679244.3845.43.camel@edumazet-laptop>

On Mon, Apr 19, 2010 at 8:14 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> As several packets can be enqueued for a given cpu, we would need to
> keep bitmasks.
> We would have to add one test in enqueue_to_backlog()
>
> if (cpu_test_and_set(cpu, mask)) {
>        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
>        array[nb++] = cpu;
> }

        rps_lock(queue);
        if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
                if (queue->input_pkt_queue.qlen) {
...
                if (napi_schedule_prep(&queue->backlog)) {
#ifdef CONFIG_RPS
                        if (cpu != smp_processor_id()) {
                                struct rps_remote_softirq_cpus *rcpus =
                                    &__get_cpu_var(rps_remote_softirq_cpus);

                                cpu_set(cpu, rcpus->mask[rcpus->select]);
                                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
                                goto enqueue;
                        }
#endif
                        __napi_schedule(&queue->backlog);
                }

Only the first packet of a softnet.input_pkt_queue may trigger IPI, so
we don't need to keep bitmasks.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [RFC] rps: shortcut net_rps_action()
From: Eric Dumazet @ 2010-04-19 12:14 UTC (permalink / raw)
  To: Changli Gao; +Cc: Tom Herbert, David Miller, netdev
In-Reply-To: <k2u412e6f7f1004190248s8ac633beof2475645e799f2ea@mail.gmail.com>

Le lundi 19 avril 2010 à 17:48 +0800, Changli Gao a écrit :
> On Mon, Apr 19, 2010 at 5:37 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > net_rps_action() is a bit expensive on NR_CPUS=64..4096 kernels, even if
> > RPS is not active.
> >
> > I add a flag to scan cpumask only if at least one IPI was scheduled.
> > Even cpumask_weight() might be expensive on some setups, where
> > nr_cpumask_bits could be very big (4096 for example)
> 
> How about using a array to save the cpu IDs. The number of CPUs, to
> which the IPI will be sent, should be small.
> 

Yes it should be small, yet the two arrays would be big enough to make
softnet_data first part use at least two cache lines instead of one,
even in the case we handle one cpu/IPI per net_rps_action()

As several packets can be enqueued for a given cpu, we would need to
keep bitmasks.
We would have to add one test in enqueue_to_backlog()

if (cpu_test_and_set(cpu, mask)) {
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
	array[nb++] = cpu;
}




^ permalink raw reply

* [PATCH 2/2] ehea: fix possible DLPAR/mem deadlock
From: Thomas Klein @ 2010-04-19 12:08 UTC (permalink / raw)
  To: David S. Miller
  Cc: Christoph Raisch, Jan-Bernd Themann, linux-kernel, linux-ppc,
	netdev
In-Reply-To: <20080916085746.194c1510@bull.net>

Force serialization of userspace-triggered DLPAR/mem operations

Signed-off-by: Thomas Klein <tklein@de.ibm.com>
---

Patch created against 2.6.34-rc4

diff -Nurp linux-2.6.34-rc4.orig//drivers/net/ehea/ehea.h linux-2.6.34-rc4//drivers/net/ehea/ehea.h
--- linux-2.6.34-rc4.orig//drivers/net/ehea/ehea.h	2010-04-19 11:54:07.000000000 +0200
+++ linux-2.6.34-rc4//drivers/net/ehea/ehea.h	2010-04-19 12:00:14.000000000 +0200
@@ -40,7 +40,7 @@
  #include <asm/io.h>

  #define DRV_NAME	"ehea"
-#define DRV_VERSION	"EHEA_0102"
+#define DRV_VERSION	"EHEA_0103"

  /* eHEA capability flags */
  #define DLPAR_PORT_ADD_REM 1
diff -Nurp linux-2.6.34-rc4.orig//drivers/net/ehea/ehea_main.c linux-2.6.34-rc4//drivers/net/ehea/ehea_main.c
--- linux-2.6.34-rc4.orig//drivers/net/ehea/ehea_main.c	2010-04-19 11:59:11.000000000 +0200
+++ linux-2.6.34-rc4//drivers/net/ehea/ehea_main.c	2010-04-19 11:59:50.000000000 +0200
@@ -2889,7 +2889,6 @@ static void ehea_rereg_mrs(struct work_s
  	int ret, i;
  	struct ehea_adapter *adapter;

-	mutex_lock(&dlpar_mem_lock);
  	ehea_info("LPAR memory changed - re-initializing driver");

  	list_for_each_entry(adapter, &adapter_list, list)
@@ -2959,7 +2958,6 @@ static void ehea_rereg_mrs(struct work_s
  		}
  	ehea_info("re-initializing driver complete");
  out:
-	mutex_unlock(&dlpar_mem_lock);
  	return;
  }

@@ -3542,7 +3540,14 @@ void ehea_crash_handler(void)
  static int ehea_mem_notifier(struct notifier_block *nb,
                               unsigned long action, void *data)
  {
+	int ret = NOTIFY_BAD;
  	struct memory_notify *arg = data;
+
+	if (!mutex_trylock(&dlpar_mem_lock)) {
+		ehea_info("ehea_mem_notifier must not be called parallelized");
+		goto out;
+	}
+
  	switch (action) {
  	case MEM_CANCEL_OFFLINE:
  		ehea_info("memory offlining canceled");
@@ -3551,14 +3556,14 @@ static int ehea_mem_notifier(struct noti
  		ehea_info("memory is going online");
  		set_bit(__EHEA_STOP_XFER, &ehea_driver_flags);
  		if (ehea_add_sect_bmap(arg->start_pfn, arg->nr_pages))
-			return NOTIFY_BAD;
+			goto out_unlock;
  		ehea_rereg_mrs(NULL);
  		break;
  	case MEM_GOING_OFFLINE:
  		ehea_info("memory is going offline");
  		set_bit(__EHEA_STOP_XFER, &ehea_driver_flags);
  		if (ehea_rem_sect_bmap(arg->start_pfn, arg->nr_pages))
-			return NOTIFY_BAD;
+			goto out_unlock;
  		ehea_rereg_mrs(NULL);
  		break;
  	default:
@@ -3566,8 +3571,12 @@ static int ehea_mem_notifier(struct noti
  	}

  	ehea_update_firmware_handles();
+	ret = NOTIFY_OK;

-	return NOTIFY_OK;
+out_unlock:
+	mutex_unlock(&dlpar_mem_lock);
+out:
+	return ret;
  }

  static struct notifier_block ehea_mem_nb = {

^ permalink raw reply

* [PATCH 1/2] ehea: error handling improvement
From: Thomas Klein @ 2010-04-19 12:08 UTC (permalink / raw)
  To: David S. Miller
  Cc: Christoph Raisch, Jan-Bernd Themann, linux-kernel, linux-ppc,
	netdev

Reset a port's resources only if they're actually in an error state

Signed-off-by: Thomas Klein <tklein@de.ibm.com>
---

Patch created against 2.6.34-rc4

diff -Nurp linux-2.6.34-rc4.orig//drivers/net/ehea/ehea_main.c linux-2.6.34-rc4//drivers/net/ehea/ehea_main.c
--- linux-2.6.34-rc4.orig//drivers/net/ehea/ehea_main.c	2010-04-19 11:54:07.000000000 +0200
+++ linux-2.6.34-rc4//drivers/net/ehea/ehea_main.c	2010-04-19 11:55:43.000000000 +0200
@@ -791,11 +791,17 @@ static struct ehea_cqe *ehea_proc_cqes(s
  		cqe_counter++;
  		rmb();
  		if (cqe->status & EHEA_CQE_STAT_ERR_MASK) {
-			ehea_error("Send Completion Error: Resetting port");
+			ehea_error("Bad send completion status=0x%04X",
+				   cqe->status);
+
  			if (netif_msg_tx_err(pr->port))
  				ehea_dump(cqe, sizeof(*cqe), "Send CQE");
-			ehea_schedule_port_reset(pr->port);
-			break;
+
+			if (cqe->status & EHEA_CQE_STAT_RESET_MASK) {
+				ehea_error("Resetting port");
+				ehea_schedule_port_reset(pr->port);
+				break;
+			}
  		}

  		if (netif_msg_tx_done(pr->port))
@@ -901,6 +907,8 @@ static irqreturn_t ehea_qp_aff_irq_handl
  	struct ehea_eqe *eqe;
  	struct ehea_qp *qp;
  	u32 qp_token;
+	u64 resource_type, aer, aerr;
+	int reset_port = 0;

  	eqe = ehea_poll_eq(port->qp_eq);

@@ -910,11 +918,24 @@ static irqreturn_t ehea_qp_aff_irq_handl
  			   eqe->entry, qp_token);

  		qp = port->port_res[qp_token].qp;
-		ehea_error_data(port->adapter, qp->fw_handle);
+
+		resource_type = ehea_error_data(port->adapter, qp->fw_handle,
+						&aer, &aerr);
+
+		if (resource_type == EHEA_AER_RESTYPE_QP) {
+			if ((aer & EHEA_AER_RESET_MASK) ||
+			    (aerr & EHEA_AERR_RESET_MASK))
+				 reset_port = 1;
+		} else
+			reset_port = 1;   /* Reset in case of CQ or EQ error */
+
  		eqe = ehea_poll_eq(port->qp_eq);
  	}

-	ehea_schedule_port_reset(port);
+	if (reset_port) {
+		ehea_error("Resetting port");
+		ehea_schedule_port_reset(port);
+	}

  	return IRQ_HANDLED;
  }
diff -Nurp linux-2.6.34-rc4.orig//drivers/net/ehea/ehea_qmr.c linux-2.6.34-rc4//drivers/net/ehea/ehea_qmr.c
--- linux-2.6.34-rc4.orig//drivers/net/ehea/ehea_qmr.c	2010-04-19 11:54:07.000000000 +0200
+++ linux-2.6.34-rc4//drivers/net/ehea/ehea_qmr.c	2010-04-19 11:56:36.000000000 +0200
@@ -229,14 +229,14 @@ u64 ehea_destroy_cq_res(struct ehea_cq *

  int ehea_destroy_cq(struct ehea_cq *cq)
  {
-	u64 hret;
+	u64 hret, aer, aerr;
  	if (!cq)
  		return 0;

  	hcp_epas_dtor(&cq->epas);
  	hret = ehea_destroy_cq_res(cq, NORMAL_FREE);
  	if (hret == H_R_STATE) {
-		ehea_error_data(cq->adapter, cq->fw_handle);
+		ehea_error_data(cq->adapter, cq->fw_handle, &aer, &aerr);
  		hret = ehea_destroy_cq_res(cq, FORCE_FREE);
  	}

@@ -357,7 +357,7 @@ u64 ehea_destroy_eq_res(struct ehea_eq *

  int ehea_destroy_eq(struct ehea_eq *eq)
  {
-	u64 hret;
+	u64 hret, aer, aerr;
  	if (!eq)
  		return 0;

@@ -365,7 +365,7 @@ int ehea_destroy_eq(struct ehea_eq *eq)

  	hret = ehea_destroy_eq_res(eq, NORMAL_FREE);
  	if (hret == H_R_STATE) {
-		ehea_error_data(eq->adapter, eq->fw_handle);
+		ehea_error_data(eq->adapter, eq->fw_handle, &aer, &aerr);
  		hret = ehea_destroy_eq_res(eq, FORCE_FREE);
  	}

@@ -540,7 +540,7 @@ u64 ehea_destroy_qp_res(struct ehea_qp *

  int ehea_destroy_qp(struct ehea_qp *qp)
  {
-	u64 hret;
+	u64 hret, aer, aerr;
  	if (!qp)
  		return 0;

@@ -548,7 +548,7 @@ int ehea_destroy_qp(struct ehea_qp *qp)

  	hret = ehea_destroy_qp_res(qp, NORMAL_FREE);
  	if (hret == H_R_STATE) {
-		ehea_error_data(qp->adapter, qp->fw_handle);
+		ehea_error_data(qp->adapter, qp->fw_handle, &aer, &aerr);
  		hret = ehea_destroy_qp_res(qp, FORCE_FREE);
  	}

@@ -986,42 +986,45 @@ void print_error_data(u64 *data)
  	if (length > EHEA_PAGESIZE)
  		length = EHEA_PAGESIZE;

-	if (type == 0x8) /* Queue Pair */
+	if (type == EHEA_AER_RESTYPE_QP)
  		ehea_error("QP (resource=%llX) state: AER=0x%llX, AERR=0x%llX, "
  			   "port=%llX", resource, data[6], data[12], data[22]);
-
-	if (type == 0x4) /* Completion Queue */
+	else if (type == EHEA_AER_RESTYPE_CQ)
  		ehea_error("CQ (resource=%llX) state: AER=0x%llX", resource,
  			   data[6]);
-
-	if (type == 0x3) /* Event Queue */
+	else if (type == EHEA_AER_RESTYPE_EQ)
  		ehea_error("EQ (resource=%llX) state: AER=0x%llX", resource,
  			   data[6]);

  	ehea_dump(data, length, "error data");
  }

-void ehea_error_data(struct ehea_adapter *adapter, u64 res_handle)
+u64 ehea_error_data(struct ehea_adapter *adapter, u64 res_handle,
+		    u64 *aer, u64 *aerr)
  {
  	unsigned long ret;
  	u64 *rblock;
+	u64 type = 0;

  	rblock = (void *)get_zeroed_page(GFP_KERNEL);
  	if (!rblock) {
  		ehea_error("Cannot allocate rblock memory.");
-		return;
+		goto out;
  	}

-	ret = ehea_h_error_data(adapter->handle,
-				res_handle,
-				rblock);
+	ret = ehea_h_error_data(adapter->handle, res_handle, rblock);

-	if (ret == H_R_STATE)
-		ehea_error("No error data is available: %llX.", res_handle);
-	else if (ret == H_SUCCESS)
+	if (ret == H_SUCCESS) {
+		type = EHEA_BMASK_GET(ERROR_DATA_TYPE, rblock[2]);
+		*aer = rblock[6];
+		*aerr = rblock[12];
  		print_error_data(rblock);
-	else
+	} else if (ret == H_R_STATE) {
+		ehea_error("No error data available: %llX.", res_handle);
+	} else
  		ehea_error("Error data could not be fetched: %llX", res_handle);

  	free_page((unsigned long)rblock);
+out:
+	return type;
  }
diff -Nurp linux-2.6.34-rc4.orig//drivers/net/ehea/ehea_qmr.h linux-2.6.34-rc4//drivers/net/ehea/ehea_qmr.h
--- linux-2.6.34-rc4.orig//drivers/net/ehea/ehea_qmr.h	2010-04-19 11:54:07.000000000 +0200
+++ linux-2.6.34-rc4//drivers/net/ehea/ehea_qmr.h	2010-04-19 11:57:12.000000000 +0200
@@ -154,6 +154,9 @@ struct ehea_rwqe {
  #define EHEA_CQE_STAT_ERR_IP       0x2000
  #define EHEA_CQE_STAT_ERR_CRC      0x1000

+/* Defines which bad send cqe stati lead to a port reset */
+#define EHEA_CQE_STAT_RESET_MASK   0x0002
+
  struct ehea_cqe {
  	u64 wr_id;		/* work request ID from WQE */
  	u8 type;
@@ -187,6 +190,14 @@ struct ehea_cqe {
  #define EHEA_EQE_SM_MECH_NUMBER  EHEA_BMASK_IBM(48, 55)
  #define EHEA_EQE_SM_PORT_NUMBER  EHEA_BMASK_IBM(56, 63)

+#define EHEA_AER_RESTYPE_QP  0x8
+#define EHEA_AER_RESTYPE_CQ  0x4
+#define EHEA_AER_RESTYPE_EQ  0x3
+
+/* Defines which affiliated errors lead to a port reset */
+#define EHEA_AER_RESET_MASK   0xFFFFFFFFFEFFFFFFULL
+#define EHEA_AERR_RESET_MASK  0xFFFFFFFFFFFFFFFFULL
+
  struct ehea_eqe {
  	u64 entry;
  };
@@ -379,7 +390,8 @@ int ehea_gen_smr(struct ehea_adapter *ad

  int ehea_rem_mr(struct ehea_mr *mr);

-void ehea_error_data(struct ehea_adapter *adapter, u64 res_handle);
+u64 ehea_error_data(struct ehea_adapter *adapter, u64 res_handle,
+		    u64 *aer, u64 *aerr);

  int ehea_add_sect_bmap(unsigned long pfn, unsigned long nr_pages);
  int ehea_rem_sect_bmap(unsigned long pfn, unsigned long nr_pages);

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox