Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH 3/4] net: VSI: Add virtual station interface support
From: John Fastabend @ 2013-09-11 18:47 UTC (permalink / raw)
  To: stephen, bhutchings, ogerlitz
  Cc: vfalico, john.ronciak, netdev, shannon.nelson
In-Reply-To: <20130911184441.26914.10336.stgit@nitbit.x32>

This patch adds support for a new device type VSI (virtual station
interface) this device type exposes additional net devices complete
with queues and a MAC/VLAN pair to the host OS that are logically
stacked on top of a switching/routing component with the physical
link acting as the downlink to the peer switch.

The hardware on receive path will forward packets to the new VSI
net device using the forwarding database (FDB) already exposed via
the ndo ops ndo_fdb_{add|del|dump}. On transmit the hardware may
use either a VEB or VEPA. In the VEB case traffic may be "switched"
between VSI net devices by the hardware and in VEPA case all traffic
is sent to the adjacent switch. The hardware _should_ expose this
functionality via the ndo_bridge_{set|get}link ndo operations.

This net device should be functionally analogous to an offloaded
macvlan device with the ebridge component offloaded into hardware.

Also notice that for now the ixgbe implementation accompanying this
patch set only supports L2 forwarding the fdb interfaces could push
L3/L4 forwarding to the hardware for more advanced usages including
vxlan and other tunnel schemes.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/net/Kconfig       |    9 +++
 drivers/net/Makefile      |    1 
 drivers/net/vsi.c         |  124 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netdevice.h |   27 ++++++++++
 include/uapi/linux/if.h   |    1 
 5 files changed, 162 insertions(+)
 create mode 100644 drivers/net/vsi.c

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index b45b240..19be0fb 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -362,4 +362,13 @@ config VMXNET3
 
 source "drivers/net/hyperv/Kconfig"
 
+config VSI
+	tristate "Virtual Station Interfaces (VSI)"
+	help
+	  This supports chip sets with embedded switching components
+	  and allows creating additional net devices that are
+	  logically slaves of a master net device typically the net
+	  device associated with the physical function. For these
+	  child devices switching occurs in the hardware component.
+
 endif # NETDEVICES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 3fef8a8..3ef1d66 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_VETH) += veth.o
 obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
 obj-$(CONFIG_VXLAN) += vxlan.o
 obj-$(CONFIG_NLMON) += nlmon.o
+obj-$(CONFIG_VSI) += vsi.o
 
 #
 # Networking Drivers
diff --git a/drivers/net/vsi.c b/drivers/net/vsi.c
new file mode 100644
index 0000000..e9d39da
--- /dev/null
+++ b/drivers/net/vsi.c
@@ -0,0 +1,124 @@
+/*
+ * VSI - Virtual Sstation Interface
+ * Copyright(c) 2013 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Contact Information:
+ * John Fastabend <john.r.fastabend@intel.com>
+ */
+#include <linux/module.h>
+#include <net/rtnetlink.h>
+#include <linux/etherdevice.h>
+
+size_t vsi_priv_size(struct net *src_net, struct nlattr *tb[])
+{
+	struct net_device *dev;
+	size_t size = 0;
+
+	if (!tb[IFLA_LINK])
+		return 0;
+
+	dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
+	if (!dev)
+		return -ENODEV;
+
+	if (dev->netdev_ops->ndo_vsi_size)
+		size = dev->netdev_ops->ndo_vsi_size(dev);
+	return size;
+}
+
+static int vsi_newlink(struct net *src_net, struct net_device *dev,
+		       struct nlattr *tb[], struct nlattr *data[])
+{
+	struct net_device *lower;
+	int err;
+
+	if (!tb[IFLA_LINK])
+		return -EINVAL;
+
+	lower = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
+	if (!lower)
+		return -ENODEV;
+
+	if (!tb[IFLA_MTU])
+		dev->mtu = lower->mtu;
+	else if (lower->mtu > dev->mtu)
+		return -EINVAL;
+
+	dev->priv_flags |= IFF_VSI_PORT;
+	err = lower->netdev_ops->ndo_vsi_add(lower, dev);
+	if (err < 0)
+		return err;
+
+	err = netdev_upper_dev_link(lower, dev);
+	if (err)
+		goto destroy_port;
+
+	err = register_netdevice(dev);
+	if (err < 0)
+		goto upper_dev_unlink;
+
+	netif_stacked_transfer_operstate(lower, dev);
+	return 0;
+upper_dev_unlink:
+	netdev_upper_dev_unlink(lower, dev);
+destroy_port:
+	if (lower->netdev_ops->ndo_vsi_del)
+		lower->netdev_ops->ndo_vsi_del(dev);
+	return err;
+}
+
+void vsi_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct net_device *lower;
+	struct list_head *iter;
+
+	netdev_for_each_lower_dev_rcu(dev, lower, iter) {
+		if (lower->netdev_ops->ndo_vsi_del)
+			lower->netdev_ops->ndo_vsi_del(dev);
+		netdev_upper_dev_unlink(lower, dev);
+	}
+
+	unregister_netdevice_queue(dev, head);
+}
+
+static struct rtnl_link_ops vsi_link_ops __read_mostly = {
+	.kind		= "vsi",
+	.priv_size	= vsi_priv_size,
+	.setup		= ether_setup,
+	.newlink	= vsi_newlink,
+	.dellink	= vsi_dellink,
+};
+
+static int __init vsi_init_module(void)
+{
+	return rtnl_link_register(&vsi_link_ops);
+}
+
+static void __exit vsi_cleanup_module(void)
+{
+	rtnl_link_unregister(&vsi_link_ops);
+}
+
+module_init(vsi_init_module);
+module_exit(vsi_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("John Fastabend <john.r.fastabend@intel.com>");
+MODULE_DESCRIPTION("Virutal Station Interfaces (VSI)");
+MODULE_ALIAS_RTNL_LINK("vsi");
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4d24b38..9817745 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -961,6 +961,24 @@ struct netdev_phys_port_id {
  *	Called by vxlan to notify the driver about a UDP port and socket
  *	address family that vxlan is not listening to anymore. The operation
  *	is protected by the vxlan_net->sock_lock.
+ *
+ * int (*ndo_vsi_add)(struct net_device *lower, struct net_device *dev)
+ *	Called by the virtual station interface (VSI) link type to add a new
+ *	net device 'dev' to an embedded switch where the embedded switch
+ *	management net device is identified by 'lower'. This should return
+ *	0 on success or may return negative error codes. Error codes should
+ *	be used here to signify resource constraints, unsupportable attributes,
+ *	or any other condition which caused the creation to fail.
+ * void (*ndo_vsi_del)(struct net_device *dev)
+ *	Called by the virtual station interface (VSI) link type to remove the
+ *	net device 'dev' from an embedded switch. Drivers may not fail this
+ *	command.
+ * size_t (*ndo_vsi_size)(struct net_device *dev)
+ *	Called by the virtual station interface (VSI) link type to add the
+ *	required private size to a VSI interface that is being created. If
+ *	this routine is not implemented size_t 0 is used. The 'dev' argument
+ *	indicates the embedded switch management interface where the new
+ *	net devices is being attached.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1097,6 +1115,10 @@ struct net_device_ops {
 	void			(*ndo_del_vxlan_port)(struct  net_device *dev,
 						      sa_family_t sa_family,
 						      __u16 port);
+	int			(*ndo_vsi_add)(struct net_device *lower,
+					       struct net_device *dev);
+	void			(*ndo_vsi_del)(struct net_device *dev);
+	size_t			(*ndo_vsi_size)(struct net_device *dev);
 };
 
 /*
@@ -2967,6 +2989,11 @@ static inline bool netif_supports_nofcs(struct net_device *dev)
 	return dev->priv_flags & IFF_SUPP_NOFCS;
 }
 
+static inline bool netif_is_vsi_port(struct net_device *dev)
+{
+	return dev->priv_flags & IFF_VSI_PORT;
+}
+
 extern struct pernet_operations __net_initdata loopback_net_ops;
 
 /* Logging, debugging and troubleshooting/diagnostic helpers. */
diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
index 1ec407b..9b8d6a0 100644
--- a/include/uapi/linux/if.h
+++ b/include/uapi/linux/if.h
@@ -83,6 +83,7 @@
 #define IFF_SUPP_NOFCS	0x80000		/* device supports sending custom FCS */
 #define IFF_LIVE_ADDR_CHANGE 0x100000	/* device supports hardware address
 					 * change when it's running */
+#define IFF_VSI_PORT 0x200000		/* Virtual Station Interface port */
 
 
 #define IF_GET_IFACE	0x0001		/* for querying only */

^ permalink raw reply related

* [RFC PATCH 2/4] net: Add lower dev list helpers
From: John Fastabend @ 2013-09-11 18:46 UTC (permalink / raw)
  To: stephen, bhutchings, ogerlitz
  Cc: vfalico, john.ronciak, netdev, shannon.nelson
In-Reply-To: <20130911184441.26914.10336.stgit@nitbit.x32>

This patch adds helpers to traverse the lower dev lists, these
helpers match the upper dev list implementation.

VSI implementers may use these to track a list of connected netdevs.
This is easier then having drivers do their own accounting.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 include/linux/netdevice.h |    8 ++++++++
 net/core/dev.c            |   25 +++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 041b42a..4d24b38 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2813,6 +2813,8 @@ extern int		bpf_jit_enable;
 extern bool netdev_has_upper_dev(struct net_device *dev,
 				 struct net_device *upper_dev);
 extern bool netdev_has_any_upper_dev(struct net_device *dev);
+extern struct net_device *netdev_lower_get_next_dev_rcu(struct net_device *dev,
+							struct list_head **iter);
 extern struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 							struct list_head **iter);
 
@@ -2823,6 +2825,12 @@ extern struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 	     upper; \
 	     upper = netdev_upper_get_next_dev_rcu(dev, &(iter)))
 
+#define netdev_for_each_lower_dev_rcu(dev, lower, iter) \
+	for (iter = &(dev)->lower_dev_list, \
+	     lower = netdev_lower_get_next_dev_rcu(dev, &(iter)); \
+	     lower; \
+	     lower = netdev_lower_get_next_dev_rcu(dev, &(iter)))
+
 extern struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
 extern struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
 extern int netdev_upper_dev_link(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 5c713f2..65ed610 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4468,6 +4468,31 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_master_upper_dev_get);
 
+/* netdev_lower_get_next_dev_rcu - Get the next dev from lower list
+ * @dev: device
+ * @iter: list_head ** of the current position
+ *
+ * Gets the next device from the dev's lower list, starting from iter
+ * position. The caller must hold RTNL/RCU read lock.
+ */
+struct net_device *netdev_lower_get_next_dev_rcu(struct net_device *dev,
+						 struct list_head **iter)
+{
+	struct netdev_adjacent *lower;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+	if (&lower->list == &dev->lower_dev_list)
+		return NULL;
+
+	*iter = &lower->list;
+
+	return lower->dev;
+}
+EXPORT_SYMBOL(netdev_lower_get_next_dev_rcu);
+
 /* netdev_upper_get_next_dev_rcu - Get the next dev from upper list
  * @dev: device
  * @iter: list_head ** of the current position

^ permalink raw reply related

* [RFC PATCH 1/4] net: rtnetlink: make priv_size a function for devs with dynamic size
From: John Fastabend @ 2013-09-11 18:46 UTC (permalink / raw)
  To: stephen, bhutchings, ogerlitz
  Cc: vfalico, john.ronciak, netdev, shannon.nelson
In-Reply-To: <20130911184441.26914.10336.stgit@nitbit.x32>

The priv_size rtnl_link_op today is a fixed size_t value.

For upcoming VSI support via rtnl_link_ops it is useful to allow this
size to be configurable. This patch converts the existing static
definition into a function that returns the size_t value.

To make this conversion as easy as possible the patch uses a new
macro RTNL_LINK_OPS_PRIV_SIZE which existing users can call to
generate a function equivalent to the previous static value.

RFC NOTE: I'm not entirely sure the macro makes the code easier
          to read or just obfuscates what is really happening.
          Any opinions?

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_netlink.c |    4 +++-
 drivers/net/bonding/bond_main.c              |    4 +++-
 drivers/net/caif/caif_hsi.c                  |    4 +++-
 drivers/net/ifb.c                            |    4 +++-
 drivers/net/macvlan.c                        |    4 +++-
 drivers/net/nlmon.c                          |    4 +++-
 drivers/net/team/team.c                      |    4 +++-
 drivers/net/tun.c                            |    4 +++-
 drivers/net/veth.c                           |    4 +++-
 drivers/net/vxlan.c                          |    4 +++-
 include/net/rtnetlink.h                      |   11 ++++++++++-
 net/8021q/vlan_netlink.c                     |    4 +++-
 net/bridge/br_netlink.c                      |    4 +++-
 net/core/rtnetlink.c                         |    2 +-
 net/ieee802154/6lowpan.c                     |    4 +++-
 net/ipv4/ip_gre.c                            |    6 ++++--
 net/ipv4/ip_tunnel.c                         |    2 +-
 net/ipv4/ip_vti.c                            |    4 +++-
 net/ipv4/ipip.c                              |    4 +++-
 net/ipv6/ip6_tunnel.c                        |    4 +++-
 net/ipv6/sit.c                               |    4 +++-
 21 files changed, 67 insertions(+), 22 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
index f81abe1..8fd93fc 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
@@ -155,11 +155,13 @@ static size_t ipoib_get_size(const struct net_device *dev)
 		nla_total_size(2);	/* IFLA_IPOIB_UMCAST */
 }
 
+RTNL_LINK_OPS_PRIV_SIZE(ipoib, ipoib_dev_priv);
+
 static struct rtnl_link_ops ipoib_link_ops __read_mostly = {
 	.kind		= "ipoib",
 	.maxtype	= IFLA_IPOIB_MAX,
 	.policy		= ipoib_policy,
-	.priv_size	= sizeof(struct ipoib_dev_priv),
+	.priv_size	= ipoib_priv_size,
 	.setup		= ipoib_setup,
 	.newlink	= ipoib_new_child_link,
 	.changelink	= ipoib_changelink,
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 39e5b1c..1b3caf4 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4503,9 +4503,11 @@ static unsigned int bond_get_num_tx_queues(void)
 	return tx_queues;
 }
 
+RTNL_LINK_OPS_PRIV_SIZE(bond, bonding);
+
 static struct rtnl_link_ops bond_link_ops __read_mostly = {
 	.kind			= "bond",
-	.priv_size		= sizeof(struct bonding),
+	.priv_size		= bond_priv_size,
 	.setup			= bond_setup,
 	.validate		= bond_validate,
 	.get_num_tx_queues	= bond_get_num_tx_queues,
diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c
index 5e40a8b..bf4f19a 100644
--- a/drivers/net/caif/caif_hsi.c
+++ b/drivers/net/caif/caif_hsi.c
@@ -1445,9 +1445,11 @@ err:
 	return -ENODEV;
 }
 
+RTNL_LINK_OPS_PRIV_SIZE(caif, cfhsi);
+
 static struct rtnl_link_ops caif_hsi_link_ops __read_mostly = {
 	.kind		= "cfhsi",
-	.priv_size	= sizeof(struct cfhsi),
+	.priv_size	= caif_priv_size,
 	.setup		= cfhsi_setup,
 	.maxtype	= __IFLA_CAIF_HSI_MAX,
 	.policy	= caif_hsi_policy,
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index a3bed28..ed8f5fa 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -251,9 +251,11 @@ static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
+RTNL_LINK_OPS_PRIV_SIZE(ifb, ifb_private);
+
 static struct rtnl_link_ops ifb_link_ops __read_mostly = {
 	.kind		= "ifb",
-	.priv_size	= sizeof(struct ifb_private),
+	.priv_size	= ifb_priv_size,
 	.setup		= ifb_setup,
 	.validate	= ifb_validate,
 };
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 64dfaa3..6f293f7 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -934,10 +934,12 @@ static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = {
 	[IFLA_MACVLAN_FLAGS] = { .type = NLA_U16 },
 };
 
+RTNL_LINK_OPS_PRIV_SIZE(macvlan, macvlan_dev);
+
 int macvlan_link_register(struct rtnl_link_ops *ops)
 {
 	/* common fields */
-	ops->priv_size		= sizeof(struct macvlan_dev);
+	ops->priv_size		= macvlan_priv_size,
 	ops->validate		= macvlan_validate;
 	ops->maxtype		= IFLA_MACVLAN_MAX;
 	ops->policy		= macvlan_policy;
diff --git a/drivers/net/nlmon.c b/drivers/net/nlmon.c
index b57ce5f..693f357 100644
--- a/drivers/net/nlmon.c
+++ b/drivers/net/nlmon.c
@@ -154,9 +154,11 @@ static int nlmon_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
+RTNL_LINK_OPS_PRIV_SIZE(nlmon, nlmon);
+
 static struct rtnl_link_ops nlmon_link_ops __read_mostly = {
 	.kind			= "nlmon",
-	.priv_size		= sizeof(struct nlmon),
+	.priv_size		= nlmon_priv_size,
 	.setup			= nlmon_setup,
 	.validate		= nlmon_validate,
 };
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 50e43e6..b5f0526 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2068,9 +2068,11 @@ static unsigned int team_get_num_rx_queues(void)
 	return TEAM_DEFAULT_NUM_RX_QUEUES;
 }
 
+RTNL_LINK_OPS_PRIV_SIZE(team, team);
+
 static struct rtnl_link_ops team_link_ops __read_mostly = {
 	.kind			= DRV_NAME,
-	.priv_size		= sizeof(struct team),
+	.priv_size		= team_priv_size,
 	.setup			= team_setup,
 	.newlink		= team_newlink,
 	.validate		= team_validate,
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a639de8..d5aebec 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1380,9 +1380,11 @@ static int tun_validate(struct nlattr *tb[], struct nlattr *data[])
 	return -EINVAL;
 }
 
+RTNL_LINK_OPS_PRIV_SIZE(tun, tun_struct);
+
 static struct rtnl_link_ops tun_link_ops __read_mostly = {
 	.kind		= DRV_NAME,
-	.priv_size	= sizeof(struct tun_struct),
+	.priv_size	= tun_priv_size,
 	.setup		= tun_setup,
 	.validate	= tun_validate,
 };
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index eee1f19..61f7122 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -434,9 +434,11 @@ static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
 	[VETH_INFO_PEER]	= { .len = sizeof(struct ifinfomsg) },
 };
 
+RTNL_LINK_OPS_PRIV_SIZE(veth, veth_priv);
+
 static struct rtnl_link_ops veth_link_ops = {
 	.kind		= DRV_NAME,
-	.priv_size	= sizeof(struct veth_priv),
+	.priv_size	= veth_priv_size,
 	.setup		= veth_setup,
 	.validate	= veth_validate,
 	.newlink	= veth_newlink,
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index bf64b41..cff4c45 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2616,11 +2616,13 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+RTNL_LINK_OPS_PRIV_SIZE(vxlan, vxlan_dev);
+
 static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
 	.kind		= "vxlan",
 	.maxtype	= IFLA_VXLAN_MAX,
 	.policy		= vxlan_policy,
-	.priv_size	= sizeof(struct vxlan_dev),
+	.priv_size	= vxlan_priv_size,
 	.setup		= vxlan_setup,
 	.validate	= vxlan_validate,
 	.newlink	= vxlan_newlink,
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 7026648..b6a2ffb 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -4,6 +4,14 @@
 #include <linux/rtnetlink.h>
 #include <net/netlink.h>
 
+/* generate a get_priv function for simple field */
+#define RTNL_LINK_OPS_PRIV_SIZE(device, size_struct)		\
+static size_t device##_priv_size(struct net *src_net,		\
+				  struct nlattr *tb[])		\
+{								\
+	return sizeof(struct size_struct);			\
+}
+
 typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *);
 typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *);
 typedef u16 (*rtnl_calcit_func)(struct sk_buff *, struct nlmsghdr *);
@@ -54,7 +62,8 @@ struct rtnl_link_ops {
 
 	const char		*kind;
 
-	size_t			priv_size;
+	size_t			(*priv_size)(struct net *src_net,
+					     struct nlattr *tb[]);
 	void			(*setup)(struct net_device *dev);
 
 	int			maxtype;
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 3091297..6052370 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -238,11 +238,13 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+RTNL_LINK_OPS_PRIV_SIZE(vlan, vlan_dev_priv);
+
 struct rtnl_link_ops vlan_link_ops __read_mostly = {
 	.kind		= "vlan",
 	.maxtype	= IFLA_VLAN_MAX,
 	.policy		= vlan_policy,
-	.priv_size	= sizeof(struct vlan_dev_priv),
+	.priv_size	= vlan_priv_size,
 	.setup		= vlan_setup,
 	.validate	= vlan_validate,
 	.newlink	= vlan_newlink,
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index b9259ef..3f4a792 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -469,9 +469,11 @@ static struct rtnl_af_ops br_af_ops = {
 	.get_link_af_size	= br_get_link_af_size,
 };
 
+RTNL_LINK_OPS_PRIV_SIZE(br, net_bridge);
+
 struct rtnl_link_ops br_link_ops __read_mostly = {
 	.kind		= "bridge",
-	.priv_size	= sizeof(struct net_bridge),
+	.priv_size	= br_priv_size,
 	.setup		= br_dev_setup,
 	.validate	= br_validate,
 	.dellink	= br_dev_delete,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2a0e21d..76320fb 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1673,7 +1673,7 @@ struct net_device *rtnl_create_link(struct net *net,
 		num_rx_queues = ops->get_num_rx_queues();
 
 	err = -ENOMEM;
-	dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup,
+	dev = alloc_netdev_mqs(ops->priv_size(net, tb), ifname, ops->setup,
 			       num_tx_queues, num_rx_queues);
 	if (!dev)
 		goto err;
diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
index c85e71e..8ee9235 100644
--- a/net/ieee802154/6lowpan.c
+++ b/net/ieee802154/6lowpan.c
@@ -1420,9 +1420,11 @@ static void lowpan_dellink(struct net_device *dev, struct list_head *head)
 	dev_put(real_dev);
 }
 
+RTNL_LINK_OPS_PRIV_SIZE(lowpan, lowpan_dev_info);
+
 static struct rtnl_link_ops lowpan_link_ops __read_mostly = {
 	.kind		= "lowpan",
-	.priv_size	= sizeof(struct lowpan_dev_info),
+	.priv_size	= lowpan_priv_size,
 	.setup		= lowpan_setup,
 	.newlink	= lowpan_newlink,
 	.dellink	= lowpan_dellink,
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index d7aea4c..c3cfb7b 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -731,11 +731,13 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
 };
 
+RTNL_LINK_OPS_PRIV_SIZE(ipgre, ip_tunnel);
+
 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
 	.kind		= "gre",
 	.maxtype	= IFLA_GRE_MAX,
 	.policy		= ipgre_policy,
-	.priv_size	= sizeof(struct ip_tunnel),
+	.priv_size	= ipgre_priv_size,
 	.setup		= ipgre_tunnel_setup,
 	.validate	= ipgre_tunnel_validate,
 	.newlink	= ipgre_newlink,
@@ -749,7 +751,7 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
 	.kind		= "gretap",
 	.maxtype	= IFLA_GRE_MAX,
 	.policy		= ipgre_policy,
-	.priv_size	= sizeof(struct ip_tunnel),
+	.priv_size	= ipgre_priv_size,
 	.setup		= ipgre_tap_setup,
 	.validate	= ipgre_tap_validate,
 	.newlink	= ipgre_newlink,
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index ac9fabe..562dd12 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -293,7 +293,7 @@ static struct net_device *__ip_tunnel_create(struct net *net,
 	}
 
 	ASSERT_RTNL();
-	dev = alloc_netdev(ops->priv_size, name, ops->setup);
+	dev = alloc_netdev(ops->priv_size(net, NULL), name, ops->setup);
 	if (!dev) {
 		err = -ENOMEM;
 		goto failed;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index e805e7b..545cc20 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -416,11 +416,13 @@ static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
 	[IFLA_VTI_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
 };
 
+RTNL_LINK_OPS_PRIV_SIZE(vti, ip_tunnel);
+
 static struct rtnl_link_ops vti_link_ops __read_mostly = {
 	.kind		= "vti",
 	.maxtype	= IFLA_VTI_MAX,
 	.policy		= vti_policy,
-	.priv_size	= sizeof(struct ip_tunnel),
+	.priv_size	= vti_priv_size,
 	.setup		= vti_tunnel_setup,
 	.validate	= vti_tunnel_validate,
 	.newlink	= vti_newlink,
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 7f80fb4..ee5a926 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -408,11 +408,13 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_PMTUDISC]		= { .type = NLA_U8 },
 };
 
+RTNL_LINK_OPS_PRIV_SIZE(ipip, ip_tunnel);
+
 static struct rtnl_link_ops ipip_link_ops __read_mostly = {
 	.kind		= "ipip",
 	.maxtype	= IFLA_IPTUN_MAX,
 	.policy		= ipip_policy,
-	.priv_size	= sizeof(struct ip_tunnel),
+	.priv_size	= ipip_priv_size,
 	.setup		= ipip_tunnel_setup,
 	.newlink	= ipip_newlink,
 	.changelink	= ipip_changelink,
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 61355f7..2ab41b9 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1682,11 +1682,13 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_PROTO]		= { .type = NLA_U8 },
 };
 
+RTNL_LINK_OPS_PRIV_SIZE(ip6_tnl, ip6_tnl);
+
 static struct rtnl_link_ops ip6_link_ops __read_mostly = {
 	.kind		= "ip6tnl",
 	.maxtype	= IFLA_IPTUN_MAX,
 	.policy		= ip6_tnl_policy,
-	.priv_size	= sizeof(struct ip6_tnl),
+	.priv_size	= ip6_tnl_priv_size,
 	.setup		= ip6_tnl_dev_setup,
 	.validate	= ip6_tnl_validate,
 	.newlink	= ip6_tnl_newlink,
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 7ee5cb9..bd638ba 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1540,11 +1540,13 @@ static const struct nla_policy ipip6_policy[IFLA_IPTUN_MAX + 1] = {
 #endif
 };
 
+RTNL_LINK_OPS_PRIV_SIZE(ipip6, ip_tunnel);
+
 static struct rtnl_link_ops sit_link_ops __read_mostly = {
 	.kind		= "sit",
 	.maxtype	= IFLA_IPTUN_MAX,
 	.policy		= ipip6_policy,
-	.priv_size	= sizeof(struct ip_tunnel),
+	.priv_size	= ipip6_priv_size,
 	.setup		= ipip6_tunnel_setup,
 	.validate	= ipip6_validate,
 	.newlink	= ipip6_newlink,

^ permalink raw reply related

* [RFC PATCH 0/4] Series short description
From: John Fastabend @ 2013-09-11 18:45 UTC (permalink / raw)
  To: stephen, bhutchings, ogerlitz
  Cc: vfalico, john.ronciak, netdev, shannon.nelson

This patch series implements virtual station interfaces or VSIs. The
VSI term comes from the IEEE std 802.1Qbg-2012 specification which
should be merged with 802.1Q proper at some point.

	3.18 Virtual Station Interface (VSI): An interface to a
	     virtual station that is attached to a DRP of an edge
	     edge relay.

A DRP (downlink relay port) is the link between an edge relay and
a VSI. An edge relay is basically a simple bridge that does not
need to support learning, flooding, xSTP, etc. Which matches up well
with the ixgbe and I believe other hardware embedded bridge (ebridge)
implementations.

This series adds a new VSI rtnl link type. I chose to do this via
a link type because it allows us to reuse a lot of the existing
infrastructure to bring up a net device and it lets a VSI look
similar to a macvlan. The macvlan link type being the software
equivalent of a VSI. In many cases I can simply replace the
macvlan type with vsi from the ip command line tool and my existing
scripts work but use the ebridge instead of SW.

The usage model looks like this,

# ip link add link p3p2 numtxqueues 2 numrxqueues 2 type vsi
# ip link add link p3p2 numtxqueues 2 numrxqueues 2 type vsi
# ip link add link p3p2 numtxqueues 4 numrxqueues 4 type vsi
# ip link set dev vsi0 addr 00:1b:21:69:9f:15
# ip link set dev vsi1 addr 00:1b:21:69:9f:16
# ip link set dev vsi2 addr 00:1b:21:69:9f:17
# ip link set dev vsi0 up
# ip link set dev vsi1 up
# ip link set dev vsi2 up
# ip link show
16: p3p2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP mode DEFAULT qlen 1000
    link/ether 00:1b:21:69:9f:09 brd ff:ff:ff:ff:ff:ff
17: vsi0@p3p2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP mode DEFAULT qlen 1000
    link/ether 00:1b:21:69:9f:15 brd ff:ff:ff:ff:ff:ff
18: vsi1@p3p2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP mode DEFAULT qlen 1000
    link/ether 00:1b:21:69:9f:16 brd ff:ff:ff:ff:ff:ff
19: vsi2@p3p2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP mode DEFAULT qlen 1000
    link/ether 00:1b:21:69:9f:17 brd ff:ff:ff:ff:ff:ff

And creates this topology,

      vsi0      vsi1       vsi2
       |         |          |
     +------------------------+
     |                        |
     |        ebridge         |
     |                        |
     +------------------------+
                 |
               p3p2

At this point each vsi# will receive their assigned MAC addresses.
Using the 'fdb' interfaces additional L2/L3/tunnel entries could be
added to the vsi#. And VSIs can be assigned to net name spaces.

The topology of the ebridge is tracked via the upper and lower dev
lists. After we get Veaceslav Falico's work to expose this via sysfs
then the topology will be visible. Although it can be learned also
via iflink:ifindex to some extent.

PATCH DESCRIPTION:

The first extend rtnl link ops priv_size routine so VSI link types
can set the private space for the netdev being created. This is
required because device drivers will use this space.

The second patch adds some helper routines to traverse the lower dev
list.

The third patch is the interface work to support VSI devices.

And the last patch is an implementation for ixgbe. Notice there are
still a few items I need to clean up on this patch before submitting
but it is working, without DCB/FCoE, now. And I think I can simplify
it some to bring down the line count.

My plan would be to submit this as a real patch after net-next opens
but I wanted to see if there was any initial feedback especially
related to the VSI link type.

phew... sorry that was a bit long winded.

---

John Fastabend (4):
      net: rtnetlink: make priv_size a function for devs with dynamic size
      net: Add lower dev list helpers
      net: VSI: Add virtual station interface support
      ixgbe: Adding VSI support to ixgbe


 drivers/infiniband/ulp/ipoib/ipoib_netlink.c     |    4 
 drivers/net/Kconfig                              |    9 
 drivers/net/Makefile                             |    1 
 drivers/net/bonding/bond_main.c                  |    4 
 drivers/net/caif/caif_hsi.c                      |    4 
 drivers/net/ethernet/intel/ixgbe/Makefile        |    3 
 drivers/net/ethernet/intel/ixgbe/ixgbe.h         |   32 ++
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |    4 
 drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c     |   15 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  307 +++++++++++-----
 drivers/net/ethernet/intel/ixgbe/ixgbe_vsi.c     |  428 ++++++++++++++++++++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_vsi.h     |   71 ++++
 drivers/net/ifb.c                                |    4 
 drivers/net/macvlan.c                            |    4 
 drivers/net/nlmon.c                              |    4 
 drivers/net/team/team.c                          |    4 
 drivers/net/tun.c                                |    4 
 drivers/net/veth.c                               |    4 
 drivers/net/vsi.c                                |  124 ++++++
 drivers/net/vxlan.c                              |    4 
 include/linux/netdevice.h                        |   35 ++
 include/net/rtnetlink.h                          |   11 +
 include/uapi/linux/if.h                          |    1 
 net/8021q/vlan_netlink.c                         |    4 
 net/bridge/br_netlink.c                          |    4 
 net/core/dev.c                                   |   25 +
 net/core/rtnetlink.c                             |    2 
 net/ieee802154/6lowpan.c                         |    4 
 net/ipv4/ip_gre.c                                |    6 
 net/ipv4/ip_tunnel.c                             |    2 
 net/ipv4/ip_vti.c                                |    4 
 net/ipv4/ipip.c                                  |    4 
 net/ipv6/ip6_tunnel.c                            |    4 
 net/ipv6/sit.c                                   |    4 
 34 files changed, 1028 insertions(+), 116 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/ixgbe/ixgbe_vsi.c
 create mode 100644 drivers/net/ethernet/intel/ixgbe/ixgbe_vsi.h
 create mode 100644 drivers/net/vsi.c

-- 
Signature

^ permalink raw reply

* Re: drivers/net/ethernet/nvidia/forcedeth.c saved_config_space[size] access patch
From: Sergei Shtylyov @ 2013-09-11 18:44 UTC (permalink / raw)
  To: Marc Weber; +Cc: netdev, linux-kernel, David S. Miller, Jiri Kosina
In-Reply-To: <1378682421-sup-4422@nixos>

Hello.

On 09/09/2013 03:39 AM, Marc Weber wrote:

>    1) VER3 and _MAX are of same size:
>    #define NV_PCI_REGSZ_VER3       0x604
>    #define NV_PCI_REGSZ_MAX        0x604


>    2) It looks like there is a case where VER3 get's assigned to
>        register_size:

>    if (id->driver_data &
>    (DEV_HAS_VLAN|DEV_HAS_MSI_X|DEV_HAS_POWER_CNTRL|DEV_HAS_STATISTICS_V2|DEV_HAS_STATISTICS_V
>                    np->register_size = NV_PCI_REGSZ_VER3;

>    3) the definition of saved_config_space is MAX divided by 4 (size of u32)
>    struct fe_priv {
>      [...]
>      u32 saved_config_space[NV_PCI_REGSZ_MAX/4]

>    4) This doesn't stop loop at [size-1]:
>      Thus there is the risk that it overrides the field after
>      saved_config_space. If that's desired behaviour at least a comment
>      is missing IMHO:

>    for (i = 0; i <= np->register_size/sizeof(u32); i++)
>       np->saved_config_space[i] = readl(base + i*sizeof(u32));

>    Such for loop is used twice in forcedeth.c

> Patch againstn 4de9ad9bc08 (Fri Sep 6 11:14:33) attached fixing both
> using < instead of <=.

> If you think I've hit a small bug just fix and commit.
> I don't care much about my ownership of this patch.

> I didn't test this patch because I don't have the hardware and I think
> its a trivial case.

    Don't attach the patches, send them inline instead. And please mark the 
mails with patches with [PATCH] in the subject.

> Marc Weber

WBR, Sergei

^ permalink raw reply

* Re: [PATCH 1/1] bridge: fix message_age_timer calculation
From: Sergei Shtylyov @ 2013-09-11 18:40 UTC (permalink / raw)
  To: Chris Healy; +Cc: Stephen Hemminger, netdev, bridge, David S. Miller, buytenh
In-Reply-To: <1378745768-4495-1-git-send-email-cphealy@gmail.com>

Hello.

On 09/09/2013 08:56 PM, Chris Healy wrote:

> This changes the message_age_timer calculation to use the BPDU's max age as opposed to the local bridge's max age.  This is in accordance with section 8.6.2.3.2 Step 2 of the 802.1D-1998 sprecification.

    You should wrap your changelog lines at 80 chars at most, preferably less.

> With the current implementation, when running with very large bridge diameters, convergance will not always occur even if a root bridge is configured to have a longer max age.

> Tested successfully on bridge diameters of ~200.

> Signed-off-by: Chris Healy <cphealy@gmail.com>

WBR, Sergei

^ permalink raw reply

* Re: [PATCH 1/1] net: sched: Make netns available for ematch extensions
From: Jozsef Kadlecsik @ 2013-09-11 18:37 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, V. Lavrov
In-Reply-To: <1378909978.21474.3.camel@edumazet-glaptop>

On Wed, 11 Sep 2013, Eric Dumazet wrote:

> On Wed, 2013-09-11 at 07:31 -0700, Eric Dumazet wrote:
> 
> > Note that qdisc_dev(tp->q) should give you the pointer to device
> > 
> > Then ->nd_net gives you the struct net pointer.
> > 
> > On management path, this should be enough ;)

Ohh, this is great, thanks indeed! Somehow I overlooked tp->q when 
searching to find a pointer to struct net.

> (A device can be moved from net xxxx to net yyyy)

So something like dev_net(qdisc_dev(tp->q)) is all what we need.
 
Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.mta.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply

* [PATCH 2/2] tg3: Use pci_dev pm_cap
From: Jon Mason @ 2013-09-11 18:22 UTC (permalink / raw)
  To: netdev; +Cc: Nithin Nayak Sujir, Michael Chan
In-Reply-To: <1378923760-16232-1-git-send-email-jdmason@kudzu.us>

Use the already existing pm_cap variable in struct pci_dev for
determining the power management offset.  This saves the driver from
having to keep track of an extra variable.

Signed-off-by: Jon Mason <jdmason@kudzu.us>
Cc: Nithin Nayak Sujir <nsujir@broadcom.com>
Cc: Michael Chan <mchan@broadcom.com>
---
 drivers/net/ethernet/broadcom/tg3.c |    5 ++---
 drivers/net/ethernet/broadcom/tg3.h |    1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 5701f3d..938e05c 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -16192,12 +16192,12 @@ static int tg3_get_invariants(struct tg3 *tp, const struct pci_device_id *ent)
 			 * So explicitly force the chip into D0 here.
 			 */
 			pci_read_config_dword(tp->pdev,
-					      tp->pm_cap + PCI_PM_CTRL,
+					      tp->pdev->pm_cap + PCI_PM_CTRL,
 					      &pm_reg);
 			pm_reg &= ~PCI_PM_CTRL_STATE_MASK;
 			pm_reg |= PCI_PM_CTRL_PME_ENABLE | 0 /* D0 */;
 			pci_write_config_dword(tp->pdev,
-					       tp->pm_cap + PCI_PM_CTRL,
+					       tp->pdev->pm_cap + PCI_PM_CTRL,
 					       pm_reg);
 
 			/* Also, force SERR#/PERR# in PCI command. */
@@ -17346,7 +17346,6 @@ static int tg3_init_one(struct pci_dev *pdev,
 	tp = netdev_priv(dev);
 	tp->pdev = pdev;
 	tp->dev = dev;
-	tp->pm_cap = pdev->pm_cap;
 	tp->rx_mode = TG3_DEF_RX_MODE;
 	tp->tx_mode = TG3_DEF_TX_MODE;
 	tp->irq_sync = 1;
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index ddb8be1..7025780 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -3234,7 +3234,6 @@ struct tg3 {
 	u8				pci_lat_timer;
 
 	int				pci_fn;
-	int				pm_cap;
 	int				msi_cap;
 	int				pcix_cap;
 	int				pcie_readrq;
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 1/2] bnx2x: Use pci_dev pm_cap
From: Jon Mason @ 2013-09-11 18:22 UTC (permalink / raw)
  To: netdev; +Cc: Eilon Greenstein

Use the already existing pm_cap variable in struct pci_dev for
determining the power management offset.  This saves the driver from
having to keep track of an extra variable.

Signed-off-by: Jon Mason <jdmason@kudzu.us>
Cc: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h         |    1 -
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c     |    8 ++++----
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c |    4 ++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c    |   10 +++++-----
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 0c33802..70b6a05 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -1542,7 +1542,6 @@ struct bnx2x {
 	 */
 	bool			fcoe_init;
 
-	int			pm_cap;
 	int			mrrs;
 
 	struct delayed_work	sp_task;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 2361bf2..44a1261 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -3000,16 +3000,16 @@ int bnx2x_set_power_state(struct bnx2x *bp, pci_power_t state)
 	u16 pmcsr;
 
 	/* If there is no power capability, silently succeed */
-	if (!bp->pm_cap) {
+	if (!bp->pdev->pm_cap) {
 		BNX2X_DEV_INFO("No power capability. Breaking.\n");
 		return 0;
 	}
 
-	pci_read_config_word(bp->pdev, bp->pm_cap + PCI_PM_CTRL, &pmcsr);
+	pci_read_config_word(bp->pdev, bp->pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
 
 	switch (state) {
 	case PCI_D0:
-		pci_write_config_word(bp->pdev, bp->pm_cap + PCI_PM_CTRL,
+		pci_write_config_word(bp->pdev, bp->pdev->pm_cap + PCI_PM_CTRL,
 				      ((pmcsr & ~PCI_PM_CTRL_STATE_MASK) |
 				       PCI_PM_CTRL_PME_STATUS));
 
@@ -3033,7 +3033,7 @@ int bnx2x_set_power_state(struct bnx2x *bp, pci_power_t state)
 		if (bp->wol)
 			pmcsr |= PCI_PM_CTRL_PME_ENABLE;
 
-		pci_write_config_word(bp->pdev, bp->pm_cap + PCI_PM_CTRL,
+		pci_write_config_word(bp->pdev, bp->pdev->pm_cap + PCI_PM_CTRL,
 				      pmcsr);
 
 		/* No more memory access after this point until
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 2612e3c..324de5f 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -1387,9 +1387,9 @@ static bool bnx2x_is_nvm_accessible(struct bnx2x *bp)
 	u16 pm = 0;
 	struct net_device *dev = pci_get_drvdata(bp->pdev);
 
-	if (bp->pm_cap)
+	if (bp->pdev->pm_cap)
 		rc = pci_read_config_word(bp->pdev,
-					  bp->pm_cap + PCI_PM_CTRL, &pm);
+					  bp->pdev->pm_cap + PCI_PM_CTRL, &pm);
 
 	if ((rc && !netif_running(dev)) ||
 	    (!rc && ((pm & PCI_PM_CTRL_STATE_MASK) != (__force u16)PCI_D0)))
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 634a793..41d6f2d 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -8651,6 +8651,7 @@ u32 bnx2x_send_unload_req(struct bnx2x *bp, int unload_mode)
 	else if (bp->wol) {
 		u32 emac_base = port ? GRCBASE_EMAC1 : GRCBASE_EMAC0;
 		u8 *mac_addr = bp->dev->dev_addr;
+		struct pci_dev *pdev = bp->pdev;
 		u32 val;
 		u16 pmc;
 
@@ -8667,9 +8668,9 @@ u32 bnx2x_send_unload_req(struct bnx2x *bp, int unload_mode)
 		EMAC_WR(bp, EMAC_REG_EMAC_MAC_MATCH + entry + 4, val);
 
 		/* Enable the PME and clear the status */
-		pci_read_config_word(bp->pdev, bp->pm_cap + PCI_PM_CTRL, &pmc);
+		pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmc);
 		pmc |= PCI_PM_CTRL_PME_ENABLE | PCI_PM_CTRL_PME_STATUS;
-		pci_write_config_word(bp->pdev, bp->pm_cap + PCI_PM_CTRL, pmc);
+		pci_write_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, pmc);
 
 		reset_code = DRV_MSG_CODE_UNLOAD_REQ_WOL_EN;
 
@@ -10398,7 +10399,7 @@ static void bnx2x_get_common_hwinfo(struct bnx2x *bp)
 		break;
 	}
 
-	pci_read_config_word(bp->pdev, bp->pm_cap + PCI_PM_PMC, &pmc);
+	pci_read_config_word(bp->pdev, bp->pdev->pm_cap + PCI_PM_PMC, &pmc);
 	bp->flags |= (pmc & PCI_PM_CAP_PME_D3cold) ? 0 : NO_WOL_FLAG;
 
 	BNX2X_DEV_INFO("%sWoL capable\n",
@@ -12140,8 +12141,7 @@ static int bnx2x_init_dev(struct bnx2x *bp, struct pci_dev *pdev,
 	}
 
 	if (IS_PF(bp)) {
-		bp->pm_cap = pdev->pm_cap;
-		if (bp->pm_cap == 0) {
+		if (!pdev->pm_cap) {
 			dev_err(&bp->pdev->dev,
 				"Cannot find power management capability, aborting\n");
 			rc = -EIO;
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH 20/52] net: fealnx: remove unnecessary pci_set_drvdata()
From: Sergei Shtylyov @ 2013-09-11 18:19 UTC (permalink / raw)
  To: Jingoo Han; +Cc: 'David S. Miller', netdev
In-Reply-To: <005e01ceaec2$23e32420$6ba96c60$%han@samsung.com>

Hello.

On 09/11/2013 11:40 AM, Jingoo Han wrote:

> The driver core clears the driver data to NULL after device_release
> or on probe failure. Thus, it is not needed to manually clear the
> device driver data to NULL.

> Signed-off-by: Jingoo Han <jg1.han@samsung.com>
> ---
>   drivers/net/ethernet/fealnx.c |    4 ++--
>   1 file changed, 2 insertions(+), 2 deletions(-)

> diff --git a/drivers/net/ethernet/fealnx.c b/drivers/net/ethernet/fealnx.c
> index c706b7a..99194d1 100644
> --- a/drivers/net/ethernet/fealnx.c
> +++ b/drivers/net/ethernet/fealnx.c
> @@ -699,9 +699,9 @@ static void fealnx_remove_one(struct pci_dev *pdev)
>   		pci_iounmap(pdev, np->mem);
>   		free_netdev(dev);
>   		pci_release_regions(pdev);
> -		pci_set_drvdata(pdev, NULL);
> -	} else
> +	} else {
>   		printk(KERN_ERR "fealnx: remove for unknown device\n");
> +	}

    No "drove-by" coding style fixes, please.

WBR, Sergei

^ permalink raw reply

* Re: [PATCH 29/52] net: neterion: remove unnecessary pci_set_drvdata()
From: Jon Mason @ 2013-09-11 18:05 UTC (permalink / raw)
  To: Jingoo Han; +Cc: David S. Miller, netdev
In-Reply-To: <006701ceaec3$dce53e30$96afba90$%han@samsung.com>

On Wed, Sep 11, 2013 at 12:52 AM, Jingoo Han <jg1.han@samsung.com> wrote:
> The driver core clears the driver data to NULL after device_release
> or on probe failure. Thus, it is not needed to manually clear the
> device driver data to NULL.
>
> Signed-off-by: Jingoo Han <jg1.han@samsung.com>

Looks sane to me.
Acked-by: Jon Mason <jdmason@kudzu.us>

> ---
>  drivers/net/ethernet/neterion/s2io.c           |    2 --
>  drivers/net/ethernet/neterion/vxge/vxge-main.c |    2 --
>  2 files changed, 4 deletions(-)
>
> diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
> index 51b0094..9eeddbd 100644
> --- a/drivers/net/ethernet/neterion/s2io.c
> +++ b/drivers/net/ethernet/neterion/s2io.c
> @@ -8185,7 +8185,6 @@ mem_alloc_failed:
>         free_shared_mem(sp);
>         pci_disable_device(pdev);
>         pci_release_regions(pdev);
> -       pci_set_drvdata(pdev, NULL);
>         free_netdev(dev);
>
>         return ret;
> @@ -8221,7 +8220,6 @@ static void s2io_rem_nic(struct pci_dev *pdev)
>         iounmap(sp->bar0);
>         iounmap(sp->bar1);
>         pci_release_regions(pdev);
> -       pci_set_drvdata(pdev, NULL);
>         free_netdev(dev);
>         pci_disable_device(pdev);
>  }
> diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
> index 5a20eaf..8614eeb 100644
> --- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
> +++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
> @@ -4739,7 +4739,6 @@ _exit6:
>  _exit5:
>         vxge_device_unregister(hldev);
>  _exit4:
> -       pci_set_drvdata(pdev, NULL);
>         vxge_hw_device_terminate(hldev);
>         pci_disable_sriov(pdev);
>  _exit3:
> @@ -4782,7 +4781,6 @@ static void vxge_remove(struct pci_dev *pdev)
>                 vxge_free_mac_add_list(&vdev->vpaths[i]);
>
>         vxge_device_unregister(hldev);
> -       pci_set_drvdata(pdev, NULL);
>         /* Do not call pci_disable_sriov here, as it will break child devices */
>         vxge_hw_device_terminate(hldev);
>         iounmap(vdev->bar0);
> --
> 1.7.10.4
>
>

^ permalink raw reply

* Re: [PATCH nf] netfilter: use RCU safe kfree for conntrack extensions
From: Eric Dumazet @ 2013-09-11 17:54 UTC (permalink / raw)
  To: Michal Kubecek
  Cc: Phil Oester, netfilter-devel, netdev, Pablo Neira Ayuso,
	Patrick McHardy, Jozsef Kadlecsik
In-Reply-To: <20130911174248.GA8318@unicorn.suse.cz>

On Wed, 2013-09-11 at 19:42 +0200, Michal Kubecek wrote:

> Yes, the patch submitted here is against current nf branch. For
> SLES 11 SP1 (with 2.6.32 kernel), I'm going to use call_rcu() the way
> original commit 68b80f11 does. 

Well, please just submit a patch for current tree, using kfree_rcu()

When doing backport to stable branches, needed adaptation shall be done.

^ permalink raw reply

* Re: [PATCH nf] netfilter: use RCU safe kfree for conntrack extensions
From: Pablo Neira Ayuso @ 2013-09-11 17:50 UTC (permalink / raw)
  To: Phil Oester
  Cc: Michal Kubecek, netfilter-devel, netdev, Patrick McHardy,
	Jozsef Kadlecsik
In-Reply-To: <20130911170946.GA2926@linuxace.com>

On Wed, Sep 11, 2013 at 10:09:47AM -0700, Phil Oester wrote:
> On Wed, Sep 11, 2013 at 05:28:05PM +0200, Michal Kubecek wrote:
> > > Looking at
> > > your proposed fix, the NAT extension data should have been cleaned
> > > from the bysource list in nf_nat_cleanup_conntrack (via __nf_ct_ext_destroy)
> > > before reaching the kfree.  Would you agree?
> > 
> > It is cleaned from the list but as it is an RCU list, other readers can
> > still be holding pointers to it. We have to wait for the RCU grace
> > period before we can reuse it.
> 
> Agreed - looks like your fix should work.  However, two nits:
> 
> 1) normally RCU functions have _rcu suffixes.  So nf_ct_ext_free should
> become nf_ct_ext_free_rcu.

That postfix is there if the function requires to be called holding
rcu read lock, not this case. I'll take this patch.

> 2) kfree_rcu was not added to the kernel until 3.0.  All of the bug
> reports I've been looking into (including the original in netfilter bugzilla
> at http://bugzilla.netfilter.org/show_bug.cgi?id=714) have been reported in
> 2.6.32 or earlier kernels.  So a different fix would need to be backported for
> -stable.  For that, we would probably export __nf_ct_ext_free_rcu from
> nf_conntrack_extend.c and change the kfree call in nf_ct_ext_free_rcu to
> call_rcu(&ct->ext->rcu, __nf_ct_ext_free_rcu). Of course the alternative
> is just to use this fix for both old and new kernels for simplicity.

Either way, we need a specific backport for 2.6.x indeed.

Thanks for tracking up this issue.

^ permalink raw reply

* Re: [PATCH nf] netfilter: use RCU safe kfree for conntrack extensions
From: Michal Kubecek @ 2013-09-11 17:42 UTC (permalink / raw)
  To: Phil Oester
  Cc: netfilter-devel, netdev, Pablo Neira Ayuso, Patrick McHardy,
	Jozsef Kadlecsik
In-Reply-To: <20130911170946.GA2926@linuxace.com>

On Wed, Sep 11, 2013 at 10:09:47AM -0700, Phil Oester wrote:
> 
> 1) normally RCU functions have _rcu suffixes.  So nf_ct_ext_free should
> become nf_ct_ext_free_rcu.

Right. I'll post updated version with the rename tomorrow.

> 2) kfree_rcu was not added to the kernel until 3.0.  All of the bug
> reports I've been looking into (including the original in netfilter bugzilla
> at http://bugzilla.netfilter.org/show_bug.cgi?id=714) have been reported in
> 2.6.32 or earlier kernels.  So a different fix would need to be backported for
> -stable.  For that, we would probably export __nf_ct_ext_free_rcu from
> nf_conntrack_extend.c and change the kfree call in nf_ct_ext_free_rcu to
> call_rcu(&ct->ext->rcu, __nf_ct_ext_free_rcu).

Yes, the patch submitted here is against current nf branch. For
SLES 11 SP1 (with 2.6.32 kernel), I'm going to use call_rcu() the way
original commit 68b80f11 does. IIRC the only pre-3.0 stable branch still
maintained is 2.6.32, all others are 3.0 or newer so they have
kfree_rcu() and also use it in __nf_ct_ext_add() since commit 1f8d36a1. 

                                                        Michal Kubecek


^ permalink raw reply

* Re: [PATCH 14/52] net: cxgb4vf: remove unnecessary pci_set_drvdata()
From: Casey Leedom @ 2013-09-11 17:24 UTC (permalink / raw)
  To: Jingoo Han; +Cc: 'David S. Miller', netdev
In-Reply-To: <005801ceaec1$6b8d3320$42a79960$%han@samsung.com>

   I agree that the redundant pci_set_drvdata(pdev, NULL) in 
cxgb4vf_pci_probe() under the err_release_regions: label is unneeded, 
but don't we need to NULL out the PCI Driver Data under the 
err_free_adapter: label and also in cxgb4vf_pci_remove()?  Or is that 
handled automatically in the PCI infrastructure code which calls the 
Device Probe and Remove routines?  Mostly I was just being an 
obsessively clean housewife assuming that we'd want to clean up these 
references ...

Casey

On 09/11/13 00:34, Jingoo Han wrote:
> The driver core clears the driver data to NULL after device_release
> or on probe failure. Thus, it is not needed to manually clear the
> device driver data to NULL.
>
> Signed-off-by: Jingoo Han <jg1.han@samsung.com>
> ---
>   drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c |    3 ---
>   1 file changed, 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
> index 40c22e7..43bb012 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
> +++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
> @@ -2782,11 +2782,9 @@ err_unmap_bar:
>   
>   err_free_adapter:
>   	kfree(adapter);
> -	pci_set_drvdata(pdev, NULL);
>   
>   err_release_regions:
>   	pci_release_regions(pdev);
> -	pci_set_drvdata(pdev, NULL);
>   	pci_clear_master(pdev);
>   
>   err_disable_device:
> @@ -2851,7 +2849,6 @@ static void cxgb4vf_pci_remove(struct pci_dev *pdev)
>   		}
>   		iounmap(adapter->regs);
>   		kfree(adapter);
> -		pci_set_drvdata(pdev, NULL);
>   	}
>   
>   	/*

^ permalink raw reply

* Re: [PATCH nf] netfilter: use RCU safe kfree for conntrack extensions
From: Phil Oester @ 2013-09-11 17:09 UTC (permalink / raw)
  To: Michal Kubecek
  Cc: netfilter-devel, netdev, Pablo Neira Ayuso, Patrick McHardy,
	Jozsef Kadlecsik
In-Reply-To: <20130911152804.GA5397@unicorn.suse.cz>

On Wed, Sep 11, 2013 at 05:28:05PM +0200, Michal Kubecek wrote:
> > Looking at
> > your proposed fix, the NAT extension data should have been cleaned
> > from the bysource list in nf_nat_cleanup_conntrack (via __nf_ct_ext_destroy)
> > before reaching the kfree.  Would you agree?
> 
> It is cleaned from the list but as it is an RCU list, other readers can
> still be holding pointers to it. We have to wait for the RCU grace
> period before we can reuse it.

Agreed - looks like your fix should work.  However, two nits:

1) normally RCU functions have _rcu suffixes.  So nf_ct_ext_free should
become nf_ct_ext_free_rcu.

2) kfree_rcu was not added to the kernel until 3.0.  All of the bug
reports I've been looking into (including the original in netfilter bugzilla
at http://bugzilla.netfilter.org/show_bug.cgi?id=714) have been reported in
2.6.32 or earlier kernels.  So a different fix would need to be backported for
-stable.  For that, we would probably export __nf_ct_ext_free_rcu from
nf_conntrack_extend.c and change the kfree call in nf_ct_ext_free_rcu to
call_rcu(&ct->ext->rcu, __nf_ct_ext_free_rcu). Of course the alternative
is just to use this fix for both old and new kernels for simplicity.

> No, it is a bugreport from our customer. And even that customer
> encountered it only once so far. Which is not very surprising as to
> reproduce it, you have to be (un)lucky twice: first to have someone
> overwrite the area soon enough and second to have someone access the
> area after it is overwritten.

Yes, hitting this seems dependent upon phase of the moon.

Phil

^ permalink raw reply

* [v3.11][Regression][Resend] skge: add dma_mapping check
From: Joseph Salisbury @ 2013-09-11 16:42 UTC (permalink / raw)
  To: stephen; +Cc: davem@davemloft.net, mlindner, netdev@vger.kernel.org,
	linux-kernel
In-Reply-To: <521E1552.4010100@canonical.com>

> Hi Stephen,
>
> A bug was opened against the Ubuntu kernel[0].  A bug has also been
> opened in bugzilla[1].  After a kernel bisect, it was found that
> reverting the following commit resolved this bug:
>
> commit 136d8f377e1575463b47840bc5f1b22d94bf8f63
> Author: stephen hemminger <stephen@networkplumber.org>
> Date:   Sun Aug 4 17:22:34 2013 -0700
>
>     skge: add dma_mapping check
>
>
> The regression was introduced as of v3.11-rc6.
>
> I see that you are the author of this patch, so I wanted to run this by
> you.  I was thinking of requesting a revert for v3.11, but I wanted to
> get your feedback first.
>
>
> Thanks,
>
> Joe
>
> [0] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1216745
> [1] https://bugzilla.kernel.org/show_bug.cgi?id=60784

^ permalink raw reply

* Re: 3.10.0 network trace
From: Yuchung Cheng @ 2013-09-11 16:32 UTC (permalink / raw)
  To: Josh Boyer; +Cc: Michael Sterrett, netdev, Neal Cardwell, Nandita Dukkipati
In-Reply-To: <CA+5PVA5bDAz5a-TrM4ALA9vJz2JbE14VT7xM5=WqaRg=Ro0_xQ@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1080 bytes --]

On Wed, Sep 11, 2013 at 5:49 AM, Josh Boyer <jwboyer@fedoraproject.org> wrote:
> On Sun, Jul 21, 2013 at 9:36 PM, Michael Sterrett <michael@sterretts.net> wrote:
>> Since upgraded to 3.10.1 which exhibits the same issue (probably not
>> surprising).
>>
>> sysctl net.ipv4.tcp_frto=0 net.ipv4.tcp_early_retrans=3 seems to take
>> care of it.
>
> We're still getting reports of this with 3.10.10 in Fedora [1].  At
> least one of the reporters has said those sysctl settings didn't help.
>
> Is there anything else that can be gathered to help track this down?
since frto=0 didn't help, I suspect it's either a bug introduced in
the tail loss probe patch 9b717a8d (tcp: TLP loss detection) or other
changes related loss recovery.

could you try disable TLP by
sysctl net.ipv4.tcp_frto=0 net.ipv4.tcp_early_retrans=1

if it still does not work. disable any form of early retransmit by
sysctl net.ipv4.tcp_frto=0 net.ipv4.tcp_early_retrans=0

if you can test a custom build kernel, please try this debugging patch.

>
> josh
>
> [1] https://bugzilla.redhat.com/show_bug.cgi?id=989251

[-- Attachment #2: 0001-tcp-debug-fastretrans-warning.patch --]
[-- Type: application/octet-stream, Size: 1525 bytes --]

From 62561c91d4655539781113e0fa04ee681b22a2a4 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 31 Jul 2013 09:22:35 -0700
Subject: [PATCH] tcp: debug fastretrans warning

printk a ton of states to debug fastretrans warning

Signed-off-by: Yuchung Cheng <ycheng@google.com>
---
 net/ipv4/tcp_input.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9c62257..c18dab1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2773,7 +2773,20 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
 	/* D. Check state exit conditions. State can be terminated
 	 *    when high_seq is ACKed. */
 	if (icsk->icsk_ca_state == TCP_CA_Open) {
-		WARN_ON(tp->retrans_out != 0);
+		if (WARN_ON(tp->retrans_out != 0)) {
+			printk(KERN_DEBUG "%pI4:%u F0x%x S%u s%d IF%u+%u-%u-%u"
+			       "f%u ur%u rr%u rt%u um%u hs%u nxt%u\n",
+			       &inet_sk(sk)->inet_daddr,
+			       ntohs(inet_sk(sk)->inet_dport),
+			       flag, sk->sk_state, tp->rx_opt.sack_ok,
+			       tp->packets_out, tp->retrans_out,
+			       tp->sacked_out, tp->lost_out,
+			       tp->frto, tp->undo_retrans,
+			       tp->reordering, icsk->icsk_retransmits,
+			       tp->undo_marker ? tp->undo_marker-tp->snd_una:0,
+			       tp->high_seq - tp->snd_una,
+			       tp->snd_nxt - tp->snd_una);
+		}
 		tp->retrans_stamp = 0;
 	} else if (!before(tp->snd_una, tp->high_seq)) {
 		switch (icsk->icsk_ca_state) {
-- 
1.8.3


^ permalink raw reply related

* Re: [PATCH net 2/4] bridge: Handle priority-tagged frames properly
From: Vlad Yasevich @ 2013-09-11 16:32 UTC (permalink / raw)
  To: Toshiaki Makita; +Cc: David S. Miller, netdev
In-Reply-To: <1378882832.3495.12.camel@ubuntu-vm-makita>

On 09/11/2013 03:00 AM, Toshiaki Makita wrote:
> On Tue, 2013-09-10 at 10:03 -0400, Vlad Yasevich wrote:
>> On 09/10/2013 06:34 AM, Toshiaki Makita wrote:
>>> IEEE 802.1Q says that when we receive priority-tagged (VID 0) frames
>>> use the PVID for the port as its VID.
>>> (See IEEE 802.1Q-2005 6.7.1 and Table 9-2)
>>>
>>> Apply the PVID to not only untagged frames but also priority-tagged frames.
>>>
>>> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
>>> ---
>>>    net/bridge/br_vlan.c | 27 ++++++++++++++++++++-------
>>>    1 file changed, 20 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
>>> index 21b6d21..5a9c44a 100644
>>> --- a/net/bridge/br_vlan.c
>>> +++ b/net/bridge/br_vlan.c
>>> @@ -189,6 +189,8 @@ out:
>>>    bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
>>>    			struct sk_buff *skb, u16 *vid)
>>>    {
>>> +	int err;
>>> +
>>>    	/* If VLAN filtering is disabled on the bridge, all packets are
>>>    	 * permitted.
>>>    	 */
>>> @@ -201,20 +203,31 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
>>>    	if (!v)
>>>    		return false;
>>>
>>> -	if (br_vlan_get_tag(skb, vid)) {
>>> +	err = br_vlan_get_tag(skb, vid);
>>> +	if (!*vid) {
>>>    		u16 pvid = br_get_pvid(v);
>>>
>>> -		/* Frame did not have a tag.  See if pvid is set
>>> -		 * on this port.  That tells us which vlan untagged
>>> -		 * traffic belongs to.
>>> +		/* Frame had a tag with VID 0 or did not have a tag.
>>> +		 * See if pvid is set on this port.  That tells us which
>>> +		 * vlan untagged or priority-tagged traffic belongs to.
>>>    		 */
>>>    		if (pvid == VLAN_N_VID)
>>>    			return false;
>>>
>>> -		/* PVID is set on this port.  Any untagged ingress
>>> -		 * frame is considered to belong to this vlan.
>>> +		/* PVID is set on this port.  Any untagged or priority-tagged
>>> +		 * ingress frame is considered to belong to this vlan.
>>>    		 */
>>> -		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), pvid);
>>> +		if (likely(err))
>>> +			/* Untagged Frame. */
>>> +			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), pvid);
>>> +		else
>>> +			/* Priority-tagged Frame.
>>> +			 * At this point, We know that skb->vlan_tci had
>>> +			 * VLAN_TAG_PRESENT bit and its VID field was 0x000.
>>> +			 * We update only VID field and preserve PCP field.
>>> +			 */
>>> +			skb->vlan_tci |= pvid;
>>> +
>>
>> In the case of a priority tagged frame, we should unroll the
>> modification above and restore the VID field to 0.  Otherwise, you
>> may end up either stripping the vlan header completely or forwarding
>> with pvid of the ingress port.
>
> Thank you for reviewing.
>
> It is my intended behavior that an incoming priority-tagged frame is
> forwarded as a frame untagged or tagged with pvid.
>
> IEEE 802.1Q-2011:
>
>    section 8.1.7 Conversion of frame formats
>
>    NOTE - As all incoming frames, including priority-tagged frames, are
>    classified as belonging to a VLAN, the transmitting Port transmits
>    VLAN-tagged frames or untagged frames. Hence, a station sending a
>    priority-tagged frame via a Bridge will receive a response that is
>    either VLAN-tagged or untagged, as described in 8.5.
>
>    3. Definitions
>
>    3.132 Priority-tagged frame: A tagged frame whose tag header carries
>    priority information but carries no VLAN identification information.
>
>    3.203 VLAN-tagged frame: A VLAN-tagged frame is a tagged frame whose
>    tag header carries *both* VLAN identification and priority
>    information.
>
> Toshiaki Makita
>

Hmm..  The problem is that if a system attached to a port configures a
vlan interface with vid 0 and some priority mappings, then that
interface will not be able to properly receive traffic, as the bridge 
now will never transmit priority tagged frames.

-vlad

>>
>> -vlad
>>>    		return true;
>>>    	}
>>>
>>>
>
>

^ permalink raw reply

* Re: [PATCH] ppc: bpf_jit: support MOD operation
From: Vladimir Murzin @ 2013-09-11 16:15 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Benjamin Herrenschmidt, linuxppc-dev, paulus, davem, Matt Evans,
	netdev
In-Reply-To: <5226DB64.3020207@redhat.com>

On Wed, Sep 04, 2013 at 09:04:04AM +0200, Daniel Borkmann wrote:
> On 09/03/2013 10:52 PM, Daniel Borkmann wrote:
> > On 09/03/2013 09:58 PM, Vladimir Murzin wrote:
> [...]
> >>> Do you have a test case/suite by any chance ?
> >>>
> >>> Ben.
> >>>
> >>
> >> Hi Ben!
> >>
> >> Thanks for your feedback.
> >>
> >> This patch is only compile tested. I have no real hardware, but I'll
> >> probably bring up qemu ppc64 till end of the week...
> >> Meanwhile, I've made simple how-to for testing. You can use it if you wish.
> >> It is mainly based on the [1] and rechecked on x86-64.

Finally I've managed to bring up qemu ppc64 and done simple testing. As a
result I could see difference in opcodes for divide instruction - I've just
sent the patch for that.

WRT mod instruction result is:

For BPF program

(000) ldh      [12]
(001) jeq      #0x800           jt 2	jf 10
(002) ldh      [16]
(003) sub      #20
(004) mod      #5
(005) jeq      #0x0             jt 10	jf 6
(006) ldb      [20]
(007) and      #0x20
(008) jeq      #0x20            jt 9	jf 10
(009) ret      #65535
(010) ret      #0

The following code is generated (with patch divw to divwu applied)

244 bytes emitted from JIT compiler (pass:3, flen:11)
d0000000015c0018 + <x>:
   0:	mflr    r0
   4:	std     r0,16(r1)
   8:	std     r14,-144(r1)
   c:	std     r15,-136(r1)
  10:	stdu    r1,-288(r1)
  14:	lwz     r7,108(r3)
  18:	lwz     r15,104(r3)
  1c:	subf    r15,r7,r15
  20:	ld      r14,216(r3)
  24:	lis     r7,-16384
  28:	rldicr  r7,r7,32,31
  2c:	oris    r7,r7,9
  30:	ori     r7,r7,43428
  34:	mtlr    r7
  38:	li      r6,12
  3c:	blrl
  40:	blt-    0x00000000000000dc
  44:	nop
  48:	cmplwi  r4,2048
  4c:	bne-    0x00000000000000d8
  50:	nop
  54:	lis     r7,-16384
  58:	rldicr  r7,r7,32,31
  5c:	oris    r7,r7,9
  60:	ori     r7,r7,43428
  64:	mtlr    r7
  68:	li      r6,16
  6c:	blrl
  70:	blt-    0x00000000000000dc
  74:	nop
  78:	addi    r4,r4,-20
  7c:	li      r8,5
  80:	divwu   r7,r4,r8
  84:	mullw   r7,r8,r7
  88:	subf    r4,r7,r4
  8c:	cmplwi  r4,0
  90:	beq-    0x00000000000000d8
  94:	nop
  98:	lis     r7,-16384
  9c:	rldicr  r7,r7,32,31
  a0:	oris    r7,r7,9
  a4:	ori     r7,r7,43456
  a8:	mtlr    r7
  ac:	li      r6,20
  b0:	blrl
  b4:	blt-    0x00000000000000dc
  b8:	nop
  bc:	andi.   r4,r4,32
  c0:	cmplwi  r4,32
  c4:	bne-    0x00000000000000d8
  c8:	nop
  cc:	li      r3,-1
  d0:	addis   r3,r3,1
  d4:	b       0x00000000000000dc
  d8:	li      r3,0
  dc:	addi    r1,r1,288
  e0:	ld      r0,16(r1)
  e4:	mtlr    r0
  e8:	ld      r14,-144(r1)
  ec:	ld      r15,-136(r1)
  f0:	blr

Raw codes are

flen=11 proglen=244 pass=3 image=d0000000015c0018
JIT code: 00000000: 7c 08 02 a6 f8 01 00 10 f9 c1 ff 70 f9 e1 ff 78
JIT code: 00000010: f8 21 fe e1 80 e3 00 6c 81 e3 00 68 7d e7 78 50
JIT code: 00000020: e9 c3 00 d8 3c e0 c0 00 78 e7 07 c6 64 e7 00 09
JIT code: 00000030: 60 e7 a9 a4 7c e8 03 a6 38 c0 00 0c 4e 80 00 21
JIT code: 00000040: 41 80 00 9c 60 00 00 00 28 04 08 00 40 82 00 8c
JIT code: 00000050: 60 00 00 00 3c e0 c0 00 78 e7 07 c6 64 e7 00 09
JIT code: 00000060: 60 e7 a9 a4 7c e8 03 a6 38 c0 00 10 4e 80 00 21
JIT code: 00000070: 41 80 00 6c 60 00 00 00 38 84 ff ec 39 00 00 05
JIT code: 00000080: 7c e4 43 96 7c e8 39 d6 7c 87 20 50 28 04 00 00
JIT code: 00000090: 41 82 00 48 60 00 00 00 3c e0 c0 00 78 e7 07 c6
JIT code: 000000a0: 64 e7 00 09 60 e7 a9 c0 7c e8 03 a6 38 c0 00 14
JIT code: 000000b0: 4e 80 00 21 41 80 00 28 60 00 00 00 70 84 00 20
JIT code: 000000c0: 28 04 00 20 40 82 00 14 60 00 00 00 38 60 ff ff
JIT code: 000000d0: 3c 63 00 01 48 00 00 08 38 60 00 00 38 21 01 20
JIT code: 000000e0: e8 01 00 10 7c 08 03 a6 e9 c1 ff 70 e9 e1 ff 78
JIT code: 000000f0: 4e 80 00 20

Ben,

How do you feel about it?

> >
> > Please also cc netdev on BPF related changes.
> >
> > Actually, your test plan can be further simplified ...
> >
> > For retrieving and disassembling the JIT image, we have bpf_jit_disasm [1].
> >
> >   1) echo 2 > /proc/sys/net/core/bpf_jit_enable
> >   2) ... attach filter ...
> >   3) bpf_jit_disasm -o
> >
> > For generating a simple stupid test filter, you can use bpfc [2] (also
> > see its man page). E.g. ...
> >
> >    # cat blub
> >    ldi #10
> >    mod #8
> >    ret a
> >    # bpfc blub
> >    { 0x0, 0, 0, 0x0000000a },
> >    { 0x94, 0, 0, 0x00000008 },
> >    { 0x16, 0, 0, 0x00000000 },
> 
> Plus something like ...
> 
> ldxi #0
> mod x
> ret a
> 

Thanks Daniel!

Unfortunately, I couldn't trigger JIT compiler with the pair bpfc/netsniff-ng
(even for x86-64). I guess I missed something. I'd be very grateful if you
point at my mistakes.

> For longer-term testing, also trinity has BPF support. ;)
> 

Wow! Could do give some hint how to run this for BPF only?

> > And load this array e.g. either into a small C program that attaches this
> > as BPF filter, or simply do bpfc blub > blub2 and run netsniff-ng -f blub2\
> > -s -i eth0, that should also do it.
> >
> > Then, when attached, the kernel should truncate incoming frames for pf_packet
> > into max length of 2, just as an example.
> >
> >    [1] kernel tree, tools/net/bpf_jit_disasm.c
> >    [2] git clone git://github.com/borkmann/netsniff-ng.git

Thanks
Vladimir

^ permalink raw reply

* hi
From: ZHAO Gang @ 2013-09-11 16:06 UTC (permalink / raw)
  To: netdev

subscribe netdev

^ permalink raw reply

* RE: usbnet transmit path problems
From: David Laight @ 2013-09-11 16:05 UTC (permalink / raw)
  To: Ming Lei; +Cc: Oliver Neukum, Network Development, linux-usb
In-Reply-To: <CACVXFVN6ZHLesrsMVNMWrikRs1mMbk=aZD9qZybNn1gB7aFTZQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

> On Wed, Sep 11, 2013 at 8:56 PM, David Laight <David.Laight-JxhZ9S5GRejQT0dZR+AlfA@public.gmane.org> wrote:
> >> > > 2) If 'length % dev->maxpacket == 0' for a multi-fragment packet then
> >> > >    the extra byte isn't added correctly (the code probably falls off
> >> > >    the end of the scatter-gather list).
> >> >
> >> > Indeed. Ming Lei, should usbnet handle this in the sg case or better
> >> > leave it to the subdriver you introduced this for?
> >
> > Is the ZLP issue a problem with the host or with the target?
> 
> Sorry, what do you mean the ZLP issue here? I understand Oliver
> thinks one commit from me may break ZLP handling, are you discussing
> this problem? If not, could you explain it in a bit detail?

I was thinking of the general ZLP problem.
 
> > If it is a host problem then the necessity comes from the host,
> > but the fix needs to be target dependant.
> > If it is a common target problem then generic code can apply
> > a common fix.
> 
> All usbnet device should have sent one ZLP in case the size of
> bulk out transfer can be divided by max packet size, but the one
> byte transfer might be introduced for avoiding some target problem
> (can't deal with zlp well), as said by David, see below discussion:
> 
>    http://marc.info/?l=linux-usb&m=127067487604112&w=2

AFAICT the code avoids sending a zero length packet (that would
terminate a USB bulk transfer packet) by increasing the length
of the bulk packet by (at least) one byte.

> > AFICT there are at least 3 fixes:
> > 1) Extend the ethernet frame by one byte and hope the receiving
> >    system doesn't object to the padding.
> >    This is probably the only option if tx_fixup() doesn't
> >    add a header.
> > 2) Put the ethernet frame length in the header and have the
> >    target discard the added pad byte (ax88179_178a.c).
> > 3) Add a second zero-length frame in the same USB data block
> >    (ax88172a.c).
> 
> Why do we need the above 3 fixes? The patch in my last email can
> fix the problem which is introduced recently, can't it?

I meant there are 3 ways of avoiding the ZLP, each driver will
pick one of them.

I've just looked at all the drivers in net/usb.
It doesn't look like they all handle fragmented skb, shared skb,
or ZLP properly.

A lot of common code could be removed if usbnet knew the size of the
header and allocated it before calling tx_fixup().

None of this is helping me sort out why netperf udp rr tests with
burst 19 are losing all the packets at once :-(

	David



--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [net  8/8] i40e: include i40e in kernel proper
From: Jeff Kirsher @ 2013-09-11 15:37 UTC (permalink / raw)
  To: Waskiewicz Jr, Peter P
  Cc: Joe Perches, davem@davemloft.net, Brandeburg, Jesse,
	netdev@vger.kernel.org, gospo@redhat.com, sassmann@redhat.com,
	Nelson, Shannon, e1000-devel@lists.sourceforge.net
In-Reply-To: <1378913071.3863.27.camel@ppwaskie-mobl2>

[-- Attachment #1: Type: text/plain, Size: 671 bytes --]

On Wed, 2013-09-11 at 08:24 -0700, Waskiewicz Jr, Peter P wrote:
> On Wed, 2013-09-11 at 05:20 -0700, Joe Perches wrote:
> > On Wed, 2013-09-11 at 02:50 -0700, Jeff Kirsher wrote:
> > > New driver build option is CONFIG_I40E
> > 
> > > diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
> > []
> > > +i40e.txt
> > > +	- README for the Intel Ethernet Controller XL710 Driver (i40e).
> > 
> > Just curious but why the XL710 / i40e name mismatch?
> 
> ixgbe is a good example.  ixgbe is the driver name, but it supports the
> 82598, 82599 / X520, and X540 chips.
> 

Another good example is e100, e1000, e1000e, and igb. :-)

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [net  8/8] i40e: include i40e in kernel proper
From: Joe Perches @ 2013-09-11 15:31 UTC (permalink / raw)
  To: Jeff Kirsher
  Cc: davem, Jesse Brandeburg, netdev, gospo, sassmann, Shannon Nelson,
	PJ Waskiewicz, e1000-devel
In-Reply-To: <1378909937.2026.8.camel@jtkirshe-mobl>

On Wed, 2013-09-11 at 07:32 -0700, Jeff Kirsher wrote:
> On Wed, 2013-09-11 at 05:20 -0700, Joe Perches wrote:
> > On Wed, 2013-09-11 at 02:50 -0700, Jeff Kirsher wrote:
> > > New driver build option is CONFIG_I40E
> > 
> > > diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
> > []
> > > +i40e.txt
> > > +	- README for the Intel Ethernet Controller XL710 Driver (i40e).
> > 
> > Just curious but why the XL710 / i40e name mismatch?
> 
> i40e stands for Intel 40 GbE Ethernet which is more generic than naming
> the driver the same as the first part (XL710).  That way when future
> silicon is made, we are not stuck with a driver named after previous
> silicon.

>From the intro and most of the rest of the patches:

----------------------------
This series implements the new i40e driver for Intel's upcoming
Intel(R) Ethernet Controller XL710 Family of devices.
----------------------------

If the xl710 is a specific instance of the i40e family,
I think this is incorrect.

^ permalink raw reply

* Re: [PATCH net v2] net: sctp: fix ipv6 ipsec encryption bug in sctp_v6_xmit
From: Vlad Yasevich @ 2013-09-11 15:30 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: davem, netdev, linux-sctp, adobriyan, Steffen Klassert,
	Hannes Frederic Sowa
In-Reply-To: <1378911516-15942-1-git-send-email-dborkman@redhat.com>

On 09/11/2013 10:58 AM, Daniel Borkmann wrote:
> Alan Chester reported an issue with IPv6 on SCTP that IPsec traffic is not
> being encrypted, whereas on IPv4 it is. Setting up an AH + ESP transport
> does not seem to have the desired effect:
>
> SCTP + IPv4:
>
>    22:14:20.809645 IP (tos 0x2,ECT(0), ttl 64, id 0, offset 0, flags [DF], proto AH (51), length 116)
>      192.168.0.2 > 192.168.0.5: AH(spi=0x00000042,sumlen=16,seq=0x1): ESP(spi=0x00000044,seq=0x1), length 72
>    22:14:20.813270 IP (tos 0x2,ECT(0), ttl 64, id 0, offset 0, flags [DF], proto AH (51), length 340)
>      192.168.0.5 > 192.168.0.2: AH(spi=0x00000043,sumlen=16,seq=0x1):
>
> SCTP + IPv6:
>
>    22:31:19.215029 IP6 (class 0x02, hlim 64, next-header SCTP (132) payload length: 364)
>      fe80::222:15ff:fe87:7fc.3333 > fe80::92e6:baff:fe0d:5a54.36767: sctp
>      1) [INIT ACK] [init tag: 747759530] [rwnd: 62464] [OS: 10] [MIS: 10]
>
> Moreover, Alan says:
>
>    This problem was seen with both Racoon and Racoon2. Other people have seen
>    this with OpenSwan. When IPsec is configured to encrypt all upper layer
>    protocols the SCTP connection does not initialize. After using Wireshark to
>    follow packets, this is because the SCTP packet leaves Box A unencrypted and
>    Box B believes all upper layer protocols are to be encrypted so it drops
>    this packet, causing the SCTP connection to fail to initialize. When IPsec
>    is configured to encrypt just SCTP, the SCTP packets are observed unencrypted.
>
> In fact, using `socat sctp6-listen:3333 -` on one end and transferring "plaintext"
> string on the other end, results in cleartext on the wire where SCTP eventually
> does not report any errors, thus in the latter case that Alan reports, the
> non-paranoid user might think he's communicating over an encrypted transport on
> SCTP although he's not (tcpdump ... -X):
>
>    ...
>    0x0030: 5d70 8e1a 0003 001a 177d eb6c 0000 0000  ]p.......}.l....
>    0x0040: 0000 0000 706c 6169 6e74 6578 740a 0000  ....plaintext...
>
> Only in /proc/net/xfrm_stat we can see XfrmInTmplMismatch increasing on the
> receiver side. Initial follow-up analysis from Alan's bug report was done by
> Alexey Dobriyan. Also thanks to Vlad Yasevich for feedback on this.
>
> SCTP has its own implementation of sctp_v6_xmit() not calling inet6_csk_xmit().
> This has the implication that it probably never really got updated along with
> changes in inet6_csk_xmit() and therefore does not seem to invoke xfrm handlers.
>
> SCTP's IPv4 xmit however, properly calls ip_queue_xmit() to do the work. Since
> a call to inet6_csk_xmit() would solve this problem, but result in unecessary
> route lookups, let us just use the cached flowi6 instead that we got through
> sctp_v6_get_dst(). Since all SCTP packets are being sent through sctp_packet_transmit(),
> we do the route lookup / flow caching in sctp_transport_route(), hold it in
> tp->dst and skb_dst_set() right after that. If we would alter fl6->daddr in
> sctp_v6_xmit() to np->opt->srcrt, we possibly could run into the same effect
> of not having xfrm layer pick it up, hence, use fl6_update_dst() in sctp_v6_get_dst()
> instead to get the correct source routed dst entry, which we assign to the skb.
>
> Also source address routing example from 625034113 ("sctp: fix sctp to work with
> ipv6 source address routing") still works with this patch! Nevertheless, in RFC5095
> it is actually 'recommended' to not use that anyway due to traffic amplification [1].
> So it seems we're not supposed to do that anyway in sctp_v6_xmit(). Moreover, if
> we overwrite the flow destination here, the lower IPv6 layer will be unable to
> put the correct destination address into IP header, as routing header is added in
> ipv6_push_nfrag_opts() but then probably with wrong final destination. Things aside,
> result of this patch is that we do not have any XfrmInTmplMismatch increase plus on
> the wire with this patch it now looks like:
>
> SCTP + IPv6:
>
>    08:17:47.074080 IP6 2620:52:0:102f:7a2b:cbff:fe27:1b0a > 2620:52:0:102f:213:72ff:fe32:7eba:
>      AH(spi=0x00005fb4,seq=0x1): ESP(spi=0x00005fb5,seq=0x1), length 72
>    08:17:47.074264 IP6 2620:52:0:102f:213:72ff:fe32:7eba > 2620:52:0:102f:7a2b:cbff:fe27:1b0a:
>      AH(spi=0x00003d54,seq=0x1): ESP(spi=0x00003d55,seq=0x1), length 296
>
> This fixes Kernel Bugzilla 24412. This security issue seems to be present since
> 2.6.18 kernels. Lets just hope some big passive adversary in the wild didn't have
> its fun with that. lksctp-tools IPv6 regression test suite passes as well with
> this patch.
>
>   [1] http://www.secdev.org/conf/IPv6_RH_security-csw07.pdf
>
> Reported-by: Alan Chester <alan.chester@tekelec.com>
> Reported-by: Alexey Dobriyan <adobriyan@gmail.com>
> Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
> Cc: Steffen Klassert <steffen.klassert@secunet.com>
> Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>

Acked-by: Vlad Yasevich <vyasevich@gmail.com>

-vlad

> ---
>   v1->v2:
>    - use cached flow
>    - improved commit msg
>
>   net/sctp/ipv6.c | 42 +++++++++++++-----------------------------
>   1 file changed, 13 insertions(+), 29 deletions(-)
>
> diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
> index da613ce..4f52e2c 100644
> --- a/net/sctp/ipv6.c
> +++ b/net/sctp/ipv6.c
> @@ -204,44 +204,23 @@ out:
>   		in6_dev_put(idev);
>   }
>
> -/* Based on tcp_v6_xmit() in tcp_ipv6.c. */
>   static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
>   {
>   	struct sock *sk = skb->sk;
>   	struct ipv6_pinfo *np = inet6_sk(sk);
> -	struct flowi6 fl6;
> -
> -	memset(&fl6, 0, sizeof(fl6));
> -
> -	fl6.flowi6_proto = sk->sk_protocol;
> -
> -	/* Fill in the dest address from the route entry passed with the skb
> -	 * and the source address from the transport.
> -	 */
> -	fl6.daddr = transport->ipaddr.v6.sin6_addr;
> -	fl6.saddr = transport->saddr.v6.sin6_addr;
> -
> -	fl6.flowlabel = np->flow_label;
> -	IP6_ECN_flow_xmit(sk, fl6.flowlabel);
> -	if (ipv6_addr_type(&fl6.saddr) & IPV6_ADDR_LINKLOCAL)
> -		fl6.flowi6_oif = transport->saddr.v6.sin6_scope_id;
> -	else
> -		fl6.flowi6_oif = sk->sk_bound_dev_if;
> -
> -	if (np->opt && np->opt->srcrt) {
> -		struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt;
> -		fl6.daddr = *rt0->addr;
> -	}
> +	struct flowi6 *fl6 = &transport->fl.u.ip6;
>
>   	pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb,
> -		 skb->len, &fl6.saddr, &fl6.daddr);
> +		 skb->len, &fl6->saddr, &fl6->daddr);
>
> -	SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);
> +	IP6_ECN_flow_xmit(sk, fl6->flowlabel);
>
>   	if (!(transport->param_flags & SPP_PMTUD_ENABLE))
>   		skb->local_df = 1;
>
> -	return ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
> +	SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);
> +
> +	return ip6_xmit(sk, skb, fl6, np->opt, np->tclass);
>   }
>
>   /* Returns the dst cache entry for the given source and destination ip
> @@ -254,10 +233,12 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
>   	struct dst_entry *dst = NULL;
>   	struct flowi6 *fl6 = &fl->u.ip6;
>   	struct sctp_bind_addr *bp;
> +	struct ipv6_pinfo *np = inet6_sk(sk);
>   	struct sctp_sockaddr_entry *laddr;
>   	union sctp_addr *baddr = NULL;
>   	union sctp_addr *daddr = &t->ipaddr;
>   	union sctp_addr dst_saddr;
> +	struct in6_addr *final_p, final;
>   	__u8 matchlen = 0;
>   	__u8 bmatchlen;
>   	sctp_scope_t scope;
> @@ -281,7 +262,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
>   		pr_debug("src=%pI6 - ", &fl6->saddr);
>   	}
>
> -	dst = ip6_dst_lookup_flow(sk, fl6, NULL, false);
> +	final_p = fl6_update_dst(fl6, np->opt, &final);
> +	dst = ip6_dst_lookup_flow(sk, fl6, final_p, false);
>   	if (!asoc || saddr)
>   		goto out;
>
> @@ -333,10 +315,12 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
>   		}
>   	}
>   	rcu_read_unlock();
> +
>   	if (baddr) {
>   		fl6->saddr = baddr->v6.sin6_addr;
>   		fl6->fl6_sport = baddr->v6.sin6_port;
> -		dst = ip6_dst_lookup_flow(sk, fl6, NULL, false);
> +		final_p = fl6_update_dst(fl6, np->opt, &final);
> +		dst = ip6_dst_lookup_flow(sk, fl6, final_p, false);
>   	}
>
>   out:
>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox