Netdev List

Netdev List
 help / color / mirror / Atom feed

* ethtool 3.4.2 released
From: Ben Hutchings @ 2012-07-17 15:31 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 756 bytes --]

ethtool version 3.4.2 has been released.  This fixes various bugs.

Home page: https://ftp.kernel.org/pub/software/network/ethtool/
Download link:
https://ftp.kernel.org/pub/software/network/ethtool/ethtool-3.4.2.tar.gz

Release notes:

	* Fix: Fix regression in RX NFC rule insertion for drivers that do
	  not select rule locations (-N/-U option)
	* Fix: Remove bogus error message when changing offload settings
	  on Linux < 2.6.39 (-K option)
	* Fix: Use alternate method to check for VLAN tag offload on Linux
	  < 2.6.37 (-k option)

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.



[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply

* [patch net-next 2/2] team: add netpoll support
From: Jiri Pirko @ 2012-07-17 15:22 UTC (permalink / raw)
  To: netdev; +Cc: davem
In-Reply-To: <1342538556-22601-1-git-send-email-jiri@resnulli.us>

It's done in very similar way this is done in bonding and bridge.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
 drivers/net/team/team.c                   |  113 +++++++++++++++++++++++++++++
 drivers/net/team/team_mode_activebackup.c |    3 +-
 drivers/net/team/team_mode_broadcast.c    |    7 +-
 drivers/net/team/team_mode_loadbalance.c  |    3 +-
 drivers/net/team/team_mode_roundrobin.c   |    3 +-
 include/linux/if_team.h                   |   33 +++++++++
 6 files changed, 152 insertions(+), 10 deletions(-)

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 3620c63..1a13470 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -18,6 +18,7 @@
 #include <linux/ctype.h>
 #include <linux/notifier.h>
 #include <linux/netdevice.h>
+#include <linux/netpoll.h>
 #include <linux/if_vlan.h>
 #include <linux/if_arp.h>
 #include <linux/socket.h>
@@ -787,6 +788,58 @@ static void team_port_leave(struct team *team, struct team_port *port)
 	dev_put(team->dev);
 }
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static int team_port_enable_netpoll(struct team *team, struct team_port *port)
+{
+	struct netpoll *np;
+	int err;
+
+	np = kzalloc(sizeof(*np), GFP_KERNEL);
+	if (!np)
+		return -ENOMEM;
+
+	err = __netpoll_setup(np, port->dev);
+	if (err) {
+		kfree(np);
+		return err;
+	}
+	port->np = np;
+	return err;
+}
+
+static void team_port_disable_netpoll(struct team_port *port)
+{
+	struct netpoll *np = port->np;
+
+	if (!np)
+		return;
+	port->np = NULL;
+
+	/* Wait for transmitting packets to finish before freeing. */
+	synchronize_rcu_bh();
+	__netpoll_cleanup(np);
+	kfree(np);
+}
+
+static struct netpoll_info *team_netpoll_info(struct team *team)
+{
+	return team->dev->npinfo;
+}
+
+#else
+static int team_port_enable_netpoll(struct team *team, struct team_port *port)
+{
+	return 0;
+}
+static void team_port_disable_netpoll(struct team_port *port)
+{
+}
+static struct netpoll_info *team_netpoll_info(struct team *team)
+{
+	return NULL;
+}
+#endif
+
 static void __team_port_change_check(struct team_port *port, bool linkup);
 
 static int team_port_add(struct team *team, struct net_device *port_dev)
@@ -853,6 +906,15 @@ static int team_port_add(struct team *team, struct net_device *port_dev)
 		goto err_vids_add;
 	}
 
+	if (team_netpoll_info(team)) {
+		err = team_port_enable_netpoll(team, port);
+		if (err) {
+			netdev_err(dev, "Failed to enable netpoll on device %s\n",
+				   portname);
+			goto err_enable_netpoll;
+		}
+	}
+
 	err = netdev_set_master(port_dev, dev);
 	if (err) {
 		netdev_err(dev, "Device %s failed to set master\n", portname);
@@ -892,6 +954,9 @@ err_handler_register:
 	netdev_set_master(port_dev, NULL);
 
 err_set_master:
+	team_port_disable_netpoll(port);
+
+err_enable_netpoll:
 	vlan_vids_del_by_dev(port_dev, dev);
 
 err_vids_add:
@@ -932,6 +997,7 @@ static int team_port_del(struct team *team, struct net_device *port_dev)
 	list_del_rcu(&port->list);
 	netdev_rx_handler_unregister(port_dev);
 	netdev_set_master(port_dev, NULL);
+	team_port_disable_netpoll(port);
 	vlan_vids_del_by_dev(port_dev, dev);
 	dev_close(port_dev);
 	team_port_leave(team, port);
@@ -1307,6 +1373,48 @@ static int team_vlan_rx_kill_vid(struct net_device *dev, uint16_t vid)
 	return 0;
 }
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void team_poll_controller(struct net_device *dev)
+{
+}
+
+static void __team_netpoll_cleanup(struct team *team)
+{
+	struct team_port *port;
+
+	list_for_each_entry(port, &team->port_list, list)
+		team_port_disable_netpoll(port);
+}
+
+static void team_netpoll_cleanup(struct net_device *dev)
+{
+	struct team *team = netdev_priv(dev);
+
+	mutex_lock(&team->lock);
+	__team_netpoll_cleanup(team);
+	mutex_unlock(&team->lock);
+}
+
+static int team_netpoll_setup(struct net_device *dev,
+			      struct netpoll_info *npifo)
+{
+	struct team *team = netdev_priv(dev);
+	struct team_port *port;
+	int err;
+
+	mutex_lock(&team->lock);
+	list_for_each_entry(port, &team->port_list, list) {
+		err = team_port_enable_netpoll(team, port);
+		if (err) {
+			__team_netpoll_cleanup(team);
+			break;
+		}
+	}
+	mutex_unlock(&team->lock);
+	return err;
+}
+#endif
+
 static int team_add_slave(struct net_device *dev, struct net_device *port_dev)
 {
 	struct team *team = netdev_priv(dev);
@@ -1363,6 +1471,11 @@ static const struct net_device_ops team_netdev_ops = {
 	.ndo_get_stats64	= team_get_stats64,
 	.ndo_vlan_rx_add_vid	= team_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= team_vlan_rx_kill_vid,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= team_poll_controller,
+	.ndo_netpoll_setup	= team_netpoll_setup,
+	.ndo_netpoll_cleanup	= team_netpoll_cleanup,
+#endif
 	.ndo_add_slave		= team_add_slave,
 	.ndo_del_slave		= team_del_slave,
 	.ndo_fix_features	= team_fix_features,
diff --git a/drivers/net/team/team_mode_activebackup.c b/drivers/net/team/team_mode_activebackup.c
index 253b8a5..6262b4d 100644
--- a/drivers/net/team/team_mode_activebackup.c
+++ b/drivers/net/team/team_mode_activebackup.c
@@ -43,8 +43,7 @@ static bool ab_transmit(struct team *team, struct sk_buff *skb)
 	active_port = rcu_dereference_bh(ab_priv(team)->active_port);
 	if (unlikely(!active_port))
 		goto drop;
-	skb->dev = active_port->dev;
-	if (dev_queue_xmit(skb))
+	if (team_dev_queue_xmit(team, active_port, skb))
 		return false;
 	return true;
 
diff --git a/drivers/net/team/team_mode_broadcast.c b/drivers/net/team/team_mode_broadcast.c
index 5562345..c96e4d2 100644
--- a/drivers/net/team/team_mode_broadcast.c
+++ b/drivers/net/team/team_mode_broadcast.c
@@ -29,8 +29,8 @@ static bool bc_transmit(struct team *team, struct sk_buff *skb)
 			if (last) {
 				skb2 = skb_clone(skb, GFP_ATOMIC);
 				if (skb2) {
-					skb2->dev = last->dev;
-					ret = dev_queue_xmit(skb2);
+					ret = team_dev_queue_xmit(team, last,
+								  skb2);
 					if (!sum_ret)
 						sum_ret = ret;
 				}
@@ -39,8 +39,7 @@ static bool bc_transmit(struct team *team, struct sk_buff *skb)
 		}
 	}
 	if (last) {
-		skb->dev = last->dev;
-		ret = dev_queue_xmit(skb);
+		ret = team_dev_queue_xmit(team, last, skb);
 		if (!sum_ret)
 			sum_ret = ret;
 	}
diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c
index 51a4b19..cdc31b5 100644
--- a/drivers/net/team/team_mode_loadbalance.c
+++ b/drivers/net/team/team_mode_loadbalance.c
@@ -217,8 +217,7 @@ static bool lb_transmit(struct team *team, struct sk_buff *skb)
 	port = select_tx_port_func(team, lb_priv, skb, hash);
 	if (unlikely(!port))
 		goto drop;
-	skb->dev = port->dev;
-	if (dev_queue_xmit(skb))
+	if (team_dev_queue_xmit(team, port, skb))
 		return false;
 	lb_update_tx_stats(tx_bytes, lb_priv, get_lb_port_priv(port), hash);
 	return true;
diff --git a/drivers/net/team/team_mode_roundrobin.c b/drivers/net/team/team_mode_roundrobin.c
index 0cf38e9..ad7ed0e 100644
--- a/drivers/net/team/team_mode_roundrobin.c
+++ b/drivers/net/team/team_mode_roundrobin.c
@@ -55,8 +55,7 @@ static bool rr_transmit(struct team *team, struct sk_buff *skb)
 	port = __get_first_port_up(team, port);
 	if (unlikely(!port))
 		goto drop;
-	skb->dev = port->dev;
-	if (dev_queue_xmit(skb))
+	if (team_dev_queue_xmit(team, port, skb))
 		return false;
 	return true;
 
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index dfa0c8e..7fd0cde 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -13,6 +13,8 @@
 
 #ifdef __KERNEL__
 
+#include <linux/netpoll.h>
+
 struct team_pcpu_stats {
 	u64			rx_packets;
 	u64			rx_bytes;
@@ -60,6 +62,10 @@ struct team_port {
 		unsigned int mtu;
 	} orig;
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	struct netpoll *np;
+#endif
+
 	long mode_priv[0];
 };
 
@@ -73,6 +79,33 @@ static inline bool team_port_txable(struct team_port *port)
 	return port->linkup && team_port_enabled(port);
 }
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static inline void team_netpoll_send_skb(struct team_port *port,
+					 struct sk_buff *skb)
+{
+	struct netpoll *np = port->np;
+
+	if (np)
+		netpoll_send_skb(np, skb);
+}
+#else
+static inline void team_netpoll_send_skb(struct team_port *port,
+					 struct sk_buff *skb)
+{
+}
+#endif
+
+static inline int team_dev_queue_xmit(struct team *team, struct team_port *port,
+				      struct sk_buff *skb)
+{
+	skb->dev = port->dev;
+	if (unlikely(netpoll_tx_running(port->dev))) {
+		team_netpoll_send_skb(port, skb);
+		return 0;
+	}
+	return dev_queue_xmit(skb);
+}
+
 struct team_mode_ops {
 	int (*init)(struct team *team);
 	void (*exit)(struct team *team);
-- 
1.7.10.4

^ permalink raw reply related

* [patch net-next 1/2] netpoll: move np->dev and np->dev_name init into __netpoll_setup()
From: Jiri Pirko @ 2012-07-17 15:22 UTC (permalink / raw)
  To: netdev; +Cc: davem
In-Reply-To: <1342538556-22601-1-git-send-email-jiri@resnulli.us>

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
---
 drivers/net/bonding/bond_main.c |    4 +---
 include/linux/netpoll.h         |    2 +-
 net/8021q/vlan_dev.c            |    5 +----
 net/bridge/br_device.c          |    5 +----
 net/core/netpoll.c              |   10 +++++-----
 5 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 4ddcc3e..1eb3979 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1240,9 +1240,7 @@ static inline int slave_enable_netpoll(struct slave *slave)
 	if (!np)
 		goto out;
 
-	np->dev = slave->dev;
-	strlcpy(np->dev_name, slave->dev->name, IFNAMSIZ);
-	err = __netpoll_setup(np);
+	err = __netpoll_setup(np, slave->dev);
 	if (err) {
 		kfree(np);
 		goto out;
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 5dfa091..28f5389 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -43,7 +43,7 @@ struct netpoll_info {
 void netpoll_send_udp(struct netpoll *np, const char *msg, int len);
 void netpoll_print_options(struct netpoll *np);
 int netpoll_parse_options(struct netpoll *np, char *opt);
-int __netpoll_setup(struct netpoll *np);
+int __netpoll_setup(struct netpoll *np, struct net_device *ndev);
 int netpoll_setup(struct netpoll *np);
 int netpoll_trap(void);
 void netpoll_set_trap(int trap);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index da1bc9c..73a2a83 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -681,10 +681,7 @@ static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *n
 	if (!netpoll)
 		goto out;
 
-	netpoll->dev = real_dev;
-	strlcpy(netpoll->dev_name, real_dev->name, IFNAMSIZ);
-
-	err = __netpoll_setup(netpoll);
+	err = __netpoll_setup(netpoll, real_dev);
 	if (err) {
 		kfree(netpoll);
 		goto out;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 929e48aed..f4be1bb 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -246,10 +246,7 @@ int br_netpoll_enable(struct net_bridge_port *p)
 	if (!np)
 		goto out;
 
-	np->dev = p->dev;
-	strlcpy(np->dev_name, p->dev->name, IFNAMSIZ);
-
-	err = __netpoll_setup(np);
+	err = __netpoll_setup(np, p->dev);
 	if (err) {
 		kfree(np);
 		goto out;
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index f9f40b9..b4c90e4 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -715,14 +715,16 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
 }
 EXPORT_SYMBOL(netpoll_parse_options);
 
-int __netpoll_setup(struct netpoll *np)
+int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
 {
-	struct net_device *ndev = np->dev;
 	struct netpoll_info *npinfo;
 	const struct net_device_ops *ops;
 	unsigned long flags;
 	int err;
 
+	np->dev = ndev;
+	strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
+
 	if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
 	    !ndev->netdev_ops->ndo_poll_controller) {
 		np_err(np, "%s doesn't support polling, aborting\n",
@@ -851,13 +853,11 @@ int netpoll_setup(struct netpoll *np)
 		np_info(np, "local IP %pI4\n", &np->local_ip);
 	}
 
-	np->dev = ndev;
-
 	/* fill up the skb queue */
 	refill_skbs();
 
 	rtnl_lock();
-	err = __netpoll_setup(np);
+	err = __netpoll_setup(np, ndev);
 	rtnl_unlock();
 
 	if (err)
-- 
1.7.10.4

^ permalink raw reply related

* [patch net-next 0/2] team: add netpoll support
From: Jiri Pirko @ 2012-07-17 15:22 UTC (permalink / raw)
  To: netdev; +Cc: davem

Also contains a little change to netpoll core.

Jiri Pirko (2):
  netpoll: move np->dev and np->dev_name init into __netpoll_setup()
  team: add netpoll support

 drivers/net/bonding/bond_main.c           |    4 +-
 drivers/net/team/team.c                   |  113 +++++++++++++++++++++++++++++
 drivers/net/team/team_mode_activebackup.c |    3 +-
 drivers/net/team/team_mode_broadcast.c    |    7 +-
 drivers/net/team/team_mode_loadbalance.c  |    3 +-
 drivers/net/team/team_mode_roundrobin.c   |    3 +-
 include/linux/if_team.h                   |   33 +++++++++
 include/linux/netpoll.h                   |    2 +-
 net/8021q/vlan_dev.c                      |    5 +-
 net/bridge/br_device.c                    |    5 +-
 net/core/netpoll.c                        |   10 +--
 11 files changed, 161 insertions(+), 27 deletions(-)

-- 
1.7.10.4

^ permalink raw reply

* PATCH net/ipv6/mip6.c destopt corruption
From: András Takács @ 2012-07-17 14:47 UTC (permalink / raw)
  To: netdev
In-Reply-To: <A9CE2E85-2DDC-4182-B494-431A6A62BC95@wakoond.hu>


Dear All,


I have added a lot of debug messages to the kernel source and finally found the problem. When the kernel creates the skb from iovec (ip6_append_data) it sets the pointer of the network header to a wrong position. It will be shifted with 24 bytes (it is the length of the HAO dest. opt. header with paddings).

After this point, the message will be corrupted, the beginning (the first 24 bytes) of the MH part will be truncated. Later, when the kernel adds the dest. opt. header itself, there isn't any issue.

So, back to the wrong network header pointer. It is shifted by exthdrlen (= 24) by the skb_set_network_header() function. This exthdrlen comes from rt->rt6i_nfheader_len, which comes from the dst_entry chain. This nfheader_len value comes from the header_len of the desired xfrm type (in this case hao dest opt):

(net/xfrm/xfrm_policy.c: xfrm_bundle_create)
header_len += xfrm[i]->props.header_len;
if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT)
	nfheader_len += xfrm[i]->props.header_len;
...
xfrm_init_path((struct xfrm_dst *)dst0, dst, nfheader_len);

I have run a fast grep on the kernel tree, and this XFRM_TYPE_NON_FRAGMENT does not have any effect, just sets (or not) nfheader_len here. So, the following patch solves the issue:

diff -Nuar linux-3.4.2-orig/net/ipv6/mip6.c linux-3.4.2/net/ipv6/mip6.c
--- linux-3.4.2-orig/net/ipv6/mip6.c	2012-07-17 15:18:30.148777104 +0200
+++ linux-3.4.2/net/ipv6/mip6.c	2012-07-17 15:21:12.104779113 +0200
@@ -338,7 +338,7 @@
 	.description	= "MIP6DESTOPT",
 	.owner		= THIS_MODULE,
 	.proto	     	= IPPROTO_DSTOPTS,
-	.flags		= XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR,
+	.flags		= XFRM_TYPE_LOCAL_COADDR,
 	.init_state	= mip6_destopt_init_state,
 	.destructor	= mip6_destopt_destroy,
 	.input		= mip6_destopt_input,
@@ -471,7 +471,7 @@
 	.description	= "MIP6RT",
 	.owner		= THIS_MODULE,
 	.proto	     	= IPPROTO_ROUTING,
-	.flags		= XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR,
+	.flags		= XFRM_TYPE_REMOTE_COADDR,
 	.init_state	= mip6_rthdr_init_state,
 	.destructor	= mip6_rthdr_destroy,
 	.input		= mip6_rthdr_input,

What do you think about this fix? Does it have any drawback?


Ragards,
András


On Jul 16, 2012, at 3:40 PM, András Takács wrote:

> 
> Dear All,
> 
> 
> I have serious problems with HAO dest opt XFRM processing. In the past few days I have tried to find the problem, and I figured out the following:
> 
> 1. case: No XFRM rules
> It works fine (as it was described in my previous e-mail)
> 
> 2. case: HAO RO XFRM processing
> I have created the following rules manually:
> sudo ip -6 xfrm policy add src 2001:470:7210:10::11 dst 2001:470:7210:10::1000 proto 135 type 5 dir out priority 2 ptype sub tmpl src 2001:470:7210:10::11 dst 2001:470:7210:10::1000 proto hao reqid 0 mode ro
> sudo ip -6 xfrm state add src 2001:470:7210:10::11 dst 2001:470:7210:10::1000 proto hao reqid 0 mode ro replay-window 0 coa 2001:470:7210:11:20c:29ff:fe46:a0e3 sel src 2001:470:7210:10::11 dst 2001:470:7210:10::1000
> 
> The message format is corrupted, because during the xfrm processing, the beginning of the MH part will be overwritten by the DST OPT header.
> 
> 3. case: ESP TUNNEL XFRM
> I have created ESP TUNNEL XFRM rules manually, and it was worked fine. 
> So the problem has to be somewhere in the net/ipv6/mip6.c or net/ipv6/xfrm_mode_ro.c files.
> 
> -------------
> 
> I added a lot of debug printk statements to the source, and I have figured out the following:
> 
> When the kernel creates the skb from the iovec, it seems to be ok (in ip6_append_data):
> 
> skb->data to skb->tail:
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 57 39 B1 FB 87 39 B1 FB 64 92 FF FF 18 00 00 00 00 00 00 00 00 00 00 00 3B 03 05 00 00 00 00 01 00 00 00 02 01 00 03 10 20 01 04 70 72 10 00 11 02 0C 29 FF FE 46 A0 E3
> 
> Unfortunately at the beginning of the xfrm6_ro_output function, it seems to be corrupt:
> 
> 60 00 00 00 00 08 87 40 20 01 04 70 72 10 00 10 00 00 00 00 00 00 00 11 20 01 04 70 72 10 00 10 00 00 00 00 00 00 10 00 02 0C 29 FF FE 46 A0 E3
> 
> Here missing the first 24 bytes of the MH part. It is quite suspicious, because the size of the DST OPT header (with the necessary padding) is exactly same long. 
> 
> After this point xfrm6_ro_output and mip6_destopt_output works fine, and insert the DST OPT header to this truncated skb.
> 
> 
> Could you please help me to find the connection ("call - graph" ?) between ip6_append_data and xfrm6_ro_output? I can't find the point where it fails. In ip6_append_data, the beginning of the skb is reserved for the IPv6 header, but where will be this part filled with the right values?
> 
> 
> Thank you very much for your help!
> 
> 
> Regards,
> András
> 
> 
> On Jun 21, 2012, at 10:41 PM, Andras Takacs wrote:
> 
>> Dear All,
>> 
>> I'm working with Mobile IPv6 systems, and I'm setting up a new MIP6 environment. I would like to use the latest stable kernel, so I'm using 3.4.2. Unfortunately I have some serious problems with destination option XFRM processing. I have done the following tests to find the issue:
>> 
>> First case: No XFRM policies and states.
>> Sending MH messages without destopt header.
>> In this case the message format is OK, I have tested it with tcpdump and wireshark.
>> 
>> 21:33:58.817130 IP6 2001:470:7210:10::11 > 2001:470:7210:10::1000: mobility: BU seq#=1 lifetime=8
>> 	0x0000:  6000 0000 0020 8740 2001 0470 7210 0010  `......@...pr...
>> 	0x0010:  0000 0000 0000 0011 2001 0470 7210 0010  ...........pr...
>> 	0x0020:  0000 0000 0000 1000 3b03 0500 1c46 0001  ........;....F..
>> 	0x0030:  0000 0002 0100 0310 2001 0470 7210 0011  ...........pr...
>> 	0x0040:  020c 29ff fe46 a0e3                      ..)..F..
>> 
>> Second case: Adding destopt XFRM policy and state:
>> 
>> ip -6 xfrm policy add src 2001:470:7210:10::11 dst 2001:470:7210:10::1000 proto 135 type 5 dir out priority 2 ptype sub tmpl src 2001:470:7210:10::11 dst 2001:470:7210:10::1000 proto hao reqid 0 mode ro level use
>> ip -6 xfrm state add src 2001:470:7210:10::11 dst 2001:470:7210:10::1000 proto hao reqid 0 mode ro replay-window 0 coa 2001:470:7210:11:20c:29ff:fe46:a0e3 sel src 2001:470:7210:10::11 dst 2001:470:7210:10::1000
>> 
>> In this case, the message format is corrupted:
>> 
>> 21:30:42.350315 IP6 2001:470:7210:11:20c:29ff:fe46:a0e3 > 2001:470:7210:10::1000: DSTOPT mobility: type-#41 len=12
>> 	0x0000:  6000 0000 0020 3c40 2001 0470 7210 0011  `.....<@...pr...
>> 	0x0010:  020c 29ff fe46 a0e3 2001 0470 7210 0010  ..)..F.....pr...
>> 	0x0020:  0000 0000 0000 1000 8702 0102 0000 c910  ................
>> 	0x0030:  2001 0470 7210 0010 0000 0000 0000 0011  ...pr...........
>> 	0x0040:  020c 29ff fe46 a0e3 
>> 
>> As you can see, the IPv6 header is OK. Next, the destination option header is OK. Finally, the following part of the packet isn't OK. If you compare the two dump carefully, you will see, that the last 8 bytes are identical. The mip6_destopt_output function adds the destination option header correctly, but overwrites the existing MH header, and doesn't shift it after the destopt header.
>> 
>> I'm not familiar with the XFRM framework enough to fix the problem. :(
>> Maybe, could anyone help to me to fix this issue?
>> 
>> The last environment, which worked fine was built on 2.6.35 version. The problem happened between 2.6.35 and 3.4.2. Sorry, I know, it is a quite big interval. :(
>> 
>> Thanks!
>> 
>> 
>> Best regards,
>> András Takács
> 

^ permalink raw reply

* Re: [PATCH net-next] tcp: implement RFC 5961 4.2
From: David Miller @ 2012-07-17 14:41 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, kkiran
In-Reply-To: <1342525290.2626.459.camel@edumazet-glaptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 17 Jul 2012 13:41:30 +0200

> From: Eric Dumazet <edumazet@google.com>
> 
> Implement the RFC 5691 mitigation against Blind
> Reset attack using SYN bit.
> 
> Section 4.2 of RFC 5961 advises to send a Challenge ACK and drop
> incoming packet, instead of resetting the session.
> 
> Add a new SNMP counter to count number of challenge acks sent
> in response to SYN packets.
> (netstat -s | grep TCPSYNChallenge)
> 
> Remove obsolete TCPAbortOnSyn, since we no longer abort a TCP session
> because of a SYN flag.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Looks good, applied, thanks Eric.

^ permalink raw reply

* [GIT] Networking
From: David Miller @ 2012-07-17 14:36 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel


I know this looks like a lot more than you want to see right now,
however a) the stuff here are real OOPS'ers, memory leaks, and
regressions and b) it's been a full 2 weeks since I last sent bug
fixes your way.

If it makes you feel any better, my default has been to toss fixes
into net-next unless it was really serious like the stuff below.

I have a CIPSO ipv4 option processing oops'er I intend to work on
fixing myself if the maintainer of the code doesn't look at it in the
24 hours.

1) IPVS oops'ers:
   a) Should not reset skb->nf_bridge in forwarding hook (Lin Ming)
   b) 3.4 commit can cause ip_vs_control_cleanup to be invoked after
      the ipvs_core_ops are unregistered during rmmod (Julian ANastasov)

2) ixgbevf bringup failure can crash in TX descriptor cleanup (Alexander Duyck)

3) AX25 switch missing break statement hoses ROSE sockets (Alan Cox)

4) CAIF accesses freed per-net memory (Sjur Brandeland)

5) Network cgroup code has out-or-bounds accesses (Eric DUmazet), and accesses
   freed memory (Gao Feng)

6) Fix a crash in SCTP reported by Dave Jones caused by freeing an association
   still on a list (Neil HOrman)

7) __netdev_alloc_skb() regresses on GFP_DMA using drivers because that GFP
   flag is not being retained for the allocation (Eric Dumazet).

8) Missing NULL hceck in sch_sfb netlink message parsing (Alan Cox)

9) bnx2 crashes because TX index iteration is not bounded correctly (Michael
   Chan)

10) IPoIB generates warnings in TCP queue collapsing (via
    skb_try_coalesce) because it does not set skb->truesize correctly
    (Eric Dumazet)

11) vlan_info objects leak for the implicit vlan with ID 0 (Amir Hanania)

12) A fix for TX time stamp handling in gianfar does not transfer
    socket ownership from one packet to another correctly, resulting
    in a socket write space imbalance (Eric Dumazet)

13) Julia Lawall found several cases where we do a list iteration, and
    then at the loop termination unconditionally assume we ended up with
    real list object, rather than the list head itself (CNIC, RXRPC,
    mISDN).

14) The bonding driver handles procfs moving incorrectly when a device
    it manages is moved from one namespace to another (Eric Biederman)

15) Missing memory barriers in stmmac descriptor accesses result in
    various crashes (Deepak Sikri)

16) Fix handling of broadcast packets in batman-adv (Simon Wunderlich)

17) Properly check the sanity of sendmsg() lengths in ieee802154's
    dgram_sendmsg().  Dave Jones and others have hit and reported this
    bug (Sasha Levin)

18) Some drivers (b44 and b43legacy) on 64-bit machines stopped
    working because of how netdev_alloc_skb() was adjusted.  Such
    drivers should now use alloc_skb() for obtaining bounce buffers.
    (Eric Dumazet)

19) atl1c mis-managed it's link state in that it stops the queue by
    hand on link down.  The generic networking takes care of that and
    this double stop locks the queue down.  So simply removing the
    driver's queue stop call fixes the problem (Cloud Ren)

20) Fix out-of-memory due to mis-accounting in net_em packet scheduler
    (Eric Dumazet)

21) If DCB and SR-IOV are configured at the same time in IXGBE the chip
    will hang because this is not supported (Alexander Duyck)

22) A commit to stop drivers using netdev->base_addr broke the CNIC
    driver (Michael Chan)    

23) Timeout regression in ipset caused by an attempt to fix an overflow
    bug (Jozsef Kadlecsik).

24) mac80211 minstrel code allocates memory using incorrect size
    (Thomas Huehn)

25) llcp_sock_getname() needs to check for a NULL device otherwise we
    OOPS (Sasha Levin)

26) mwifiex leaks memory (Bing Zhao)

27) Propagate iwlwifi fix to iwlegacy, even when we're not associated
    we need to monitor for stuck queues in the watchdog handler
    (Stanislaw Geuszka)

Please pull, thanks a lot.

The following changes since commit 9e85a6f9dc231f3ed3c1dc1b12217505d970142a:

  Merge tag 'clk-fixes-for-linus' of git://git.linaro.org/people/mturquette/linux (2012-07-03 18:06:49 -0700)

are available in the git repository at:


  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net master

for you to fetch changes up to 602e65a3b0c4f6b09fba19817ff798647a08e706:

  Merge branch 'master' of git://1984.lsi.us.es/nf (2012-07-17 03:19:33 -0700)

----------------------------------------------------------------

Alan Cox (2):
      sch_sfb: Fix missing NULL check
      ax25: Fix missing break

Alexander Duyck (2):
      ixgbe: DCB and SR-IOV can not co-exist and will cause hangs
      ixgbevf: Fix panic when loading driver

Amir Hanania (1):
      net: Fix memory leak - vlan_info struct

Bing Zhao (1):
      mwifiex: fix Coverity SCAN CID 709078: Resource leak (RESOURCE_LEAK)

Bjørn Mork (1):
      net: qmi_wwan: add ZTE MF60

Bruce Allan (1):
      e1000e: fix test for PHY being accessible on 82577/8/9 and I217

Cloud Ren (1):
      atl1c: fix issue of transmit queue 0 timed out

David Daney (1):
      netdev/phy: Fixup lockdep warnings in mdio-mux.c

David S. Miller (4):
      Merge branch 'master' of git://1984.lsi.us.es/nf
      Merge tag 'batman-adv-fix-for-davem' of git://git.open-mesh.org/linux-merge
      Merge branch 'master' of git://git.kernel.org/.../jkirsher/net
      Merge branch 'master' of git://1984.lsi.us.es/nf

Deepak Sikri (2):
      stmmac: Fix for nfs hang on multiple reboot
      stmmac: Fix for higher mtu size handling

Dmitry Eremin-Solenikov (1):
      MAINTAINERS: reflect actual changes in IEEE 802.15.4 maintainership

Eliad Peller (1):
      mac80211: destroy assoc_data correctly if assoc fails

Emmanuel Grumbach (1):
      iwlegacy: don't mess up the SCD when removing a key

Eric Dumazet (6):
      net: dont use __netdev_alloc_skb for bounce buffer
      netem: add limitation to reordered packets
      net: cgroup: fix out of bounds accesses
      gianfar: fix potential sk_wmem_alloc imbalance
      IPoIB: fix skb truesize underestimatiom
      net: respect GFP_DMA in __netdev_alloc_skb()

Eric W. Biederman (2):
      bonding: Manage /proc/net/bonding/ entries from the netdev events
      bonding: debugfs and network namespaces are incompatible

Gao feng (2):
      cgroup: fix panic in netprio_cgroup
      net: cgroup: fix access the unallocated memory in netprio cgroup

John W. Linville (1):
      Merge branch 'master' of git://git.kernel.org/.../linville/wireless into for-davem

Jozsef Kadlecsik (1):
      netfilter: ipset: timeout fixing bug broke SET target special timeout value

Julia Lawall (3):
      drivers/isdn/mISDN/stack.c: remove invalid reference to list iterator variable
      net/rxrpc/ar-peer.c: remove invalid reference to list iterator variable
      drivers/net/ethernet/broadcom/cnic.c: remove invalid reference to list iterator variable

Julian Anastasov (1):
      ipvs: fix oops in ip_vs_dst_event on rmmod

Lin Ming (1):
      ipvs: fix oops on NAT reply in br_nf context

Michael Chan (2):
      cnic: Don't use netdev->base_addr
      bnx2: Fix bug in bnx2_free_tx_skbs().

Narendra K (1):
      ixgbevf: Prevent RX/TX statistics getting reset to zero

Neil Horman (1):
      sctp: Fix list corruption resulting from freeing an association on a list

Pablo Neira Ayuso (1):
      netfilter: nf_ct_ecache: fix crash with multiple containers, one shutting down

Sasha Levin (2):
      ieee802154: verify packet size before trying to allocate it
      NFC: Prevent NULL deref when getting socket name

Simon Wunderlich (1):
      batman-adv: check incoming packet type for bla

Sjur Brændeland (1):
      caif: Fix access to freed pernet memory

Stanislaw Gruszka (2):
      rt2x00usb: fix indexes ordering on RX queue kick
      iwlegacy: always monitor for stuck queues

Thomas Huehn (1):
      mac80211: correct size the argument to kzalloc in minstrel_ht

Tushar Dave (1):
      e1000e: Correct link check logic for 82571 serdes

 MAINTAINERS                                       |    3 +-
 drivers/infiniband/ulp/ipoib/ipoib_ib.c           |   12 ++++---
 drivers/isdn/mISDN/stack.c                        |    4 +--
 drivers/net/bonding/bond_debugfs.c                |    2 +-
 drivers/net/bonding/bond_main.c                   |    9 ++++--
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c   |    1 -
 drivers/net/ethernet/broadcom/b44.c               |    4 +--
 drivers/net/ethernet/broadcom/bnx2.c              |    6 ++--
 drivers/net/ethernet/broadcom/cnic.c              |   10 ++++--
 drivers/net/ethernet/freescale/gianfar.c          |    7 ++--
 drivers/net/ethernet/intel/e1000e/82571.c         |    3 ++
 drivers/net/ethernet/intel/e1000e/ich8lan.c       |   42 ++++++++++++++++++------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |    5 +++
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |   15 ++-------
 drivers/net/ethernet/stmicro/stmmac/ring_mode.c   |    3 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |    3 ++
 drivers/net/phy/mdio-mux.c                        |   10 ++++--
 drivers/net/usb/qmi_wwan.c                        |   18 +++++++++++
 drivers/net/wireless/b43legacy/dma.c              |    2 +-
 drivers/net/wireless/iwlegacy/4965-mac.c          |    4 +--
 drivers/net/wireless/iwlegacy/common.c            |   14 ++++----
 drivers/net/wireless/mwifiex/cfg80211.c           |    1 +
 drivers/net/wireless/rt2x00/rt2x00usb.c           |    2 +-
 include/net/ip_vs.h                               |    2 +-
 include/net/netfilter/nf_conntrack_ecache.h       |    2 +-
 net/8021q/vlan.c                                  |    3 ++
 net/ax25/af_ax25.c                                |    1 +
 net/batman-adv/bridge_loop_avoidance.c            |   15 ++++++---
 net/batman-adv/bridge_loop_avoidance.h            |    5 +--
 net/batman-adv/soft-interface.c                   |    6 +++-
 net/caif/caif_dev.c                               |    2 +-
 net/core/dev.c                                    |    8 +++--
 net/core/netprio_cgroup.c                         |   78 +++++++++++++++++++++++++++++++++------------
 net/core/skbuff.c                                 |    2 +-
 net/ieee802154/dgram.c                            |   12 +++----
 net/mac80211/mlme.c                               |    6 ++--
 net/mac80211/rc80211_minstrel_ht.c                |    2 +-
 net/netfilter/ipvs/ip_vs_ctl.c                    |    5 +--
 net/netfilter/xt_set.c                            |    4 ++-
 net/nfc/llcp/sock.c                               |    2 +-
 net/rxrpc/ar-peer.c                               |    2 +-
 net/sched/sch_netem.c                             |   42 +++++++++---------------
 net/sched/sch_sfb.c                               |    2 ++
 net/sctp/input.c                                  |    7 ++--
 net/sctp/socket.c                                 |   12 +++++--
 45 files changed, 256 insertions(+), 144 deletions(-)

^ permalink raw reply

* Re: [PATCH 5/5] ipv4: Add FIB nexthop exceptions.
From: David Miller @ 2012-07-17 14:25 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1342533605.2626.680.camel@edumazet-glaptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 17 Jul 2012 16:00:05 +0200

> On Tue, 2012-07-17 at 06:14 -0700, David Miller wrote:
>> In a regime where we have subnetted route entries, we need a way to
>> store persistent storage about destination specific learned values
>> such as redirects and PMTU values.
>> 
>> This is implemented here via nexthop exceptions.
>> 
>> The initial implementation is a simple linked list, and can be
>> expanded to a hash table when it is shown to be justified.
> 
> Say a typical host uses a single default route, I am trying to convince
> myself it can really use a simple linked list ?
> 
> Arent PMTU entries added by messages coming from untrusted sources ?

They are trusted when we validate them at the socket layer, at least
as is done for TCP.

I totally agree that we'll need to adjust the list into something more
sophisticated, but that's an implementation detail rather than
something that requires the actual infrastructure to be redone.

^ permalink raw reply

* RE: [PATCH 1/4] pch_gbe: Fix the checksum fill to the error location
From: Andy Cress @ 2012-07-17 14:20 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Zhong Hongbo
In-Reply-To: <1342510387.2626.174.camel@edumazet-glaptop>

Eric,

This is intriguing, and the data copy also would explain why this transmit path is slow, and is susceptible to transmit timeouts.  
I want to apply and test your proposed patch, but I'll have to do that next week.

Andy

-----Original Message-----
From: Eric Dumazet [mailto:eric.dumazet@gmail.com] 
Sent: Tuesday, July 17, 2012 3:33 AM
To: Andy Cress
Cc: netdev@vger.kernel.org; Zhong Hongbo
Subject: Re: [PATCH 1/4] pch_gbe: Fix the checksum fill to the error location

On Tue, 2012-07-17 at 09:09 +0200, Eric Dumazet wrote:

> Hmm... I fail to understand why you care about NIC doing checksums,
> while pch_gbe_tx_queue() make a _copy_ of each outgoing
> packets.
> 
> There _must_ be a way to avoid most of these copies (ie not touching
> payload), only mess with the header to insert these 2 nul bytes ?
> 
> /* [Header:14][payload] ---> [Header:14][paddong:2][payload]    */
> 
> So at device setup : dev->needed_headroom = 2;
> 
> and in xmit,
> 
> 	if (skb_headroom(skb) < 2) {
> 		struct sk_buff *skb_new;
> 
> 		skb_new = skb_realloc_headroom(skb, 2);
> 		if (!skb_new) { handle error }
> 		consume_skb(skb);
> 		skb = skb_new;
> 	}
> 	ptr = skb_push(skb, 2);
> 	memmove(ptr, ptr + 2, ETH_HLEN);
> 	ptr[ETH_HLEN] = 0;
> 	ptr[ETH_HLEN + 1] = 0;
> 
> 

Something like the following (untested) patch


 drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c |   55 +++++-----
 1 file changed, 29 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
index b100656..2d3d982 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
@@ -1163,7 +1163,7 @@ static void pch_gbe_tx_queue(struct pch_gbe_adapter *adapter,
 	struct pch_gbe_hw *hw = &adapter->hw;
 	struct pch_gbe_tx_desc *tx_desc;
 	struct pch_gbe_buffer *buffer_info;
-	struct sk_buff *tmp_skb;
+	char *ptr;
 	unsigned int frame_ctrl;
 	unsigned int ring_num;
 
@@ -1221,18 +1221,27 @@ static void pch_gbe_tx_queue(struct pch_gbe_adapter *adapter,
 
 
 	buffer_info = &tx_ring->buffer_info[ring_num];
-	tmp_skb = buffer_info->skb;
+	if (skb_headroom(skb) < 2) {
+		struct sk_buff *skb_new;
+
+		skb_new = skb_realloc_headroom(skb, 2);
+		if (!skb_new) {
+			tx_ring->next_to_use = ring_num;
+			dev_kfree_skb_any(skb);
+			return;
+		}
+		consume_skb(skb);
+		skb = skb_new;
+	}
 
 	/* [Header:14][payload] ---> [Header:14][paddong:2][payload]    */
-	memcpy(tmp_skb->data, skb->data, ETH_HLEN);
-	tmp_skb->data[ETH_HLEN] = 0x00;
-	tmp_skb->data[ETH_HLEN + 1] = 0x00;
-	tmp_skb->len = skb->len;
-	memcpy(&tmp_skb->data[ETH_HLEN + 2], &skb->data[ETH_HLEN],
-	       (skb->len - ETH_HLEN));
+	ptr = skb_push(skb, 2);
+	memmove(ptr, ptr + 2, ETH_HLEN);
+	ptr[ETH_HLEN] = 0x00;
+	ptr[ETH_HLEN + 1] = 0x00;
 	/*-- Set Buffer information --*/
-	buffer_info->length = tmp_skb->len;
-	buffer_info->dma = dma_map_single(&adapter->pdev->dev, tmp_skb->data,
+	buffer_info->length = skb->len;
+	buffer_info->dma = dma_map_single(&adapter->pdev->dev, skb->data,
 					  buffer_info->length,
 					  DMA_TO_DEVICE);
 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
@@ -1240,18 +1249,20 @@ static void pch_gbe_tx_queue(struct pch_gbe_adapter *adapter,
 		buffer_info->dma = 0;
 		buffer_info->time_stamp = 0;
 		tx_ring->next_to_use = ring_num;
+		dev_kfree_skb_any(skb);
 		return;
 	}
 	buffer_info->mapped = true;
 	buffer_info->time_stamp = jiffies;
+	buffer_info->skb = skb;
 
 	/*-- Set Tx descriptor --*/
 	tx_desc = PCH_GBE_TX_DESC(*tx_ring, ring_num);
-	tx_desc->buffer_addr = (buffer_info->dma);
-	tx_desc->length = (tmp_skb->len);
-	tx_desc->tx_words_eob = ((tmp_skb->len + 3));
+	tx_desc->buffer_addr = buffer_info->dma;
+	tx_desc->length = skb->len;
+	tx_desc->tx_words_eob = skb->len + 3;
 	tx_desc->tx_frame_ctrl = (frame_ctrl);
-	tx_desc->gbec_status = (DSC_INIT16);
+	tx_desc->gbec_status = DSC_INIT16;
 
 	if (unlikely(++ring_num == tx_ring->count))
 		ring_num = 0;
@@ -1265,7 +1276,6 @@ static void pch_gbe_tx_queue(struct pch_gbe_adapter *adapter,
 	pch_tx_timestamp(adapter, skb);
 #endif
 
-	dev_kfree_skb_any(skb);
 }
 
 /**
@@ -1543,19 +1553,12 @@ static void pch_gbe_alloc_tx_buffers(struct pch_gbe_adapter *adapter,
 					struct pch_gbe_tx_ring *tx_ring)
 {
 	struct pch_gbe_buffer *buffer_info;
-	struct sk_buff *skb;
 	unsigned int i;
-	unsigned int bufsz;
 	struct pch_gbe_tx_desc *tx_desc;
 
-	bufsz =
-	    adapter->hw.mac.max_frame_size + PCH_GBE_DMA_ALIGN + NET_IP_ALIGN;
-
 	for (i = 0; i < tx_ring->count; i++) {
 		buffer_info = &tx_ring->buffer_info[i];
-		skb = netdev_alloc_skb(adapter->netdev, bufsz);
-		skb_reserve(skb, PCH_GBE_DMA_ALIGN);
-		buffer_info->skb = skb;
+		buffer_info->skb = NULL;
 		tx_desc = PCH_GBE_TX_DESC(*tx_ring, i);
 		tx_desc->gbec_status = (DSC_INIT16);
 	}
@@ -1622,9 +1625,9 @@ pch_gbe_clean_tx(struct pch_gbe_adapter *adapter,
 					 buffer_info->length, DMA_TO_DEVICE);
 			buffer_info->mapped = false;
 		}
-		if (buffer_info->skb) {
-			pr_debug("trim buffer_info->skb : %d\n", i);
-			skb_trim(buffer_info->skb, 0);
+		if (skb) {
+			dev_kfree_skb_any(skb);
+			buffer_info->skb = NULL;
 		}
 		tx_desc->gbec_status = DSC_INIT16;
 		if (unlikely(++i == tx_ring->count))



^ permalink raw reply related

* Re: [PATCH] [RFC] tcp: TSQ - do not always throttle.
From: Krishna Kumar2 @ 2012-07-17 14:08 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, netdev
In-Reply-To: <1342530654.2626.563.camel@edumazet-glaptop>

Eric Dumazet <eric.dumazet@gmail.com> wrote on 07/17/2012 06:40:54 PM:

> > Do not throttle if sysctl_tcp_limit_output_bytes==0.
> >
> > Maybe it is better to throttle earlier in the loop, after
> > calling tcp_init_tso_segs().
> >
>
> I wonder why, and why you put this question in a changelog instead of
> outside of it...
>
> Idea was to avoid setting TSQ_THROTTLED if we break out the loop.

The reason I mentioned it (in the wrong place) is because
I thought this is a likely case and the checks before that
might all pass only to get throttled. Some of the checks
are quite lengthy.

> About disabling TSQ, my initial intent was to instead use a negative
> sysctl_tcp_limit_output_bytes value.
>
> Thats why I have in tcp_transmit_skb() :
>
> skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
>         tcp_wfree : sock_wfree;
>
> So I suggest you change the tcp_write_xmit(() test to a single unsigned
> compare :
>
> if (atomic_read(&sk->sk_wmem_alloc) >=
>     (unsigned) sysctl_tcp_limit_output_bytes) {
>
> Also use :
>
> skb->destructor = (sysctl_tcp_limit_output_bytes >= 0) ?
>   tcp_wfree : sock_wfree;
>
> and document the 'negative value disables TSQ' in
> Documentation/networking/ip-sysctl.txt

Sure, will post with this change.

thanks,
- KK

^ permalink raw reply

* Re: [PATCH 5/5] ipv4: Add FIB nexthop exceptions.
From: Eric Dumazet @ 2012-07-17 14:00 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120717.061435.1733209287175819043.davem@davemloft.net>

On Tue, 2012-07-17 at 06:14 -0700, David Miller wrote:
> In a regime where we have subnetted route entries, we need a way to
> store persistent storage about destination specific learned values
> such as redirects and PMTU values.
> 
> This is implemented here via nexthop exceptions.
> 
> The initial implementation is a simple linked list, and can be
> expanded to a hash table when it is shown to be justified.

Say a typical host uses a single default route, I am trying to convince
myself it can really use a simple linked list ?

Arent PMTU entries added by messages coming from untrusted sources ?

^ permalink raw reply

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
From: Eric Dumazet @ 2012-07-17 13:50 UTC (permalink / raw)
  To: David Miller
  Cc: David.Laight, rick.jones2, cascardo, netdev, yevgenyp, ogerlitz,
	amirv, brking, leitao, klebers
In-Reply-To: <20120717.055005.1912765690890797652.davem@davemloft.net>

On Tue, 2012-07-17 at 05:50 -0700, David Miller wrote:
> From: "David Laight" <David.Laight@ACULAB.COM>
> Date: Tue, 17 Jul 2012 13:42:04 +0100
> 
> > Would there be any mileage in permanently allocating IOMMU
> > virtual address to the ring entries, then 'just' assigning
> > the correct physical address during rx/tx setup?
> 
> There is a not a one to one mapping between these two entities,
> in particular on the transmit side.
> 
> A transmit packet can have multiple segments, some of which are
> larger than one IOMMU page.

And on rx side, permanently allocating IOMMU would need to copy all
incoming frames to newly allocated memory.

Annot this IOMMU performance problem can be solved on its side,
instead of having to shuffle things in all drivers ?

^ permalink raw reply

* Re: [GIT PULL nf] IPVS
From: Simon Horman @ 2012-07-17 13:50 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer
In-Reply-To: <20120717101406.GC3812@1984>

On Tue, Jul 17, 2012 at 12:14:06PM +0200, Pablo Neira Ayuso wrote:
> On Wed, Jul 11, 2012 at 09:19:20AM +0900, Simon Horman wrote:
> > 
> > Hi Pablo,
> > 
> > this pull request consists of three bug fixes for IPVS.
> > Please consider for inclusion in 3.5 and stable.
> > 
> > The bug fix from Julian, "ipvs: fix oops in ip_vs_dst_event on rmmod"
> > fixes a regression introduced in 3.4 and thus I believe it is
> > only relevant to 3.5 and 3.4-stable.
> > 
> > The other two fixes appear to have been present since at least 2.6.37
> > (there were a lot of changes to IPVS around that time).
> 
> I have passed the two of these patches to David. The one for the FTP
> needs a consistent description.
> 
> It's fairly late in the development cycle (-rc7), but these are small.
> Let's see if David is still in time to accept them. Otherwise, they go
> to net-next and we will ask for -stable submission.

Thanks, it seems that David was in an accepting mood.

^ permalink raw reply

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
From: David Miller @ 2012-07-17 13:46 UTC (permalink / raw)
  To: David.Laight
  Cc: rick.jones2, cascardo, netdev, yevgenyp, ogerlitz, amirv, brking,
	leitao, klebers
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F6026B6F8D@saturn3.aculab.com>

From: "David Laight" <David.Laight@ACULAB.COM>
Date: Tue, 17 Jul 2012 14:36:11 +0100

> The driver will already have to cope with 'too many segments'
> (I remember being passed a full sized frame made of a list
> of 1-byte message blocks...)

Baring driver hardware bug workarounds, no it does not have to cope
with that.  The code is extremely simple now.

All the driver has to do is assume that a new TX packet can never
consume more than MAX_SKB_FRAGS.

Therefore it simply stops the queue if less than MAX_SKB_FRAGS
segments remain after queueing a transmit.

Your suggestion will significantly complicate driver TX paths.

If you're going to suggest a solution, it has to be completely
general enough to work in the current state of affairs, and
your idea absolutely does not.

^ permalink raw reply

* RE: [PATCH] mlx4_en: map entire pages to increase throughput
From: David Laight @ 2012-07-17 13:36 UTC (permalink / raw)
  To: David Miller
  Cc: rick.jones2, cascardo, netdev, yevgenyp, ogerlitz, amirv, brking,
	leitao, klebers
In-Reply-To: <20120717.055005.1912765690890797652.davem@davemloft.net>

> > Would there be any mileage in permanently allocating IOMMU
> > virtual address to the ring entries, then 'just' assigning
> > the correct physical address during rx/tx setup?
> 
> There is a not a one to one mapping between these two entities,
> in particular on the transmit side.
> 
> A transmit packet can have multiple segments, some of which are
> larger than one IOMMU page.

A SMOP :-) TX is probably easier than RX.
Each tx segment will already go into a separate ring entry,
page boundaries could do the same.
The driver will already have to cope with 'too many segments'
(I remember being passed a full sized frame made of a list
of 1-byte message blocks...)

Or allocate enough sequential IOMMU pages for the longest
tx segment for every ring entry - after all that is already
the 'worst case' allocation!

	David

^ permalink raw reply

* [PATCH 5/5] ipv4: Add FIB nexthop exceptions.
From: David Miller @ 2012-07-17 13:14 UTC (permalink / raw)
  To: netdev


In a regime where we have subnetted route entries, we need a way to
store persistent storage about destination specific learned values
such as redirects and PMTU values.

This is implemented here via nexthop exceptions.

The initial implementation is a simple linked list, and can be
expanded to a hash table when it is shown to be justified.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |    9 ++
 net/ipv4/fib_semantics.c |   15 ++++
 net/ipv4/route.c         |  216 +++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 209 insertions(+), 31 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 5697ace..b6b400f 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -46,6 +46,14 @@ struct fib_config {
 
 struct fib_info;
 
+struct fib_nh_exception {
+	struct hlist_node	fnhe_node;
+	__be32			fnhe_daddr;
+	u32			fnhe_pmtu;
+	u32			fnhe_gw;
+	unsigned long		fnhe_expires;
+};
+
 struct fib_nh {
 	struct net_device	*nh_dev;
 	struct hlist_node	nh_hash;
@@ -63,6 +71,7 @@ struct fib_nh {
 	__be32			nh_gw;
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
+	struct hlist_head	nh_exceptions;
 };
 
 /*
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d71bfbd..d266096 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -140,6 +140,18 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
 	},
 };
 
+static void free_nh_exceptions(struct fib_nh *nh)
+{
+	struct hlist_head *head = &nh->nh_exceptions;
+	struct hlist_node *node, *tmp;
+	struct fib_nh_exception *fnhe;
+
+	hlist_for_each_entry_safe(fnhe, node, tmp, head, fnhe_node) {
+		hlist_del(node);
+		kfree(fnhe);
+	}
+}
+
 /* Release a nexthop info record */
 static void free_fib_info_rcu(struct rcu_head *head)
 {
@@ -148,6 +160,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
 	change_nexthops(fi) {
 		if (nexthop_nh->nh_dev)
 			dev_put(nexthop_nh->nh_dev);
+		if (!hlist_empty(&nexthop_nh->nh_exceptions))
+			free_nh_exceptions(nexthop_nh);
 	} endfor_nexthops(fi);
 
 	release_net(fi->fib_net);
@@ -777,6 +791,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	fi->fib_nhs = nhs;
 	change_nexthops(fi) {
 		nexthop_nh->nh_parent = fi;
+		INIT_HLIST_HEAD(&nexthop_nh->nh_exceptions);
 	} endfor_nexthops(fi)
 
 	if (cfg->fc_mx) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b35d3bf..c27ca8f4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1275,14 +1275,93 @@ static void rt_del(unsigned int hash, struct rtable *rt)
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
-static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
+static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
+			     const struct iphdr *iph,
+			     int oif, u8 tos,
+			     u8 prot, u32 mark, int flow_flags)
+{
+	if (sk) {
+		const struct inet_sock *inet = inet_sk(sk);
+
+		oif = sk->sk_bound_dev_if;
+		mark = sk->sk_mark;
+		tos = RT_CONN_FLAGS(sk);
+		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
+	}
+	flowi4_init_output(fl4, oif, mark, tos,
+			   RT_SCOPE_UNIVERSE, prot,
+			   flow_flags,
+			   iph->daddr, iph->saddr, 0, 0);
+}
+
+static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct sock *sk)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	int oif = skb->dev->ifindex;
+	u8 tos = RT_TOS(iph->tos);
+	u8 prot = iph->protocol;
+	u32 mark = skb->mark;
+
+	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
+}
+
+static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	struct ip_options_rcu *inet_opt;
+	__be32 daddr = inet->inet_daddr;
+
+	rcu_read_lock();
+	inet_opt = rcu_dereference(inet->inet_opt);
+	if (inet_opt && inet_opt->opt.srr)
+		daddr = inet_opt->opt.faddr;
+	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+			   inet_sk_flowi_flags(sk),
+			   daddr, inet->inet_saddr, 0, 0);
+	rcu_read_unlock();
+}
+
+static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
+				 struct sk_buff *skb)
+{
+	if (skb)
+		build_skb_flow_key(fl4, skb, sk);
+	else
+		build_sk_flow_key(fl4, sk);
+}
+
+static DEFINE_SPINLOCK(fnhe_lock);
+
+static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
+{
+	struct hlist_head *head = &nh->nh_exceptions;
+	struct fib_nh_exception *fnhe;
+	struct hlist_node *node;
+
+	hlist_for_each_entry(fnhe, node, head, fnhe_node) {
+		if (fnhe->fnhe_daddr == daddr)
+			return fnhe;
+	}
+
+	fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+	if (!fnhe)
+		return NULL;
+
+	fnhe->fnhe_daddr = daddr;
+	hlist_add_head(&fnhe->fnhe_node, head);
+	return fnhe;
+}
+
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
 {
 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 	__be32 old_gw = ip_hdr(skb)->saddr;
 	struct net_device *dev = skb->dev;
 	struct in_device *in_dev;
+	struct fib_result res;
 	struct neighbour *n;
-	struct rtable *rt;
 	struct net *net;
 
 	switch (icmp_hdr(skb)->code & 7) {
@@ -1296,7 +1375,6 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
 		return;
 	}
 
-	rt = (struct rtable *) dst;
 	if (rt->rt_gateway != old_gw)
 		return;
 
@@ -1320,11 +1398,21 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
 			goto reject_redirect;
 	}
 
-	n = ipv4_neigh_lookup(dst, NULL, &new_gw);
+	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 	if (n) {
 		if (!(n->nud_state & NUD_VALID)) {
 			neigh_event_send(n, NULL);
 		} else {
+			if (fib_lookup(net, fl4, &res) == 0) {
+				struct fib_nh *nh = &FIB_RES_NH(res);
+				struct fib_nh_exception *fnhe;
+
+				spin_lock_bh(&fnhe_lock);
+				fnhe = find_or_create_fnhe(nh, fl4->daddr);
+				if (fnhe)
+					fnhe->fnhe_gw = new_gw;
+				spin_unlock_bh(&fnhe_lock);
+			}
 			rt->rt_gateway = new_gw;
 			rt->rt_flags |= RTCF_REDIRECTED;
 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
@@ -1349,6 +1437,17 @@ reject_redirect:
 	;
 }
 
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
+{
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	rt = (struct rtable *) dst;
+
+	ip_rt_build_flow_key(&fl4, sk, skb);
+	__ip_do_redirect(rt, skb, &fl4);
+}
+
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 {
 	struct rtable *rt = (struct rtable *)dst;
@@ -1508,33 +1607,51 @@ out:	kfree_skb(skb);
 	return 0;
 }
 
-static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
-			      struct sk_buff *skb, u32 mtu)
+static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
-	struct rtable *rt = (struct rtable *) dst;
-
-	dst_confirm(dst);
+	struct fib_result res;
 
 	if (mtu < ip_rt_min_pmtu)
 		mtu = ip_rt_min_pmtu;
 
+	if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
+		struct fib_nh *nh = &FIB_RES_NH(res);
+		struct fib_nh_exception *fnhe;
+
+		spin_lock_bh(&fnhe_lock);
+		fnhe = find_or_create_fnhe(nh, fl4->daddr);
+		if (fnhe) {
+			fnhe->fnhe_pmtu = mtu;
+			fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
+		}
+		spin_unlock_bh(&fnhe_lock);
+	}
 	rt->rt_pmtu = mtu;
 	dst_set_expires(&rt->dst, ip_rt_mtu_expires);
 }
 
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			      struct sk_buff *skb, u32 mtu)
+{
+	struct rtable *rt = (struct rtable *) dst;
+	struct flowi4 fl4;
+
+	ip_rt_build_flow_key(&fl4, sk, skb);
+	__ip_rt_update_pmtu(rt, &fl4, mtu);
+}
+
 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 		      int oif, u32 mark, u8 protocol, int flow_flags)
 {
-	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	const struct iphdr *iph = (const struct iphdr *) skb->data;
 	struct flowi4 fl4;
 	struct rtable *rt;
 
-	flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
-			   protocol, flow_flags,
-			   iph->daddr, iph->saddr, 0, 0);
+	__build_flow_key(&fl4, NULL, iph, oif,
+			 RT_TOS(iph->tos), protocol, mark, flow_flags);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
-		ip_rt_update_pmtu(&rt->dst, NULL, skb, mtu);
+		__ip_rt_update_pmtu(rt, &fl4, mtu);
 		ip_rt_put(rt);
 	}
 }
@@ -1542,27 +1659,31 @@ EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
 
 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 {
-	const struct inet_sock *inet = inet_sk(sk);
+	const struct iphdr *iph = (const struct iphdr *) skb->data;
+	struct flowi4 fl4;
+	struct rtable *rt;
 
-	return ipv4_update_pmtu(skb, sock_net(sk), mtu,
-				sk->sk_bound_dev_if, sk->sk_mark,
-				inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
-				inet_sk_flowi_flags(sk));
+	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+	rt = __ip_route_output_key(sock_net(sk), &fl4);
+	if (!IS_ERR(rt)) {
+		__ip_rt_update_pmtu(rt, &fl4, mtu);
+		ip_rt_put(rt);
+	}
 }
 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
 
 void ipv4_redirect(struct sk_buff *skb, struct net *net,
 		   int oif, u32 mark, u8 protocol, int flow_flags)
 {
-	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	const struct iphdr *iph = (const struct iphdr *) skb->data;
 	struct flowi4 fl4;
 	struct rtable *rt;
 
-	flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
-			   protocol, flow_flags, iph->daddr, iph->saddr, 0, 0);
+	__build_flow_key(&fl4, NULL, iph, oif,
+			 RT_TOS(iph->tos), protocol, mark, flow_flags);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
-		ip_do_redirect(&rt->dst, NULL, skb);
+		__ip_do_redirect(rt, skb, &fl4);
 		ip_rt_put(rt);
 	}
 }
@@ -1570,12 +1691,16 @@ EXPORT_SYMBOL_GPL(ipv4_redirect);
 
 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
 {
-	const struct inet_sock *inet = inet_sk(sk);
+	const struct iphdr *iph = (const struct iphdr *) skb->data;
+	struct flowi4 fl4;
+	struct rtable *rt;
 
-	return ipv4_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
-			     sk->sk_mark,
-			     inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
-			     inet_sk_flowi_flags(sk));
+	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
+	rt = __ip_route_output_key(sock_net(sk), &fl4);
+	if (!IS_ERR(rt)) {
+		__ip_do_redirect(rt, skb, &fl4);
+		ip_rt_put(rt);
+	}
 }
 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
 
@@ -1722,14 +1847,43 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
 	dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 }
 
+static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
+{
+	struct hlist_head *head = &nh->nh_exceptions;
+	struct fib_nh_exception *fnhe;
+	struct hlist_node *node;
+
+	spin_lock_bh(&fnhe_lock);
+	hlist_for_each_entry(fnhe, node, head, fnhe_node) {
+		if (fnhe->fnhe_daddr == daddr) {
+			if (fnhe->fnhe_pmtu) {
+				unsigned long expires = fnhe->fnhe_expires;
+				unsigned long diff = jiffies - expires;
+
+				if (time_before(jiffies, expires)) {
+					rt->rt_pmtu = fnhe->fnhe_pmtu;
+					dst_set_expires(&rt->dst, diff);
+				}
+			}
+			if (fnhe->fnhe_gw)
+				rt->rt_gateway = fnhe->fnhe_gw;
+			break;
+		}
+	}
+	spin_unlock_bh(&fnhe_lock);
+}
+
 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
 			   const struct fib_result *res,
 			   struct fib_info *fi, u16 type, u32 itag)
 {
 	if (fi) {
-		if (FIB_RES_GW(*res) &&
-		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
-			rt->rt_gateway = FIB_RES_GW(*res);
+		struct fib_nh *nh = &FIB_RES_NH(*res);
+
+		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
+			rt->rt_gateway = nh->nh_gw;
+		if (unlikely(!hlist_empty(&nh->nh_exceptions)))
+			rt_bind_exception(rt, nh, fl4->daddr);
 		rt_init_metrics(rt, fl4, fi);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 4/5] net: Pass optional SKB and SK arguments to dst_ops->{update_pmtu,redirect}()
From: David Miller @ 2012-07-17 13:14 UTC (permalink / raw)
  To: netdev


This will be used so that we can compose a full flow key.

Even though we have a route in this context, we need more.  In the
future the routes will be without destination address, source address,
etc. keying.  One ipv4 route will cover entire subnets, etc.

In this environment we have to have a way to possess persistent storage
for redirects and PMTU information.  This persistent storage will exist
in the FIB tables, and that's why we'll need to be able to rebuild a
full lookup flow key here.  Using that flow key will do a fib_lookup()
and create/update the persistent entry.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/ulp/ipoib/ipoib_cm.c |    2 +-
 include/net/dst_ops.h                   |    6 ++++--
 net/bridge/br_netfilter.c               |    6 ++++--
 net/dccp/ipv4.c                         |    2 +-
 net/dccp/ipv6.c                         |    2 +-
 net/decnet/dn_route.c                   |   12 ++++++++----
 net/ipv4/inet_connection_sock.c         |    2 +-
 net/ipv4/ip_gre.c                       |    2 +-
 net/ipv4/ipip.c                         |    2 +-
 net/ipv4/route.c                        |   21 +++++++++++++--------
 net/ipv4/tcp_ipv4.c                     |    2 +-
 net/ipv4/xfrm4_policy.c                 |   10 ++++++----
 net/ipv6/inet6_connection_sock.c        |    2 +-
 net/ipv6/ip6_tunnel.c                   |    6 +++---
 net/ipv6/route.c                        |   21 +++++++++++++--------
 net/ipv6/sit.c                          |    2 +-
 net/ipv6/tcp_ipv6.c                     |    2 +-
 net/ipv6/xfrm6_policy.c                 |   10 ++++++----
 net/netfilter/ipvs/ip_vs_xmit.c         |    4 ++--
 net/sctp/input.c                        |    2 +-
 net/sctp/transport.c                    |    2 +-
 21 files changed, 71 insertions(+), 49 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 014504d..1ca7322 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -1397,7 +1397,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
 	int e = skb_queue_empty(&priv->cm.skb_queue);
 
 	if (skb_dst(skb))
-		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 
 	skb_queue_tail(&priv->cm.skb_queue, skb);
 	if (e)
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index 085931f..d079fc6 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -24,8 +24,10 @@ struct dst_ops {
 					  struct net_device *dev, int how);
 	struct dst_entry *	(*negative_advice)(struct dst_entry *);
 	void			(*link_failure)(struct sk_buff *);
-	void			(*update_pmtu)(struct dst_entry *dst, u32 mtu);
-	void			(*redirect)(struct dst_entry *dst, struct sk_buff *skb);
+	void			(*update_pmtu)(struct dst_entry *dst, struct sock *sk,
+					       struct sk_buff *skb, u32 mtu);
+	void			(*redirect)(struct dst_entry *dst, struct sock *sk,
+					    struct sk_buff *skb);
 	int			(*local_out)(struct sk_buff *skb);
 	struct neighbour *	(*neigh_lookup)(const struct dst_entry *dst,
 						struct sk_buff *skb,
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 81f76c4..68e8f36 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -111,11 +111,13 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb)
 	 pppoe_proto(skb) == htons(PPP_IPV6) && \
 	 brnf_filter_pppoe_tagged)
 
-static void fake_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			     struct sk_buff *skb, u32 mtu)
 {
 }
 
-static void fake_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void fake_redirect(struct dst_entry *dst, struct sock *sk,
+			  struct sk_buff *skb)
 {
 }
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 683902f..ab4f44c 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -193,7 +193,7 @@ static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk)
 	struct dst_entry *dst = __sk_dst_check(sk, 0);
 
 	if (dst)
-		dst->ops->redirect(dst, skb);
+		dst->ops->redirect(dst, sk, skb);
 }
 
 /*
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 3ee0342..56840b2 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -134,7 +134,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
 
 		if (dst)
-			dst->ops->redirect(dst, skb);
+			dst->ops->redirect(dst, sk, skb);
 	}
 
 	if (type == ICMPV6_PKT_TOOBIG) {
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index e9c4e2e..47de90d 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -117,8 +117,10 @@ static void dn_dst_destroy(struct dst_entry *);
 static void dn_dst_ifdown(struct dst_entry *, struct net_device *dev, int how);
 static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
 static void dn_dst_link_failure(struct sk_buff *);
-static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu);
-static void dn_dst_redirect(struct dst_entry *dst, struct sk_buff *skb);
+static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			       struct sk_buff *skb , u32 mtu);
+static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk,
+			    struct sk_buff *skb);
 static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
 					     struct sk_buff *skb,
 					     const void *daddr);
@@ -266,7 +268,8 @@ static int dn_dst_gc(struct dst_ops *ops)
  * We update both the mtu and the advertised mss (i.e. the segment size we
  * advertise to the other end).
  */
-static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			       struct sk_buff *skb, u32 mtu)
 {
 	struct dn_route *rt = (struct dn_route *) dst;
 	struct neighbour *n = rt->n;
@@ -294,7 +297,8 @@ static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu)
 	}
 }
 
-static void dn_dst_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk,
+			    struct sk_buff *skb)
 {
 }
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 200d218..3ea4652 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -840,7 +840,7 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
 		if (!dst)
 			goto out;
 	}
-	dst->ops->update_pmtu(dst, mtu);
+	dst->ops->update_pmtu(dst, sk, NULL, mtu);
 
 	dst = __sk_dst_check(sk, 0);
 	if (!dst)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0c31235..42c44b1 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -833,7 +833,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 
 	if (skb_dst(skb))
-		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 
 	if (skb->protocol == htons(ETH_P_IP)) {
 		df |= (old_iph->frag_off&htons(IP_DF));
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c2d0e6d..2c2c35b 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -519,7 +519,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 
 		if (skb_dst(skb))
-			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 
 		if ((old_iph->frag_off & htons(IP_DF)) &&
 		    mtu < ntohs(old_iph->tot_len)) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index aad2181..b35d3bf 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -148,8 +148,10 @@ static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
 static void		 ipv4_dst_destroy(struct dst_entry *dst);
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 static void		 ipv4_link_failure(struct sk_buff *skb);
-static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
-static void		 ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb);
+static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+					   struct sk_buff *skb, u32 mtu);
+static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
+					struct sk_buff *skb);
 static int rt_garbage_collect(struct dst_ops *ops);
 
 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -1273,7 +1275,7 @@ static void rt_del(unsigned int hash, struct rtable *rt)
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
-static void ip_do_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 {
 	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 	__be32 old_gw = ip_hdr(skb)->saddr;
@@ -1506,7 +1508,8 @@ out:	kfree_skb(skb);
 	return 0;
 }
 
-static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			      struct sk_buff *skb, u32 mtu)
 {
 	struct rtable *rt = (struct rtable *) dst;
 
@@ -1531,7 +1534,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 			   iph->daddr, iph->saddr, 0, 0);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
-		ip_rt_update_pmtu(&rt->dst, mtu);
+		ip_rt_update_pmtu(&rt->dst, NULL, skb, mtu);
 		ip_rt_put(rt);
 	}
 }
@@ -1559,7 +1562,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
 			   protocol, flow_flags, iph->daddr, iph->saddr, 0, 0);
 	rt = __ip_route_output_key(net, &fl4);
 	if (!IS_ERR(rt)) {
-		ip_do_redirect(&rt->dst, skb);
+		ip_do_redirect(&rt->dst, NULL, skb);
 		ip_rt_put(rt);
 	}
 }
@@ -2587,11 +2590,13 @@ static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
 	return mtu ? : dst->dev->mtu;
 }
 
-static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+					  struct sk_buff *skb, u32 mtu)
 {
 }
 
-static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+				       struct sk_buff *skb)
 {
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b8e7e05..d9caf5c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -319,7 +319,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk)
 	struct dst_entry *dst = __sk_dst_check(sk, 0);
 
 	if (dst)
-		dst->ops->redirect(dst, skb);
+		dst->ops->redirect(dst, sk, skb);
 }
 
 /*
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 737131c..fcf7678 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -194,20 +194,22 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
 	return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
 }
 
-static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			      struct sk_buff *skb, u32 mtu)
 {
 	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 	struct dst_entry *path = xdst->route;
 
-	path->ops->update_pmtu(path, mtu);
+	path->ops->update_pmtu(path, sk, skb, mtu);
 }
 
-static void xfrm4_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
+			   struct sk_buff *skb)
 {
 	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 	struct dst_entry *path = xdst->route;
 
-	path->ops->redirect(path, skb);
+	path->ops->redirect(path, sk, skb);
 }
 
 static void xfrm4_dst_destroy(struct dst_entry *dst)
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 62539a4..4a0c4d2 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -269,7 +269,7 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu)
 
 	if (IS_ERR(dst))
 		return NULL;
-	dst->ops->update_pmtu(dst, mtu);
+	dst->ops->update_pmtu(dst, sk, NULL, mtu);
 
 	return inet6_csk_route_socket(sk);
 }
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 61d1065..db32846 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -609,10 +609,10 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (rel_info > dst_mtu(skb_dst(skb2)))
 			goto out;
 
-		skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), rel_info);
+		skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), NULL, skb2, rel_info);
 	}
 	if (rel_type == ICMP_REDIRECT)
-		skb_dst(skb2)->ops->redirect(skb_dst(skb2), skb2);
+		skb_dst(skb2)->ops->redirect(skb_dst(skb2), NULL, skb2);
 
 	icmp_send(skb2, rel_type, rel_code, htonl(rel_info));
 
@@ -952,7 +952,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
 	if (skb_dst(skb))
-		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 	if (skb->len > mtu) {
 		*pmtu = mtu;
 		err = -EMSGSIZE;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2a4c8d4..31af1ed 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -78,8 +78,10 @@ static int		 ip6_dst_gc(struct dst_ops *ops);
 static int		ip6_pkt_discard(struct sk_buff *skb);
 static int		ip6_pkt_discard_out(struct sk_buff *skb);
 static void		ip6_link_failure(struct sk_buff *skb);
-static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
-static void		rt6_do_redirect(struct dst_entry *dst, struct sk_buff *skb);
+static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+					   struct sk_buff *skb, u32 mtu);
+static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
+					struct sk_buff *skb);
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
 static struct rt6_info *rt6_add_route_info(struct net *net,
@@ -187,11 +189,13 @@ static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
 	return mtu ? : dst->dev->mtu;
 }
 
-static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+					 struct sk_buff *skb, u32 mtu)
 {
 }
 
-static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+				      struct sk_buff *skb)
 {
 }
 
@@ -1071,7 +1075,8 @@ static void ip6_link_failure(struct sk_buff *skb)
 	}
 }
 
-static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			       struct sk_buff *skb, u32 mtu)
 {
 	struct rt6_info *rt6 = (struct rt6_info*)dst;
 
@@ -1108,7 +1113,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
 
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (!dst->error)
-		ip6_rt_update_pmtu(dst, ntohl(mtu));
+		ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
 	dst_release(dst);
 }
 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
@@ -1136,7 +1141,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
 
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (!dst->error)
-		rt6_do_redirect(dst, skb);
+		rt6_do_redirect(dst, NULL, skb);
 	dst_release(dst);
 }
 EXPORT_SYMBOL_GPL(ip6_redirect);
@@ -1639,7 +1644,7 @@ static int ip6_route_del(struct fib6_config *cfg)
 	return err;
 }
 
-static void rt6_do_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 	struct netevent_redirect netevent;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index fbf1622..3bd1bfc 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -807,7 +807,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		}
 
 		if (tunnel->parms.iph.daddr && skb_dst(skb))
-			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 
 		if (skb->len > mtu) {
 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ecdf241..c9dabdd 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -367,7 +367,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
 
 		if (dst)
-			dst->ops->redirect(dst,skb);
+			dst->ops->redirect(dst, sk, skb);
 	}
 
 	if (type == ICMPV6_PKT_TOOBIG) {
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index f5a9cb8..ef39812 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -207,20 +207,22 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops)
 	return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
 }
 
-static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu)
+static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			      struct sk_buff *skb, u32 mtu)
 {
 	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 	struct dst_entry *path = xdst->route;
 
-	path->ops->update_pmtu(path, mtu);
+	path->ops->update_pmtu(path, sk, skb, mtu);
 }
 
-static void xfrm6_redirect(struct dst_entry *dst, struct sk_buff *skb)
+static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk,
+			   struct sk_buff *skb)
 {
 	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 	struct dst_entry *path = xdst->route;
 
-	path->ops->redirect(path, skb);
+	path->ops->redirect(path, sk, skb);
 }
 
 static void xfrm6_dst_destroy(struct dst_entry *dst)
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 71d6ecb..65b616a 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -797,7 +797,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error_put;
 	}
 	if (skb_dst(skb))
-		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 
 	df |= (old_iph->frag_off & htons(IP_DF));
 
@@ -913,7 +913,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 		goto tx_error_put;
 	}
 	if (skb_dst(skb))
-		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
 
 	if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) &&
 	    !skb_is_gso(skb)) {
diff --git a/net/sctp/input.c b/net/sctp/input.c
index a67bc31..c201b26 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -432,7 +432,7 @@ void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t,
 		return;
 	dst = sctp_transport_dst_check(t);
 	if (dst)
-		dst->ops->redirect(dst, skb);
+		dst->ops->redirect(dst, sk, skb);
 }
 
 /*
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index e69e1a2..a6b7ee9 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -249,7 +249,7 @@ void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 p
 		t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
 
 	if (dst) {
-		dst->ops->update_pmtu(dst, pmtu);
+		dst->ops->update_pmtu(dst, sk, NULL, pmtu);
 
 		dst = sctp_transport_dst_check(t);
 		if (!dst)
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 3/5] sctp: Adjust PMTU updates to accomodate route invalidation.
From: David Miller @ 2012-07-17 13:14 UTC (permalink / raw)
  To: netdev


This adjusts the call to dst_ops->update_pmtu() so that we can
transparently handle the fact that, in the future, the dst itself can
be invalidated by the PMTU update (when we have non-host routes cached
in sockets).

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h    |    4 ++--
 include/net/sctp/structs.h |    4 ++--
 net/sctp/associola.c       |    4 ++--
 net/sctp/input.c           |    4 ++--
 net/sctp/output.c          |    2 +-
 net/sctp/socket.c          |    6 +++---
 net/sctp/transport.c       |   12 ++++++++++--
 7 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 1f2735d..ff49964 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -519,10 +519,10 @@ static inline int sctp_frag_point(const struct sctp_association *asoc, int pmtu)
 	return frag;
 }
 
-static inline void sctp_assoc_pending_pmtu(struct sctp_association *asoc)
+static inline void sctp_assoc_pending_pmtu(struct sock *sk, struct sctp_association *asoc)
 {
 
-	sctp_assoc_sync_pmtu(asoc);
+	sctp_assoc_sync_pmtu(sk, asoc);
 	asoc->pmtu_pending = 0;
 }
 
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index fecdf31..536e439 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1091,7 +1091,7 @@ void sctp_transport_burst_limited(struct sctp_transport *);
 void sctp_transport_burst_reset(struct sctp_transport *);
 unsigned long sctp_transport_timeout(struct sctp_transport *);
 void sctp_transport_reset(struct sctp_transport *);
-void sctp_transport_update_pmtu(struct sctp_transport *, u32);
+void sctp_transport_update_pmtu(struct sock *, struct sctp_transport *, u32);
 void sctp_transport_immediate_rtx(struct sctp_transport *);
 
 
@@ -2003,7 +2003,7 @@ void sctp_assoc_update(struct sctp_association *old,
 
 __u32 sctp_association_get_next_tsn(struct sctp_association *);
 
-void sctp_assoc_sync_pmtu(struct sctp_association *);
+void sctp_assoc_sync_pmtu(struct sock *, struct sctp_association *);
 void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int);
 void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int);
 void sctp_assoc_set_primary(struct sctp_association *,
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index b16517e..8cf348e 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1360,7 +1360,7 @@ struct sctp_transport *sctp_assoc_choose_alter_transport(
 /* Update the association's pmtu and frag_point by going through all the
  * transports. This routine is called when a transport's PMTU has changed.
  */
-void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
+void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
 {
 	struct sctp_transport *t;
 	__u32 pmtu = 0;
@@ -1372,7 +1372,7 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc)
 	list_for_each_entry(t, &asoc->peer.transport_addr_list,
 				transports) {
 		if (t->pmtu_pending && t->dst) {
-			sctp_transport_update_pmtu(t, dst_mtu(t->dst));
+			sctp_transport_update_pmtu(sk, t, dst_mtu(t->dst));
 			t->pmtu_pending = 0;
 		}
 		if (!pmtu || (t->pathmtu < pmtu))
diff --git a/net/sctp/input.c b/net/sctp/input.c
index f050d45..a67bc31 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -408,10 +408,10 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc,
 
 	if (t->param_flags & SPP_PMTUD_ENABLE) {
 		/* Update transports view of the MTU */
-		sctp_transport_update_pmtu(t, pmtu);
+		sctp_transport_update_pmtu(sk, t, pmtu);
 
 		/* Update association pmtu. */
-		sctp_assoc_sync_pmtu(asoc);
+		sctp_assoc_sync_pmtu(sk, asoc);
 	}
 
 	/* Retransmit with the new pmtu setting.
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 539f35d..838e18b 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -410,7 +410,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	if (!sctp_transport_dst_check(tp)) {
 		sctp_transport_route(tp, NULL, sctp_sk(sk));
 		if (asoc && (asoc->param_flags & SPP_PMTUD_ENABLE)) {
-			sctp_assoc_sync_pmtu(asoc);
+			sctp_assoc_sync_pmtu(sk, asoc);
 		}
 	}
 	dst = dst_clone(tp->dst);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b3b8a8d..74bd3c4 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1853,7 +1853,7 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
 	}
 
 	if (asoc->pmtu_pending)
-		sctp_assoc_pending_pmtu(asoc);
+		sctp_assoc_pending_pmtu(sk, asoc);
 
 	/* If fragmentation is disabled and the message length exceeds the
 	 * association fragmentation point, return EMSGSIZE.  The I-D
@@ -2365,7 +2365,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
 	if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) {
 		if (trans) {
 			trans->pathmtu = params->spp_pathmtu;
-			sctp_assoc_sync_pmtu(asoc);
+			sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc);
 		} else if (asoc) {
 			asoc->pathmtu = params->spp_pathmtu;
 			sctp_frag_point(asoc, params->spp_pathmtu);
@@ -2382,7 +2382,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
 				(trans->param_flags & ~SPP_PMTUD) | pmtud_change;
 			if (update) {
 				sctp_transport_pmtu(trans, sctp_opt2sk(sp));
-				sctp_assoc_sync_pmtu(asoc);
+				sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc);
 			}
 		} else if (asoc) {
 			asoc->param_flags =
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 1dcceb6..e69e1a2 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -228,7 +228,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
 		transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
 }
 
-void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
+void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 pmtu)
 {
 	struct dst_entry *dst;
 
@@ -245,8 +245,16 @@ void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
 	}
 
 	dst = sctp_transport_dst_check(t);
-	if (dst)
+	if (!dst)
+		t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
+
+	if (dst) {
 		dst->ops->update_pmtu(dst, pmtu);
+
+		dst = sctp_transport_dst_check(t);
+		if (!dst)
+			t->af_specific->get_dst(t, &t->saddr, &t->fl, sk);
+	}
 }
 
 /* Caches the dst entry and source address for a transport's destination
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 2/5] ipv6: Add helper inet6_csk_update_pmtu().
From: David Miller @ 2012-07-17 13:14 UTC (permalink / raw)
  To: netdev


This is the ipv6 version of inet_csk_update_pmtu().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet6_connection_sock.h |    2 ++
 net/dccp/ipv6.c                     |   35 +++----------------------
 net/ipv6/inet6_connection_sock.c    |   49 +++++++++++++++++++++++++----------
 net/ipv6/tcp_ipv6.c                 |   37 +++-----------------------
 4 files changed, 45 insertions(+), 78 deletions(-)

diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
index df2a857..04642c9 100644
--- a/include/net/inet6_connection_sock.h
+++ b/include/net/inet6_connection_sock.h
@@ -43,4 +43,6 @@ extern void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
 extern void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);
 
 extern int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl);
+
+extern struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu);
 #endif /* _INET6_CONNECTION_SOCK_H */
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 090c080..3ee0342 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -145,39 +145,12 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
 			goto out;
 
-		/* icmp should have updated the destination cache entry */
-		dst = __sk_dst_check(sk, np->dst_cookie);
-		if (dst == NULL) {
-			struct inet_sock *inet = inet_sk(sk);
-			struct flowi6 fl6;
-
-			/* BUGGG_FUTURE: Again, it is not clear how
-			   to handle rthdr case. Ignore this complexity
-			   for now.
-			 */
-			memset(&fl6, 0, sizeof(fl6));
-			fl6.flowi6_proto = IPPROTO_DCCP;
-			fl6.daddr = np->daddr;
-			fl6.saddr = np->saddr;
-			fl6.flowi6_oif = sk->sk_bound_dev_if;
-			fl6.fl6_dport = inet->inet_dport;
-			fl6.fl6_sport = inet->inet_sport;
-			security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
-
-			dst = ip6_dst_lookup_flow(sk, &fl6, NULL, false);
-			if (IS_ERR(dst)) {
-				sk->sk_err_soft = -PTR_ERR(dst);
-				goto out;
-			}
-		} else
-			dst_hold(dst);
-
-		dst->ops->update_pmtu(dst, ntohl(info));
+		dst = inet6_csk_update_pmtu(sk, ntohl(info));
+		if (!dst)
+			goto out;
 
-		if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
+		if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst))
 			dccp_sync_mss(sk, dst_mtu(dst));
-		} /* else let the usual retransmit timer handle it */
-		dst_release(dst);
 		goto out;
 	}
 
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index bceb144..62539a4 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -203,15 +203,13 @@ struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
 	return dst;
 }
 
-int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused)
+static struct dst_entry *inet6_csk_route_socket(struct sock *sk)
 {
-	struct sock *sk = skb->sk;
 	struct inet_sock *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct flowi6 fl6;
-	struct dst_entry *dst;
 	struct in6_addr *final_p, final;
-	int res;
+	struct dst_entry *dst;
+	struct flowi6 fl6;
 
 	memset(&fl6, 0, sizeof(fl6));
 	fl6.flowi6_proto = sk->sk_protocol;
@@ -228,18 +226,29 @@ int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused)
 	final_p = fl6_update_dst(&fl6, np->opt, &final);
 
 	dst = __inet6_csk_dst_check(sk, np->dst_cookie);
-
-	if (dst == NULL) {
+	if (!dst) {
 		dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false);
 
-		if (IS_ERR(dst)) {
-			sk->sk_err_soft = -PTR_ERR(dst);
-			sk->sk_route_caps = 0;
-			kfree_skb(skb);
-			return PTR_ERR(dst);
-		}
+		if (!IS_ERR(dst))
+			__inet6_csk_dst_store(sk, dst, NULL, NULL);
+	}
+	return dst;
+}
 
-		__inet6_csk_dst_store(sk, dst, NULL, NULL);
+int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused)
+{
+	struct sock *sk = skb->sk;
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct flowi6 fl6;
+	struct dst_entry *dst;
+	int res;
+
+	dst = inet6_csk_route_socket(sk);
+	if (IS_ERR(dst)) {
+		sk->sk_err_soft = -PTR_ERR(dst);
+		sk->sk_route_caps = 0;
+		kfree_skb(skb);
+		return PTR_ERR(dst);
 	}
 
 	rcu_read_lock();
@@ -253,3 +262,15 @@ int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused)
 	return res;
 }
 EXPORT_SYMBOL_GPL(inet6_csk_xmit);
+
+struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu)
+{
+	struct dst_entry *dst = inet6_csk_route_socket(sk);
+
+	if (IS_ERR(dst))
+		return NULL;
+	dst->ops->update_pmtu(dst, mtu);
+
+	return inet6_csk_route_socket(sk);
+}
+EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3071f37..ecdf241 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -378,43 +378,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
 			goto out;
 
-		/* icmp should have updated the destination cache entry */
-		dst = __sk_dst_check(sk, np->dst_cookie);
-
-		if (dst == NULL) {
-			struct inet_sock *inet = inet_sk(sk);
-			struct flowi6 fl6;
-
-			/* BUGGG_FUTURE: Again, it is not clear how
-			   to handle rthdr case. Ignore this complexity
-			   for now.
-			 */
-			memset(&fl6, 0, sizeof(fl6));
-			fl6.flowi6_proto = IPPROTO_TCP;
-			fl6.daddr = np->daddr;
-			fl6.saddr = np->saddr;
-			fl6.flowi6_oif = sk->sk_bound_dev_if;
-			fl6.flowi6_mark = sk->sk_mark;
-			fl6.fl6_dport = inet->inet_dport;
-			fl6.fl6_sport = inet->inet_sport;
-			security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
-
-			dst = ip6_dst_lookup_flow(sk, &fl6, NULL, false);
-			if (IS_ERR(dst)) {
-				sk->sk_err_soft = -PTR_ERR(dst);
-				goto out;
-			}
-
-		} else
-			dst_hold(dst);
-
-		dst->ops->update_pmtu(dst, ntohl(info));
+		dst = inet6_csk_update_pmtu(sk, ntohl(info));
+		if (!dst)
+			goto out;
 
 		if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
 			tcp_sync_mss(sk, dst_mtu(dst));
 			tcp_simple_retransmit(sk);
-		} /* else let the usual retransmit timer handle it */
-		dst_release(dst);
+		}
 		goto out;
 	}
 
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 1/5] ipv4: Add helper inet_csk_update_pmtu().
From: David Miller @ 2012-07-17 13:14 UTC (permalink / raw)
  To: netdev


This abstracts away the call to dst_ops->update_pmtu() so that we can
transparently handle the fact that, in the future, the dst itself can
be invalidated by the PMTU update (when we have non-host routes cached
in sockets).

So we try to rebuild the socket cached route after the method
invocation if necessary.

This isn't used by SCTP because it needs to cache dsts per-transport,
and thus will need it's own local version of this helper.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h |    2 ++
 net/dccp/ipv4.c                    |   11 ++-------
 net/ipv4/inet_connection_sock.c    |   46 ++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c                |   11 ++-------
 4 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 291e7ce..2cf44b4 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -337,4 +337,6 @@ extern int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
 				      char __user *optval, int __user *optlen);
 extern int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
 				      char __user *optval, unsigned int optlen);
+
+extern struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
 #endif /* _INET_CONNECTION_SOCK_H */
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 129ed8f..683902f 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -161,17 +161,10 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
 	if (sk->sk_state == DCCP_LISTEN)
 		return;
 
-	/* We don't check in the destentry if pmtu discovery is forbidden
-	 * on this route. We just assume that no packet_to_big packets
-	 * are send back when pmtu discovery is not active.
-	 * There is a small race when the user changes this flag in the
-	 * route, but I think that's acceptable.
-	 */
-	if ((dst = __sk_dst_check(sk, 0)) == NULL)
+	dst = inet_csk_update_pmtu(sk, mtu);
+	if (!dst)
 		return;
 
-	dst->ops->update_pmtu(dst, mtu);
-
 	/* Something is about to be wrong... Remember soft error
 	 * for the case, if this connection will not able to recover.
 	 */
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 76825be..200d218 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -803,3 +803,49 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
 }
 EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
 #endif
+
+static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_options_rcu *inet_opt;
+	__be32 daddr = inet->inet_daddr;
+	struct flowi4 *fl4;
+	struct rtable *rt;
+
+	rcu_read_lock();
+	inet_opt = rcu_dereference(inet->inet_opt);
+	if (inet_opt && inet_opt->opt.srr)
+		daddr = inet_opt->opt.faddr;
+	fl4 = &fl->u.ip4;
+	rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
+				   inet->inet_saddr, inet->inet_dport,
+				   inet->inet_sport, sk->sk_protocol,
+				   RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+	if (IS_ERR(rt))
+		rt = NULL;
+	if (rt)
+		sk_setup_caps(sk, &rt->dst);
+	rcu_read_unlock();
+
+	return &rt->dst;
+}
+
+struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
+{
+	struct dst_entry *dst = __sk_dst_check(sk, 0);
+	struct inet_sock *inet = inet_sk(sk);
+
+	if (!dst) {
+		dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+		if (!dst)
+			goto out;
+	}
+	dst->ops->update_pmtu(dst, mtu);
+
+	dst = __sk_dst_check(sk, 0);
+	if (!dst)
+		dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
+out:
+	return dst;
+}
+EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7a0062c..b8e7e05 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -289,17 +289,10 @@ static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
 	if (sk->sk_state == TCP_LISTEN)
 		return;
 
-	/* We don't check in the destentry if pmtu discovery is forbidden
-	 * on this route. We just assume that no packet_to_big packets
-	 * are send back when pmtu discovery is not active.
-	 * There is a small race when the user changes this flag in the
-	 * route, but I think that's acceptable.
-	 */
-	if ((dst = __sk_dst_check(sk, 0)) == NULL)
+	dst = inet_csk_update_pmtu(sk, mtu);
+	if (!dst)
 		return;
 
-	dst->ops->update_pmtu(dst, mtu);
-
 	/* Something is about to be wrong... Remember soft error
 	 * for the case, if this connection will not able to recover.
 	 */
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 0/5] Long term PMTU/redirect storage in ipv4.
From: David Miller @ 2012-07-17 13:14 UTC (permalink / raw)
  To: netdev

These patches implement the final mechanism necessary to really allow
us to go without the route cache in ipv4.

We need a place to have long-term storage of PMTU/redirect information
which is independent of the routes themselves, yet does not get us
back into a situation where we have to write to metrics or anything
like that.

For this we use an "next-hop exception" table in the FIB nexthops.

Currently it is a simple linked list and uses a single global lock
for synchronization, but that can be easily adjusted as-needed.

The one thing I desperately want to avoid is having to create clone
routes in the FIB trie for this purpose, because that is very
expensive.   However, I'm willing to entertain such an idea later
if this current scheme proves to have downsides that the FIB trie
variant would not have.

In order to accomodate this any such scheme, we need to be able to
produce a full flow key at PMTU/redirect time.  That required an
adjustment of the interface call-sites used to propagate these events.

For a PMTU/redirect with a fully specified socket, we pass that socket
and use it to produce the flow key.

Otherwise we use a passed in SKB to formulate the key.  There are two
cases that need to be distinguished, ICMP message processing (in which
case the IP header is at skb->data) and output packet processing
(mostly tunnels, and in all such cases the IP header is at ip_hdr(skb)).

We also have to make the code able to handle the case where the dst
itself passed into the dst_ops->{update_pmtu,redirect} method is
invalidated.  This matters for calls from sockets that have cached
that route.  We provide a inet{,6} helper function for this purpose,
and edit SCTP specially since it caches routes at the transport rather
than socket level.

Signed-off-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply

* Re: [PATCH] [RFC] tcp: TSQ - do not always throttle.
From: Eric Dumazet @ 2012-07-17 13:10 UTC (permalink / raw)
  To: Krishna Kumar; +Cc: davem, netdev
In-Reply-To: <20120717120358.16611.98190.sendpatchset@localhost.localdomain>

On Tue, 2012-07-17 at 17:33 +0530, Krishna Kumar wrote:
> Do not throttle if sysctl_tcp_limit_output_bytes==0.
> 
> Maybe it is better to throttle earlier in the loop, after
> calling tcp_init_tso_segs().
> 

I wonder why, and why you put this question in a changelog instead of
outside of it...

Idea was to avoid setting TSQ_THROTTLED if we break out the loop.

About disabling TSQ, my initial intent was to instead use a negative
sysctl_tcp_limit_output_bytes value.

Thats why I have in tcp_transmit_skb() :

skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
		  tcp_wfree : sock_wfree;

So I suggest you change the tcp_write_xmit(() test to a single unsigned
compare :

if (atomic_read(&sk->sk_wmem_alloc) >=
    (unsigned) sysctl_tcp_limit_output_bytes) {

Also use :

skb->destructor = (sysctl_tcp_limit_output_bytes >= 0) ?
  tcp_wfree : sock_wfree;

and document the 'negative value disables TSQ' in
Documentation/networking/ip-sysctl.txt

^ permalink raw reply

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
From: David Miller @ 2012-07-17 12:50 UTC (permalink / raw)
  To: David.Laight
  Cc: rick.jones2, cascardo, netdev, yevgenyp, ogerlitz, amirv, brking,
	leitao, klebers
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F6026B6F8B@saturn3.aculab.com>

From: "David Laight" <David.Laight@ACULAB.COM>
Date: Tue, 17 Jul 2012 13:42:04 +0100

> Would there be any mileage in permanently allocating IOMMU
> virtual address to the ring entries, then 'just' assigning
> the correct physical address during rx/tx setup?

There is a not a one to one mapping between these two entities,
in particular on the transmit side.

A transmit packet can have multiple segments, some of which are
larger than one IOMMU page.

^ permalink raw reply

* RE: [PATCH] mlx4_en: map entire pages to increase throughput
From: David Laight @ 2012-07-17 12:42 UTC (permalink / raw)
  To: David Miller, rick.jones2
  Cc: cascardo, netdev, yevgenyp, ogerlitz, amirv, brking, leitao,
	klebers
In-Reply-To: <20120716.222903.367603216293954363.davem@davemloft.net>

> > That seems rather extraordinarily low - Power7 is supposed to be a
> > rather high performance CPU.  The last time I noticed O(3Gbit/s) on
> > 10G for bulk transfer was before the advent of LRO/GRO - that was in
> > the x86 space though.  Is mapping really that expensive with Power7?
> 
> Unfortunately, IOMMU mappings are incredibly expensive.  I see effects
> like this on Sparc too.

Would there be any mileage in permanently allocating IOMMU
virtual address to the ring entries, then 'just' assigning
the correct physical address during rx/tx setup?

A long time ago it used to be much faster on sparc systems
to receive into a permanently mapped buffer area and then
do a maximally aligned copy into the actual rx buffer.

	David

^ permalink raw reply

* Re: [PATCH v2] sctp: Fix list corruption resulting from freeing an association on a list
From: Neil Horman @ 2012-07-17 12:25 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, davej, vyasevich, sri, linux-sctp
In-Reply-To: <20120716.223250.2238626170464909220.davem@davemloft.net>

On Mon, Jul 16, 2012 at 10:32:50PM -0700, David Miller wrote:
> From: Neil Horman <nhorman@tuxdriver.com>
> Date: Mon, 16 Jul 2012 15:13:51 -0400
> 
> > A few days ago Dave Jones reported this oops:
>  ...
> > It appears from his analysis and some staring at the code that this is likely
> > occuring because an association is getting freed while still on the
> > sctp_assoc_hashtable.  As a result, we get a gpf when traversing the hashtable
> > while a freed node corrupts part of the list.
> > 
> > Nominally I would think that an mibalanced refcount was responsible for this,
> > but I can't seem to find any obvious imbalance.  What I did note however was
> > that the two places where we create an association using
> > sctp_primitive_ASSOCIATE (__sctp_connect and sctp_sendmsg), have failure paths
> > which free a newly created association after calling sctp_primitive_ASSOCIATE.
> > sctp_primitive_ASSOCIATE brings us into the sctp_sf_do_prm_asoc path, which
> > issues a SCTP_CMD_NEW_ASOC side effect, which in turn adds a new association to
> > the aforementioned hash table.  the sctp command interpreter that process side
> > effects has not way to unwind previously processed commands, so freeing the
> > association from the __sctp_connect or sctp_sendmsg error path would lead to a
> > freed association remaining on this hash table.
> > 
> > I've fixed this but modifying sctp_[un]hash_established to use hlist_del_init,
> > which allows us to proerly use hlist_unhashed to check if the node is on a
> > hashlist safely during a delete.  That in turn alows us to safely call
> > sctp_unhash_established in the __sctp_connect and sctp_sendmsg error paths
> > before freeing them, regardles of what the associations state is on the hash
> > list.
> > 
> > I noted, while I was doing this, that the __sctp_unhash_endpoint was using
> > hlist_unhsashed in a simmilar fashion, but never nullified any removed nodes
> > pointers to make that function work properly, so I fixed that up in a simmilar
> > fashion.
> > 
> > I attempted to test this using a virtual guest running the SCTP_RR test from
> > netperf in a loop while running the trinity fuzzer, both in a loop.  I wasn't
> > able to recreate the problem prior to this fix, nor was I able to trigger the
> > failure after (neither of which I suppose is suprising).  Given the trace above
> > however, I think its likely that this is what we hit.
> > 
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > Reported-by: davej@redhat.com
> 
> Looks great, applied and queued up for -stable, thanks Neil.
> 

Thanks Dave!
Neil

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox