Netdev List
 help / color / mirror / Atom feed
* [PATCH 07/15] net: ethernet: ll_temac: Utilize phy_ethtool_nway_reset
From: Florian Fainelli @ 2016-11-15 18:06 UTC (permalink / raw)
  To: netdev
  Cc: davem, andrew, tremyfr, Florian Fainelli, Michal Simek,
	Sören Brinkmann, Florian Westphal, Felipe Balbi,
	moderated list:ARM/ZYNQ ARCHITECTURE, open list
In-Reply-To: <20161115180644.3941-1-f.fainelli@gmail.com>

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/xilinx/ll_temac_main.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index a9bd665fd122..bcd7b76dde9f 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -967,13 +967,8 @@ static const struct attribute_group temac_attr_group = {
 };
 
 /* ethtool support */
-static int temac_nway_reset(struct net_device *ndev)
-{
-	return phy_start_aneg(ndev->phydev);
-}
-
 static const struct ethtool_ops temac_ethtool_ops = {
-	.nway_reset = temac_nway_reset,
+	.nway_reset = phy_ethtool_nway_reset,
 	.get_link = ethtool_op_get_link,
 	.get_ts_info = ethtool_op_get_ts_info,
 	.get_link_ksettings = phy_ethtool_get_link_ksettings,
-- 
2.9.3

^ permalink raw reply related

* [PATCH 06/15] net: ethernet: smsc9420: Utilize phy_ethtool_nway_reset
From: Florian Fainelli @ 2016-11-15 18:06 UTC (permalink / raw)
  To: netdev
  Cc: davem, andrew, tremyfr, Florian Fainelli, Steve Glendinning,
	open list
In-Reply-To: <20161115180644.3941-1-f.fainelli@gmail.com>

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/smsc/smsc9420.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/smsc/smsc9420.c b/drivers/net/ethernet/smsc/smsc9420.c
index b7bfed4bc96b..3174aebb322f 100644
--- a/drivers/net/ethernet/smsc/smsc9420.c
+++ b/drivers/net/ethernet/smsc/smsc9420.c
@@ -254,14 +254,6 @@ static void smsc9420_ethtool_set_msglevel(struct net_device *netdev, u32 data)
 	pd->msg_enable = data;
 }
 
-static int smsc9420_ethtool_nway_reset(struct net_device *netdev)
-{
-	if (!netdev->phydev)
-		return -ENODEV;
-
-	return phy_start_aneg(netdev->phydev);
-}
-
 static int smsc9420_ethtool_getregslen(struct net_device *dev)
 {
 	/* all smsc9420 registers plus all phy registers */
@@ -417,7 +409,7 @@ static const struct ethtool_ops smsc9420_ethtool_ops = {
 	.get_drvinfo = smsc9420_ethtool_get_drvinfo,
 	.get_msglevel = smsc9420_ethtool_get_msglevel,
 	.set_msglevel = smsc9420_ethtool_set_msglevel,
-	.nway_reset = smsc9420_ethtool_nway_reset,
+	.nway_reset = phy_ethtool_nway_reset,
 	.get_link = ethtool_op_get_link,
 	.get_eeprom_len = smsc9420_ethtool_get_eeprom_len,
 	.get_eeprom = smsc9420_ethtool_get_eeprom,
-- 
2.9.3

^ permalink raw reply related

* [PATCH 05/15] net: smsc911x: Utilize phy_ethtool_nway_reset
From: Florian Fainelli @ 2016-11-15 18:06 UTC (permalink / raw)
  To: netdev
  Cc: davem, andrew, tremyfr, Florian Fainelli, Steve Glendinning,
	open list
In-Reply-To: <20161115180644.3941-1-f.fainelli@gmail.com>

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/smsc/smsc911x.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c
index cdb343f0c6e0..be09573c6ced 100644
--- a/drivers/net/ethernet/smsc/smsc911x.c
+++ b/drivers/net/ethernet/smsc/smsc911x.c
@@ -1956,11 +1956,6 @@ static void smsc911x_ethtool_getdrvinfo(struct net_device *dev,
 		sizeof(info->bus_info));
 }
 
-static int smsc911x_ethtool_nwayreset(struct net_device *dev)
-{
-	return phy_start_aneg(dev->phydev);
-}
-
 static u32 smsc911x_ethtool_getmsglevel(struct net_device *dev)
 {
 	struct smsc911x_data *pdata = netdev_priv(dev);
@@ -2132,7 +2127,7 @@ static int smsc911x_ethtool_set_eeprom(struct net_device *dev,
 static const struct ethtool_ops smsc911x_ethtool_ops = {
 	.get_link = ethtool_op_get_link,
 	.get_drvinfo = smsc911x_ethtool_getdrvinfo,
-	.nway_reset = smsc911x_ethtool_nwayreset,
+	.nway_reset = phy_ethtool_nway_reset,
 	.get_msglevel = smsc911x_ethtool_getmsglevel,
 	.set_msglevel = smsc911x_ethtool_setmsglevel,
 	.get_regs_len = smsc911x_ethtool_getregslen,
-- 
2.9.3

^ permalink raw reply related

* [PATCH 04/15] net: mv643xx_eth: Utilize phy_ethtool_nway_reset
From: Florian Fainelli @ 2016-11-15 18:06 UTC (permalink / raw)
  To: netdev
  Cc: davem, andrew, tremyfr, Florian Fainelli, Sebastian Hesselbarth,
	open list
In-Reply-To: <20161115180644.3941-1-f.fainelli@gmail.com>

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/marvell/mv643xx_eth.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 81b08d71c0f8..5f62c3d70df9 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -1639,14 +1639,6 @@ static void mv643xx_eth_get_drvinfo(struct net_device *dev,
 	strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
 }
 
-static int mv643xx_eth_nway_reset(struct net_device *dev)
-{
-	if (!dev->phydev)
-		return -EINVAL;
-
-	return genphy_restart_aneg(dev->phydev);
-}
-
 static int
 mv643xx_eth_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec)
 {
@@ -1770,7 +1762,7 @@ static int mv643xx_eth_get_sset_count(struct net_device *dev, int sset)
 
 static const struct ethtool_ops mv643xx_eth_ethtool_ops = {
 	.get_drvinfo		= mv643xx_eth_get_drvinfo,
-	.nway_reset		= mv643xx_eth_nway_reset,
+	.nway_reset		= phy_ethtool_nway_reset,
 	.get_link		= ethtool_op_get_link,
 	.get_coalesce		= mv643xx_eth_get_coalesce,
 	.set_coalesce		= mv643xx_eth_set_coalesce,
-- 
2.9.3

^ permalink raw reply related

* [PATCH 03/15] net: bcm63xx_enet: Utilize phy_ethtool_nway_reset
From: Florian Fainelli @ 2016-11-15 18:06 UTC (permalink / raw)
  To: netdev
  Cc: andrew, Florian Fainelli, Arnd Bergmann, xypron.glpk@gmx.de,
	open list, Jarod Wilson,
	maintainer:BROADCOM BCM63XX ARM ARCHITECTURE, Thierry Reding,
	davem, moderated list:BROADCOM BCM63XX ARM ARCHITECTURE, tremyfr
In-Reply-To: <20161115180644.3941-1-f.fainelli@gmail.com>

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 5c7acef1de2e..a43ab90c051e 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -1434,11 +1434,8 @@ static int bcm_enet_nway_reset(struct net_device *dev)
 	struct bcm_enet_priv *priv;
 
 	priv = netdev_priv(dev);
-	if (priv->has_phy) {
-		if (!dev->phydev)
-			return -ENODEV;
-		return genphy_restart_aneg(dev->phydev);
-	}
+	if (priv->has_phy)
+		return phy_ethtool_nway_reset(dev),
 
 	return -EOPNOTSUPP;
 }
-- 
2.9.3

^ permalink raw reply related

* [PATCH 02/15] net: nb8800: Utilize phy_ethtool_nway_reset
From: Florian Fainelli @ 2016-11-15 18:06 UTC (permalink / raw)
  To: netdev
  Cc: davem, andrew, tremyfr, Florian Fainelli, Måns Rullgård,
	Wei Yongjun, Jarod Wilson, Arnd Bergmann, open list
In-Reply-To: <20161115180644.3941-1-f.fainelli@gmail.com>

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/aurora/nb8800.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/aurora/nb8800.c b/drivers/net/ethernet/aurora/nb8800.c
index b59aa3541270..07ff6492402a 100644
--- a/drivers/net/ethernet/aurora/nb8800.c
+++ b/drivers/net/ethernet/aurora/nb8800.c
@@ -1037,16 +1037,6 @@ static const struct net_device_ops nb8800_netdev_ops = {
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
-static int nb8800_nway_reset(struct net_device *dev)
-{
-	struct phy_device *phydev = dev->phydev;
-
-	if (!phydev)
-		return -ENODEV;
-
-	return genphy_restart_aneg(phydev);
-}
-
 static void nb8800_get_pauseparam(struct net_device *dev,
 				  struct ethtool_pauseparam *pp)
 {
@@ -1165,7 +1155,7 @@ static void nb8800_get_ethtool_stats(struct net_device *dev,
 }
 
 static const struct ethtool_ops nb8800_ethtool_ops = {
-	.nway_reset		= nb8800_nway_reset,
+	.nway_reset		= phy_ethtool_nway_reset,
 	.get_link		= ethtool_op_get_link,
 	.get_pauseparam		= nb8800_get_pauseparam,
 	.set_pauseparam		= nb8800_set_pauseparam,
-- 
2.9.3

^ permalink raw reply related

* Re: linux-next: net->netns_ids is used after calling idr_destroy for it
From: Cong Wang @ 2016-11-15 18:04 UTC (permalink / raw)
  To: Andrei Vagin; +Cc: Nicolas Dichtel, Linux Kernel Network Developers
In-Reply-To: <CANaxB-x7FgxpAwpa-RgOvUptrCRUmAE6NFm3As+LuXUE=fHJyA@mail.gmail.com>

On Mon, Nov 14, 2016 at 10:23 PM, Andrei Vagin <avagin@gmail.com> wrote:
> Hi Nicolas,
>
> cleanup_net() calls idr_destroy(net->netns_ids) for network namespaces
> and then it calls unregister_netdevice_many() which calls
> idr_alloc(net0>netns_ids). It looks wrong, doesn't it?
>

netns id is designed to allocate lazily, but yeah it makes no sense
to allocate id for the netns being destroyed, not to mention idr is freed.

I will send a patch.

^ permalink raw reply

* Re: [PATCH net] sctp: use new rhlist interface on sctp transport rhashtable
From: Neil Horman @ 2016-11-15 18:04 UTC (permalink / raw)
  To: Xin Long
  Cc: network dev, linux-sctp, davem, Marcelo Ricardo Leitner,
	Vlad Yasevich, Herbert Xu, phil
In-Reply-To: <0a89ef8506db3bba6a37010bd5622cd145183ab4.1479223391.git.lucien.xin@gmail.com>

On Tue, Nov 15, 2016 at 11:23:11PM +0800, Xin Long wrote:
> Now sctp transport rhashtable uses hash(lport, dport, daddr) as the key
> to hash a node to one chain. If in one host thousands of assocs connect
> to one server with the same lport and different laddrs (although it's
> not a normal case), all the transports would be hashed into the same
> chain.
> 
> It may cause to keep returning -EBUSY when inserting a new node, as the
> chain is too long and sctp inserts a transport node in a loop, which
> could even lead to system hangs there.
> 
> The new rhlist interface works for this case that there are many nodes
> with the same key in one chain. It puts them into a list then makes this
> list be as a node of the chain.
> 
> This patch is to replace rhashtable_ interface with rhltable_ interface.
> Since a chain would not be too long and it would not return -EBUSY with
> this fix when inserting a node, the reinsert loop is also removed here.
> 
> Signed-off-by: Xin Long <lucien.xin@gmail.com>

Does this really buy us anything in this case though?  If the use case is that a
majority of the associations map to the same key, then you might avoid EBUSY for
the individual associaion that doesn't map there, but you still have to cope
with a huge linear search for the majority of the keys.

Might be more reasonable to mix saddr into the hash function so that your use
case gets spread through the hash table more evenly.

Neil

> ---
>  include/net/sctp/sctp.h    |  2 +-
>  include/net/sctp/structs.h |  4 +-
>  net/sctp/associola.c       |  8 +++-
>  net/sctp/input.c           | 93 ++++++++++++++++++++++++++--------------------
>  net/sctp/socket.c          |  7 +---
>  5 files changed, 64 insertions(+), 50 deletions(-)
> 
> diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> index 31acc3f..f0dcaeb 100644
> --- a/include/net/sctp/sctp.h
> +++ b/include/net/sctp/sctp.h
> @@ -164,7 +164,7 @@ void sctp_backlog_migrate(struct sctp_association *assoc,
>  			  struct sock *oldsk, struct sock *newsk);
>  int sctp_transport_hashtable_init(void);
>  void sctp_transport_hashtable_destroy(void);
> -void sctp_hash_transport(struct sctp_transport *t);
> +int sctp_hash_transport(struct sctp_transport *t);
>  void sctp_unhash_transport(struct sctp_transport *t);
>  struct sctp_transport *sctp_addrs_lookup_transport(
>  				struct net *net,
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index 11c3bf2..c5a2d83 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -124,7 +124,7 @@ extern struct sctp_globals {
>  	/* This is the sctp port control hash.	*/
>  	struct sctp_bind_hashbucket *port_hashtable;
>  	/* This is the hash of all transports. */
> -	struct rhashtable transport_hashtable;
> +	struct rhltable transport_hashtable;
>  
>  	/* Sizes of above hashtables. */
>  	int ep_hashsize;
> @@ -762,7 +762,7 @@ static inline int sctp_packet_empty(struct sctp_packet *packet)
>  struct sctp_transport {
>  	/* A list of transports. */
>  	struct list_head transports;
> -	struct rhash_head node;
> +	struct rhlist_head node;
>  
>  	/* Reference counting. */
>  	atomic_t refcnt;
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index f10d339..68428e1 100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -700,11 +700,15 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
>  	/* Set the peer's active state. */
>  	peer->state = peer_state;
>  
> +	/* Add this peer into the transport hashtable */
> +	if (sctp_hash_transport(peer)) {
> +		sctp_transport_free(peer);
> +		return NULL;
> +	}
> +
>  	/* Attach the remote transport to our asoc.  */
>  	list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list);
>  	asoc->peer.transport_count++;
> -	/* Add this peer into the transport hashtable */
> -	sctp_hash_transport(peer);
>  
>  	/* If we do not yet have a primary path, set one.  */
>  	if (!asoc->peer.primary_path) {
> diff --git a/net/sctp/input.c b/net/sctp/input.c
> index a01a56e..458e506 100644
> --- a/net/sctp/input.c
> +++ b/net/sctp/input.c
> @@ -790,10 +790,9 @@ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
>  
>  /* rhashtable for transport */
>  struct sctp_hash_cmp_arg {
> -	const struct sctp_endpoint	*ep;
> -	const union sctp_addr		*laddr;
> -	const union sctp_addr		*paddr;
> -	const struct net		*net;
> +	const union sctp_addr	*paddr;
> +	const struct net	*net;
> +	u16			lport;
>  };
>  
>  static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
> @@ -801,7 +800,6 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
>  {
>  	struct sctp_transport *t = (struct sctp_transport *)ptr;
>  	const struct sctp_hash_cmp_arg *x = arg->key;
> -	struct sctp_association *asoc;
>  	int err = 1;
>  
>  	if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr))
> @@ -809,19 +807,10 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
>  	if (!sctp_transport_hold(t))
>  		return err;
>  
> -	asoc = t->asoc;
> -	if (!net_eq(sock_net(asoc->base.sk), x->net))
> +	if (!net_eq(sock_net(t->asoc->base.sk), x->net))
> +		goto out;
> +	if (x->lport != htons(t->asoc->base.bind_addr.port))
>  		goto out;
> -	if (x->ep) {
> -		if (x->ep != asoc->ep)
> -			goto out;
> -	} else {
> -		if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port))
> -			goto out;
> -		if (!sctp_bind_addr_match(&asoc->base.bind_addr,
> -					  x->laddr, sctp_sk(asoc->base.sk)))
> -			goto out;
> -	}
>  
>  	err = 0;
>  out:
> @@ -851,11 +840,9 @@ static inline u32 sctp_hash_key(const void *data, u32 len, u32 seed)
>  	const struct sctp_hash_cmp_arg *x = data;
>  	const union sctp_addr *paddr = x->paddr;
>  	const struct net *net = x->net;
> -	u16 lport;
> +	u16 lport = x->lport;
>  	u32 addr;
>  
> -	lport = x->ep ? htons(x->ep->base.bind_addr.port) :
> -			x->laddr->v4.sin_port;
>  	if (paddr->sa.sa_family == AF_INET6)
>  		addr = jhash(&paddr->v6.sin6_addr, 16, seed);
>  	else
> @@ -875,29 +862,32 @@ static const struct rhashtable_params sctp_hash_params = {
>  
>  int sctp_transport_hashtable_init(void)
>  {
> -	return rhashtable_init(&sctp_transport_hashtable, &sctp_hash_params);
> +	return rhltable_init(&sctp_transport_hashtable, &sctp_hash_params);
>  }
>  
>  void sctp_transport_hashtable_destroy(void)
>  {
> -	rhashtable_destroy(&sctp_transport_hashtable);
> +	rhltable_destroy(&sctp_transport_hashtable);
>  }
>  
> -void sctp_hash_transport(struct sctp_transport *t)
> +int sctp_hash_transport(struct sctp_transport *t)
>  {
>  	struct sctp_hash_cmp_arg arg;
> +	int err;
>  
>  	if (t->asoc->temp)
> -		return;
> +		return 0;
>  
> -	arg.ep = t->asoc->ep;
> -	arg.paddr = &t->ipaddr;
>  	arg.net   = sock_net(t->asoc->base.sk);
> +	arg.paddr = &t->ipaddr;
> +	arg.lport = htons(t->asoc->base.bind_addr.port);
>  
> -reinsert:
> -	if (rhashtable_lookup_insert_key(&sctp_transport_hashtable, &arg,
> -					 &t->node, sctp_hash_params) == -EBUSY)
> -		goto reinsert;
> +	err = rhltable_insert_key(&sctp_transport_hashtable, &arg,
> +				  &t->node, sctp_hash_params);
> +	if (err)
> +		pr_err_once("insert transport fail, errno %d\n", err);
> +
> +	return err;
>  }
>  
>  void sctp_unhash_transport(struct sctp_transport *t)
> @@ -905,39 +895,62 @@ void sctp_unhash_transport(struct sctp_transport *t)
>  	if (t->asoc->temp)
>  		return;
>  
> -	rhashtable_remove_fast(&sctp_transport_hashtable, &t->node,
> -			       sctp_hash_params);
> +	rhltable_remove(&sctp_transport_hashtable, &t->node,
> +			sctp_hash_params);
>  }
>  
> +/* return a transport with holding it */
>  struct sctp_transport *sctp_addrs_lookup_transport(
>  				struct net *net,
>  				const union sctp_addr *laddr,
>  				const union sctp_addr *paddr)
>  {
> +	struct rhlist_head *tmp, *list;
> +	struct sctp_transport *t;
>  	struct sctp_hash_cmp_arg arg = {
> -		.ep    = NULL,
> -		.laddr = laddr,
>  		.paddr = paddr,
>  		.net   = net,
> +		.lport = laddr->v4.sin_port,
>  	};
>  
> -	return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg,
> -				      sctp_hash_params);
> +	list = rhltable_lookup(&sctp_transport_hashtable, &arg,
> +			       sctp_hash_params);
> +
> +	rhl_for_each_entry_rcu(t, tmp, list, node) {
> +		if (!sctp_transport_hold(t))
> +			continue;
> +
> +		if (sctp_bind_addr_match(&t->asoc->base.bind_addr,
> +					 laddr, sctp_sk(t->asoc->base.sk)))
> +			return t;
> +		sctp_transport_put(t);
> +	}
> +
> +	return NULL;
>  }
>  
> +/* return a transport without holding it, as it's only used under sock lock */
>  struct sctp_transport *sctp_epaddr_lookup_transport(
>  				const struct sctp_endpoint *ep,
>  				const union sctp_addr *paddr)
>  {
>  	struct net *net = sock_net(ep->base.sk);
> +	struct rhlist_head *tmp, *list;
> +	struct sctp_transport *t;
>  	struct sctp_hash_cmp_arg arg = {
> -		.ep    = ep,
>  		.paddr = paddr,
>  		.net   = net,
> +		.lport = htons(ep->base.bind_addr.port),
>  	};
>  
> -	return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg,
> -				      sctp_hash_params);
> +	list = rhltable_lookup(&sctp_transport_hashtable, &arg,
> +			       sctp_hash_params);
> +
> +	rhl_for_each_entry_rcu(t, tmp, list, node)
> +		if (ep == t->asoc->ep)
> +			return t;
> +
> +	return NULL;
>  }
>  
>  /* Look up an association. */
> @@ -951,7 +964,7 @@ static struct sctp_association *__sctp_lookup_association(
>  	struct sctp_association *asoc = NULL;
>  
>  	t = sctp_addrs_lookup_transport(net, local, peer);
> -	if (!t || !sctp_transport_hold(t))
> +	if (!t)
>  		goto out;
>  
>  	asoc = t->asoc;
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index f23ad91..d5f4b4a 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -4392,10 +4392,7 @@ int sctp_transport_walk_start(struct rhashtable_iter *iter)
>  {
>  	int err;
>  
> -	err = rhashtable_walk_init(&sctp_transport_hashtable, iter,
> -				   GFP_KERNEL);
> -	if (err)
> -		return err;
> +	rhltable_walk_enter(&sctp_transport_hashtable, iter);
>  
>  	err = rhashtable_walk_start(iter);
>  	if (err && err != -EAGAIN) {
> @@ -4479,7 +4476,7 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
>  
>  	rcu_read_lock();
>  	transport = sctp_addrs_lookup_transport(net, laddr, paddr);
> -	if (!transport || !sctp_transport_hold(transport))
> +	if (!transport)
>  		goto out;
>  
>  	rcu_read_unlock();
> -- 
> 2.1.0
> 
> 

^ permalink raw reply

* Re: [PATCH/RFC] ravb: Support 1Gbps on R-Car H3 ES1.1+ and R-Car M3-W
From: Simon Horman @ 2016-11-15 17:53 UTC (permalink / raw)
  To: David Miller
  Cc: geert+renesas, sergei.shtylyov, arnd, netdev, linux-renesas-soc
In-Reply-To: <20161101093216.GE28143@verge.net.au>

On Tue, Nov 01, 2016 at 10:32:17AM +0100, Simon Horman wrote:
> On Mon, Oct 31, 2016 at 01:24:31PM -0400, David Miller wrote:
> > From: Geert Uytterhoeven <geert+renesas@glider.be>
> > Date: Mon, 31 Oct 2016 18:13:38 +0100
> > 
> > > The limitation to 10/100Mbit speeds on R-Car Gen3 is valid for R-Car H3
> > > ES1.0 only. Check for the exact SoC model to allow 1Gbps on newer
> > > revisions of R-Car H3, and on R-Car M3-W.
> > > 
> > > Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
> > > ---
> > > Tested on:
> > >   - r8a7795/salvator-x with R-Car H3 ES1.0 (limited to 100Mbps),
> > >   - r8a7795/salvator-x with R-Car H3 ES1.1 (1Gbps),
> > >   - r8a7796/salvator-x with R-Car M3-W ES1.0 (1Gbps).
> > > 
> > > This is marked as an RFC because it depends on:
> > >   A) the soc_device_match() infrastructure,
> > >   B) Renesas SoC core ESx.y handling.
> > > Hence I think the best merge strategy is to let this patch go in through
> > > Simon's Renesas tree.
> > > 
> > > David: If you agree, can you please provide your ack? Thanks!
> > 
> > Sure, no problem:
> > 
> > Acked-by: David S. Miller <davem@davemloft.net>
> 
> Thanks Dave.
> 
> Geert, please repost or otherwise ping me once the dependencies are in
> place and I should queue this up.

For the record, I have queued this up for v4.10.

^ permalink raw reply

* [PATCH net-next 2/2] net: marvell: Allow drivers to be built with COMPILE_TEST
From: Florian Fainelli @ 2016-11-15 17:35 UTC (permalink / raw)
  To: netdev
  Cc: davem, mw, arnd, gregory.clement, Shaohui.Xie, Igal.Liberman,
	Florian Fainelli
In-Reply-To: <20161115173548.32567-1-f.fainelli@gmail.com>

All Marvell Ethernet drivers actually build fine with COMPILE_TEST with
a few warnings.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/marvell/Kconfig | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/marvell/Kconfig b/drivers/net/ethernet/marvell/Kconfig
index 2664827ddecd..1fbe0bfbeead 100644
--- a/drivers/net/ethernet/marvell/Kconfig
+++ b/drivers/net/ethernet/marvell/Kconfig
@@ -5,7 +5,7 @@
 config NET_VENDOR_MARVELL
 	bool "Marvell devices"
 	default y
-	depends on PCI || CPU_PXA168 || MV64X60 || PPC32 || PLAT_ORION || INET
+	depends on PCI || CPU_PXA168 || MV64X60 || PPC32 || PLAT_ORION || INET || COMPILE_TEST
 	---help---
 	  If you have a network (Ethernet) card belonging to this class, say Y.
 
@@ -18,7 +18,7 @@ if NET_VENDOR_MARVELL
 
 config MV643XX_ETH
 	tristate "Marvell Discovery (643XX) and Orion ethernet support"
-	depends on (MV64X60 || PPC32 || PLAT_ORION) && INET
+	depends on (MV64X60 || PPC32 || PLAT_ORION || COMPILE_TEST) && INET
 	select PHYLIB
 	select MVMDIO
 	---help---
@@ -55,7 +55,7 @@ config MVNETA_BM_ENABLE
 
 config MVNETA
 	tristate "Marvell Armada 370/38x/XP network interface support"
-	depends on PLAT_ORION
+	depends on PLAT_ORION || COMPILE_TEST
 	select MVMDIO
 	select FIXED_PHY
 	---help---
@@ -77,7 +77,7 @@ config MVNETA_BM
 
 config MVPP2
 	tristate "Marvell Armada 375 network interface support"
-	depends on MACH_ARMADA_375
+	depends on MACH_ARMADA_375 || COMPILE_TEST
 	select MVMDIO
 	---help---
 	  This driver supports the network interface units in the
-- 
2.9.3

^ permalink raw reply related

* [PATCH net-next 1/2] net: fsl: Allow most drivers to be built with COMPILE_TEST
From: Florian Fainelli @ 2016-11-15 17:35 UTC (permalink / raw)
  To: netdev
  Cc: davem, mw, arnd, gregory.clement, Shaohui.Xie, Igal.Liberman,
	Florian Fainelli
In-Reply-To: <20161115173548.32567-1-f.fainelli@gmail.com>

There are only a handful of Freescale Ethernet drivers that don't
actually build with COMPILE_TEST:

* FEC, for which we would need to define a default register layout if no
  supported architecture is defined

* UCC_GETH which depends on PowerPC cpm.h header (which could be moved
  to a generic location)

We need to fix an unmet dependency to get there though:
warning: (FSL_XGMAC_MDIO) selects OF_MDIO which has unmet direct
dependencies (OF && PHYLIB)

which would result in CONFIG_OF_MDIO=[ym] without CONFIG_OF to be set.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/freescale/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig
index d1ca45fbb164..0df5835d8b94 100644
--- a/drivers/net/ethernet/freescale/Kconfig
+++ b/drivers/net/ethernet/freescale/Kconfig
@@ -8,7 +8,7 @@ config NET_VENDOR_FREESCALE
 	depends on FSL_SOC || QUICC_ENGINE || CPM1 || CPM2 || PPC_MPC512x || \
 		   M523x || M527x || M5272 || M528x || M520x || M532x || \
 		   ARCH_MXC || ARCH_MXS || (PPC_MPC52xx && PPC_BESTCOMM) || \
-		   ARCH_LAYERSCAPE
+		   ARCH_LAYERSCAPE || COMPILE_TEST
 	---help---
 	  If you have a network (Ethernet) card belonging to this class, say Y.
 
@@ -65,6 +65,7 @@ config FSL_PQ_MDIO
 config FSL_XGMAC_MDIO
 	tristate "Freescale XGMAC MDIO"
 	select PHYLIB
+	depends on OF
 	select OF_MDIO
 	---help---
 	  This driver supports the MDIO bus on the Fman 10G Ethernet MACs, and
-- 
2.9.3

^ permalink raw reply related

* [PATCH net-next 0/2] net: ethernet: Allow Marvell & Freescale to COMPILE_TEST
From: Florian Fainelli @ 2016-11-15 17:35 UTC (permalink / raw)
  To: netdev
  Cc: davem, mw, arnd, gregory.clement, Shaohui.Xie, Igal.Liberman,
	Florian Fainelli

Hi David,

This patch series allows most of the Freescale Ethernet drivers to be built
with COMPILE_TEST and all Marvell drivers to build with COMPILE_TEST.

This is helpful to increase build coverage without requiring hacking into
Kconfig anymore.

Florian Fainelli (2):
  net: fsl: Allow most drivers to be built with COMPILE_TEST
  net: marvell: Allow drivers to be built with COMPILE_TEST

 drivers/net/ethernet/freescale/Kconfig | 3 ++-
 drivers/net/ethernet/marvell/Kconfig   | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

-- 
2.9.3

^ permalink raw reply

* Re: [PATCH] icmp: Restore resistence to abnormal messages
From: Florian Westphal @ 2016-11-15 17:30 UTC (permalink / raw)
  To: David Miller
  Cc: googuy, kuznet, jmorris, yoshfuji, kaber, netdev, linux-kernel
In-Reply-To: <20161115.115657.798577230951109692.davem@redhat.com>

David Miller <davem@redhat.com> wrote:
> From: Vicente Jiménez <googuy@gmail.com>
> Date: Tue, 15 Nov 2016 17:49:43 +0100
> 
> > On Mon, Nov 14, 2016 at 7:36 PM, David Miller <davem@davemloft.net> wrote:
> >> From: Vicente Jimenez Aguilar <googuy@gmail.com>
> >> Date: Fri, 11 Nov 2016 21:20:18 +0100
> >>
> >>> @@ -819,6 +820,12 @@ static bool icmp_unreach(struct sk_buff *skb)
> >>>                               /* fall through */
> >>>                       case 0:
> >>>                               info = ntohs(icmph->un.frag.mtu);
> >>> +                             /* Handle weird case where next hop MTU is
> >>> +                              * equal to or exceeding dropped packet size
> >>> +                              */
> >>> +                             old_mtu = ntohs(iph->tot_len);
> >>> +                             if (info >= old_mtu)
> >>> +                                     info = old_mtu - 2;
> >>
> >> This isn't something the old code did.
> >>
> >> The old code behaved much differently.
> >>
> > I don't wanted to restore old behavior just fix a strange case that
> > was handle by this code where the next hop MTU reported by the router
> > is equal or greater than the actual path MTU. Because router
> > information is wrong, we need a way to guess a good packet size
> > ignoring router data. The simplest strategy that avoid odd numbers is
> > reducing dropped packet size by 2.
> 
> This whole approach seems arbitrary.
> 
> You haven't discussed in any way, what causes this in the first place.
> And what about that cause makes simply subtracting by 2 work well or
> not.
> 
> You have a very locallized, specific, situation on your end you want
> to fix.  But we must accept changes that handle things generically and
> in a way that would help more than just your specific case.

FWIW this is similar to the patch I sent a while ago:

https://patchwork.ozlabs.org/patch/493997/

I think in interest of robustness principle ("eat shit and don't die")
one of these changes should go in :-|

^ permalink raw reply

* Re: [PATCH net-next v3 0/7] vxlan: xmit improvements.
From: David Miller @ 2016-11-15 17:16 UTC (permalink / raw)
  To: pshelar; +Cc: netdev
In-Reply-To: <1479098638-4921-1-git-send-email-pshelar@ovn.org>

From: Pravin B Shelar <pshelar@ovn.org>
Date: Sun, 13 Nov 2016 20:43:51 -0800

> Following patch series improves vxlan fast path, removes
> duplicate code and simplifies vxlan xmit code path.
> 
> v2-v3:
> Removed unrelated warning fix from patch 2.
> rearranged error handling from patch 3
> Fixed stats updates in vxlan route lookup in patch 4
> 
> v1-v2:
> Fix compilation error when IPv6 support is not enabled.

Series applied, thanks.

^ permalink raw reply

* Re: [PATCH net-next v3 3/7] vxlan: simplify exception handling
From: Pravin Shelar @ 2016-11-15 17:15 UTC (permalink / raw)
  To: Jiri Benc; +Cc: Linux Kernel Network Developers
In-Reply-To: <20161115180453.26105ba8@griffin>

On Tue, Nov 15, 2016 at 9:04 AM, Jiri Benc <jbenc@redhat.com> wrote:
> On Tue, 15 Nov 2016 08:40:58 -0800, Pravin Shelar wrote:
>> On Tue, Nov 15, 2016 at 6:30 AM, Jiri Benc <jbenc@redhat.com> wrote:
>> > It would be a bit cleaner to do this assignment just after rt is
>> > assigned (but after the IS_ERR(rt) condition), get rid of the added
>> > ip_rt_put call above and move the existing ip_rt_put call in the bypass
>> > case just before the vxlan_encap_bypass call...
>> >
>> Does it really matters given that next patches in this series moves
>> this duplicate code and does pretty much what you are describing?
>
> Okay, right. I tried to look also at patches further in the series but
> it seemed to me this will leave an instance of ip_rt_put that could be
> avoided. But it will not.
>
> Acked-by: Jiri Benc <jbenc@redhat.com>
>
> (It would make the reviewers' life easier if the individual patches
> were more self contained. Ideally, each patch should be able to stand
> on its own. This unrelated code shuffling makes it too easy to miss
> things...)
>
I tried to keep one patch targeting single cleanup. But all patches
targeted same code, that makes it dependent on each other.

> Anyway, thanks for the cleanup!
>

Thanks for all reviews.

^ permalink raw reply

* Re: [PATCH net] gro_cells: mark napi struct as not busy poll candidates
From: Rolf Neugebauer @ 2016-11-15 17:08 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Eric W. Biederman, David Miller, Paul E. McKenney, Cong Wang,
	Linux Kernel Network Developers, Justin Cormack, Ian Campbell,
	Eric Dumazet, Rolf Neugebauer
In-Reply-To: <1479169722.8455.108.camel@edumazet-glaptop3.roam.corp.google.com>

On Tue, Nov 15, 2016 at 12:28 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> Rolf Neugebauer reported very long delays at netns dismantle.
>
> Eric W. Biederman was kind enough to look at this problem
> and noticed synchronize_net() occurring from netif_napi_del() that was
> added in linux-4.5
>
> Busy polling makes no sense for tunnels NAPI.
> If busy poll is used for sessions over tunnels, the poller will need to
> poll the physical device queue anyway.
>
> netif_tx_napi_add() could be used here, but function name is misleading,
> and renaming it is not stable material, so set NAPI_STATE_NO_BUSY_POLL
> bit directly.
>
> This will avoid inserting gro_cells napi structures in napi_hash[]
> and avoid the problematic synchronize_net() (per possible cpu) that
> Rolf reported.

Thanks for the quick turn-around. I've tested this on both 4.9-rc5 and
4.8.7  with different CPU configs and it dramatically reduced the
delay. I still see a small (1s) delay for creating a new namespace
after deleting one in one config but will follow up with the details
on the original email thread.

>
> Fixes: 93d05d4a320c ("net: provide generic busy polling to all NAPI drivers")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-by: Rolf Neugebauer <rolf.neugebauer@docker.com>
> Reported-by: Eric W. Biederman <ebiederm@xmission.com>

Tested-by: Rolf Neugebauer <rolf.neugebauer@docker.com>

> ---
>  include/net/gro_cells.h |    3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/include/net/gro_cells.h b/include/net/gro_cells.h
> index d15214d673b2e8e08fd6437b572278fb1359f10d..2a1abbf8da74368cd01adc40cef6c0644e059ef2 100644
> --- a/include/net/gro_cells.h
> +++ b/include/net/gro_cells.h
> @@ -68,6 +68,9 @@ static inline int gro_cells_init(struct gro_cells *gcells, struct net_device *de
>                 struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
>
>                 __skb_queue_head_init(&cell->napi_skbs);
> +
> +               set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
> +
>                 netif_napi_add(dev, &cell->napi, gro_cell_poll, 64);
>                 napi_enable(&cell->napi);
>         }
>
>

^ permalink raw reply

* Re: [PATCH] net: ethernet: Fix SGMII unable to switch speed and autonego failure
From: David Miller @ 2016-11-15 17:06 UTC (permalink / raw)
  To: ho.jia.jie; +Cc: peppe.cavallaro, alexandre.torgue, netdev, linux-kernel
In-Reply-To: <1479114409-20002-1-git-send-email-ho.jia.jie@intel.com>

From: ho.jia.jie@intel.com
Date: Mon, 14 Nov 2016 17:06:49 +0800

> From: Jia Jie Ho <ho.jia.jie@intel.com>
> 
> TSE PCS SGMII ethernet has an issue where switching speed doesn't work
> caused by a faulty register macro offset. This fixes the issue.
> 
> Signed-off-by: Jia Jie Ho <ho.jia.jie@intel.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next v3 3/7] vxlan: simplify exception handling
From: Jiri Benc @ 2016-11-15 17:04 UTC (permalink / raw)
  To: Pravin Shelar; +Cc: Linux Kernel Network Developers
In-Reply-To: <CAOrHB_CQEeknxznNu5u=PhD8pGdPxg+HBinTu7+rz77tj2+wNg@mail.gmail.com>

On Tue, 15 Nov 2016 08:40:58 -0800, Pravin Shelar wrote:
> On Tue, Nov 15, 2016 at 6:30 AM, Jiri Benc <jbenc@redhat.com> wrote:
> > It would be a bit cleaner to do this assignment just after rt is
> > assigned (but after the IS_ERR(rt) condition), get rid of the added
> > ip_rt_put call above and move the existing ip_rt_put call in the bypass
> > case just before the vxlan_encap_bypass call...
> >
> Does it really matters given that next patches in this series moves
> this duplicate code and does pretty much what you are describing?

Okay, right. I tried to look also at patches further in the series but
it seemed to me this will leave an instance of ip_rt_put that could be
avoided. But it will not.

Acked-by: Jiri Benc <jbenc@redhat.com>

(It would make the reviewers' life easier if the individual patches
were more self contained. Ideally, each patch should be able to stand
on its own. This unrelated code shuffling makes it too easy to miss
things...)

Anyway, thanks for the cleanup!

 Jiri

^ permalink raw reply

* Re: [PATCH net 1/3] net: phy: realtek: add eee advertisement disable options
From: Florian Fainelli @ 2016-11-15 17:03 UTC (permalink / raw)
  To: Andrew Lunn, Jerome Brunet
  Cc: netdev, devicetree, Carlo Caione, Kevin Hilman,
	Giuseppe Cavallaro, Alexandre TORGUE, Martin Blumenstingl,
	Andre Roth, Neil Armstrong, linux-amlogic, linux-arm-kernel,
	linux-kernel
In-Reply-To: <20161115163036.GB23231@lunn.ch>

On 11/15/2016 08:30 AM, Andrew Lunn wrote:
> On Tue, Nov 15, 2016 at 03:29:12PM +0100, Jerome Brunet wrote:
>> On some platforms, energy efficient ethernet with rtl8211 devices is
>> causing issue, like throughput drop or broken link.
>>
>> This was reported on the OdroidC2 (DWMAC + RTL8211F). While the issue root
>> cause is not fully understood yet, disabling EEE advertisement prevent auto
>> negotiation from enabling EEE.
>>
>> This patch provides options to disable 1000T and 100TX EEE advertisement
>> individually for the realtek phys supporting this feature.
> 
> Looking at the code, i don't see anything specific to RealTek
> here. This all seems generic. So should it be in phy.c and made a
> generic OF property which can be applied to any PHY which supports
> EEE.

Agreed. Just to be sure, Jerome, you did verify that with EEE no longer
advertised, ethtool --set-eee fails, right? The point is that you may be
able to disable EEE on boot, but if there is a way to re-enable it later
on, we would want to disable that too.
-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next] udplite: fix NULL pointer dereference
From: David Miller @ 2016-11-15 16:59 UTC (permalink / raw)
  To: pabeni; +Cc: netdev, eric.dumazet, avagin, hannes
In-Reply-To: <ac389a539b9299b44f63a2d1f744a637a8126943.1479223645.git.pabeni@redhat.com>

From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 15 Nov 2016 16:37:53 +0100

> The commit 850cbaddb52d ("udp: use it's own memory accounting schema")
> assumes that the socket proto has memory accounting enabled,
> but this is not the case for UDPLITE.
> Fix it enabling memory accounting for UDPLITE and performing
> fwd allocated memory reclaiming on socket shutdown.
> UDP and UDPLITE share now the same memory accounting limits.
> Also drop the backlog receive operation, since is no more needed.
> 
> Fixes: 850cbaddb52d ("udp: use it's own memory accounting schema")
> Reported-by: Andrei Vagin <avagin@gmail.com>
> Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>

Applied, thank you.

^ permalink raw reply

* Re: [PATCH v2 net-next 0/6] bpf: LRU map
From: David Miller @ 2016-11-15 16:59 UTC (permalink / raw)
  To: kafai; +Cc: netdev, ast, daniel, kernel-team
In-Reply-To: <20161115.115123.433376609853237994.davem@redhat.com>

From: David Miller <davem@redhat.com>
Date: Tue, 15 Nov 2016 11:51:23 -0500 (EST)

> From: Martin KaFai Lau <kafai@fb.com>
> Date: Fri, 11 Nov 2016 10:55:05 -0800
> 
>> This patch set adds LRU map implementation to the existing BPF map
>> family.
> 
> Series applied, thanks.

BTW, with Fedora 24's gcc-6.2.1 on x86_64 I'm getting this warning:

kernel/bpf/bpf_lru_list.c: In function ‘__bpf_lru_list_rotate_inactive.isra.3’:
kernel/bpf/bpf_lru_list.c:201:28: warning: ‘next’ may be used uninitialized in this function [-Wmaybe-uninitialized]
  l->next_inactive_rotation = next;
  ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~

^ permalink raw reply

* Re: [PATCH] icmp: Restore resistence to abnormal messages
From: David Miller @ 2016-11-15 16:56 UTC (permalink / raw)
  To: googuy; +Cc: kuznet, jmorris, yoshfuji, kaber, netdev, linux-kernel
In-Reply-To: <CAO1wt+b4OkJ4DmJ7_hUj_oKj4p_rRMuKUVpiBY7MSFMa=0WKQg@mail.gmail.com>

From: Vicente Jiménez <googuy@gmail.com>
Date: Tue, 15 Nov 2016 17:49:43 +0100

> On Mon, Nov 14, 2016 at 7:36 PM, David Miller <davem@davemloft.net> wrote:
>> From: Vicente Jimenez Aguilar <googuy@gmail.com>
>> Date: Fri, 11 Nov 2016 21:20:18 +0100
>>
>>> @@ -819,6 +820,12 @@ static bool icmp_unreach(struct sk_buff *skb)
>>>                               /* fall through */
>>>                       case 0:
>>>                               info = ntohs(icmph->un.frag.mtu);
>>> +                             /* Handle weird case where next hop MTU is
>>> +                              * equal to or exceeding dropped packet size
>>> +                              */
>>> +                             old_mtu = ntohs(iph->tot_len);
>>> +                             if (info >= old_mtu)
>>> +                                     info = old_mtu - 2;
>>
>> This isn't something the old code did.
>>
>> The old code behaved much differently.
>>
> I don't wanted to restore old behavior just fix a strange case that
> was handle by this code where the next hop MTU reported by the router
> is equal or greater than the actual path MTU. Because router
> information is wrong, we need a way to guess a good packet size
> ignoring router data. The simplest strategy that avoid odd numbers is
> reducing dropped packet size by 2.

This whole approach seems arbitrary.

You haven't discussed in any way, what causes this in the first place.
And what about that cause makes simply subtracting by 2 work well or
not.

You have a very locallized, specific, situation on your end you want
to fix.  But we must accept changes that handle things generically and
in a way that would help more than just your specific case.

^ permalink raw reply

* Re: [PATCH v2 net-next 0/6] bpf: LRU map
From: David Miller @ 2016-11-15 16:51 UTC (permalink / raw)
  To: kafai; +Cc: netdev, ast, daniel, kernel-team
In-Reply-To: <1478890511-1346984-1-git-send-email-kafai@fb.com>

From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 11 Nov 2016 10:55:05 -0800

> This patch set adds LRU map implementation to the existing BPF map
> family.

Series applied, thanks.

^ permalink raw reply

* Re: [PATCH] icmp: Restore resistence to abnormal messages
From: Vicente Jiménez @ 2016-11-15 16:49 UTC (permalink / raw)
  To: David Miller
  Cc: Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, netdev, linux-kernel
In-Reply-To: <20161114.133646.1687576478968660327.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 6602 bytes --]

On Mon, Nov 14, 2016 at 7:36 PM, David Miller <davem@davemloft.net> wrote:
> From: Vicente Jimenez Aguilar <googuy@gmail.com>
> Date: Fri, 11 Nov 2016 21:20:18 +0100
>
>> @@ -819,6 +820,12 @@ static bool icmp_unreach(struct sk_buff *skb)
>>                               /* fall through */
>>                       case 0:
>>                               info = ntohs(icmph->un.frag.mtu);
>> +                             /* Handle weird case where next hop MTU is
>> +                              * equal to or exceeding dropped packet size
>> +                              */
>> +                             old_mtu = ntohs(iph->tot_len);
>> +                             if (info >= old_mtu)
>> +                                     info = old_mtu - 2;
>
> This isn't something the old code did.
>
> The old code behaved much differently.
>
I don't wanted to restore old behavior just fix a strange case that
was handle by this code where the next hop MTU reported by the router
is equal or greater than the actual path MTU. Because router
information is wrong, we need a way to guess a good packet size
ignoring router data. The simplest strategy that avoid odd numbers is
reducing dropped packet size by 2.

> In the case where the new mtu was smaller than 68 or larger than
> the iph->tot_len value, it would do several things:
Larger OR EQUAL (our case) than the iph->tot_len.
>
> 1) First it would check for a BSD 4.2 anomaly and subtract old_mtu
>    by the IP header length.
This only happens when next hop MTU is zero. I don't care about this
case. It is handled at protocol level as the commit that removed that
function says.

>
> 2) Second, it would try to guess the intended MTU using the
>    mtu_plateau table.
>
Old kernel work because they reduced the current MTU by a small amount
using the mtu_plateau.
Not because we have some trouble with headers and we need the common 1492 value.

> I don't see any code where a subtraction by a fixed constant of 2
> occurred.
>
This is the simplest solution I came about. Because packets of size
old_mtu get dropped and suggested next hop MTU is equal or larger, we
just ignore wrong router information and try to get the correct MTU by
reducing current MTU by just a bit. I discarded subtracting by 1 to
avoid odd numbers.
For our case:
 - Old kernel sets MTU to 1492 from previous 1500 bytes and worked.
 - Newer kernel enters infinite loop sending dropped size packets.
 - Patched kernel sets MTU to 1498 from 1500 and go trough this abnormal router.
 - Microsoft Windows also recovers from those strange ICMP messages.

By the way, setting MTU to 1499 also failed to communicate for us.

> Nor can I figure out what that might accomplish.  If you really
> want to do this, you have to docuement what this 2 means, what
> it is accomplishing, and why you have choosen to accomplish it
> this way.
>
> Thanks.

I just wanted to restore the "new_mtu >= old_mtu" check and instead of
reducing the current packet size using a plateau function, decrement
current size by 2 (to avoid odd numbers). This solution solved our
problem setting MTU to 1498 and I think this must converge to any good
MTU value because it reduce packet size until we don't receive any
more ICMP fragmentation needed messages.

We can criticize that this solution could be slow to reach small MTU values.
But I suspect that those abnormal ICMP messages was caused by a router
bug and must be very close to a size that get through them. If
suggested MTU is smaller than current packet size, we set the MTU to
next hop MTU. If those packets that are equal in size to the suggested
one still get dropped, then this code try to overcome that. It had the
strength of being really simple and it must be ideally never be
executed (we are still trying to fix the abnormal router, but we don't
manage it). I suspect that for those weird case, a working size must
be very close to the suggested and it is far better than current
kernel that just take the value in next hop MTU without any check and
enter in a infinite loop sending dropped size packets. TCP connections
stall and finally time out.

I suspect that a router bug get size comparison wrong and the real
path MTU is 1500 (as it is in the other direction). We need to get
around this and currently we just lowered interface MTU at startup for
those devices.

Where do I need to put more explanation?
 - Code comment
 - Commit message

Attached capture file that show the problem: Abnormal ICMP messages
that drops and suggest 1500 bytes.

This is the long explanation that I emailed earlier:

"
In a large corporate network, we spotted this weird ICMP message after
a long troubleshooting. See attached capture file. Those ICMP "network
unreachable - fragmentation needed and don't fragment bit set"
messages are sent by a router that drop 1500 bytes IP packets and fill
the next hop MTU ICMP field with 1500.

Those messages cause the TCP connection to stall but only on newer
kernels. Older kernels set path MTU to 1492 and communicates
successfully.

After checking code and commit history, I spotted how commit
46517008e116 ("ipv4: Kill ip_rt_frag_needed().") from June 2012
changed ICMP messages handling by removing ip_rt_frag_needed function.

The relevant part of the ip_rt_frag_needed function that was removed is:

if (new_mtu < 68 || new_mtu >= old_mtu) {
        /* BSD 4.2 derived systems incorrectly adjust
         * tot_len by the IP header length, and report
         * a zero MTU in the ICMP message.
         */
        if (mtu == 0 &&
            old_mtu >= 68 + (iph->ihl << 2))
                old_mtu -= iph->ihl << 2;
        mtu = guess_mtu(old_mtu);
}

This condition handled the cases when next hop MTU where zero (less
than 68). Now this is handled by the protocol and fixed by commit
68b7107b6298 "ipv4: icmp: Fix pMTU handling for rare case".

But the rarest case when (next hop MTU) new_mtu >= old_mtu (dropped
packet length) was also removed. This commit restores this check.
Instead of using a table lookup like function guess_mtu uses, it just
try to set the path MTU decrementing by 2 bytes the dropped packet
size.

In our case, setting the path MTU to just 1498 (one iteration) worked.
This solution should converge in any case to a good value by small
steps. I don't think there's a need to a more complex solution.

The patched kernel worked perfectly setting the path MTU to 1498 from
the initial default interface value of 1500. This time I don't have a
capture file from inside the affected center, but all received packed
had a maximum size of 1498.
"

-- 
thanks
vicente

[-- Attachment #2: ICMP dropping and suggesting 1500.pcapng --]
[-- Type: application/x-pcapng, Size: 89664 bytes --]

^ permalink raw reply

* [net PATCH 2/2] ipv4: Fix memory leak in exception case for splitting tries
From: Alexander Duyck @ 2016-11-15 10:46 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, davem
In-Reply-To: <20161115104306.13711.67911.stgit@ahduyck-blue-test.jf.intel.com>

Fix a small memory leak that can occur where we leak a fib_alias in the
event of us not being able to insert it into the local table.

Fixes: 0ddcf43d5d4a0 ("ipv4: FIB Local/MAIN table collapse")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 net/ipv4/fib_trie.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 735edc9..026f309 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1743,8 +1743,10 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
 				local_l = fib_find_node(lt, &local_tp, l->key);
 
 			if (fib_insert_alias(lt, local_tp, local_l, new_fa,
-					     NULL, l->key))
+					     NULL, l->key)) {
+				kmem_cache_free(fn_alias_kmem, new_fa);
 				goto out;
+			}
 		}
 
 		/* stop loop if key wrapped back to 0 */

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox