Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: net/l2tp:BUG: KASAN: use-after-free in l2tp_ip6_close
From: Cong Wang @ 2016-11-16 19:08 UTC (permalink / raw)
  To: Guillaume Nault; +Cc: Baozeng Ding, Linux Kernel Network Developers
In-Reply-To: <20161116163021.tblhptv37r6byjvp@alphalink.fr>

On Wed, Nov 16, 2016 at 8:30 AM, Guillaume Nault <g.nault@alphalink.fr> wrote:
> diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
> index fce25af..982f6c4 100644
> --- a/net/l2tp/l2tp_ip.c
> +++ b/net/l2tp/l2tp_ip.c
> @@ -251,8 +251,6 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
>         int ret;
>         int chk_addr_ret;
>
> -       if (!sock_flag(sk, SOCK_ZAPPED))
> -               return -EINVAL;
>         if (addr_len < sizeof(struct sockaddr_l2tpip))
>                 return -EINVAL;
>         if (addr->l2tp_family != AF_INET)
> @@ -267,6 +265,9 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
>         read_unlock_bh(&l2tp_ip_lock);
>
>         lock_sock(sk);
> +       if (!sock_flag(sk, SOCK_ZAPPED))
> +               goto out;
> +
>         if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip))
>                 goto out;
>
> diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
> index ad3468c..9978d01 100644
> --- a/net/l2tp/l2tp_ip6.c
> +++ b/net/l2tp/l2tp_ip6.c
> @@ -269,8 +269,6 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
>         int addr_type;
>         int err;
>
> -       if (!sock_flag(sk, SOCK_ZAPPED))
> -               return -EINVAL;
>         if (addr->l2tp_family != AF_INET6)
>                 return -EINVAL;
>         if (addr_len < sizeof(*addr))
> @@ -296,6 +294,9 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
>         lock_sock(sk);
>
>         err = -EINVAL;
> +       if (!sock_flag(sk, SOCK_ZAPPED))
> +               goto out_unlock;
> +
>         if (sk->sk_state != TCP_CLOSE)
>                 goto out_unlock;


Makes sense, it should prevent a concurrent caller adding the socket
into bind table
twice after passing __l2tp_ip_bind_lookup() check.

^ permalink raw reply

* [PATCH 1/3] net: stmmac: replace all pr_xxx by their netdev_xxx counterpart
From: Corentin Labbe @ 2016-11-16 19:09 UTC (permalink / raw)
  To: peppe.cavallaro, alexandre.torgue; +Cc: netdev, linux-kernel, LABBE Corentin

From: LABBE Corentin <clabbe.montjoie@gmail.com>

The stmmac driver use lots of pr_xxx functions to print information.
This is bad since we cannot know which device logs the information.
(moreover if two stmmac device are present)

Furthermore, it seems that it assumes wrongly that all logs will always
be subsequent by using a dev_xxx then some indented pr_xxx like this:
kernel: sun7i-dwmac 1c50000.ethernet: no reset control found
kernel:  Ring mode enabled
kernel:  No HW DMA feature register supported
kernel:  Normal descriptors
kernel:  TX Checksum insertion supported

So this patch replace all pr_xxx by their netdev_xxx counterpart.
Excepts for some printing where netdev "cause" unpretty output like:
sun7i-dwmac 1c50000.ethernet (unnamed net_device) (uninitialized): no reset control found
In those case, I keep dev_xxx.

In the same time I remove some "stmmac:" print since
this will be a duplicate with that dev_xxx displays.

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 204 ++++++++++++----------
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c |  14 +-
 2 files changed, 123 insertions(+), 95 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 8eb12353..791daf4 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -305,7 +305,7 @@ bool stmmac_eee_init(struct stmmac_priv *priv)
 			 */
 			spin_lock_irqsave(&priv->lock, flags);
 			if (priv->eee_active) {
-				pr_debug("stmmac: disable EEE\n");
+				netdev_dbg(priv->dev, "disable EEE\n");
 				del_timer_sync(&priv->eee_ctrl_timer);
 				priv->hw->mac->set_eee_timer(priv->hw, 0,
 							     tx_lpi_timer);
@@ -334,7 +334,7 @@ bool stmmac_eee_init(struct stmmac_priv *priv)
 		ret = true;
 		spin_unlock_irqrestore(&priv->lock, flags);
 
-		pr_debug("stmmac: Energy-Efficient Ethernet initialized\n");
+		netdev_dbg(priv->dev, "Energy-Efficient Ethernet initialized\n");
 	}
 out:
 	return ret;
@@ -456,8 +456,8 @@ static int stmmac_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr)
 			   sizeof(struct hwtstamp_config)))
 		return -EFAULT;
 
-	pr_debug("%s config flags:0x%x, tx_type:0x%x, rx_filter:0x%x\n",
-		 __func__, config.flags, config.tx_type, config.rx_filter);
+	netdev_dbg(priv->dev, "%s config flags:0x%x, tx_type:0x%x, rx_filter:0x%x\n",
+		   __func__, config.flags, config.tx_type, config.rx_filter);
 
 	/* reserved for future extensions */
 	if (config.flags)
@@ -756,8 +756,9 @@ static void stmmac_adjust_link(struct net_device *dev)
 				break;
 			default:
 				if (netif_msg_link(priv))
-					pr_warn("%s: Speed (%d) not 10/100\n",
-						dev->name, phydev->speed);
+					netdev_warn(priv->dev,
+						    "Speed (%d) not 10/100\n",
+						    phydev->speed);
 				break;
 			}
 
@@ -810,10 +811,10 @@ static void stmmac_check_pcs_mode(struct stmmac_priv *priv)
 		    (interface == PHY_INTERFACE_MODE_RGMII_ID) ||
 		    (interface == PHY_INTERFACE_MODE_RGMII_RXID) ||
 		    (interface == PHY_INTERFACE_MODE_RGMII_TXID)) {
-			pr_debug("STMMAC: PCS RGMII support enable\n");
+			netdev_dbg(priv->dev, "PCS RGMII support enabled\n");
 			priv->hw->pcs = STMMAC_PCS_RGMII;
 		} else if (interface == PHY_INTERFACE_MODE_SGMII) {
-			pr_debug("STMMAC: PCS SGMII support enable\n");
+			netdev_dbg(priv->dev, "PCS SGMII support enabled\n");
 			priv->hw->pcs = STMMAC_PCS_SGMII;
 		}
 	}
@@ -848,15 +849,15 @@ static int stmmac_init_phy(struct net_device *dev)
 
 		snprintf(phy_id_fmt, MII_BUS_ID_SIZE + 3, PHY_ID_FMT, bus_id,
 			 priv->plat->phy_addr);
-		pr_debug("stmmac_init_phy:  trying to attach to %s\n",
-			 phy_id_fmt);
+		netdev_dbg(priv->dev, "stmmac_init_phy: trying to attach to %s\n",
+			   phy_id_fmt);
 
 		phydev = phy_connect(dev, phy_id_fmt, &stmmac_adjust_link,
 				     interface);
 	}
 
 	if (IS_ERR_OR_NULL(phydev)) {
-		pr_err("%s: Could not attach to PHY\n", dev->name);
+		netdev_err(priv->dev, "Could not attach to PHY\n");
 		if (!phydev)
 			return -ENODEV;
 
@@ -889,8 +890,9 @@ static int stmmac_init_phy(struct net_device *dev)
 	if (phydev->is_pseudo_fixed_link)
 		phydev->irq = PHY_POLL;
 
-	pr_debug("stmmac_init_phy:  %s: attached to PHY (UID 0x%x)"
-		 " Link = %d\n", dev->name, phydev->phy_id, phydev->link);
+	netdev_dbg(priv->dev,
+		   "stmmac_init_phy: attached to PHY (UID 0x%x) Link = %d\n",
+		   phydev->phy_id, phydev->link);
 
 	return 0;
 }
@@ -976,7 +978,8 @@ static int stmmac_init_rx_buffers(struct stmmac_priv *priv, struct dma_desc *p,
 
 	skb = __netdev_alloc_skb_ip_align(priv->dev, priv->dma_buf_sz, flags);
 	if (!skb) {
-		pr_err("%s: Rx init fails; skb is NULL\n", __func__);
+		netdev_err(priv->dev,
+			   "%s: Rx init fails; skb is NULL\n", __func__);
 		return -ENOMEM;
 	}
 	priv->rx_skbuff[i] = skb;
@@ -984,7 +987,7 @@ static int stmmac_init_rx_buffers(struct stmmac_priv *priv, struct dma_desc *p,
 						priv->dma_buf_sz,
 						DMA_FROM_DEVICE);
 	if (dma_mapping_error(priv->device, priv->rx_skbuff_dma[i])) {
-		pr_err("%s: DMA mapping error\n", __func__);
+		netdev_err(priv->dev, "%s: DMA mapping error\n", __func__);
 		dev_kfree_skb_any(skb);
 		return -EINVAL;
 	}
@@ -1035,11 +1038,12 @@ static int init_dma_desc_rings(struct net_device *dev, gfp_t flags)
 	priv->dma_buf_sz = bfsize;
 
 	if (netif_msg_probe(priv)) {
-		pr_debug("(%s) dma_rx_phy=0x%08x dma_tx_phy=0x%08x\n", __func__,
-			 (u32) priv->dma_rx_phy, (u32) priv->dma_tx_phy);
+		netdev_dbg(priv->dev, "(%s) dma_rx_phy=0x%08x dma_tx_phy=0x%08x\n",
+			   __func__, (u32)priv->dma_rx_phy,
+			   (u32)priv->dma_tx_phy);
 
 		/* RX INITIALIZATION */
-		pr_debug("\tSKB addresses:\nskb\t\tskb data\tdma data\n");
+		netdev_dbg(priv->dev, "SKB addresses:\nskb\t\tskb data\tdma data\n");
 	}
 	for (i = 0; i < DMA_RX_SIZE; i++) {
 		struct dma_desc *p;
@@ -1053,7 +1057,8 @@ static int init_dma_desc_rings(struct net_device *dev, gfp_t flags)
 			goto err_init_rx_buffers;
 
 		if (netif_msg_probe(priv))
-			pr_debug("[%p]\t[%p]\t[%x]\n", priv->rx_skbuff[i],
+			netdev_dbg(priv->dev, "[%p]\t[%p]\t[%x]\n",
+				   priv->rx_skbuff[i],
 				 priv->rx_skbuff[i]->data,
 				 (unsigned int)priv->rx_skbuff_dma[i]);
 	}
@@ -1386,7 +1391,8 @@ static void stmmac_tx_clean(struct stmmac_priv *priv)
 		if (netif_queue_stopped(priv->dev) &&
 		    stmmac_tx_avail(priv) > STMMAC_TX_THRESH) {
 			if (netif_msg_tx_done(priv))
-				pr_debug("%s: restart transmit\n", __func__);
+				netdev_dbg(priv->dev, "%s: restart transmit\n",
+					   __func__);
 			netif_wake_queue(priv->dev);
 		}
 		netif_tx_unlock(priv->dev);
@@ -1497,7 +1503,7 @@ static void stmmac_mmc_setup(struct stmmac_priv *priv)
 		dwmac_mmc_ctrl(priv->mmcaddr, mode);
 		memset(&priv->mmc, 0, sizeof(struct stmmac_counters));
 	} else
-		pr_info(" No MAC Management Counters available\n");
+		netdev_info(priv->dev, "No MAC Management Counters available\n");
 }
 
 /**
@@ -1510,18 +1516,18 @@ static void stmmac_mmc_setup(struct stmmac_priv *priv)
 static void stmmac_selec_desc_mode(struct stmmac_priv *priv)
 {
 	if (priv->plat->enh_desc) {
-		pr_info(" Enhanced/Alternate descriptors\n");
+		dev_info(priv->device, "Enhanced/Alternate descriptors\n");
 
 		/* GMAC older than 3.50 has no extended descriptors */
 		if (priv->synopsys_id >= DWMAC_CORE_3_50) {
-			pr_info("\tEnabled extended descriptors\n");
+			dev_info(priv->device, "Enabled extended descriptors\n");
 			priv->extend_desc = 1;
 		} else
-			pr_warn("Extended descriptors not supported\n");
+			dev_warn(priv->device, "Extended descriptors not supported\n");
 
 		priv->hw->desc = &enh_desc_ops;
 	} else {
-		pr_info(" Normal descriptors\n");
+		dev_info(priv->device, "Normal descriptors\n");
 		priv->hw->desc = &ndesc_ops;
 	}
 }
@@ -1562,8 +1568,8 @@ static void stmmac_check_ether_addr(struct stmmac_priv *priv)
 					     priv->dev->dev_addr, 0);
 		if (!is_valid_ether_addr(priv->dev->dev_addr))
 			eth_hw_addr_random(priv->dev);
-		pr_info("%s: device MAC address %pM\n", priv->dev->name,
-			priv->dev->dev_addr);
+		netdev_info(priv->dev, "device MAC address %pM\n",
+			    priv->dev->dev_addr);
 	}
 }
 
@@ -1671,7 +1677,8 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 	/* DMA initialization and SW reset */
 	ret = stmmac_init_dma_engine(priv);
 	if (ret < 0) {
-		pr_err("%s: DMA engine initialization failed\n", __func__);
+		netdev_err(priv->dev, "%s: DMA engine initialization failed\n",
+			   __func__);
 		return ret;
 	}
 
@@ -1700,7 +1707,7 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 
 	ret = priv->hw->mac->rx_ipc(priv->hw);
 	if (!ret) {
-		pr_warn(" RX IPC Checksum Offload disabled\n");
+		netdev_warn(priv->dev, "RX IPC Checksum Offload disabled\n");
 		priv->plat->rx_coe = STMMAC_RX_COE_NONE;
 		priv->hw->rx_csum = 0;
 	}
@@ -1725,10 +1732,11 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 #ifdef CONFIG_DEBUG_FS
 	ret = stmmac_init_fs(dev);
 	if (ret < 0)
-		pr_warn("%s: failed debugFS registration\n", __func__);
+		netdev_warn(priv->dev, "%s: failed debugFS registration\n",
+			    __func__);
 #endif
 	/* Start the ball rolling... */
-	pr_debug("%s: DMA RX/TX processes started...\n", dev->name);
+	netdev_dbg(priv->dev, "DMA RX/TX processes started...\n");
 	priv->hw->dma->start_tx(priv->ioaddr);
 	priv->hw->dma->start_rx(priv->ioaddr);
 
@@ -1783,8 +1791,9 @@ static int stmmac_open(struct net_device *dev)
 	    priv->hw->pcs != STMMAC_PCS_RTBI) {
 		ret = stmmac_init_phy(dev);
 		if (ret) {
-			pr_err("%s: Cannot attach to PHY (error: %d)\n",
-			       __func__, ret);
+			netdev_err(priv->dev,
+				   "%s: Cannot attach to PHY (error: %d)\n",
+				   __func__, ret);
 			return ret;
 		}
 	}
@@ -1798,19 +1807,21 @@ static int stmmac_open(struct net_device *dev)
 
 	ret = alloc_dma_desc_resources(priv);
 	if (ret < 0) {
-		pr_err("%s: DMA descriptors allocation failed\n", __func__);
+		netdev_err(priv->dev, "%s: DMA descriptors allocation failed\n",
+			   __func__);
 		goto dma_desc_error;
 	}
 
 	ret = init_dma_desc_rings(dev, GFP_KERNEL);
 	if (ret < 0) {
-		pr_err("%s: DMA descriptors initialization failed\n", __func__);
+		netdev_err(priv->dev, "%s: DMA descriptors initialization failed\n",
+			   __func__);
 		goto init_error;
 	}
 
 	ret = stmmac_hw_setup(dev, true);
 	if (ret < 0) {
-		pr_err("%s: Hw setup failed\n", __func__);
+		netdev_err(priv->dev, "%s: Hw setup failed\n", __func__);
 		goto init_error;
 	}
 
@@ -1823,8 +1834,9 @@ static int stmmac_open(struct net_device *dev)
 	ret = request_irq(dev->irq, stmmac_interrupt,
 			  IRQF_SHARED, dev->name, dev);
 	if (unlikely(ret < 0)) {
-		pr_err("%s: ERROR: allocating the IRQ %d (error: %d)\n",
-		       __func__, dev->irq, ret);
+		netdev_err(priv->dev,
+			   "%s: ERROR: allocating the IRQ %d (error: %d)\n",
+			   __func__, dev->irq, ret);
 		goto init_error;
 	}
 
@@ -1833,8 +1845,9 @@ static int stmmac_open(struct net_device *dev)
 		ret = request_irq(priv->wol_irq, stmmac_interrupt,
 				  IRQF_SHARED, dev->name, dev);
 		if (unlikely(ret < 0)) {
-			pr_err("%s: ERROR: allocating the WoL IRQ %d (%d)\n",
-			       __func__, priv->wol_irq, ret);
+			netdev_err(priv->dev,
+				   "%s: ERROR: allocating the WoL IRQ %d (%d)\n",
+				   __func__, priv->wol_irq, ret);
 			goto wolirq_error;
 		}
 	}
@@ -1844,8 +1857,9 @@ static int stmmac_open(struct net_device *dev)
 		ret = request_irq(priv->lpi_irq, stmmac_interrupt, IRQF_SHARED,
 				  dev->name, dev);
 		if (unlikely(ret < 0)) {
-			pr_err("%s: ERROR: allocating the LPI IRQ %d (%d)\n",
-			       __func__, priv->lpi_irq, ret);
+			netdev_err(priv->dev,
+				   "%s: ERROR: allocating the LPI IRQ %d (%d)\n",
+				   __func__, priv->lpi_irq, ret);
 			goto lpiirq_error;
 		}
 	}
@@ -2008,7 +2022,9 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 		if (!netif_queue_stopped(dev)) {
 			netif_stop_queue(dev);
 			/* This is a hard error, log it. */
-			pr_err("%s: Tx Ring full when queue awake\n", __func__);
+			netdev_err(priv->dev,
+				   "%s: Tx Ring full when queue awake\n",
+				   __func__);
 		}
 		spin_unlock(&priv->tx_lock);
 		return NETDEV_TX_BUSY;
@@ -2082,7 +2098,8 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	if (unlikely(stmmac_tx_avail(priv) <= (MAX_SKB_FRAGS + 1))) {
 		if (netif_msg_hw(priv))
-			pr_debug("%s: stop transmitted packets\n", __func__);
+			netdev_dbg(priv->dev, "%s: stop transmitted packets\n",
+				   __func__);
 		netif_stop_queue(dev);
 	}
 
@@ -2188,7 +2205,9 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 		if (!netif_queue_stopped(dev)) {
 			netif_stop_queue(dev);
 			/* This is a hard error, log it. */
-			pr_err("%s: Tx Ring full when queue awake\n", __func__);
+			netdev_err(priv->dev,
+				   "%s: Tx Ring full when queue awake\n",
+				   __func__);
 		}
 		return NETDEV_TX_BUSY;
 	}
@@ -2263,9 +2282,10 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (netif_msg_pktdata(priv)) {
 		void *tx_head;
 
-		pr_debug("%s: curr=%d dirty=%d f=%d, e=%d, first=%p, nfrags=%d",
-			 __func__, priv->cur_tx, priv->dirty_tx, first_entry,
-			 entry, first, nfrags);
+		netdev_dbg(priv->dev,
+			   "%s: curr=%d dirty=%d f=%d, e=%d, first=%p, nfrags=%d",
+			   __func__, priv->cur_tx, priv->dirty_tx, first_entry,
+			   entry, first, nfrags);
 
 		if (priv->extend_desc)
 			tx_head = (void *)priv->dma_etx;
@@ -2274,13 +2294,14 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 
 		priv->hw->desc->display_ring(tx_head, DMA_TX_SIZE, false);
 
-		pr_debug(">>> frame to be transmitted: ");
+		netdev_dbg(priv->dev, ">>> frame to be transmitted: ");
 		print_pkt(skb->data, skb->len);
 	}
 
 	if (unlikely(stmmac_tx_avail(priv) <= (MAX_SKB_FRAGS + 1))) {
 		if (netif_msg_hw(priv))
-			pr_debug("%s: stop transmitted packets\n", __func__);
+			netdev_dbg(priv->dev,
+				   "%s: stop transmitted packets\n", __func__);
 		netif_stop_queue(dev);
 	}
 
@@ -2357,7 +2378,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 
 dma_map_err:
 	spin_unlock(&priv->tx_lock);
-	dev_err(priv->device, "Tx dma map failed\n");
+	netdev_err(priv->dev, "Tx DMA map failed\n");
 	dev_kfree_skb(skb);
 	priv->dev->stats.tx_dropped++;
 	return NETDEV_TX_OK;
@@ -2428,7 +2449,7 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv)
 					   DMA_FROM_DEVICE);
 			if (dma_mapping_error(priv->device,
 					      priv->rx_skbuff_dma[entry])) {
-				dev_err(priv->device, "Rx dma map failed\n");
+				netdev_err(priv->dev, "Rx DMA map failed\n");
 				dev_kfree_skb(skb);
 				break;
 			}
@@ -2446,7 +2467,8 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv)
 				priv->rx_zeroc_thresh--;
 
 			if (netif_msg_rx_status(priv))
-				pr_debug("\trefill entry #%d\n", entry);
+				netdev_dbg(priv->dev,
+					   "refill entry #%d\n", entry);
 		}
 		wmb();
 
@@ -2479,7 +2501,7 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit)
 	if (netif_msg_rx_status(priv)) {
 		void *rx_head;
 
-		pr_debug("%s: descriptor ring:\n", __func__);
+		netdev_dbg(priv->dev, "%s: descriptor ring:\n", __func__);
 		if (priv->extend_desc)
 			rx_head = (void *)priv->dma_erx;
 		else
@@ -2549,9 +2571,9 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit)
 			 *  ignored
 			 */
 			if (frame_len > priv->dma_buf_sz) {
-				pr_err("%s: len %d larger than size (%d)\n",
-				       priv->dev->name, frame_len,
-				       priv->dma_buf_sz);
+				netdev_err(priv->dev,
+					   "len %d larger than size (%d)\n",
+					   frame_len, priv->dma_buf_sz);
 				priv->dev->stats.rx_length_errors++;
 				break;
 			}
@@ -2563,11 +2585,11 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit)
 				frame_len -= ETH_FCS_LEN;
 
 			if (netif_msg_rx_status(priv)) {
-				pr_debug("\tdesc: %p [entry %d] buff=0x%x\n",
-					p, entry, des);
+				netdev_dbg(priv->dev, "\tdesc: %p [entry %d] buff=0x%x\n",
+					   p, entry, des);
 				if (frame_len > ETH_FRAME_LEN)
-					pr_debug("\tframe size %d, COE: %d\n",
-						 frame_len, status);
+					netdev_dbg(priv->dev, "frame size %d, COE: %d\n",
+						   frame_len, status);
 			}
 
 			/* The zero-copy is always used for all the sizes
@@ -2604,8 +2626,9 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit)
 			} else {
 				skb = priv->rx_skbuff[entry];
 				if (unlikely(!skb)) {
-					pr_err("%s: Inconsistent Rx chain\n",
-					       priv->dev->name);
+					netdev_err(priv->dev,
+						   "%s: Inconsistent Rx chain\n",
+						   priv->dev->name);
 					priv->dev->stats.rx_dropped++;
 					break;
 				}
@@ -2623,7 +2646,8 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit)
 			stmmac_get_rx_hwtstamp(priv, entry, skb);
 
 			if (netif_msg_pktdata(priv)) {
-				pr_debug("frame received (%dbytes)", frame_len);
+				netdev_dbg(priv->dev, "frame received (%dbytes)",
+					   frame_len);
 				print_pkt(skb->data, frame_len);
 			}
 
@@ -2720,8 +2744,10 @@ static void stmmac_set_rx_mode(struct net_device *dev)
  */
 static int stmmac_change_mtu(struct net_device *dev, int new_mtu)
 {
+	struct stmmac_priv *priv = netdev_priv(dev);
+
 	if (netif_running(dev)) {
-		pr_err("%s: must be stopped to change its MTU\n", dev->name);
+		netdev_err(priv->dev, "must be stopped to change its MTU\n");
 		return -EBUSY;
 	}
 
@@ -2800,7 +2826,7 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id)
 		pm_wakeup_event(priv->device, 0);
 
 	if (unlikely(!dev)) {
-		pr_err("%s: invalid dev pointer\n", __func__);
+		netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__);
 		return IRQ_NONE;
 	}
 
@@ -3032,8 +3058,7 @@ static int stmmac_init_fs(struct net_device *dev)
 	priv->dbgfs_dir = debugfs_create_dir(dev->name, stmmac_fs_dir);
 
 	if (!priv->dbgfs_dir || IS_ERR(priv->dbgfs_dir)) {
-		pr_err("ERROR %s/%s, debugfs create directory failed\n",
-		       STMMAC_RESOURCE_NAME, dev->name);
+		netdev_err(priv->dev, "ERROR failed to create debugfs directory\n");
 
 		return -ENOMEM;
 	}
@@ -3045,7 +3070,7 @@ static int stmmac_init_fs(struct net_device *dev)
 				    &stmmac_rings_status_fops);
 
 	if (!priv->dbgfs_rings_status || IS_ERR(priv->dbgfs_rings_status)) {
-		pr_info("ERROR creating stmmac ring debugfs file\n");
+		netdev_err(priv->dev, "ERROR creating stmmac ring debugfs file\n");
 		debugfs_remove_recursive(priv->dbgfs_dir);
 
 		return -ENOMEM;
@@ -3057,7 +3082,7 @@ static int stmmac_init_fs(struct net_device *dev)
 					    dev, &stmmac_dma_cap_fops);
 
 	if (!priv->dbgfs_dma_cap || IS_ERR(priv->dbgfs_dma_cap)) {
-		pr_info("ERROR creating stmmac MMC debugfs file\n");
+		netdev_err(priv->dev, "ERROR creating stmmac MMC debugfs file\n");
 		debugfs_remove_recursive(priv->dbgfs_dir);
 
 		return -ENOMEM;
@@ -3129,11 +3154,11 @@ static int stmmac_hw_init(struct stmmac_priv *priv)
 	} else {
 		if (chain_mode) {
 			priv->hw->mode = &chain_mode_ops;
-			pr_info(" Chain mode enabled\n");
+			dev_info(priv->device, "Chain mode enabled\n");
 			priv->mode = STMMAC_CHAIN_MODE;
 		} else {
 			priv->hw->mode = &ring_mode_ops;
-			pr_info(" Ring mode enabled\n");
+			dev_info(priv->device, "Ring mode enabled\n");
 			priv->mode = STMMAC_RING_MODE;
 		}
 	}
@@ -3141,7 +3166,7 @@ static int stmmac_hw_init(struct stmmac_priv *priv)
 	/* Get the HW capability (new GMAC newer than 3.50a) */
 	priv->hw_cap_support = stmmac_get_hw_features(priv);
 	if (priv->hw_cap_support) {
-		pr_info(" DMA HW capability register supported");
+		dev_info(priv->device, "DMA HW capability register supported\n");
 
 		/* We can override some gmac/dma configuration fields: e.g.
 		 * enh_desc, tx_coe (e.g. that are passed through the
@@ -3166,8 +3191,9 @@ static int stmmac_hw_init(struct stmmac_priv *priv)
 		else if (priv->dma_cap.rx_coe_type1)
 			priv->plat->rx_coe = STMMAC_RX_COE_TYPE1;
 
-	} else
-		pr_info(" No HW DMA feature register supported");
+	} else {
+		dev_info(priv->device, "No HW DMA feature register supported\n");
+	}
 
 	/* To use alternate (extended), normal or GMAC4 descriptor structures */
 	if (priv->synopsys_id >= DWMAC_CORE_4_00)
@@ -3177,20 +3203,20 @@ static int stmmac_hw_init(struct stmmac_priv *priv)
 
 	if (priv->plat->rx_coe) {
 		priv->hw->rx_csum = priv->plat->rx_coe;
-		pr_info(" RX Checksum Offload Engine supported\n");
+		dev_info(priv->device, "RX Checksum Offload Engine supported\n");
 		if (priv->synopsys_id < DWMAC_CORE_4_00)
-			pr_info("\tCOE Type %d\n", priv->hw->rx_csum);
+			dev_info(priv->device, "COE Type %d\n", priv->hw->rx_csum);
 	}
 	if (priv->plat->tx_coe)
-		pr_info(" TX Checksum insertion supported\n");
+		dev_info(priv->device, "TX Checksum insertion supported\n");
 
 	if (priv->plat->pmt) {
-		pr_info(" Wake-Up On Lan supported\n");
+		dev_info(priv->device, "Wake-Up On Lan supported\n");
 		device_set_wakeup_capable(priv->device, 1);
 	}
 
 	if (priv->dma_cap.tsoen)
-		pr_info(" TSO supported\n");
+		dev_info(priv->device, "TSO supported\n");
 
 	return 0;
 }
@@ -3249,8 +3275,8 @@ int stmmac_dvr_probe(struct device *device,
 
 	priv->stmmac_clk = devm_clk_get(priv->device, STMMAC_RESOURCE_NAME);
 	if (IS_ERR(priv->stmmac_clk)) {
-		dev_warn(priv->device, "%s: warning: cannot get CSR clock\n",
-			 __func__);
+		netdev_warn(priv->dev, "%s: warning: cannot get CSR clock\n",
+			    __func__);
 		/* If failed to obtain stmmac_clk and specific clk_csr value
 		 * is NOT passed from the platform, probe fail.
 		 */
@@ -3299,7 +3325,7 @@ int stmmac_dvr_probe(struct device *device,
 	if ((priv->plat->tso_en) && (priv->dma_cap.tsoen)) {
 		ndev->hw_features |= NETIF_F_TSO;
 		priv->tso = true;
-		pr_info(" TSO feature enabled\n");
+		dev_info(priv->device, "TSO feature enabled\n");
 	}
 	ndev->features |= ndev->hw_features | NETIF_F_HIGHDMA;
 	ndev->watchdog_timeo = msecs_to_jiffies(watchdog);
@@ -3328,7 +3354,7 @@ int stmmac_dvr_probe(struct device *device,
 	 */
 	if ((priv->synopsys_id >= DWMAC_CORE_3_50) && (!priv->plat->riwt_off)) {
 		priv->use_riwt = 1;
-		pr_info(" Enable RX Mitigation via HW Watchdog Timer\n");
+		netdev_info(priv->dev, "Enable RX Mitigation via HW Watchdog Timer\n");
 	}
 
 	netif_napi_add(ndev, &priv->napi, stmmac_poll, 64);
@@ -3338,7 +3364,8 @@ int stmmac_dvr_probe(struct device *device,
 
 	ret = register_netdev(ndev);
 	if (ret) {
-		pr_err("%s: ERROR %i registering the device\n", __func__, ret);
+		netdev_err(priv->dev, "%s: ERROR %i registering the device\n",
+			   __func__, ret);
 		goto error_netdev_register;
 	}
 
@@ -3361,8 +3388,9 @@ int stmmac_dvr_probe(struct device *device,
 		/* MDIO bus Registration */
 		ret = stmmac_mdio_register(ndev);
 		if (ret < 0) {
-			pr_debug("%s: MDIO bus (id: %d) registration failed",
-				 __func__, priv->plat->bus_id);
+			netdev_err(priv->dev,
+				   "%s: MDIO bus (id: %d) registration failed",
+				   __func__, priv->plat->bus_id);
 			goto error_mdio_register;
 		}
 	}
@@ -3395,7 +3423,7 @@ int stmmac_dvr_remove(struct device *dev)
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct stmmac_priv *priv = netdev_priv(ndev);
 
-	pr_info("%s:\n\tremoving driver", __func__);
+	netdev_info(priv->dev, "%s: removing driver", __func__);
 
 	priv->hw->dma->stop_rx(priv->ioaddr);
 	priv->hw->dma->stop_tx(priv->ioaddr);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index ec29585..e3216e5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -260,7 +260,7 @@ int stmmac_mdio_reset(struct mii_bus *bus)
 #endif
 
 	if (data->phy_reset) {
-		pr_debug("stmmac_mdio_reset: calling phy_reset\n");
+		netdev_dbg(ndev, "stmmac_mdio_reset: calling phy_reset\n");
 		data->phy_reset(priv->plat->bsp_priv);
 	}
 
@@ -325,7 +325,7 @@ int stmmac_mdio_register(struct net_device *ndev)
 	else
 		err = mdiobus_register(new_bus);
 	if (err != 0) {
-		pr_err("%s: Cannot register as MDIO bus\n", new_bus->name);
+		netdev_err(ndev, "Cannot register the MDIO bus\n");
 		goto bus_register_fail;
 	}
 
@@ -372,16 +372,16 @@ int stmmac_mdio_register(struct net_device *ndev)
 				irq_str = irq_num;
 				break;
 			}
-			pr_info("%s: PHY ID %08x at %d IRQ %s (%s)%s\n",
-				ndev->name, phydev->phy_id, addr,
-				irq_str, phydev_name(phydev),
-				act ? " active" : "");
+			netdev_info(ndev, "PHY ID %08x at %d IRQ %s (%s)%s\n",
+				    phydev->phy_id, addr,
+				    irq_str, phydev_name(phydev),
+				    act ? " active" : "");
 			found = 1;
 		}
 	}
 
 	if (!found && !mdio_node) {
-		pr_warn("%s: No PHY found\n", ndev->name);
+		netdev_warn(ndev, "No PHY found\n");
 		mdiobus_unregister(new_bus);
 		mdiobus_free(new_bus);
 		return -ENODEV;
-- 
2.7.3

^ permalink raw reply related

* [PATCH 2/3] net: stmmac: replace hardcoded function name by __func__
From: Corentin Labbe @ 2016-11-16 19:09 UTC (permalink / raw)
  To: peppe.cavallaro, alexandre.torgue; +Cc: netdev, linux-kernel, LABBE Corentin
In-Reply-To: <1479323381-26639-1-git-send-email-clabbe.montjoie@gmail.com>

From: LABBE Corentin <clabbe.montjoie@gmail.com>

Some printing have the function name hardcoded.
It is better to use __func__ instead.

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 791daf4..d160bdb 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -849,7 +849,7 @@ static int stmmac_init_phy(struct net_device *dev)
 
 		snprintf(phy_id_fmt, MII_BUS_ID_SIZE + 3, PHY_ID_FMT, bus_id,
 			 priv->plat->phy_addr);
-		netdev_dbg(priv->dev, "stmmac_init_phy: trying to attach to %s\n",
+		netdev_dbg(priv->dev, "%s: trying to attach to %s\n", __func__,
 			   phy_id_fmt);
 
 		phydev = phy_connect(dev, phy_id_fmt, &stmmac_adjust_link,
@@ -890,9 +890,8 @@ static int stmmac_init_phy(struct net_device *dev)
 	if (phydev->is_pseudo_fixed_link)
 		phydev->irq = PHY_POLL;
 
-	netdev_dbg(priv->dev,
-		   "stmmac_init_phy: attached to PHY (UID 0x%x) Link = %d\n",
-		   phydev->phy_id, phydev->link);
+	netdev_dbg(priv->dev, "%s: attached to PHY (UID 0x%x) Link = %d\n",
+		   __func__, phydev->phy_id, phydev->link);
 
 	return 0;
 }
-- 
2.7.3

^ permalink raw reply related

* Re: [PATCH net] sctp: use new rhlist interface on sctp transport rhashtable
From: Neil Horman @ 2016-11-16 19:09 UTC (permalink / raw)
  To: Xin Long
  Cc: network dev, linux-sctp, davem, Marcelo Ricardo Leitner,
	Vlad Yasevich, Herbert Xu, Phil Sutter
In-Reply-To: <CADvbK_eOu5oZJ67r8WJ-Vc0O8fLj5X4y-ROeE2aX-12Xd2jzDw@mail.gmail.com>

On Wed, Nov 16, 2016 at 09:34:52PM +0800, Xin Long wrote:
> On Wed, Nov 16, 2016 at 2:04 AM, Neil Horman <nhorman@tuxdriver.com> wrote:
> > On Tue, Nov 15, 2016 at 11:23:11PM +0800, Xin Long wrote:
> >> Now sctp transport rhashtable uses hash(lport, dport, daddr) as the key
> >> to hash a node to one chain. If in one host thousands of assocs connect
> >> to one server with the same lport and different laddrs (although it's
> >> not a normal case), all the transports would be hashed into the same
> >> chain.
> >>
> >> It may cause to keep returning -EBUSY when inserting a new node, as the
> >> chain is too long and sctp inserts a transport node in a loop, which
> >> could even lead to system hangs there.
> >>
> >> The new rhlist interface works for this case that there are many nodes
> >> with the same key in one chain. It puts them into a list then makes this
> >> list be as a node of the chain.
> >>
> >> This patch is to replace rhashtable_ interface with rhltable_ interface.
> >> Since a chain would not be too long and it would not return -EBUSY with
> >> this fix when inserting a node, the reinsert loop is also removed here.
> >>
> >> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> >
> > Does this really buy us anything in this case though?  If the use case is that a
> > majority of the associations map to the same key, then you might avoid EBUSY for
> > the individual associaion that doesn't map there, but you still have to cope
> > with a huge linear search for the majority of the keys.
> >
> 
> This patch is NOT for improving performance, it is to reorganize
> transports in rhashtable in another way to avoid EBUSY, rhlist is born
> for this.
> 
Never said it was a performance issue, just suggested that avoiding EBUSY
returns on inserts might be handled in other ways.

> Before this patch, the transport insert codes are pretty bad, if it returns
> EBUSY, it would retry in a loop. now this patch avoid this and even
> removed that loop, it's a fix for this issue.
> 
> > Might be more reasonable to mix saddr into the hash function so that your use
> > case gets spread through the hash table more evenly.
> 
> we can not do this:
> 1. it will increase rhashtable's size when using multihome, if a host has
>     N addrs, the size for one assoc will be N times bigger than now.
> 2. the hash node is inside transport, if we mix saddrs, when using multihome
>     one transport would be hashed many times with different saddrs, we
>     would have to define new structure to link transport.
> we do not need to do this:
> 1. as the changelog said, "it's not a normal case", in one host (client), it
> shouldn't connect to the same server with different saddrs.
> 2. now as long as paddr+dport+lport are different, rhashtable can hash
> it evenly.

Its the 'not a normal case' thats getting me.  Making a non-trivial change like
this for a corner use case typically makes me suspcious, but your points
regarding multiple hash entries being needed when saddr is used in a multihome
scenario make sense to me.  And looking at the rhltable code more closely, I
think this makes more sense

Acked-by: Neil Horman <nhorman@tuxdriver.com>

> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* [PATCH 3/3] net: stmmac: replace if (netif_msg_type) by their netif_xxx counterpart
From: Corentin Labbe @ 2016-11-16 19:09 UTC (permalink / raw)
  To: peppe.cavallaro, alexandre.torgue; +Cc: netdev, linux-kernel, LABBE Corentin
In-Reply-To: <1479323381-26639-1-git-send-email-clabbe.montjoie@gmail.com>

From: LABBE Corentin <clabbe.montjoie@gmail.com>

As sugested by Joe Perches, we could replace all
if (netif_msg_type(priv)) dev_xxx(priv->devices, ...)
by the simpler macro netif_xxx(priv, hw, priv->dev, ...)

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 49 ++++++++++-------------
 1 file changed, 21 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index d160bdb..fbd1cd7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -755,10 +755,9 @@ static void stmmac_adjust_link(struct net_device *dev)
 				stmmac_hw_fix_mac_speed(priv);
 				break;
 			default:
-				if (netif_msg_link(priv))
-					netdev_warn(priv->dev,
-						    "Speed (%d) not 10/100\n",
-						    phydev->speed);
+				netif_warn(priv, link, priv->dev,
+					   "Speed (%d) not 10/100\n",
+					   phydev->speed);
 				break;
 			}
 
@@ -1036,14 +1035,14 @@ static int init_dma_desc_rings(struct net_device *dev, gfp_t flags)
 
 	priv->dma_buf_sz = bfsize;
 
-	if (netif_msg_probe(priv)) {
-		netdev_dbg(priv->dev, "(%s) dma_rx_phy=0x%08x dma_tx_phy=0x%08x\n",
-			   __func__, (u32)priv->dma_rx_phy,
-			   (u32)priv->dma_tx_phy);
+	netif_dbg(priv, probe, priv->dev,
+		  "(%s) dma_rx_phy=0x%08x dma_tx_phy=0x%08x\n",
+		  __func__, (u32)priv->dma_rx_phy, (u32)priv->dma_tx_phy);
+
+	/* RX INITIALIZATION */
+	netif_dbg(priv, probe, priv->dev,
+		  "SKB addresses:\nskb\t\tskb data\tdma data\n");
 
-		/* RX INITIALIZATION */
-		netdev_dbg(priv->dev, "SKB addresses:\nskb\t\tskb data\tdma data\n");
-	}
 	for (i = 0; i < DMA_RX_SIZE; i++) {
 		struct dma_desc *p;
 		if (priv->extend_desc)
@@ -1055,11 +1054,9 @@ static int init_dma_desc_rings(struct net_device *dev, gfp_t flags)
 		if (ret)
 			goto err_init_rx_buffers;
 
-		if (netif_msg_probe(priv))
-			netdev_dbg(priv->dev, "[%p]\t[%p]\t[%x]\n",
-				   priv->rx_skbuff[i],
-				 priv->rx_skbuff[i]->data,
-				 (unsigned int)priv->rx_skbuff_dma[i]);
+		netif_dbg(priv, probe, priv->dev, "[%p]\t[%p]\t[%x]\n",
+			  priv->rx_skbuff[i], priv->rx_skbuff[i]->data,
+			  (unsigned int)priv->rx_skbuff_dma[i]);
 	}
 	priv->cur_rx = 0;
 	priv->dirty_rx = (unsigned int)(i - DMA_RX_SIZE);
@@ -1389,9 +1386,8 @@ static void stmmac_tx_clean(struct stmmac_priv *priv)
 		netif_tx_lock(priv->dev);
 		if (netif_queue_stopped(priv->dev) &&
 		    stmmac_tx_avail(priv) > STMMAC_TX_THRESH) {
-			if (netif_msg_tx_done(priv))
-				netdev_dbg(priv->dev, "%s: restart transmit\n",
-					   __func__);
+			netif_dbg(priv, tx_done, priv->dev,
+				  "%s: restart transmit\n", __func__);
 			netif_wake_queue(priv->dev);
 		}
 		netif_tx_unlock(priv->dev);
@@ -2096,9 +2092,8 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 	priv->cur_tx = STMMAC_GET_ENTRY(priv->cur_tx, DMA_TX_SIZE);
 
 	if (unlikely(stmmac_tx_avail(priv) <= (MAX_SKB_FRAGS + 1))) {
-		if (netif_msg_hw(priv))
-			netdev_dbg(priv->dev, "%s: stop transmitted packets\n",
-				   __func__);
+		netif_dbg(priv, hw, priv->dev, "%s: stop transmitted packets\n",
+			  __func__);
 		netif_stop_queue(dev);
 	}
 
@@ -2298,9 +2293,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	if (unlikely(stmmac_tx_avail(priv) <= (MAX_SKB_FRAGS + 1))) {
-		if (netif_msg_hw(priv))
-			netdev_dbg(priv->dev,
-				   "%s: stop transmitted packets\n", __func__);
+		netif_dbg(priv, hw, priv->dev, "%s: stop transmitted packets\n",
+			  __func__);
 		netif_stop_queue(dev);
 	}
 
@@ -2465,9 +2459,8 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv)
 			if (priv->rx_zeroc_thresh > 0)
 				priv->rx_zeroc_thresh--;
 
-			if (netif_msg_rx_status(priv))
-				netdev_dbg(priv->dev,
-					   "refill entry #%d\n", entry);
+			netif_dbg(priv, rx_status, priv->dev,
+				  "refill entry #%d\n", entry);
 		}
 		wmb();
 
-- 
2.7.3

^ permalink raw reply related

* Re: [BUG] X86: Removing inline decl on arch/x86/include/asm/desc.h.
From: Thomas Gleixner @ 2016-11-16 19:16 UTC (permalink / raw)
  To: Corcodel Marian
  Cc: netdev, Ingo Molnar, H. Peter Anvin,
	Realtek linux nic maintainers, x86, linux-kernel
In-Reply-To: <1479239164-7721-1-git-send-email-asd@marian1000.go.ro>

On Tue, 15 Nov 2016, Corcodel Marian wrote:
>  Inline declarations suppress warning message from compiler but
>  most of these functions was declared static and is not used local on file.

Huch? This is a header file and the functions are marked inline on purpose.

Can you please explain what you are trying to achieve and why you think
that this is a good idea?

Thanks,

	tglx

^ permalink raw reply

* Re: [PATCH net] sctp: use new rhlist interface on sctp transport rhashtable
From: Marcelo Ricardo Leitner @ 2016-11-16 19:17 UTC (permalink / raw)
  To: Xin Long
  Cc: network dev, linux-sctp, davem, Neil Horman, Vlad Yasevich,
	Herbert Xu, phil
In-Reply-To: <0a89ef8506db3bba6a37010bd5622cd145183ab4.1479223391.git.lucien.xin@gmail.com>

On Tue, Nov 15, 2016 at 11:23:11PM +0800, Xin Long wrote:
> Now sctp transport rhashtable uses hash(lport, dport, daddr) as the key
> to hash a node to one chain. If in one host thousands of assocs connect
> to one server with the same lport and different laddrs (although it's
> not a normal case), all the transports would be hashed into the same
> chain.
> 
> It may cause to keep returning -EBUSY when inserting a new node, as the
> chain is too long and sctp inserts a transport node in a loop, which
> could even lead to system hangs there.
> 
> The new rhlist interface works for this case that there are many nodes
> with the same key in one chain. It puts them into a list then makes this
> list be as a node of the chain.
> 
> This patch is to replace rhashtable_ interface with rhltable_ interface.
> Since a chain would not be too long and it would not return -EBUSY with
> this fix when inserting a node, the reinsert loop is also removed here.
> 
> Signed-off-by: Xin Long <lucien.xin@gmail.com>

Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

> ---
>  include/net/sctp/sctp.h    |  2 +-
>  include/net/sctp/structs.h |  4 +-
>  net/sctp/associola.c       |  8 +++-
>  net/sctp/input.c           | 93 ++++++++++++++++++++++++++--------------------
>  net/sctp/socket.c          |  7 +---
>  5 files changed, 64 insertions(+), 50 deletions(-)
> 
> diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> index 31acc3f..f0dcaeb 100644
> --- a/include/net/sctp/sctp.h
> +++ b/include/net/sctp/sctp.h
> @@ -164,7 +164,7 @@ void sctp_backlog_migrate(struct sctp_association *assoc,
>  			  struct sock *oldsk, struct sock *newsk);
>  int sctp_transport_hashtable_init(void);
>  void sctp_transport_hashtable_destroy(void);
> -void sctp_hash_transport(struct sctp_transport *t);
> +int sctp_hash_transport(struct sctp_transport *t);
>  void sctp_unhash_transport(struct sctp_transport *t);
>  struct sctp_transport *sctp_addrs_lookup_transport(
>  				struct net *net,
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index 11c3bf2..c5a2d83 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -124,7 +124,7 @@ extern struct sctp_globals {
>  	/* This is the sctp port control hash.	*/
>  	struct sctp_bind_hashbucket *port_hashtable;
>  	/* This is the hash of all transports. */
> -	struct rhashtable transport_hashtable;
> +	struct rhltable transport_hashtable;
>  
>  	/* Sizes of above hashtables. */
>  	int ep_hashsize;
> @@ -762,7 +762,7 @@ static inline int sctp_packet_empty(struct sctp_packet *packet)
>  struct sctp_transport {
>  	/* A list of transports. */
>  	struct list_head transports;
> -	struct rhash_head node;
> +	struct rhlist_head node;
>  
>  	/* Reference counting. */
>  	atomic_t refcnt;
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index f10d339..68428e1 100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -700,11 +700,15 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
>  	/* Set the peer's active state. */
>  	peer->state = peer_state;
>  
> +	/* Add this peer into the transport hashtable */
> +	if (sctp_hash_transport(peer)) {
> +		sctp_transport_free(peer);
> +		return NULL;
> +	}
> +
>  	/* Attach the remote transport to our asoc.  */
>  	list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list);
>  	asoc->peer.transport_count++;
> -	/* Add this peer into the transport hashtable */
> -	sctp_hash_transport(peer);
>  
>  	/* If we do not yet have a primary path, set one.  */
>  	if (!asoc->peer.primary_path) {
> diff --git a/net/sctp/input.c b/net/sctp/input.c
> index a01a56e..458e506 100644
> --- a/net/sctp/input.c
> +++ b/net/sctp/input.c
> @@ -790,10 +790,9 @@ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
>  
>  /* rhashtable for transport */
>  struct sctp_hash_cmp_arg {
> -	const struct sctp_endpoint	*ep;
> -	const union sctp_addr		*laddr;
> -	const union sctp_addr		*paddr;
> -	const struct net		*net;
> +	const union sctp_addr	*paddr;
> +	const struct net	*net;
> +	u16			lport;
>  };
>  
>  static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
> @@ -801,7 +800,6 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
>  {
>  	struct sctp_transport *t = (struct sctp_transport *)ptr;
>  	const struct sctp_hash_cmp_arg *x = arg->key;
> -	struct sctp_association *asoc;
>  	int err = 1;
>  
>  	if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr))
> @@ -809,19 +807,10 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
>  	if (!sctp_transport_hold(t))
>  		return err;
>  
> -	asoc = t->asoc;
> -	if (!net_eq(sock_net(asoc->base.sk), x->net))
> +	if (!net_eq(sock_net(t->asoc->base.sk), x->net))
> +		goto out;
> +	if (x->lport != htons(t->asoc->base.bind_addr.port))
>  		goto out;
> -	if (x->ep) {
> -		if (x->ep != asoc->ep)
> -			goto out;
> -	} else {
> -		if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port))
> -			goto out;
> -		if (!sctp_bind_addr_match(&asoc->base.bind_addr,
> -					  x->laddr, sctp_sk(asoc->base.sk)))
> -			goto out;
> -	}
>  
>  	err = 0;
>  out:
> @@ -851,11 +840,9 @@ static inline u32 sctp_hash_key(const void *data, u32 len, u32 seed)
>  	const struct sctp_hash_cmp_arg *x = data;
>  	const union sctp_addr *paddr = x->paddr;
>  	const struct net *net = x->net;
> -	u16 lport;
> +	u16 lport = x->lport;
>  	u32 addr;
>  
> -	lport = x->ep ? htons(x->ep->base.bind_addr.port) :
> -			x->laddr->v4.sin_port;
>  	if (paddr->sa.sa_family == AF_INET6)
>  		addr = jhash(&paddr->v6.sin6_addr, 16, seed);
>  	else
> @@ -875,29 +862,32 @@ static const struct rhashtable_params sctp_hash_params = {
>  
>  int sctp_transport_hashtable_init(void)
>  {
> -	return rhashtable_init(&sctp_transport_hashtable, &sctp_hash_params);
> +	return rhltable_init(&sctp_transport_hashtable, &sctp_hash_params);
>  }
>  
>  void sctp_transport_hashtable_destroy(void)
>  {
> -	rhashtable_destroy(&sctp_transport_hashtable);
> +	rhltable_destroy(&sctp_transport_hashtable);
>  }
>  
> -void sctp_hash_transport(struct sctp_transport *t)
> +int sctp_hash_transport(struct sctp_transport *t)
>  {
>  	struct sctp_hash_cmp_arg arg;
> +	int err;
>  
>  	if (t->asoc->temp)
> -		return;
> +		return 0;
>  
> -	arg.ep = t->asoc->ep;
> -	arg.paddr = &t->ipaddr;
>  	arg.net   = sock_net(t->asoc->base.sk);
> +	arg.paddr = &t->ipaddr;
> +	arg.lport = htons(t->asoc->base.bind_addr.port);
>  
> -reinsert:
> -	if (rhashtable_lookup_insert_key(&sctp_transport_hashtable, &arg,
> -					 &t->node, sctp_hash_params) == -EBUSY)
> -		goto reinsert;
> +	err = rhltable_insert_key(&sctp_transport_hashtable, &arg,
> +				  &t->node, sctp_hash_params);
> +	if (err)
> +		pr_err_once("insert transport fail, errno %d\n", err);
> +
> +	return err;
>  }
>  
>  void sctp_unhash_transport(struct sctp_transport *t)
> @@ -905,39 +895,62 @@ void sctp_unhash_transport(struct sctp_transport *t)
>  	if (t->asoc->temp)
>  		return;
>  
> -	rhashtable_remove_fast(&sctp_transport_hashtable, &t->node,
> -			       sctp_hash_params);
> +	rhltable_remove(&sctp_transport_hashtable, &t->node,
> +			sctp_hash_params);
>  }
>  
> +/* return a transport with holding it */
>  struct sctp_transport *sctp_addrs_lookup_transport(
>  				struct net *net,
>  				const union sctp_addr *laddr,
>  				const union sctp_addr *paddr)
>  {
> +	struct rhlist_head *tmp, *list;
> +	struct sctp_transport *t;
>  	struct sctp_hash_cmp_arg arg = {
> -		.ep    = NULL,
> -		.laddr = laddr,
>  		.paddr = paddr,
>  		.net   = net,
> +		.lport = laddr->v4.sin_port,
>  	};
>  
> -	return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg,
> -				      sctp_hash_params);
> +	list = rhltable_lookup(&sctp_transport_hashtable, &arg,
> +			       sctp_hash_params);
> +
> +	rhl_for_each_entry_rcu(t, tmp, list, node) {
> +		if (!sctp_transport_hold(t))
> +			continue;
> +
> +		if (sctp_bind_addr_match(&t->asoc->base.bind_addr,
> +					 laddr, sctp_sk(t->asoc->base.sk)))
> +			return t;
> +		sctp_transport_put(t);
> +	}
> +
> +	return NULL;
>  }
>  
> +/* return a transport without holding it, as it's only used under sock lock */
>  struct sctp_transport *sctp_epaddr_lookup_transport(
>  				const struct sctp_endpoint *ep,
>  				const union sctp_addr *paddr)
>  {
>  	struct net *net = sock_net(ep->base.sk);
> +	struct rhlist_head *tmp, *list;
> +	struct sctp_transport *t;
>  	struct sctp_hash_cmp_arg arg = {
> -		.ep    = ep,
>  		.paddr = paddr,
>  		.net   = net,
> +		.lport = htons(ep->base.bind_addr.port),
>  	};
>  
> -	return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg,
> -				      sctp_hash_params);
> +	list = rhltable_lookup(&sctp_transport_hashtable, &arg,
> +			       sctp_hash_params);
> +
> +	rhl_for_each_entry_rcu(t, tmp, list, node)
> +		if (ep == t->asoc->ep)
> +			return t;
> +
> +	return NULL;
>  }
>  
>  /* Look up an association. */
> @@ -951,7 +964,7 @@ static struct sctp_association *__sctp_lookup_association(
>  	struct sctp_association *asoc = NULL;
>  
>  	t = sctp_addrs_lookup_transport(net, local, peer);
> -	if (!t || !sctp_transport_hold(t))
> +	if (!t)
>  		goto out;
>  
>  	asoc = t->asoc;
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index f23ad91..d5f4b4a 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -4392,10 +4392,7 @@ int sctp_transport_walk_start(struct rhashtable_iter *iter)
>  {
>  	int err;
>  
> -	err = rhashtable_walk_init(&sctp_transport_hashtable, iter,
> -				   GFP_KERNEL);
> -	if (err)
> -		return err;
> +	rhltable_walk_enter(&sctp_transport_hashtable, iter);
>  
>  	err = rhashtable_walk_start(iter);
>  	if (err && err != -EAGAIN) {
> @@ -4479,7 +4476,7 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
>  
>  	rcu_read_lock();
>  	transport = sctp_addrs_lookup_transport(net, laddr, paddr);
> -	if (!transport || !sctp_transport_hold(transport))
> +	if (!transport)
>  		goto out;
>  
>  	rcu_read_unlock();
> -- 
> 2.1.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH net-next 0/6] Fixes for the MV88e6xxx interrupt code
From: David Miller @ 2016-11-16 19:21 UTC (permalink / raw)
  To: andrew; +Cc: vivien.didelot, netdev
In-Reply-To: <1479257816-7496-1-git-send-email-andrew@lunn.ch>

From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 16 Nov 2016 01:56:50 +0100

> The interrupt code was never tested with a board who's probing
> resulted in an -EPROBE_DEFFERED. So the clean up paths never got
> tested. I now do have -EPROBE_DEFFERED, and things break badly during
> cleanup. These are the fixes.
> 
> This is fixing code in net-next.

Series applied, thanks Andrew.

^ permalink raw reply

* Re: [PATCH net-next 0/6] Fixes for the MV88e6xxx interrupt code
From: David Miller @ 2016-11-16 19:29 UTC (permalink / raw)
  To: andrew; +Cc: vivien.didelot, netdev
In-Reply-To: <20161116.142102.1586244273660874282.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Wed, 16 Nov 2016 14:21:02 -0500 (EST)

> From: Andrew Lunn <andrew@lunn.ch>
> Date: Wed, 16 Nov 2016 01:56:50 +0100
> 
>> The interrupt code was never tested with a board who's probing
>> resulted in an -EPROBE_DEFFERED. So the clean up paths never got
>> tested. I now do have -EPROBE_DEFFERED, and things break badly during
>> cleanup. These are the fixes.
>> 
>> This is fixing code in net-next.
> 
> Series applied, thanks Andrew.

Actually, I reverted, there is a bug.

Take a look at how the 'device_irq' local variable is used in
mv88e6xxx_g2_irq_setup.  You assign it to 'err' in an error
path before it is ever set to anything.

I think you meant to use the structure's 'device_irq' member
instead.

^ permalink raw reply

* Re: [PATCH] net: dsa: mv88e6xxx: Respect SPEED_UNFORCED, don't set force bit
From: David Miller @ 2016-11-16 19:34 UTC (permalink / raw)
  To: andrew; +Cc: vivien.didelot, netdev
In-Reply-To: <1479266808-10957-1-git-send-email-andrew@lunn.ch>

From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 16 Nov 2016 04:26:48 +0100

> The SPEED_UNFORCED indicates the MAC & PHY should perform
> auto-negotiation to determine a speed which works. If this is called
> for, don't set the force bit. If it is set, the MAC actually does
> 10Gbps, why the internal PHYs don't support.
> 
> Signed-off-by: Andrew Lunn <andrew@lunn.ch>

What tree is this for?  This is a fix but the patch doesn't apply to
'net'.

^ permalink raw reply

* Re: [patch net-next 6/8] ipv4: fib: Add an API to request a FIB dump
From: Hannes Frederic Sowa @ 2016-11-16 19:43 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: Jiri Pirko, netdev, davem, idosch, eladr, yotamg, nogahf, arkadis,
	ogerlitz, roopa, dsa, nikolay, andy, vivien.didelot, andrew,
	f.fainelli, alexander.h.duyck, kuznet, jmorris, yoshfuji, kaber
In-Reply-To: <20161116185103.h3hio4pyrlk2xeol@splinter>

On 16.11.2016 19:51, Ido Schimmel wrote:
> Hi,
> 
> On Wed, Nov 16, 2016 at 06:35:45PM +0100, Hannes Frederic Sowa wrote:
>> On 16.11.2016 16:18, Ido Schimmel wrote:
>>> On Wed, Nov 16, 2016 at 03:51:01PM +0100, Hannes Frederic Sowa wrote:
>>>> On 16.11.2016 15:09, Jiri Pirko wrote:
>>>>> From: Ido Schimmel <idosch@mellanox.com>
>>>>>
>>>>> Commit b90eb7549499 ("fib: introduce FIB notification infrastructure")
>>>>> introduced a new notification chain to notify listeners (f.e., switchdev
>>>>> drivers) about addition and deletion of routes.
>>>>>
>>>>> However, upon registration to the chain the FIB tables can already be
>>>>> populated, which means potential listeners will have an incomplete view
>>>>> of the tables.
>>>>>
>>>>> Solve that by adding an API to request a FIB dump. The dump itself it
>>>>> done using RCU in order not to starve consumers that need RTNL to make
>>>>> progress.
>>>>>
>>>>> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
>>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>>>>
>>>> Have you looked at potential inconsistencies resulting of RCU walking
>>>> the table and having concurrent inserts?
>>>
>>> Yes. I did try to think about situations in which this approach will
>>> fail, but I could only find problems with concurrent removals, which I
>>> addressed in 5/8. In case of concurrent insertions, even if you missed
>>> the node, you would still get the ENTRY_ADD event to your listener.
>>
>> Theoretically a node could still be installed while the deletion event
>> fired before registering the notifier. E.g. a synchronize_net before
>> dumping could help here?
> 
> If the deletion event fired for some fib alias, then by 5/8 we are
> guaranteed that it was already unlinked from the fib alias list in the
> leaf in which it was contained. So, while it's possible we didn't
> register our listener in time for the deletion event, we won't traverse
> this fib alias while dumping the trie anyway. Did I understand you
> correctly?
> 

Theoretically we can have the same problem for insertion:

You receive a delete event from the notifier that is queued up first but
the dump will still see the entry in the fib due to being managed by RCU
(the notifier running on another CPU).

The problem is that the fib_remove_alias->hlist_del_rcu->WRITE_ONCE is
still not strongly ordered against the local fib dump trie walk.

>> I don't know how you prepare the data structures for inserting in into
>> the hardware, but if ordering matters, the notifier for a delete event
>> can be called before the dump installed the fib entry?
> 
> Right. It's possible for the listener to receive a deletion event for a
> fib entry it doesn't have, in which case it should just ignore it (as
> current listeners do).

Yep, for this specific case.

Bye,
Hannes

^ permalink raw reply

* Re: [PATCH net-next 0/6] Fixes for the MV88e6xxx interrupt code
From: Andrew Lunn @ 2016-11-16 19:45 UTC (permalink / raw)
  To: David Miller; +Cc: vivien.didelot, netdev
In-Reply-To: <20161116.142916.1432429487379929901.davem@davemloft.net>

> Take a look at how the 'device_irq' local variable is used in
> mv88e6xxx_g2_irq_setup.  You assign it to 'err' in an error
> path before it is ever set to anything.
> 
> I think you meant to use the structure's 'device_irq' member
> instead.

Hi David

Agreed. Thanks for the review.

	Andrew

^ permalink raw reply

* Re: [PATCH net-next v3 2/3] net: fsl: Allow most drivers to be built with COMPILE_TEST
From: Florian Fainelli @ 2016-11-16 19:52 UTC (permalink / raw)
  To: kbuild test robot
  Cc: kbuild-all, netdev, davem, mw, arnd, gregory.clement, Shaohui.Xie
In-Reply-To: <201611161135.ksuIHp17%fengguang.wu@intel.com>

On 11/15/2016 07:23 PM, kbuild test robot wrote:
> Hi Florian,
> 
> [auto build test WARNING on net-next/master]
> 
> url:    https://github.com/0day-ci/linux/commits/Florian-Fainelli/net-gianfar_ptp-Rename-FS-bit-to-FIPERST/20161116-095805
> config: sh-allmodconfig (attached as .config)
> compiler: sh4-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
> reproduce:
>         wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # save the attached .config to linux build tree
>         make.cross ARCH=sh 
> 
> All warnings (new ones prefixed by >>):
> 
>    drivers/net/ethernet/freescale/fsl_pq_mdio.c: In function 'fsl_pq_mdio_remove':
>>> drivers/net/ethernet/freescale/fsl_pq_mdio.c:498:27: warning: unused variable 'priv' [-Wunused-variable]
>      struct fsl_pq_mdio_priv *priv = bus->priv;

Humm, this looks bogus, the variable is used see below:

>                               ^~~~
> 
> vim +/priv +498 drivers/net/ethernet/freescale/fsl_pq_mdio.c
> 
> 1577ecef drivers/net/fsl_pq_mdio.c                    Andy Fleming    2009-02-04  482  	return 0;
> 1577ecef drivers/net/fsl_pq_mdio.c                    Andy Fleming    2009-02-04  483  
> dd3b8a32 drivers/net/ethernet/freescale/fsl_pq_mdio.c Timur Tabi      2012-08-29  484  error:
> dd3b8a32 drivers/net/ethernet/freescale/fsl_pq_mdio.c Timur Tabi      2012-08-29  485  	if (priv->map)
> b3319b10 drivers/net/fsl_pq_mdio.c                    Anton Vorontsov 2009-12-30  486  		iounmap(priv->map);
> dd3b8a32 drivers/net/ethernet/freescale/fsl_pq_mdio.c Timur Tabi      2012-08-29  487  
> 1577ecef drivers/net/fsl_pq_mdio.c                    Andy Fleming    2009-02-04  488  	kfree(new_bus);
> dd3b8a32 drivers/net/ethernet/freescale/fsl_pq_mdio.c Timur Tabi      2012-08-29  489  
> 1577ecef drivers/net/fsl_pq_mdio.c                    Andy Fleming    2009-02-04  490  	return err;
> 1577ecef drivers/net/fsl_pq_mdio.c                    Andy Fleming    2009-02-04  491  }
> 1577ecef drivers/net/fsl_pq_mdio.c                    Andy Fleming    2009-02-04  492  
> 1577ecef drivers/net/fsl_pq_mdio.c                    Andy Fleming    2009-02-04  493  
> 5078ac79 drivers/net/ethernet/freescale/fsl_pq_mdio.c Timur Tabi      2012-08-29  494  static int fsl_pq_mdio_remove(struct platform_device *pdev)
> 1577ecef drivers/net/fsl_pq_mdio.c                    Andy Fleming    2009-02-04  495  {
> 5078ac79 drivers/net/ethernet/freescale/fsl_pq_mdio.c Timur Tabi      2012-08-29  496  	struct device *device = &pdev->dev;
> 1577ecef drivers/net/fsl_pq_mdio.c                    Andy Fleming    2009-02-04  497  	struct mii_bus *bus = dev_get_drvdata(device);
> b3319b10 drivers/net/fsl_pq_mdio.c                    Anton Vorontsov 2009-12-30 @498  	struct fsl_pq_mdio_priv *priv = bus->priv;
> 1577ecef drivers/net/fsl_pq_mdio.c                    Andy Fleming    2009-02-04  499  
> 1577ecef drivers/net/fsl_pq_mdio.c                    Andy Fleming    2009-02-04  500  	mdiobus_unregister(bus);
> 1577ecef drivers/net/fsl_pq_mdio.c                    Andy Fleming    2009-02-04  501  
> b3319b10 drivers/net/fsl_pq_mdio.c                    Anton Vorontsov 2009-12-30  502  	iounmap(priv->map);

Right here.

What compiler version is this?
-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next v3 3/3] net: marvell: Allow drivers to be built with COMPILE_TEST
From: Florian Fainelli @ 2016-11-16 20:06 UTC (permalink / raw)
  To: kbuild test robot, gregory.clement, thomas.petazzoni, mw
  Cc: kbuild-all, netdev, davem, mw, arnd, Shaohui.Xie
In-Reply-To: <201611170244.EQJKm0tn%fengguang.wu@intel.com>

On 11/16/2016 11:04 AM, kbuild test robot wrote:
> Hi Florian,
> 
> [auto build test WARNING on net-next/master]
> 
> url:    https://github.com/0day-ci/linux/commits/Florian-Fainelli/net-gianfar_ptp-Rename-FS-bit-to-FIPERST/20161116-095805
> config: s390-allyesconfig (attached as .config)
> compiler: s390x-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
> reproduce:
>         wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # save the attached .config to linux build tree
>         make.cross ARCH=s390 
> 
> All warnings (new ones prefixed by >>):

While we could fix some of these warnings for 64-bit architectures, the
mvneta and mvpp2 drivers would not work there anyway since they assume
physical addresses will always be 32-bit wide and casts such addresses
accordingly.

Should we still silence these warnings?
-- 
Florian

^ permalink raw reply

* Re: [PATCH net] virtio-net: add a missing synchronize_net()
From: David Miller @ 2016-11-16 20:12 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, jasowang, mst
In-Reply-To: <1479277452.8455.156.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 15 Nov 2016 22:24:12 -0800

> From: Eric Dumazet <edumazet@google.com>
> 
> It seems many drivers do not respect napi_hash_del() contract.
> 
> When napi_hash_del() is used before netif_napi_del(), an RCU grace
> period is needed before freeing NAPI object.
> 
> Fixes: 91815639d880 ("virtio-net: rx busy polling support")
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Applied and queued up for -stable, thanks Eric.

^ permalink raw reply

* Re: [BUG] X86: Removing inline decl on arch/x86/include/asm/desc.h.
From: Eric Dumazet @ 2016-11-16 20:15 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: netdev, Ingo Molnar, H. Peter Anvin,
	Realtek linux nic maintainers, x86, linux-kernel
In-Reply-To: <alpine.DEB.2.20.1611162005320.3697@nanos>

On Wed, 2016-11-16 at 20:16 +0100, Thomas Gleixner wrote:
> On Tue, 15 Nov 2016, Corcodel Marian wrote:
> >  Inline declarations suppress warning message from compiler but
> >  most of these functions was declared static and is not used local on file.
> 
> Huch? This is a header file and the functions are marked inline on purpose.
> 
> Can you please explain what you are trying to achieve and why you think
> that this is a good idea?

Corcodel Marian is a bot.
Do not bother Thomas. 
Total Waste of time.

https://www.spinics.net/lists/netdev/msg370788.html

https://www.mail-archive.com/netdev@vger.kernel.org/msg103775.html

^ permalink raw reply

* Re: [patch net-next] mlxsw: spectrum_router: Adjust placement of FIB abort warning
From: David Miller @ 2016-11-16 20:18 UTC (permalink / raw)
  To: jiri; +Cc: netdev, idosch, eladr, yotamg, nogahf, arkadis, ogerlitz
In-Reply-To: <1479286318-6115-1-git-send-email-jiri@resnulli.us>

From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 16 Nov 2016 09:51:58 +0100

> From: Ido Schimmel <idosch@mellanox.com>
> 
> The recent merge commit bb598c1b8c9b ("Merge
> git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") would cause
> the FIB abort warning to fire whenever we flush the FIB tables - either
> during module removal or actual abort.
> 
> Move it back to its rightful location in the FIB abort function.
> 
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>

Applied, thanks for fixing this up.

^ permalink raw reply

* Re: [PATCH net][v2] bpf: fix range arithmetic for bpf map access
From: Josef Bacik @ 2016-11-16 20:25 UTC (permalink / raw)
  To: Jann Horn
  Cc: Alexei Starovoitov, Alexei Starovoitov, Daniel Borkmann,
	David S. Miller, netdev
In-Reply-To: <CAG48ez2XnOQAc440ErNBATfL8uN1VOmV4M3BJKDW9s9PvWFOtg@mail.gmail.com>

On 11/16/2016 01:41 PM, Jann Horn wrote:
> On Tue, Nov 15, 2016 at 3:20 PM, Josef Bacik <jbacik@fb.com> wrote:
>> On 11/15/2016 08:47 AM, Jann Horn wrote:
>>> In states_equal():
>>> if (rold->type == NOT_INIT ||
>>>    (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT))
>>> <------------
>>> continue;
>>>
>>> I think this is broken in code like the following:
>>>
>>> int value;
>>> if (condition) {
>>>   value = 1; // visited first by verifier
>>> } else {
>>>   value = 1000000; // visited second by verifier
>>> }
>>> int dummy = 1; // states seem to converge here, but actually don't
>>> map[value] = 1234;
>>>
>>> `value` would be an UNKNOWN_VALUE for both paths, right? So
>>> states_equal() would decide that the states converge after the
>>> conditionally executed code?
>>>
>>
>> Value would be CONST_IMM for both paths, and wouldn't match so they wouldn't
>> converge.  I think I understood your question right, let me know if I'm
>> addressing the wrong part of it.
>
> Okay, true, but what if you load the values from a map and bounds-check them
> instead of hardcoding them? Then they will be of type UNKNOWN_VALUE, right?
> Like this:
>
> int value = map[0];
> if (condition) {
>   value &= 0x1; // visited first by verifier
> } else {
>   // nothing; visited second by verifier
> }
> int dummy = 1; // states seem to converge here, but actually don't
> map[value] = 1234;
>
> And then `rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT` will be
> true in the `dummy = 1` line, and the states converge. Am I missing something?
>

Ah ok yeah I see it now you are right.  This is slightly different from this 
particular problem so I'll send a second patch to address this, sound 
reasonable?  Thanks,

Josef

^ permalink raw reply

* Re: [PATCH net][v2] bpf: fix range arithmetic for bpf map access
From: Jann Horn @ 2016-11-16 20:26 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Alexei Starovoitov, Alexei Starovoitov, Daniel Borkmann,
	David S. Miller, netdev
In-Reply-To: <935e538a-1400-cad0-c933-d4a200e5e0ef@fb.com>

On Wed, Nov 16, 2016 at 9:25 PM, Josef Bacik <jbacik@fb.com> wrote:
> On 11/16/2016 01:41 PM, Jann Horn wrote:
>>
>> On Tue, Nov 15, 2016 at 3:20 PM, Josef Bacik <jbacik@fb.com> wrote:
>>>
>>> On 11/15/2016 08:47 AM, Jann Horn wrote:
>>>>
>>>> In states_equal():
>>>> if (rold->type == NOT_INIT ||
>>>>    (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT))
>>>> <------------
>>>> continue;
>>>>
>>>> I think this is broken in code like the following:
>>>>
>>>> int value;
>>>> if (condition) {
>>>>   value = 1; // visited first by verifier
>>>> } else {
>>>>   value = 1000000; // visited second by verifier
>>>> }
>>>> int dummy = 1; // states seem to converge here, but actually don't
>>>> map[value] = 1234;
>>>>
>>>> `value` would be an UNKNOWN_VALUE for both paths, right? So
>>>> states_equal() would decide that the states converge after the
>>>> conditionally executed code?
>>>>
>>>
>>> Value would be CONST_IMM for both paths, and wouldn't match so they
>>> wouldn't
>>> converge.  I think I understood your question right, let me know if I'm
>>> addressing the wrong part of it.
>>
>>
>> Okay, true, but what if you load the values from a map and bounds-check
>> them
>> instead of hardcoding them? Then they will be of type UNKNOWN_VALUE,
>> right?
>> Like this:
>>
>> int value = map[0];
>> if (condition) {
>>   value &= 0x1; // visited first by verifier
>> } else {
>>   // nothing; visited second by verifier
>> }
>> int dummy = 1; // states seem to converge here, but actually don't
>> map[value] = 1234;
>>
>> And then `rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT` will be
>> true in the `dummy = 1` line, and the states converge. Am I missing
>> something?
>>
>
> Ah ok yeah I see it now you are right.  This is slightly different from this
> particular problem so I'll send a second patch to address this, sound
> reasonable?  Thanks,

Sure, makes sense.

^ permalink raw reply

* Re: [PATCH net 1/7] net: ethernet: ti: cpsw: fix bad register access in probe error path
From: Grygorii Strashko @ 2016-11-16 20:33 UTC (permalink / raw)
  To: Johan Hovold, Mugunthan V N; +Cc: linux-omap, netdev, linux-kernel
In-Reply-To: <1479306916-27673-2-git-send-email-johan@kernel.org>



On 11/16/2016 08:35 AM, Johan Hovold wrote:
> Make sure to resume the platform device to enable clocks before
> accessing the CPSW registers in the probe error path (e.g. for deferred
> probe).
> 
> Unhandled fault: external abort on non-linefetch (0x1008) at 0xd0872d08
> ...
> [<c04fabcc>] (cpsw_ale_control_set) from [<c04fb8b4>] (cpsw_ale_destroy+0x2c/0x44)
> [<c04fb8b4>] (cpsw_ale_destroy) from [<c04fea58>] (cpsw_probe+0xbd0/0x10c4)
> [<c04fea58>] (cpsw_probe) from [<c047b2a0>] (platform_drv_probe+0x5c/0xc0)
> 
> Note that in the unlikely event of a runtime-resume failure, we'll leak
> the ale struct.
> 
> Fixes: df828598a755 ("netdev: driver: ethernet: Add TI CPSW driver")
> Signed-off-by: Johan Hovold <johan@kernel.org>
> ---
>  drivers/net/ethernet/ti/cpsw.c | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
> index c6cff3d2ff05..5bc5e6189661 100644
> --- a/drivers/net/ethernet/ti/cpsw.c
> +++ b/drivers/net/ethernet/ti/cpsw.c
> @@ -2818,7 +2818,12 @@ static int cpsw_probe(struct platform_device *pdev)
>  	return 0;
>  
>  clean_ale_ret:
> -	cpsw_ale_destroy(cpsw->ale);
> +	if (pm_runtime_get_sync(&pdev->dev) < 0) {
> +		pm_runtime_put_noidle(&pdev->dev);
> +	} else {
> +		cpsw_ale_destroy(cpsw->ale);
> +		pm_runtime_put_sync(&pdev->dev);
> +	}
>  clean_dma_ret:
>  	cpdma_ctlr_destroy(cpsw->dma);
>  clean_runtime_disable_ret:
> 

I think, wouldn't it be logically more simple to just keep CPSW PM runtime enabled during probe?
Like in below diff (not tested):

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 0548e56..deaac1b 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -2657,13 +2657,12 @@ static int cpsw_probe(struct platform_device *pdev)
                goto clean_runtime_disable_ret;
        }
        cpsw->version = readl(&cpsw->regs->id_ver);
-       pm_runtime_put_sync(&pdev->dev);
 
        res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
        cpsw->wr_regs = devm_ioremap_resource(&pdev->dev, res);
        if (IS_ERR(cpsw->wr_regs)) {
                ret = PTR_ERR(cpsw->wr_regs);
-               goto clean_runtime_disable_ret;
+               goto clean_runtime_put_ret;
        }
 
        memset(&dma_params, 0, sizeof(dma_params));
@@ -2700,7 +2699,7 @@ static int cpsw_probe(struct platform_device *pdev)
        default:
                dev_err(priv->dev, "unknown version 0x%08x\n", cpsw->version);
                ret = -ENODEV;
-               goto clean_runtime_disable_ret;
+               goto clean_runtime_put_ret;
        }
        for (i = 0; i < cpsw->data.slaves; i++) {
                struct cpsw_slave *slave = &cpsw->slaves[i];
@@ -2729,7 +2728,7 @@ static int cpsw_probe(struct platform_device *pdev)
        if (!cpsw->dma) {
                dev_err(priv->dev, "error initializing dma\n");
                ret = -ENOMEM;
-               goto clean_runtime_disable_ret;
+               goto clean_runtime_put_ret;
        }
 
        cpsw->txch[0] = cpdma_chan_create(cpsw->dma, 0, cpsw_tx_handler, 0);
@@ -2831,12 +2830,16 @@ static int cpsw_probe(struct platform_device *pdev)
                }
        }
 
+       pm_runtime_put(&pdev->dev);
+
        return 0;
 
 clean_ale_ret:
        cpsw_ale_destroy(cpsw->ale);
 clean_dma_ret:
        cpdma_ctlr_destroy(cpsw->dma);
+clean_runtime_put_ret:
+       pm_runtime_put_sync(&pdev->dev);
 clean_runtime_disable_ret:
        pm_runtime_disable(&pdev->dev);
 clean_ndev_ret:

-- 
regards,
-grygorii

^ permalink raw reply related

* HELLO
From: Wilfred Kabore @ 2016-11-16 20:39 UTC (permalink / raw)
  To: you

Dear Friend,

Greetings and hope this mail meets you well!

I want you to be my partner in the transfer of the sum of $6.7 Million dollars discovered in my  department in one of the leading Banks (EcoBank Plc) here in Burkina-Faso. I shall give you more information on this proposal when I get your reply, but rest assured that I will give you 40%  of the total sum once the transfer is completed but you have to maintain secrecy of this business deal if you are ready to work with me.

God bless you as I wait for your response.

Dr. Kabore Wilfred

^ permalink raw reply

* Re: [patch net-next 6/8] ipv4: fib: Add an API to request a FIB dump
From: Ido Schimmel @ 2016-11-16 21:06 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: Jiri Pirko, netdev, davem, idosch, eladr, yotamg, nogahf, arkadis,
	ogerlitz, roopa, dsa, nikolay, andy, vivien.didelot, andrew,
	f.fainelli, alexander.h.duyck, kuznet, jmorris, yoshfuji, kaber
In-Reply-To: <56d2179d-00ff-bcc5-e365-845179cbe672@stressinduktion.org>

On Wed, Nov 16, 2016 at 08:43:25PM +0100, Hannes Frederic Sowa wrote:
> On 16.11.2016 19:51, Ido Schimmel wrote:
> > On Wed, Nov 16, 2016 at 06:35:45PM +0100, Hannes Frederic Sowa wrote:
> >> On 16.11.2016 16:18, Ido Schimmel wrote:
> >>> On Wed, Nov 16, 2016 at 03:51:01PM +0100, Hannes Frederic Sowa wrote:
> >>>> On 16.11.2016 15:09, Jiri Pirko wrote:
> >>>>> From: Ido Schimmel <idosch@mellanox.com>
> >>>>>
> >>>>> Commit b90eb7549499 ("fib: introduce FIB notification infrastructure")
> >>>>> introduced a new notification chain to notify listeners (f.e., switchdev
> >>>>> drivers) about addition and deletion of routes.
> >>>>>
> >>>>> However, upon registration to the chain the FIB tables can already be
> >>>>> populated, which means potential listeners will have an incomplete view
> >>>>> of the tables.
> >>>>>
> >>>>> Solve that by adding an API to request a FIB dump. The dump itself it
> >>>>> done using RCU in order not to starve consumers that need RTNL to make
> >>>>> progress.
> >>>>>
> >>>>> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> >>>>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> >>>>
> >>>> Have you looked at potential inconsistencies resulting of RCU walking
> >>>> the table and having concurrent inserts?
> >>>
> >>> Yes. I did try to think about situations in which this approach will
> >>> fail, but I could only find problems with concurrent removals, which I
> >>> addressed in 5/8. In case of concurrent insertions, even if you missed
> >>> the node, you would still get the ENTRY_ADD event to your listener.
> >>
> >> Theoretically a node could still be installed while the deletion event
> >> fired before registering the notifier. E.g. a synchronize_net before
> >> dumping could help here?
> > 
> > If the deletion event fired for some fib alias, then by 5/8 we are
> > guaranteed that it was already unlinked from the fib alias list in the
> > leaf in which it was contained. So, while it's possible we didn't
> > register our listener in time for the deletion event, we won't traverse
> > this fib alias while dumping the trie anyway. Did I understand you
> > correctly?
> > 
> 
> Theoretically we can have the same problem for insertion:
> 
> You receive a delete event from the notifier that is queued up first but
> the dump will still see the entry in the fib due to being managed by RCU
> (the notifier running on another CPU).
> 
> The problem is that the fib_remove_alias->hlist_del_rcu->WRITE_ONCE is
> still not strongly ordered against the local fib dump trie walk.

It's pretty late here so I would have to check this out tomorrow
morning. If this is indeed the case (not saying you're wrong, just want
to verify for myself), then I guess 5/8 can be dropped and instead we
should go with Dave's suggestion? I don't see any other way given the
constraints...

Thanks a lot Hannes!

^ permalink raw reply

* Re: net/l2tp:BUG: KASAN: use-after-free in l2tp_ip6_close
From: Guillaume Nault @ 2016-11-16 21:07 UTC (permalink / raw)
  To: Cong Wang; +Cc: Baozeng Ding, Linux Kernel Network Developers
In-Reply-To: <CAM_iQpUxNYpkRetxX88z=iFZiZ1beQqBt+9qFsDTnAYwWCoHfA@mail.gmail.com>

On Wed, Nov 16, 2016 at 11:08:23AM -0800, Cong Wang wrote:
> On Wed, Nov 16, 2016 at 8:30 AM, Guillaume Nault <g.nault@alphalink.fr> wrote:
> > diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
> > index fce25af..982f6c4 100644
> > --- a/net/l2tp/l2tp_ip.c
> > +++ b/net/l2tp/l2tp_ip.c
> > @@ -251,8 +251,6 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
> >         int ret;
> >         int chk_addr_ret;
> >
> > -       if (!sock_flag(sk, SOCK_ZAPPED))
> > -               return -EINVAL;
> >         if (addr_len < sizeof(struct sockaddr_l2tpip))
> >                 return -EINVAL;
> >         if (addr->l2tp_family != AF_INET)
> > @@ -267,6 +265,9 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
> >         read_unlock_bh(&l2tp_ip_lock);
> >
> >         lock_sock(sk);
> > +       if (!sock_flag(sk, SOCK_ZAPPED))
> > +               goto out;
> > +
> >         if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip))
> >                 goto out;
> >
> > diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
> > index ad3468c..9978d01 100644
> > --- a/net/l2tp/l2tp_ip6.c
> > +++ b/net/l2tp/l2tp_ip6.c
> > @@ -269,8 +269,6 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
> >         int addr_type;
> >         int err;
> >
> > -       if (!sock_flag(sk, SOCK_ZAPPED))
> > -               return -EINVAL;
> >         if (addr->l2tp_family != AF_INET6)
> >                 return -EINVAL;
> >         if (addr_len < sizeof(*addr))
> > @@ -296,6 +294,9 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
> >         lock_sock(sk);
> >
> >         err = -EINVAL;
> > +       if (!sock_flag(sk, SOCK_ZAPPED))
> > +               goto out_unlock;
> > +
> >         if (sk->sk_state != TCP_CLOSE)
> >                 goto out_unlock;
> 
> 
> Makes sense, it should prevent a concurrent caller adding the socket
> into bind table
> twice after passing __l2tp_ip_bind_lookup() check.

Yes, and the __l2tp_ip_bind_lookup() call is also racy. But, by
properly checking the SOCK_ZAPPED flag, we probably can remove this
call entirely.

For now, I only wanted to make sure the issue was well identified. I'll
submit a more complete patch for net (with protected SOCK_ZAPPED check
in l2tp_ip_connect() too).

^ permalink raw reply

* Re: [PATCH] rtl8xxxu: Fix for agressive power saving by rtl8723bu wireless IC
From: Jes Sorensen @ 2016-11-16 21:29 UTC (permalink / raw)
  To: John Heenan
  Cc: Barry Day, Rafał Miłecki, Kalle Valo, linux-wireless,
	netdev, linux-kernel
In-Reply-To: <CAAye0QP0FQetaaa-RE23teVgZ335ELE8O5qnzMA7CwQ4T8wc_Q@mail.gmail.com>

John Heenan <john@zgus.com> writes:
> Barry Day has submitted real world reports for the 8192eu and 8192cu.
> This needs to be acknowledged. I have submitted real world reports for
> the 8723bu.

Lets get this a little more clear - first of all, I have asked you to
investigate which part resolves the problem. Rather than 'I randomly
moved something around and it happens to work for me'.

> When it comes down to it, it looks like the kernel code changes are
> really going to be very trivial to fix this problem and we need to
> take the focus off dramatic outbursts over style issues to a strategy
> for getting usable results from real world testing.
>
> Addressing style issues in a dramatic manner to me looks like a mean
> sport for maintainers who line up to easy target first time
> contributors. This mean attitude comes from the top with a well known
> comment about "publicly making fun of people". The polite comments
> over style from Joe Perches and Rafał Miłecki are welcomed.

Once bad code is in place, it is way harder to get rid of it again. It
is *normal* for maintainers to ask contributors to do things
correctly. In addition you have been asked repeatedly by multiple people
to respect coding style, but every patch you posted violated it again in
a different way, instead of spending the little time it would take for
you to get it right.

> An effective strategy would be to insert some printk statements to
> trace what init steps vendor derived drivers do each time
> wpa_supplicant is called and ask real world testers to report their
> results. This is a lot more productive and less error prone than
> laboriously pouring over vendor source code. Alternative drivers that
> use vendor code from Realtek is enormously complicated and a huge pain
> to make sense of.
>
> Joe Sorensen's driver code is far easier to make sense of and it is a
> shame Realtek don't come to the party. Joe Sorensens's code take takes
> advantage of the excellent work of kernel contributors to the mac80211
> driver.

Now you are pissing on my name - do you really want to be taken
seriously here?

> Previous comments I made about enable_rf, rtl8xxxu_start,
> rtl8xxxu_init_device etc should be clarified. I will leave it for the
> moment as it currently serves no direct useful purpose.

I have made it very clear I want this issue resolved, but I want it
done right.

Jes

^ permalink raw reply

* [PATCH net-next 3/3] RDS: TCP: Force every connection to be initiated by numerically smaller IP address
From: Sowmini Varadhan @ 2016-11-16 21:29 UTC (permalink / raw)
  To: netdev; +Cc: santosh.shilimkar, sowmini.varadhan, davem, rds-devel
In-Reply-To: <cover.1478876910.git.sowmini.varadhan@oracle.com>

When 2 RDS peers initiate an RDS-TCP connection simultaneously,
there is a potential for "duelling syns" on either/both sides.
See commit 241b271952eb ("RDS-TCP: Reset tcp callbacks if re-using an
outgoing socket in rds_tcp_accept_one()") for a description of this
condition, and the arbitration logic which ensures that the
numerically large IP address in the TCP connection is bound to the
RDS_TCP_PORT ("canonical ordering").

The rds_connection should not be marked as RDS_CONN_UP until the
arbitration logic has converged for the following reason. The sender
may start transmitting RDS datagrams as soon as RDS_CONN_UP is set,
and since the sender removes all datagrams from the rds_connection's
cp_retrans queue based on TCP acks. If the TCP ack was sent from
a tcp socket that got reset as part of duel aribitration (but
before data was delivered to the receivers RDS socket layer),
the sender may end up prematurely freeing the datagram, and
the datagram is no longer reliably deliverable.

This patch remedies that condition by making sure that, upon
receipt of 3WH completion state change notification of TCP_ESTABLISHED
in rds_tcp_state_change, we mark the rds_connection as RDS_CONN_UP
if, and only if, the IP addresses and ports for the connection are
canonically ordered. In all other cases, rds_tcp_state_change will
force an rds_conn_path_drop(), and rds_queue_reconnect() on
both peers will restart the connection to ensure canonical ordering.

A side-effect of enforcing this condition in rds_tcp_state_change()
is that rds_tcp_accept_one_path() can now be refactored for simplicity.
It is also no longer possible to encounter an RDS_CONN_UP connection in
the arbitration logic in rds_tcp_accept_one().

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/connection.c  |    1 +
 net/rds/tcp_connect.c |   14 +++++++++++++-
 net/rds/tcp_listen.c  |   29 ++++++++++++-----------------
 3 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/net/rds/connection.c b/net/rds/connection.c
index b86e188..fe9d31c 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -683,6 +683,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
 	    !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
 		queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
 }
+EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);

 void rds_conn_connect_if_down(struct rds_connection *conn)
 {
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 05f61c5..d6839d9 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -60,7 +60,19 @@ void rds_tcp_state_change(struct sock *sk)
 	case TCP_SYN_RECV:
 		break;
 	case TCP_ESTABLISHED:
-		rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+		/* Force the peer to reconnect so that we have the
+		 * TCP ports going from <smaller-ip>.<transient> to
+		 * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the
+		 * RDS connection as RDS_CONN_UP until the reconnect,
+		 * to avoid RDS datagram loss.
+		 */
+		if (cp->cp_conn->c_laddr > cp->cp_conn->c_faddr &&
+		    rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
+					     RDS_CONN_ERROR)) {
+			rds_conn_path_drop(cp);
+		} else {
+			rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
+		}
 		break;
 	case TCP_CLOSE_WAIT:
 	case TCP_CLOSE:
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index c9c4968..f74bab3 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -83,25 +83,20 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
 {
 	int i;
 	bool peer_is_smaller = (conn->c_faddr < conn->c_laddr);
-	int npaths = conn->c_npaths;
-
-	if (npaths <= 1) {
-		struct rds_conn_path *cp = &conn->c_path[0];
-		int ret;
-
-		ret = rds_conn_path_transition(cp, RDS_CONN_DOWN,
-					       RDS_CONN_CONNECTING);
-		if (!ret)
-			rds_conn_path_transition(cp, RDS_CONN_ERROR,
-						 RDS_CONN_CONNECTING);
-		return cp->cp_transport_data;
-	}
+	int npaths = max_t(int, 1, conn->c_npaths);

-	/* for mprds, paths with cp_index > 0 MUST be initiated by the peer
+	/* for mprds, all paths MUST be initiated by the peer
 	 * with the smaller address.
 	 */
-	if (!peer_is_smaller)
+	if (!peer_is_smaller) {
+		/* Make sure we initiate at least one path if this
+		 * has not already been done; rds_start_mprds() will
+		 * take care of additional paths, if necessary.
+		 */
+		if (npaths == 1)
+			rds_conn_path_connect_if_down(&conn->c_path[0]);
 		return NULL;
+	}

 	for (i = 0; i < npaths; i++) {
 		struct rds_conn_path *cp = &conn->c_path[i];
@@ -171,8 +166,8 @@ int rds_tcp_accept_one(struct socket *sock)
 	mutex_lock(&rs_tcp->t_conn_path_lock);
 	cp = rs_tcp->t_cpath;
 	conn_state = rds_conn_path_state(cp);
-	if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP &&
-	    conn_state != RDS_CONN_ERROR)
+	WARN_ON(conn_state == RDS_CONN_UP);
+	if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR)
 		goto rst_nsk;
 	if (rs_tcp->t_sock) {
 		/* Need to resolve a duelling SYN between peers.
-- 
1.7.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox