Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 0/4] net: bcmgenet: switch to TX NAPI
From: Florian Fainelli @ 2014-10-24 20:02 UTC (permalink / raw)
  To: netdev; +Cc: davem, pgynther, Florian Fainelli

Hi David, Petri,

This patch set updates the GENET driver to switch the transmit to NAPI instead
of using a mix of hard interrupt + RX NAPI reclaim.

Florian Fainelli (4):
  net: bcmgenet: consistently use UMAC_IRQ_RXDMA_MASK
  net: bcmgenet: return number of packets completed in TX reclaim
  net: bcmgenet: reclaim transmitted buffers in NAPI context
  net: bcmgenet: update ring producer index and buffer count in xmit

 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 71 ++++++++++++--------------
 1 file changed, 32 insertions(+), 39 deletions(-)

-- 
1.9.1

^ permalink raw reply

* [PATCH net-next 1/4] net: bcmgenet: consistently use UMAC_IRQ_RXDMA_MASK
From: Florian Fainelli @ 2014-10-24 20:02 UTC (permalink / raw)
  To: netdev; +Cc: davem, pgynther, Florian Fainelli
In-Reply-To: <1414180942-2132-1-git-send-email-f.fainelli@gmail.com>

Define a constant which sets all RXDMA interrupts bits, and use it
consistently throughout the driver to check for RX DMA interrupt bits.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index fdc9ec09e453..ee4d5baf09b6 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -47,6 +47,10 @@
 
 #include "bcmgenet.h"
 
+#define UMAC_IRQ_RXDMA_MASK	(UMAC_IRQ_RXDMA_BDONE | UMAC_IRQ_RXDMA_PDONE)
+#define UMAC_IRQ_TXDMA_MASK	(UMAC_IRQ_TXDMA_BDONE | UMAC_IRQ_TXDMA_PDONE)
+#define UMAC_IRQ_RX_TX_MASK	(UMAC_IRQ_RXDMA_MASK | UMAC_IRQ_TXDMA_MASK)
+
 /* Maximum number of hardware queues, downsized if needed */
 #define GENET_MAX_MQ_CNT	4
 
@@ -1543,9 +1547,9 @@ static int init_umac(struct bcmgenet_priv *priv)
 
 	bcmgenet_intr_disable(priv);
 
-	cpu_mask_clear = UMAC_IRQ_RXDMA_BDONE;
+	cpu_mask_clear = UMAC_IRQ_RXDMA_MASK;
 
-	dev_dbg(kdev, "%s:Enabling RXDMA_BDONE interrupt\n", __func__);
+	dev_dbg(kdev, "%s:Enabling RXDMA interrupts\n", __func__);
 
 	/* Monitor cable plug/unplugged event for internal PHY */
 	if (phy_is_internal(priv->phydev)) {
@@ -1883,7 +1887,7 @@ static int bcmgenet_poll(struct napi_struct *napi, int budget)
 				  priv->rx_c_index, RDMA_CONS_INDEX);
 	if (work_done < budget) {
 		napi_complete(napi);
-		bcmgenet_intrl2_0_writel(priv, UMAC_IRQ_RXDMA_BDONE,
+		bcmgenet_intrl2_0_writel(priv, UMAC_IRQ_RXDMA_MASK,
 					 INTRL2_CPU_MASK_CLEAR);
 	}
 
@@ -1958,13 +1962,13 @@ static irqreturn_t bcmgenet_isr0(int irq, void *dev_id)
 	netif_dbg(priv, intr, priv->dev,
 		  "IRQ=0x%x\n", priv->irq0_stat);
 
-	if (priv->irq0_stat & (UMAC_IRQ_RXDMA_BDONE | UMAC_IRQ_RXDMA_PDONE)) {
+	if (priv->irq0_stat & UMAC_IRQ_RXDMA_MASK) {
 		/* We use NAPI(software interrupt throttling, if
 		 * Rx Descriptor throttling is not used.
 		 * Disable interrupt, will be enabled in the poll method.
 		 */
 		if (likely(napi_schedule_prep(&priv->napi))) {
-			bcmgenet_intrl2_0_writel(priv, UMAC_IRQ_RXDMA_BDONE,
+			bcmgenet_intrl2_0_writel(priv, UMAC_IRQ_RXDMA_MASK,
 						 INTRL2_CPU_MASK_SET);
 			__napi_schedule(&priv->napi);
 		}
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 2/4] net: bcmgenet: return number of packets completed in TX reclaim
From: Florian Fainelli @ 2014-10-24 20:02 UTC (permalink / raw)
  To: netdev; +Cc: davem, pgynther, Florian Fainelli
In-Reply-To: <1414180942-2132-1-git-send-email-f.fainelli@gmail.com>

In preparation for reclaiming transmitted buffers in NAPI context,
update __bcmgenet_tx_reclaim() and bcmgenet_tx_reclaim() to return the
number of packets completed per call.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index ee4d5baf09b6..70f2fb366375 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -876,14 +876,14 @@ static inline void bcmgenet_tx_ring_int_disable(struct bcmgenet_priv *priv,
 }
 
 /* Unlocked version of the reclaim routine */
-static void __bcmgenet_tx_reclaim(struct net_device *dev,
-				  struct bcmgenet_tx_ring *ring)
+static unsigned int __bcmgenet_tx_reclaim(struct net_device *dev,
+					  struct bcmgenet_tx_ring *ring)
 {
 	struct bcmgenet_priv *priv = netdev_priv(dev);
 	int last_tx_cn, last_c_index, num_tx_bds;
 	struct enet_cb *tx_cb_ptr;
 	struct netdev_queue *txq;
-	unsigned int bds_compl;
+	unsigned int bds_compl, pkts_compl = 0;
 	unsigned int c_index;
 
 	/* Compute how many buffers are transmitted since last xmit call */
@@ -928,6 +928,7 @@ static void __bcmgenet_tx_reclaim(struct net_device *dev,
 		}
 		dev->stats.tx_packets++;
 		ring->free_bds += bds_compl;
+		pkts_compl += bds_compl;
 
 		last_c_index++;
 		last_c_index &= (num_tx_bds - 1);
@@ -936,20 +937,25 @@ static void __bcmgenet_tx_reclaim(struct net_device *dev,
 	if (ring->free_bds > (MAX_SKB_FRAGS + 1))
 		ring->int_disable(priv, ring);
 
-	if (netif_tx_queue_stopped(txq))
+	if (netif_tx_queue_stopped(txq) && pkts_compl)
 		netif_tx_wake_queue(txq);
 
 	ring->c_index = c_index;
+
+	return pkts_compl;
 }
 
-static void bcmgenet_tx_reclaim(struct net_device *dev,
-				struct bcmgenet_tx_ring *ring)
+static unsigned int bcmgenet_tx_reclaim(struct net_device *dev,
+					struct bcmgenet_tx_ring *ring)
 {
 	unsigned long flags;
+	unsigned int pkts_compl;
 
 	spin_lock_irqsave(&ring->lock, flags);
-	__bcmgenet_tx_reclaim(dev, ring);
+	pkts_compl = __bcmgenet_tx_reclaim(dev, ring);
 	spin_unlock_irqrestore(&ring->lock, flags);
+
+	return pkts_compl;
 }
 
 static void bcmgenet_tx_reclaim_all(struct net_device *dev)
-- 
1.9.1

^ permalink raw reply related

* [PATCH] ath6kl: remove incorrect reset_resume handler
From: Alexey Khoroshilov @ 2014-10-24 20:02 UTC (permalink / raw)
  To: Kalle Valo
  Cc: Alexey Khoroshilov, John W. Linville, linux-wireless, netdev,
	linux-kernel, ldv-project

Existing implementation of reset_resume handler just calls
ath6kl_usb_remove() that deallocates all resources.
It can lead to double free, etc. on disconnect.

The patch removes reset_resume handler,
so usb core could conservatively reset the driver.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
---
 drivers/net/wireless/ath/ath6kl/usb.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/drivers/net/wireless/ath/ath6kl/usb.c b/drivers/net/wireless/ath/ath6kl/usb.c
index a6a5e40b3e98..9da3594fd010 100644
--- a/drivers/net/wireless/ath/ath6kl/usb.c
+++ b/drivers/net/wireless/ath/ath6kl/usb.c
@@ -1193,18 +1193,10 @@ static int ath6kl_usb_pm_resume(struct usb_interface *interface)
 	return 0;
 }
 
-static int ath6kl_usb_pm_reset_resume(struct usb_interface *intf)
-{
-	if (usb_get_intfdata(intf))
-		ath6kl_usb_remove(intf);
-	return 0;
-}
-
 #else
 
 #define ath6kl_usb_pm_suspend NULL
 #define ath6kl_usb_pm_resume NULL
-#define ath6kl_usb_pm_reset_resume NULL
 
 #endif
 
@@ -1222,7 +1214,6 @@ static struct usb_driver ath6kl_usb_driver = {
 	.probe = ath6kl_usb_probe,
 	.suspend = ath6kl_usb_pm_suspend,
 	.resume = ath6kl_usb_pm_resume,
-	.reset_resume = ath6kl_usb_pm_reset_resume,
 	.disconnect = ath6kl_usb_remove,
 	.id_table = ath6kl_usb_ids,
 	.supports_autosuspend = true,
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 3/4] net: bcmgenet: reclaim transmitted buffers in NAPI context
From: Florian Fainelli @ 2014-10-24 20:02 UTC (permalink / raw)
  To: netdev; +Cc: davem, pgynther, Florian Fainelli
In-Reply-To: <1414180942-2132-1-git-send-email-f.fainelli@gmail.com>

The GENET driver is currently reclaiming transmitted buffers from hard
interrupt context in bcmgenet_isr0 as well as NAPI context in
bcmgenet_poll, which is not consistent and not ideal. Instead, update
the driver to reclaim transmitted buffers in NAPI context only and
properly switch the TX path to use interrupt mitigation based on NAPI.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 29 +++++++++-----------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 70f2fb366375..d6f4a7ace05e 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -934,9 +934,6 @@ static unsigned int __bcmgenet_tx_reclaim(struct net_device *dev,
 		last_c_index &= (num_tx_bds - 1);
 	}
 
-	if (ring->free_bds > (MAX_SKB_FRAGS + 1))
-		ring->int_disable(priv, ring);
-
 	if (netif_tx_queue_stopped(txq) && pkts_compl)
 		netif_tx_wake_queue(txq);
 
@@ -1211,10 +1208,8 @@ static netdev_tx_t bcmgenet_xmit(struct sk_buff *skb, struct net_device *dev)
 	bcmgenet_tdma_ring_writel(priv, ring->index,
 				  ring->prod_index, TDMA_PROD_INDEX);
 
-	if (ring->free_bds <= (MAX_SKB_FRAGS + 1)) {
+	if (ring->free_bds <= (MAX_SKB_FRAGS + 1))
 		netif_tx_stop_queue(txq);
-		ring->int_enable(priv, ring);
-	}
 
 out:
 	spin_unlock_irqrestore(&ring->lock, flags);
@@ -1553,9 +1548,9 @@ static int init_umac(struct bcmgenet_priv *priv)
 
 	bcmgenet_intr_disable(priv);
 
-	cpu_mask_clear = UMAC_IRQ_RXDMA_MASK;
+	cpu_mask_clear = UMAC_IRQ_RX_TX_MASK;
 
-	dev_dbg(kdev, "%s:Enabling RXDMA interrupts\n", __func__);
+	dev_dbg(kdev, "%s:Enabling RXDMA & TXDMA interrupts\n", __func__);
 
 	/* Monitor cable plug/unplugged event for internal PHY */
 	if (phy_is_internal(priv->phydev)) {
@@ -1879,10 +1874,10 @@ static int bcmgenet_poll(struct napi_struct *napi, int budget)
 {
 	struct bcmgenet_priv *priv = container_of(napi,
 			struct bcmgenet_priv, napi);
-	unsigned int work_done;
+	unsigned int work_done, tx_work;
 
 	/* tx reclaim */
-	bcmgenet_tx_reclaim(priv->dev, &priv->tx_rings[DESC_INDEX]);
+	tx_work = bcmgenet_tx_reclaim(priv->dev, &priv->tx_rings[DESC_INDEX]);
 
 	work_done = bcmgenet_desc_rx(priv, budget);
 
@@ -1891,9 +1886,9 @@ static int bcmgenet_poll(struct napi_struct *napi, int budget)
 	priv->rx_c_index &= DMA_C_INDEX_MASK;
 	bcmgenet_rdma_ring_writel(priv, DESC_INDEX,
 				  priv->rx_c_index, RDMA_CONS_INDEX);
-	if (work_done < budget) {
+	if (work_done < budget || tx_work == 0) {
 		napi_complete(napi);
-		bcmgenet_intrl2_0_writel(priv, UMAC_IRQ_RXDMA_MASK,
+		bcmgenet_intrl2_0_writel(priv, UMAC_IRQ_RX_TX_MASK,
 					 INTRL2_CPU_MASK_CLEAR);
 	}
 
@@ -1968,22 +1963,18 @@ static irqreturn_t bcmgenet_isr0(int irq, void *dev_id)
 	netif_dbg(priv, intr, priv->dev,
 		  "IRQ=0x%x\n", priv->irq0_stat);
 
-	if (priv->irq0_stat & UMAC_IRQ_RXDMA_MASK) {
+	if (priv->irq0_stat & UMAC_IRQ_RX_TX_MASK) {
 		/* We use NAPI(software interrupt throttling, if
 		 * Rx Descriptor throttling is not used.
 		 * Disable interrupt, will be enabled in the poll method.
 		 */
 		if (likely(napi_schedule_prep(&priv->napi))) {
-			bcmgenet_intrl2_0_writel(priv, UMAC_IRQ_RXDMA_MASK,
+			bcmgenet_intrl2_0_writel(priv, UMAC_IRQ_RX_TX_MASK,
 						 INTRL2_CPU_MASK_SET);
 			__napi_schedule(&priv->napi);
 		}
 	}
-	if (priv->irq0_stat &
-			(UMAC_IRQ_TXDMA_BDONE | UMAC_IRQ_TXDMA_PDONE)) {
-		/* Tx reclaim */
-		bcmgenet_tx_reclaim(priv->dev, &priv->tx_rings[DESC_INDEX]);
-	}
+
 	if (priv->irq0_stat & (UMAC_IRQ_PHY_DET_R |
 				UMAC_IRQ_PHY_DET_F |
 				UMAC_IRQ_LINK_UP |
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next 4/4] net: bcmgenet: update ring producer index and buffer count in xmit
From: Florian Fainelli @ 2014-10-24 20:02 UTC (permalink / raw)
  To: netdev; +Cc: davem, pgynther, Florian Fainelli
In-Reply-To: <1414180942-2132-1-git-send-email-f.fainelli@gmail.com>

There is no need to have both bcmgenet_xmit_single() and
bcmgenet_xmit_frag() perform a free_bds decrement and a prod_index
increment by one. In case one of these functions fails to map a SKB or
fragment for transmit, we will return and exit bcmgenet_xmit() with an
error.

We can therefore safely use our local copy of nr_frags to know by how
much we should decrement the number of free buffers available, and by
how much the producer count must be incremented and do this in the tail
of bcmgenet_xmit().

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index d6f4a7ace05e..3cd8c25a1120 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -1012,11 +1012,6 @@ static int bcmgenet_xmit_single(struct net_device *dev,
 
 	dmadesc_set(priv, tx_cb_ptr->bd_addr, mapping, length_status);
 
-	/* Decrement total BD count and advance our write pointer */
-	ring->free_bds -= 1;
-	ring->prod_index += 1;
-	ring->prod_index &= DMA_P_INDEX_MASK;
-
 	return 0;
 }
 
@@ -1054,11 +1049,6 @@ static int bcmgenet_xmit_frag(struct net_device *dev,
 		    (frag->size << DMA_BUFLENGTH_SHIFT) | dma_desc_flags |
 		    (priv->hw_params->qtag_mask << DMA_TX_QTAG_SHIFT));
 
-
-	ring->free_bds -= 1;
-	ring->prod_index += 1;
-	ring->prod_index &= DMA_P_INDEX_MASK;
-
 	return 0;
 }
 
@@ -1202,9 +1192,11 @@ static netdev_tx_t bcmgenet_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	skb_tx_timestamp(skb);
 
-	/* we kept a software copy of how much we should advance the TDMA
-	 * producer index, now write it down to the hardware
-	 */
+	/* Decrement total BD count and advance our write pointer */
+	ring->free_bds -= nr_frags + 1;
+	ring->prod_index += nr_frags + 1;
+	ring->prod_index &= DMA_P_INDEX_MASK;
+
 	bcmgenet_tdma_ring_writel(priv, ring->index,
 				  ring->prod_index, TDMA_PROD_INDEX);
 
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH net] ptp: restore the makefile for building the test program.
From: David Miller @ 2014-10-24 20:08 UTC (permalink / raw)
  To: richardcochran; +Cc: netdev, pefoley2
In-Reply-To: <edb4b32ca0326b648d905620d95297cef5ed9391.1414006463.git.rcochran@linutronix.de>

From: Richard Cochran <richardcochran@gmail.com>
Date: Wed, 22 Oct 2014 21:35:15 +0200

> This patch brings back the makefile called testptp.mk which was removed
> in commit adb19fb66eee (Documentation: add makefiles for more targets).
> 
> While the idea of that commit was to improve build coverage of the
> examples, the new Makefile is unable to cross compile the testptp program.
> In contrast, the deleted makefile was able to do this just fine.
> 
> This patch fixes the regression by restoring the original makefile.
> 
> Signed-off-by: Richard Cochran <richardcochran@gmail.com>

Applied, thanks Richard.

^ permalink raw reply

* Re: localed stuck in recent 3.18 git in copy_net_ns?
From: Paul E. McKenney @ 2014-10-24 20:15 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Yanko Kaneti, Josh Boyer, Eric W. Biederman, Cong Wang,
	Kevin Fenzi, netdev, Linux-Kernel@Vger. Kernel. Org, mroos
In-Reply-To: <20141024185753.GY4977@linux.vnet.ibm.com>

On Fri, Oct 24, 2014 at 11:57:53AM -0700, Paul E. McKenney wrote:
> On Fri, Oct 24, 2014 at 11:49:48AM -0700, Jay Vosburgh wrote:
> > Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > 
> > >On Fri, Oct 24, 2014 at 08:35:26PM +0300, Yanko Kaneti wrote:
> > >> On Fri-10/24/14-2014 10:20, Paul E. McKenney wrote:
> > >> > On Fri, Oct 24, 2014 at 08:09:31PM +0300, Yanko Kaneti wrote:
> > >> > > On Fri-10/24/14-2014 09:54, Paul E. McKenney wrote:
> > >> > > > On Fri, Oct 24, 2014 at 07:29:43PM +0300, Yanko Kaneti wrote:
> > >> > > > > On Fri-10/24/14-2014 08:40, Paul E. McKenney wrote:
> > >> > > > > > On Fri, Oct 24, 2014 at 12:08:57PM +0300, Yanko Kaneti wrote:
> > >> > > > > > > On Thu-10/23/14-2014 15:04, Paul E. McKenney wrote:
> > >> > > > > > > > On Fri, Oct 24, 2014 at 12:45:40AM +0300, Yanko Kaneti wrote:
> > >> > > > > > > > > 
> > >> > > > > > > > > On Thu, 2014-10-23 at 13:05 -0700, Paul E. McKenney wrote:
> > >> > > > > > > > > > On Thu, Oct 23, 2014 at 10:51:59PM +0300, Yanko Kaneti wrote:
> > >> > > > 
> > >> > > > [ . . . ]
> > >> > > > 
> > >> > > > > > > Ok, unless I've messsed up something major, bisecting points to:
> > >> > > > > > > 
> > >> > > > > > > 35ce7f29a44a rcu: Create rcuo kthreads only for onlined CPUs
> > >> > > > > > > 
> > >> > > > > > > Makes any sense ?
> > >> > > > > > 
> > >> > > > > > Good question.  ;-)
> > >> > > > > > 
> > >> > > > > > Are any of your online CPUs missing rcuo kthreads?  There should be
> > >> > > > > > kthreads named rcuos/0, rcuos/1, rcuos/2, and so on for each online CPU.
> > >> > > > > 
> > >> > > > > Its a Phenom II X6. With 3.17 and linux-tip with 35ce7f29a44a reverted, the rcuos are 8
> > >> > > > > and the modprobe ppp_generic testcase reliably works, libvirt also manages
> > >> > > > > to setup its bridge.
> > >> > > > > 
> > >> > > > > Just with linux-tip , the rcuos are 6 but the failure is as reliable as
> > >> > > > > before.
> > >> > > 
> > >> > > > Thank you, very interesting.  Which 6 of the rcuos are present?
> > >> > > 
> > >> > > Well, the rcuos are 0 to 5. Which sounds right for a 6 core CPU like this   
> > >> > > Phenom II.
> > >> > 
> > >> > Ah, you get 8 without the patch because it creates them for potential
> > >> > CPUs as well as real ones.  OK, got it.
> > >> > 
> > >> > > > > Awating instructions: :)
> > >> > > > 
> > >> > > > Well, I thought I understood the problem until you found that only 6 of
> > >> > > > the expected 8 rcuos are present with linux-tip without the revert.  ;-)
> > >> > > > 
> > >> > > > I am putting together a patch for the part of the problem that I think
> > >> > > > I understand, of course, but it would help a lot to know which two of
> > >> > > > the rcuos are missing.  ;-)
> > >> > > 
> > >> > > Ready to test
> > >> > 
> > >> > Well, if you are feeling aggressive, give the following patch a spin.
> > >> > I am doing sanity tests on it in the meantime.
> > >> 
> > >> Doesn't seem to make a difference here
> > >
> > >OK, inspection isn't cutting it, so time for tracing.  Does the system
> > >respond to user input?  If so, please enable rcu:rcu_barrier ftrace before
> > >the problem occurs, then dump the trace buffer after the problem occurs.
> > 
> > 	My system is up and responsive when the problem occurs, so this
> > shouldn't be a problem.
> 
> Nice!  ;-)
> 
> > 	Do you want the ftrace with your patch below, or unmodified tip
> > of tree?
> 
> Let's please start with the patch.

And I should hasten to add that you need to set CONFIG_RCU_TRACE=y
for these tracepoints to be enabled.

							Thanx, Paul

> > 	-J
> > 
> > 
> > >							Thanx, Paul
> > >
> > >> > ------------------------------------------------------------------------
> > >> > 
> > >> > diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> > >> > index 29fb23f33c18..927c17b081c7 100644
> > >> > --- a/kernel/rcu/tree_plugin.h
> > >> > +++ b/kernel/rcu/tree_plugin.h
> > >> > @@ -2546,9 +2546,13 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
> > >> >  			rdp->nocb_leader = rdp_spawn;
> > >> >  			if (rdp_last && rdp != rdp_spawn)
> > >> >  				rdp_last->nocb_next_follower = rdp;
> > >> > -			rdp_last = rdp;
> > >> > -			rdp = rdp->nocb_next_follower;
> > >> > -			rdp_last->nocb_next_follower = NULL;
> > >> > +			if (rdp == rdp_spawn) {
> > >> > +				rdp = rdp->nocb_next_follower;
> > >> > +			} else {
> > >> > +				rdp_last = rdp;
> > >> > +				rdp = rdp->nocb_next_follower;
> > >> > +				rdp_last->nocb_next_follower = NULL;
> > >> > +			}
> > >> >  		} while (rdp);
> > >> >  		rdp_spawn->nocb_next_follower = rdp_old_leader;
> > >> >  	}
> > >> > 
> > 
> > ---
> > 	-Jay Vosburgh, jay.vosburgh@canonical.com
> > 

^ permalink raw reply

* Re: [net-next 0/9][pull request] Intel Wired LAN Driver Updates 2014-10-23
From: David Miller @ 2014-10-24 20:43 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: netdev, nhorman, sassmann, jogreene
In-Reply-To: <1414123806-20049-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 23 Oct 2014 21:09:57 -0700

> This series contains updates to i40e and i40evf.
> 
> Jesse modifies the i40e driver to only notify the firmware on link up/down
> and qualified module events.  Also simplified the job of managing link
> state by using the admin queue receive event for link events as a signal
> to tell the driver to update link state.
> 
> Jeff (me) cleans up the inconsistent use of tabs for indentation in the admin
> queue command header file.
> 
> Neerav converts the use of udelay() to usleep_range().
> 
> Anjali fixes a bug where receive would stop after some stress by adding
> a sleep and restart as well as moving the setting of flow control because
> it should be done at a PF level and not a VSI level.
> 
> Mitch adds code to handle link events when updating the PF switch, which
> allows link information to be properly provided to VFS in all cases.
> 
> Catherine adds driver support for 10GBaseT and bumps driver version.

I've pulled, but I absolutely don't like that change that moved the
variables out of the loop.

Putting variables in the inner-most scope of their use is always the
best choice because:

1) Auditing is easier, because you are constraining the scope in
   which you have to check if the variable is setup properly
   before it is used.

2) You are helping the compiler out, because you have done the
   majority of solving the graph coloring problem for it and the
   compiler therefore has less work to do.

Please don't make changes like this in the future, and instead do the
opposite, by moving more variable declarations into their innermost
possible scope.

Thanks.

^ permalink raw reply

* Re: [net-next 0/9][pull request] Intel Wired LAN Driver Updates 2014-10-23
From: Jeff Kirsher @ 2014-10-24 20:45 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, nhorman, sassmann, jogreene
In-Reply-To: <20141024.164327.535125094046571848.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 1970 bytes --]

On Fri, 2014-10-24 at 16:43 -0400, David Miller wrote:
> From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Date: Thu, 23 Oct 2014 21:09:57 -0700
> 
> > This series contains updates to i40e and i40evf.
> > 
> > Jesse modifies the i40e driver to only notify the firmware on link up/down
> > and qualified module events.  Also simplified the job of managing link
> > state by using the admin queue receive event for link events as a signal
> > to tell the driver to update link state.
> > 
> > Jeff (me) cleans up the inconsistent use of tabs for indentation in the admin
> > queue command header file.
> > 
> > Neerav converts the use of udelay() to usleep_range().
> > 
> > Anjali fixes a bug where receive would stop after some stress by adding
> > a sleep and restart as well as moving the setting of flow control because
> > it should be done at a PF level and not a VSI level.
> > 
> > Mitch adds code to handle link events when updating the PF switch, which
> > allows link information to be properly provided to VFS in all cases.
> > 
> > Catherine adds driver support for 10GBaseT and bumps driver version.
> 
> I've pulled, but I absolutely don't like that change that moved the
> variables out of the loop.
> 
> Putting variables in the inner-most scope of their use is always the
> best choice because:
> 
> 1) Auditing is easier, because you are constraining the scope in
>    which you have to check if the variable is setup properly
>    before it is used.
> 
> 2) You are helping the compiler out, because you have done the
>    majority of solving the graph coloring problem for it and the
>    compiler therefore has less work to do.
> 
> Please don't make changes like this in the future, and instead do the
> opposite, by moving more variable declarations into their innermost
> possible scope.
> 
> Thanks.

Noted, and I have forwarded on your request on to the team so that we
are all on the same page.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [PATCH net] net/sched: Fix use of wild pointer in mq_destroy() when qdisc_alloc fail
From: Cong Wang @ 2014-10-24 20:52 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: John Fastabend, wang.bo116, David Miller, netdev, cui.yunfeng
In-Reply-To: <20141024191418.GA8842@acer.localdomain>

On Fri, Oct 24, 2014 at 12:14 PM, Patrick McHardy <kaber@trash.net> wrote:
> On Fri, Oct 24, 2014 at 11:13:56AM -0700, Cong Wang wrote:
>> On Fri, Oct 24, 2014 at 10:49 AM, John Fastabend
>> <john.fastabend@gmail.com> wrote:
>> >
>> > Patch looks fine, another way to fix this would be drop the
>> > mq_destroy() call in the error path. I'm not convinced one
>> > is any better than the other but maybe some other folks have
>> > opinions, it seems a bit wrong to call mq_destroy twice so in
>> > that sense it may be a bit nicer to drop the mq_destroy().
>>
>> Dropping mq_destroy() in error path is indeed better,
>> because upper layer does cleanup intentionally.
>> Look at what other qdisc's do. :)
>
> I would argue that the qdisc_destroy() call in qdisc_create_dflt()
> is wrong, it should instead free the qdisc and release the module
> reference manually as done in qdisc_create().
>
> qdisc_destroy() should only be called for fully initialized qdiscs.

Probably, but at least ->destroy() should be called, looking at
those calling qdisc_watchdog_init(), they are supposed to call
qdisc_watchdog_cancel() when >init() fails after that.

->destroy() is supposed to be able to clean up even partially
initialized qdisc's. So, for qdisc_create_dflt() we should probably
just call ->destroy().

Reading the code again, seems it is inconsistent with qdisc_create(),
where ->destroy() is skipped when ->init() fails. Hmm, we have
a bigger problem here.

I am working on a patch now. Thanks for pointing this out.

^ permalink raw reply

* Re: localed stuck in recent 3.18 git in copy_net_ns?
From: Yanko Kaneti @ 2014-10-24 21:25 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Josh Boyer, Eric W. Biederman, Cong Wang, Kevin Fenzi, netdev,
	Linux-Kernel@Vger. Kernel. Org, jay.vosburgh, mroos
In-Reply-To: <20141024183226.GW4977@linux.vnet.ibm.com>

On Fri-10/24/14-2014 11:32, Paul E. McKenney wrote:
> On Fri, Oct 24, 2014 at 08:35:26PM +0300, Yanko Kaneti wrote:
> > On Fri-10/24/14-2014 10:20, Paul E. McKenney wrote:
> > > On Fri, Oct 24, 2014 at 08:09:31PM +0300, Yanko Kaneti wrote:
> > > > On Fri-10/24/14-2014 09:54, Paul E. McKenney wrote:
> > > > > On Fri, Oct 24, 2014 at 07:29:43PM +0300, Yanko Kaneti wrote:
> > > > > > On Fri-10/24/14-2014 08:40, Paul E. McKenney wrote:
> > > > > > > On Fri, Oct 24, 2014 at 12:08:57PM +0300, Yanko Kaneti wrote:
> > > > > > > > On Thu-10/23/14-2014 15:04, Paul E. McKenney wrote:
> > > > > > > > > On Fri, Oct 24, 2014 at 12:45:40AM +0300, Yanko Kaneti wrote:
> > > > > > > > > > 
> > > > > > > > > > On Thu, 2014-10-23 at 13:05 -0700, Paul E. McKenney wrote:
> > > > > > > > > > > On Thu, Oct 23, 2014 at 10:51:59PM +0300, Yanko Kaneti wrote:
> > > > > 
> > > > > [ . . . ]
> > > > > 
> > > > > > > > Ok, unless I've messsed up something major, bisecting points to:
> > > > > > > > 
> > > > > > > > 35ce7f29a44a rcu: Create rcuo kthreads only for onlined CPUs
> > > > > > > > 
> > > > > > > > Makes any sense ?
> > > > > > > 
> > > > > > > Good question.  ;-)
> > > > > > > 
> > > > > > > Are any of your online CPUs missing rcuo kthreads?  There should be
> > > > > > > kthreads named rcuos/0, rcuos/1, rcuos/2, and so on for each online CPU.
> > > > > > 
> > > > > > Its a Phenom II X6. With 3.17 and linux-tip with 35ce7f29a44a reverted, the rcuos are 8
> > > > > > and the modprobe ppp_generic testcase reliably works, libvirt also manages
> > > > > > to setup its bridge.
> > > > > > 
> > > > > > Just with linux-tip , the rcuos are 6 but the failure is as reliable as
> > > > > > before.
> > > > 
> > > > > Thank you, very interesting.  Which 6 of the rcuos are present?
> > > > 
> > > > Well, the rcuos are 0 to 5. Which sounds right for a 6 core CPU like this   
> > > > Phenom II.
> > > 
> > > Ah, you get 8 without the patch because it creates them for potential
> > > CPUs as well as real ones.  OK, got it.
> > > 
> > > > > > Awating instructions: :)
> > > > > 
> > > > > Well, I thought I understood the problem until you found that only 6 of
> > > > > the expected 8 rcuos are present with linux-tip without the revert.  ;-)
> > > > > 
> > > > > I am putting together a patch for the part of the problem that I think
> > > > > I understand, of course, but it would help a lot to know which two of
> > > > > the rcuos are missing.  ;-)
> > > > 
> > > > Ready to test
> > > 
> > > Well, if you are feeling aggressive, give the following patch a spin.
> > > I am doing sanity tests on it in the meantime.
> > 
> > Doesn't seem to make a difference here
> 
> OK, inspection isn't cutting it, so time for tracing.  Does the system
> respond to user input?  If so, please enable rcu:rcu_barrier ftrace before
> the problem occurs, then dump the trace buffer after the problem occurs.

Sorry for being unresposive here, but I know next to nothing about tracing
or most things about the kernel, so I have some cathing up to do.

In the meantime some layman observations while I tried to find what exactly
triggers the problem.
- Even in runlevel 1 I can reliably trigger the problem by starting libvirtd
- libvirtd seems to be very active in using all sorts of kernel facilities
  that are modules on fedora so it seems to cause many simultaneous kworker 
  calls to modprobe
- there are 8 kworker/u16 from 0 to 7
- one of these kworkers always deadlocks, while there appear to be two
  kworker/u16:6 - the seventh

  6 vs 8 as in 6 rcuos where before they were always 8

Just observations from someone who still doesn't know what the u16
kworkers are..

-- Yanko



 
> 							Thanx, Paul
> 
> > > ------------------------------------------------------------------------
> > > 
> > > diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> > > index 29fb23f33c18..927c17b081c7 100644
> > > --- a/kernel/rcu/tree_plugin.h
> > > +++ b/kernel/rcu/tree_plugin.h
> > > @@ -2546,9 +2546,13 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
> > >  			rdp->nocb_leader = rdp_spawn;
> > >  			if (rdp_last && rdp != rdp_spawn)
> > >  				rdp_last->nocb_next_follower = rdp;
> > > -			rdp_last = rdp;
> > > -			rdp = rdp->nocb_next_follower;
> > > -			rdp_last->nocb_next_follower = NULL;
> > > +			if (rdp == rdp_spawn) {
> > > +				rdp = rdp->nocb_next_follower;
> > > +			} else {
> > > +				rdp_last = rdp;
> > > +				rdp = rdp->nocb_next_follower;
> > > +				rdp_last->nocb_next_follower = NULL;
> > > +			}
> > >  		} while (rdp);
> > >  		rdp_spawn->nocb_next_follower = rdp_old_leader;
> > >  	}
> > > 
> > 
> 

^ permalink raw reply

* RE: ixgbe driver fails occasionally since ee98b577e7711d5890ded2c7b05578a29512bd39
From: Tantilov, Emil S @ 2014-10-24 21:33 UTC (permalink / raw)
  To: Scott Harrison; +Cc: netdev@vger.kernel.org
In-Reply-To: <20141024091828.GA1488@cisco.com>

>-----Original Message-----
>From: Scott Harrison [mailto:scoharr2@cisco.com]
>Sent: Friday, October 24, 2014 2:18 AM
>To: Tantilov, Emil S
>Cc: netdev@vger.kernel.org
>Subject: Re: ixgbe driver fails occasionally since ee98b577e7711d5890ded2c7b05578a29512bd39
>
>Emil,
>
>Sorry, I should have looked harder it happens as part of ifup eth0, we have
>"/sbin/ethtool -s eth0 autoneg on" in the interfaces file.

Well you can avoid this issue by removing this command since it is not needed for anything.
The interface will auto-negotiate by default.

We were able to reproduce this issue and will work on a fix.

Thanks a lot for reporting it.

Emil

>
>HTH.
>
>Scott.

^ permalink raw reply

* Re: [PATCH net-next 1/4] net: bcmgenet: consistently use UMAC_IRQ_RXDMA_MASK
From: Petri Gynther @ 2014-10-24 21:39 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, David Miller
In-Reply-To: <1414180942-2132-2-git-send-email-f.fainelli@gmail.com>

Hi Florian,

On Fri, Oct 24, 2014 at 1:02 PM, Florian Fainelli <f.fainelli@gmail.com> wrote:
> Define a constant which sets all RXDMA interrupts bits, and use it
> consistently throughout the driver to check for RX DMA interrupt bits.
>
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
>  drivers/net/ethernet/broadcom/genet/bcmgenet.c | 14 +++++++++-----
>  1 file changed, 9 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
> index fdc9ec09e453..ee4d5baf09b6 100644
> --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
> +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
> @@ -47,6 +47,10 @@
>
>  #include "bcmgenet.h"
>
> +#define UMAC_IRQ_RXDMA_MASK    (UMAC_IRQ_RXDMA_BDONE | UMAC_IRQ_RXDMA_PDONE)
> +#define UMAC_IRQ_TXDMA_MASK    (UMAC_IRQ_TXDMA_BDONE | UMAC_IRQ_TXDMA_PDONE)
> +#define UMAC_IRQ_RX_TX_MASK    (UMAC_IRQ_RXDMA_MASK | UMAC_IRQ_TXDMA_MASK)
> +

How about putting these in bcmgenet.h, just below the other UMAC_IRQ_*
definitions?

>  /* Maximum number of hardware queues, downsized if needed */
>  #define GENET_MAX_MQ_CNT       4
>
> @@ -1543,9 +1547,9 @@ static int init_umac(struct bcmgenet_priv *priv)
>
>         bcmgenet_intr_disable(priv);
>
> -       cpu_mask_clear = UMAC_IRQ_RXDMA_BDONE;
> +       cpu_mask_clear = UMAC_IRQ_RXDMA_MASK;
>
> -       dev_dbg(kdev, "%s:Enabling RXDMA_BDONE interrupt\n", __func__);
> +       dev_dbg(kdev, "%s:Enabling RXDMA interrupts\n", __func__);
>
>         /* Monitor cable plug/unplugged event for internal PHY */
>         if (phy_is_internal(priv->phydev)) {
> @@ -1883,7 +1887,7 @@ static int bcmgenet_poll(struct napi_struct *napi, int budget)
>                                   priv->rx_c_index, RDMA_CONS_INDEX);
>         if (work_done < budget) {
>                 napi_complete(napi);
> -               bcmgenet_intrl2_0_writel(priv, UMAC_IRQ_RXDMA_BDONE,
> +               bcmgenet_intrl2_0_writel(priv, UMAC_IRQ_RXDMA_MASK,
>                                          INTRL2_CPU_MASK_CLEAR);
>         }
>
> @@ -1958,13 +1962,13 @@ static irqreturn_t bcmgenet_isr0(int irq, void *dev_id)
>         netif_dbg(priv, intr, priv->dev,
>                   "IRQ=0x%x\n", priv->irq0_stat);
>
> -       if (priv->irq0_stat & (UMAC_IRQ_RXDMA_BDONE | UMAC_IRQ_RXDMA_PDONE)) {
> +       if (priv->irq0_stat & UMAC_IRQ_RXDMA_MASK) {
>                 /* We use NAPI(software interrupt throttling, if
>                  * Rx Descriptor throttling is not used.
>                  * Disable interrupt, will be enabled in the poll method.
>                  */
>                 if (likely(napi_schedule_prep(&priv->napi))) {
> -                       bcmgenet_intrl2_0_writel(priv, UMAC_IRQ_RXDMA_BDONE,
> +                       bcmgenet_intrl2_0_writel(priv, UMAC_IRQ_RXDMA_MASK,
>                                                  INTRL2_CPU_MASK_SET);
>                         __napi_schedule(&priv->napi);
>                 }
> --
> 1.9.1
>

^ permalink raw reply

* Re: [net v2] iptunnel: Fix iptunnel_xmit return code for stats maintenance
From: Pravin Shelar @ 2014-10-24 21:41 UTC (permalink / raw)
  To: Andy Zhou; +Cc: David Miller, netdev
In-Reply-To: <1414127459-16128-1-git-send-email-azhou@nicira.com>

On Thu, Oct 23, 2014 at 10:10 PM, Andy Zhou <azhou@nicira.com> wrote:
> iptunnel_xmit() currently always return >= 0 instead of proper error
> code, that is used to maintain stats. For example, current return code
> conflicts with how iptunnel_xmit_stats() maintains stats.
>
> Unfortunately, the return code can not be changed without readjusting
> how SKB memory is managed through the call chain.  The following two
> rules are adopted for this patch:
>
> 1) Proper error code are always propagate back through the call chain
>    so that the caller can maintain stats.
>
> 2) Tunnel xmit functions always free resources, e.g. skb and route
>    entry.
>
> Signed-off-by: Andy Zhou <azhou@nicira.com>
>
> -----
> V1->v2:  Address pravin's review comments:
>          * fix error path memory leak in gre_tnl_send()
>          * Keep error counting consistent between openvswitch vport
>            and iptunnel_xmit_stats()
>          Sending out for net, rather than net-next, as a bug fix.
> ---
>  drivers/net/vxlan.c            |   21 +++++++++++++--------
>  include/net/ip_tunnels.h       |    7 +++++++
>  net/ipv4/geneve.c              |    8 ++++++--
>  net/ipv4/ip_tunnel_core.c      |   14 +++++++++++---
>  net/openvswitch/vport-geneve.c |    5 ++---
>  net/openvswitch/vport-gre.c    |    7 +++++--
>  net/openvswitch/vport-vxlan.c  |    6 +++---
>  net/openvswitch/vport.c        |    1 -
>  8 files changed, 47 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index ca30982..93348cb 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -1626,8 +1626,10 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
>         bool udp_sum = !vs->sock->sk->sk_no_check_tx;
>
>         skb = udp_tunnel_handle_offloads(skb, udp_sum);
> -       if (IS_ERR(skb))
> -               return -EINVAL;
> +       if (IS_ERR(skb)) {
> +               err = -EINVAL;
> +               goto error;
> +       }
>
>         min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
>                         + VXLAN_HLEN + sizeof(struct iphdr)
> @@ -1636,13 +1638,15 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
>         /* Need space for new headers (invalidates iph ptr) */
>         err = skb_cow_head(skb, min_headroom);
>         if (unlikely(err))
> -               return err;
> +               goto error;
>
>         if (vlan_tx_tag_present(skb)) {
>                 if (WARN_ON(!__vlan_put_tag(skb,
>                                             skb->vlan_proto,
> -                                           vlan_tx_tag_get(skb))))
> -                       return -ENOMEM;
> +                                           vlan_tx_tag_get(skb)))) {
> +                       err = -ENOMEM;
> +                       goto error;
> +               }
>
I just noticed that __vlan_put_tag() frees skb in case of error, so
goto results in double free.
maybe it is time we improve __vlan_put_tag() API, so that it is easier to use.

>                 skb->vlan_tci = 0;
>         }
> @@ -1655,6 +1659,10 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
>
>         return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos,
>                                    ttl, df, src_port, dst_port, xnet);
> +error:
> +       kfree_skb(skb);
> +       ip_rt_put(rt);
> +       return err;
>  }
>  EXPORT_SYMBOL_GPL(vxlan_xmit_skb);
>
> @@ -1786,9 +1794,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>                                      tos, ttl, df, src_port, dst_port,
>                                      htonl(vni << 8),
>                                      !net_eq(vxlan->net, dev_net(vxlan->dev)));
> -
> -               if (err < 0)
> -                       goto rt_tx_error;
>                 iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
>  #if IS_ENABLED(CONFIG_IPV6)
>         } else {
> diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
> index 5bc6ede..80bcf2e 100644
> --- a/include/net/ip_tunnels.h
> +++ b/include/net/ip_tunnels.h
> @@ -174,6 +174,13 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
>  }
>
>  int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto);
> +
> +/* Transmit a packet over IP tunnel
> + * Returns:
> + *     0 Congestion notification received
> + *     >0  Number of bytes in the packet successfully sent
> + *     <0 packet dropped due to error
> + */
>  int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
>                   __be32 src, __be32 dst, __u8 proto,
>                   __u8 tos, __u8 ttl, __be16 df, bool xnet);
> diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
> index 065cd94..90fea48 100644
> --- a/net/ipv4/geneve.c
> +++ b/net/ipv4/geneve.c
> @@ -129,14 +129,14 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
>
>         err = skb_cow_head(skb, min_headroom);
>         if (unlikely(err))
> -               return err;
> +               goto error;
>
>         if (vlan_tx_tag_present(skb)) {
>                 if (unlikely(!__vlan_put_tag(skb,
>                                              skb->vlan_proto,
>                                              vlan_tx_tag_get(skb)))) {
>                         err = -ENOMEM;
> -                       return err;
> +                       goto error;
>                 }
>                 skb->vlan_tci = 0;
>         }
> @@ -146,6 +146,10 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
>
>         return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst,
>                                    tos, ttl, df, src_port, dst_port, xnet);
> +error:
> +       kfree_skb(skb);
> +       ip_rt_put(rt);
> +       return err;
>  }
>  EXPORT_SYMBOL_GPL(geneve_xmit_skb);
>
> diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
> index 88c386c..b3ba4a3 100644
> --- a/net/ipv4/ip_tunnel_core.c
> +++ b/net/ipv4/ip_tunnel_core.c
> @@ -77,9 +77,17 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
>         __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
>
>         err = ip_local_out_sk(sk, skb);
> -       if (unlikely(net_xmit_eval(err)))
> -               pkt_len = 0;
> -       return pkt_len;
> +
> +       /* Deal with positive error numbers. Filter out NET_XMIT_CN */
> +       if (err > 0)
> +               return net_xmit_errno(err);
> +
> +       /* Success, return number of bytes transmitted */
> +       if (err == 0)
> +               err = pkt_len;
> +
> +       /* Return pkt_len or an error code */
> +       return err;
>  }
>  EXPORT_SYMBOL_GPL(iptunnel_xmit);
>
> diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
> index 106a9d8..34276fb 100644
> --- a/net/openvswitch/vport-geneve.c
> +++ b/net/openvswitch/vport-geneve.c
> @@ -206,15 +206,14 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
>         tunnel_id_to_vni(tun_key->tun_id, vni);
>         skb->ignore_df = 1;
>
> -       err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr,
> +       return  geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr,
>                               tun_key->ipv4_dst, tun_key->ipv4_tos,
>                               tun_key->ipv4_ttl, df, sport, dport,
>                               tun_key->tun_flags, vni,
>                               tun_info->options_len, (u8 *)tun_info->options,
>                               false);
> -       if (err < 0)
> -               ip_rt_put(rt);
>  error:
> +       kfree_skb(skb);
>         return err;
>  }
>
> diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
> index 108b82d..c0ec43f 100644
> --- a/net/openvswitch/vport-gre.c
> +++ b/net/openvswitch/vport-gre.c
> @@ -154,8 +154,10 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
>         fl.flowi4_proto = IPPROTO_GRE;
>
>         rt = ip_route_output_key(net, &fl);
> -       if (IS_ERR(rt))
> -               return PTR_ERR(rt);
> +       if (IS_ERR(rt)) {
> +               err = PTR_ERR(rt);
> +               goto error;
> +       }
>
>         tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags);
>
> @@ -200,6 +202,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
>  err_free_rt:
>         ip_rt_put(rt);
>  error:
> +       kfree_skb(skb);
>         return err;
>  }
>
> diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
> index 2735e01..ace849a 100644
> --- a/net/openvswitch/vport-vxlan.c
> +++ b/net/openvswitch/vport-vxlan.c
> @@ -174,15 +174,15 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
>
>         src_port = udp_flow_src_port(net, skb, 0, 0, true);
>
> -       err = vxlan_xmit_skb(vxlan_port->vs, rt, skb,
> +       return vxlan_xmit_skb(vxlan_port->vs, rt, skb,
>                              fl.saddr, tun_key->ipv4_dst,
>                              tun_key->ipv4_tos, tun_key->ipv4_ttl, df,
>                              src_port, dst_port,
>                              htonl(be64_to_cpu(tun_key->tun_id) << 8),
>                              false);
> -       if (err < 0)
> -               ip_rt_put(rt);
> +
>  error:
> +       kfree_skb(skb);
>         return err;
>  }
>
> diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
> index 6015802..da24d32 100644
> --- a/net/openvswitch/vport.c
> +++ b/net/openvswitch/vport.c
> @@ -482,7 +482,6 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
>                 u64_stats_update_end(&stats->syncp);
>         } else if (sent < 0) {
>                 ovs_vport_record_error(vport, VPORT_E_TX_ERROR);
> -               kfree_skb(skb);
>         } else
>                 ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
>
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v2 0/2] DSA tagging mismatches
From: Andrew Lunn @ 2014-10-24 21:44 UTC (permalink / raw)
  To: davem; +Cc: f.fainelli, netdev, Andrew Lunn

The second patch is a fix, which should be applied to -rc. It is
possible to get a DSA configuration which does not work. The patch
stops this happening.

The first patch detects this situation, and errors out the probe of
DSA, making it more obvious something is wrong. It is not required to
apply it -rc.

v2 fixes the use case pointed out by Florian, that a switch driver
may use DSA_TAG_PROTO_NONE which the patch did not correctly handle.

Andrew Lunn (2):
  net: dsa: Error out on tagging protocol mismatches
  dsa: mv88e6171: Fix tagging protocol/Kconfig

 drivers/net/dsa/mv88e6171.c | 2 +-
 net/dsa/dsa.c               | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

-- 
2.1.1

^ permalink raw reply

* [PATCH v2 2/2] dsa: mv88e6171: Fix tagging protocol/Kconfig
From: Andrew Lunn @ 2014-10-24 21:44 UTC (permalink / raw)
  To: davem; +Cc: f.fainelli, netdev, Andrew Lunn
In-Reply-To: <1414187045-8326-1-git-send-email-andrew@lunn.ch>

The mv88e6171 can support two different tagging protocols, DSA and
EDSA. The switch driver structure only allows one protocol to be
enumerated, and DSA was chosen. However the Kconfig entry ensures the
EDSA tagging code is built. With a minimal configuration, we then end
up with a mismatch. The probe is successful, EDSA tagging is used, but
the switch is configured for DSA, resulting in mangled packets.

Change the switch driver structure to enumerate EDSA, fixing the
mismatch.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Fixes: 42f272539487 ("net: DSA: Marvell mv88e6171 switch driver")
---
 drivers/net/dsa/mv88e6171.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c
index 1020a7af67cf..78d8e876f3aa 100644
--- a/drivers/net/dsa/mv88e6171.c
+++ b/drivers/net/dsa/mv88e6171.c
@@ -395,7 +395,7 @@ static int mv88e6171_get_sset_count(struct dsa_switch *ds)
 }

 struct dsa_switch_driver mv88e6171_switch_driver = {
-	.tag_protocol		= DSA_TAG_PROTO_DSA,
+	.tag_protocol		= DSA_TAG_PROTO_EDSA,
 	.priv_size		= sizeof(struct mv88e6xxx_priv_state),
 	.probe			= mv88e6171_probe,
 	.setup			= mv88e6171_setup,
-- 
2.1.1

^ permalink raw reply related

* [PATCH v2 1/2] net: dsa: Error out on tagging protocol mismatches
From: Andrew Lunn @ 2014-10-24 21:44 UTC (permalink / raw)
  To: davem; +Cc: f.fainelli, netdev, Andrew Lunn, alexander.h.duyck
In-Reply-To: <1414187045-8326-1-git-send-email-andrew@lunn.ch>

If there is a mismatch between enabled tagging protocols and the
protocol the switch supports, error out, rather than continue with a
situation which is unlikely to work.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
cc: alexander.h.duyck@intel.com
---

v2: Handle the use case of DSA_TAG_PROTO_NONE

 net/dsa/dsa.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 22f34cf4cb27..6317b41c99b0 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -174,8 +174,11 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
 			dst->rcv = brcm_netdev_ops.rcv;
 			break;
 #endif
-		default:
+		case DSA_TAG_PROTO_NONE:
 			break;
+		default:
+			ret = -ENOPROTOOPT;
+			goto out;
 		}
 
 		dst->tag_protocol = drv->tag_protocol;
-- 
2.1.1

^ permalink raw reply related

* Re: [PATCH net] net/sched: Fix use of wild pointer in mq_destroy() when qdisc_alloc fail
From: Patrick McHardy @ 2014-10-24 21:45 UTC (permalink / raw)
  To: Cong Wang; +Cc: John Fastabend, wang.bo116, David Miller, netdev, cui.yunfeng
In-Reply-To: <CAHA+R7PDxYeVuQZz=VSci8_7sZ9smb5c59ivOS=B6GemJwtcCg@mail.gmail.com>

On Fri, Oct 24, 2014 at 01:52:00PM -0700, Cong Wang wrote:
> On Fri, Oct 24, 2014 at 12:14 PM, Patrick McHardy <kaber@trash.net> wrote:
> > On Fri, Oct 24, 2014 at 11:13:56AM -0700, Cong Wang wrote:
> >> On Fri, Oct 24, 2014 at 10:49 AM, John Fastabend
> >> <john.fastabend@gmail.com> wrote:
> >> >
> >> > Patch looks fine, another way to fix this would be drop the
> >> > mq_destroy() call in the error path. I'm not convinced one
> >> > is any better than the other but maybe some other folks have
> >> > opinions, it seems a bit wrong to call mq_destroy twice so in
> >> > that sense it may be a bit nicer to drop the mq_destroy().
> >>
> >> Dropping mq_destroy() in error path is indeed better,
> >> because upper layer does cleanup intentionally.
> >> Look at what other qdisc's do. :)
> >
> > I would argue that the qdisc_destroy() call in qdisc_create_dflt()
> > is wrong, it should instead free the qdisc and release the module
> > reference manually as done in qdisc_create().
> >
> > qdisc_destroy() should only be called for fully initialized qdiscs.
> 
> Probably, but at least ->destroy() should be called, looking at
> those calling qdisc_watchdog_init(), they are supposed to call
> qdisc_watchdog_cancel() when >init() fails after that.

In which cases does it actually fail after that? Usually this is
called once initialization is complete.

> ->destroy() is supposed to be able to clean up even partially
> initialized qdisc's. So, for qdisc_create_dflt() we should probably
> just call ->destroy().

No, why do you think that? ->init() is supposed to clean up after
itself if it fails, both qdisc_destroy() and ->destroy are only
supposed to be called after ->init() succeeded.

Its simply symetrical, as everywhere else in the kernel. If a sub-init
funtion fails, it should clean up and return an error. We don't
destroy things we've never successfully initialized, they're supposed
to clean up after themselves.

> Reading the code again, seems it is inconsistent with qdisc_create(),
> where ->destroy() is skipped when ->init() fails. Hmm, we have
> a bigger problem here.
> 
> I am working on a patch now. Thanks for pointing this out.

No, that is not inconsistent, it is consistent with what we're doing
everywhere else in the kernel, what the qdisc layer has always done
and how qdiscs expect it.

Where are you seeing a problem now? Its a simple fix:
qdisc_create_dflt() should behave similar to qdisc_create().

If any ->init() functions than don't properly clean up on error, that's
an additional bug.

^ permalink raw reply

* Re: localed stuck in recent 3.18 git in copy_net_ns?
From: Paul E. McKenney @ 2014-10-24 21:49 UTC (permalink / raw)
  To: Yanko Kaneti
  Cc: Josh Boyer, Eric W. Biederman, Cong Wang, Kevin Fenzi, netdev,
	Linux-Kernel@Vger. Kernel. Org, jay.vosburgh, mroos, tj
In-Reply-To: <20141024212557.GA15537@declera.com>

On Sat, Oct 25, 2014 at 12:25:57AM +0300, Yanko Kaneti wrote:
> On Fri-10/24/14-2014 11:32, Paul E. McKenney wrote:
> > On Fri, Oct 24, 2014 at 08:35:26PM +0300, Yanko Kaneti wrote:
> > > On Fri-10/24/14-2014 10:20, Paul E. McKenney wrote:

[ . . . ]

> > > > Well, if you are feeling aggressive, give the following patch a spin.
> > > > I am doing sanity tests on it in the meantime.
> > > 
> > > Doesn't seem to make a difference here
> > 
> > OK, inspection isn't cutting it, so time for tracing.  Does the system
> > respond to user input?  If so, please enable rcu:rcu_barrier ftrace before
> > the problem occurs, then dump the trace buffer after the problem occurs.
> 
> Sorry for being unresposive here, but I know next to nothing about tracing
> or most things about the kernel, so I have some cathing up to do.
> 
> In the meantime some layman observations while I tried to find what exactly
> triggers the problem.
> - Even in runlevel 1 I can reliably trigger the problem by starting libvirtd
> - libvirtd seems to be very active in using all sorts of kernel facilities
>   that are modules on fedora so it seems to cause many simultaneous kworker 
>   calls to modprobe
> - there are 8 kworker/u16 from 0 to 7
> - one of these kworkers always deadlocks, while there appear to be two
>   kworker/u16:6 - the seventh

Adding Tejun on CC in case this duplication of kworker/u16:6 is important.

>   6 vs 8 as in 6 rcuos where before they were always 8
> 
> Just observations from someone who still doesn't know what the u16
> kworkers are..

Could you please run the following diagnostic patch?  This will help
me see if I have managed to miswire the rcuo kthreads.  It should
print some information at task-hang time.

							Thanx, Paul

------------------------------------------------------------------------

rcu: Dump no-CBs CPU state at task-hung time

Strictly diagnostic commit for rcu_barrier() hang.  Not for inclusion.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 0e5366200154..34048140577b 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -157,4 +157,8 @@ static inline bool rcu_is_watching(void)
 
 #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
 
+static inline void rcu_show_nocb_setup(void)
+{
+}
+
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 52953790dcca..0b813bdb971b 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -97,4 +97,6 @@ extern int rcu_scheduler_active __read_mostly;
 
 bool rcu_is_watching(void);
 
+void rcu_show_nocb_setup(void);
+
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 06db12434d72..e6e4d0f6b063 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -118,6 +118,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 		" disables this message.\n");
 	sched_show_task(t);
 	debug_show_held_locks(t);
+	rcu_show_nocb_setup();
 
 	touch_nmi_watchdog();
 
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 240fa9094f83..6b373e79ce0e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1513,6 +1513,7 @@ rcu_torture_cleanup(void)
 {
 	int i;
 
+	rcu_show_nocb_setup();
 	rcutorture_record_test_transition();
 	if (torture_cleanup_begin()) {
 		if (cur_ops->cb_barrier != NULL)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 927c17b081c7..285b3f6fb229 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2699,6 +2699,31 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
 
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 
+void rcu_show_nocb_setup(void)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+	int cpu;
+	struct rcu_data *rdp;
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp) {
+		pr_alert("rcu_show_nocb_setup(): %s nocb state:\n", rsp->name);
+		for_each_possible_cpu(cpu) {
+			if (!rcu_is_nocb_cpu(cpu))
+				continue;
+			rdp = per_cpu_ptr(rsp->rda, cpu);
+			pr_alert("%3d: %p l:%p n:%p %c%c%c\n",
+				 cpu,
+				 rdp, rdp->nocb_leader, rdp->nocb_next_follower,
+				 ".N"[!!rdp->nocb_head],
+				 ".G"[!!rdp->nocb_gp_head],
+				 ".F"[!!rdp->nocb_follower_head]);
+		}
+	}
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+}
+EXPORT_SYMBOL_GPL(rcu_show_nocb_setup);
+
 /*
  * An adaptive-ticks CPU can potentially execute in kernel mode for an
  * arbitrarily long period of time with the scheduling-clock tick turned

^ permalink raw reply related

* Re: [PATCH] ovs: Turn vports with dependencies into separate modules
From: Thomas Graf @ 2014-10-24 21:57 UTC (permalink / raw)
  To: Pravin Shelar; +Cc: dev@openvswitch.org, netdev
In-Reply-To: <CALnjE+o+aSE0cGnS60p_4ff6mhddK8oB0e0xMi+L0p1aa-P9vA@mail.gmail.com>

On 10/24/14 at 10:47am, Pravin Shelar wrote:
> On Wed, Oct 22, 2014 at 8:29 AM, Thomas Graf <tgraf@suug.ch> wrote:
> > The internal and netdev vport remain part of openvswitch.ko. Encap
> > vports including vxlan, gre, and geneve can be built as separate
> > modules and are loaded on demand. Modules can be unloaded after use.
> > Datapath ports keep a reference to the vport module during their
> > lifetime.
> >
> > Allows to remove the error prone maintenance of the global list
> > vport_ops_list.
> >
> How error prone is this interface, can you give example? Set of ovs
> vport type is been pretty stable, so am not sure if we need loadable
> module support for vports implementations.

I was refering to how many other kernel APIs have been designed, a
registration API allowing a vport to be implemented exclusively in the
scope of a single file tends to be cleaner than having to touch multiple
files and maintaining an init list.

It also allows for OVS to be built into vmlinuz while vports can
remain as modules even if vxlan itself is built as a module.

As for new vports, GUE and LIS are candidates, encrypted VXLAN might
look for support and there are several VXLAN extensions currently
proposed as IETF drafts which might require new vports.

^ permalink raw reply

* Re: localed stuck in recent 3.18 git in copy_net_ns?
From: Jay Vosburgh @ 2014-10-24 22:02 UTC (permalink / raw)
  To: paulmck
  Cc: Yanko Kaneti, Josh Boyer, Eric W. Biederman, Cong Wang,
	Kevin Fenzi, netdev, Linux-Kernel@Vger. Kernel. Org, mroos, tj
In-Reply-To: <20141024214927.GA4977@linux.vnet.ibm.com>

Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:

>On Sat, Oct 25, 2014 at 12:25:57AM +0300, Yanko Kaneti wrote:
>> On Fri-10/24/14-2014 11:32, Paul E. McKenney wrote:
>> > On Fri, Oct 24, 2014 at 08:35:26PM +0300, Yanko Kaneti wrote:
>> > > On Fri-10/24/14-2014 10:20, Paul E. McKenney wrote:
>
>[ . . . ]
>
>> > > > Well, if you are feeling aggressive, give the following patch a spin.
>> > > > I am doing sanity tests on it in the meantime.
>> > > 
>> > > Doesn't seem to make a difference here
>> > 
>> > OK, inspection isn't cutting it, so time for tracing.  Does the system
>> > respond to user input?  If so, please enable rcu:rcu_barrier ftrace before
>> > the problem occurs, then dump the trace buffer after the problem occurs.
>> 
>> Sorry for being unresposive here, but I know next to nothing about tracing
>> or most things about the kernel, so I have some cathing up to do.
>> 
>> In the meantime some layman observations while I tried to find what exactly
>> triggers the problem.
>> - Even in runlevel 1 I can reliably trigger the problem by starting libvirtd
>> - libvirtd seems to be very active in using all sorts of kernel facilities
>>   that are modules on fedora so it seems to cause many simultaneous kworker 
>>   calls to modprobe
>> - there are 8 kworker/u16 from 0 to 7
>> - one of these kworkers always deadlocks, while there appear to be two
>>   kworker/u16:6 - the seventh
>
>Adding Tejun on CC in case this duplication of kworker/u16:6 is important.
>
>>   6 vs 8 as in 6 rcuos where before they were always 8
>> 
>> Just observations from someone who still doesn't know what the u16
>> kworkers are..
>
>Could you please run the following diagnostic patch?  This will help
>me see if I have managed to miswire the rcuo kthreads.  It should
>print some information at task-hang time.

	I can give this a spin after the ftrace (now that I've got
CONFIG_RCU_TRACE turned on).

	I've got an ftrace capture from unmodified -net, it looks like
this:

    ovs-vswitchd-902   [000] ....   471.778441: rcu_barrier: rcu_sched Begin cpu -1 remaining 0 # 0
    ovs-vswitchd-902   [000] ....   471.778452: rcu_barrier: rcu_sched Check cpu -1 remaining 0 # 0
    ovs-vswitchd-902   [000] ....   471.778452: rcu_barrier: rcu_sched Inc1 cpu -1 remaining 0 # 1
    ovs-vswitchd-902   [000] ....   471.778453: rcu_barrier: rcu_sched OnlineNoCB cpu 0 remaining 1 # 1
    ovs-vswitchd-902   [000] ....   471.778453: rcu_barrier: rcu_sched OnlineNoCB cpu 1 remaining 2 # 1
    ovs-vswitchd-902   [000] ....   471.778453: rcu_barrier: rcu_sched OnlineNoCB cpu 2 remaining 3 # 1
    ovs-vswitchd-902   [000] ....   471.778454: rcu_barrier: rcu_sched OnlineNoCB cpu 3 remaining 4 # 1
    ovs-vswitchd-902   [000] ....   471.778454: rcu_barrier: rcu_sched Inc2 cpu -1 remaining 4 # 2
         rcuos/0-9     [000] ..s.   471.793150: rcu_barrier: rcu_sched CB cpu -1 remaining 3 # 2
         rcuos/1-18    [001] ..s.   471.793308: rcu_barrier: rcu_sched CB cpu -1 remaining 2 # 2

	I let it sit through several "hung task" cycles but that was all
there was for rcu:rcu_barrier.

	I should have ftrace with the patch as soon as the kernel is
done building, then I can try the below patch (I'll start it building
now).

	-J




>							Thanx, Paul
>
>------------------------------------------------------------------------
>
>rcu: Dump no-CBs CPU state at task-hung time
>
>Strictly diagnostic commit for rcu_barrier() hang.  Not for inclusion.
>
>Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>
>diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
>index 0e5366200154..34048140577b 100644
>--- a/include/linux/rcutiny.h
>+++ b/include/linux/rcutiny.h
>@@ -157,4 +157,8 @@ static inline bool rcu_is_watching(void)
> 
> #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
> 
>+static inline void rcu_show_nocb_setup(void)
>+{
>+}
>+
> #endif /* __LINUX_RCUTINY_H */
>diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
>index 52953790dcca..0b813bdb971b 100644
>--- a/include/linux/rcutree.h
>+++ b/include/linux/rcutree.h
>@@ -97,4 +97,6 @@ extern int rcu_scheduler_active __read_mostly;
> 
> bool rcu_is_watching(void);
> 
>+void rcu_show_nocb_setup(void);
>+
> #endif /* __LINUX_RCUTREE_H */
>diff --git a/kernel/hung_task.c b/kernel/hung_task.c
>index 06db12434d72..e6e4d0f6b063 100644
>--- a/kernel/hung_task.c
>+++ b/kernel/hung_task.c
>@@ -118,6 +118,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
> 		" disables this message.\n");
> 	sched_show_task(t);
> 	debug_show_held_locks(t);
>+	rcu_show_nocb_setup();
> 
> 	touch_nmi_watchdog();
> 
>diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
>index 240fa9094f83..6b373e79ce0e 100644
>--- a/kernel/rcu/rcutorture.c
>+++ b/kernel/rcu/rcutorture.c
>@@ -1513,6 +1513,7 @@ rcu_torture_cleanup(void)
> {
> 	int i;
> 
>+	rcu_show_nocb_setup();
> 	rcutorture_record_test_transition();
> 	if (torture_cleanup_begin()) {
> 		if (cur_ops->cb_barrier != NULL)
>diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
>index 927c17b081c7..285b3f6fb229 100644
>--- a/kernel/rcu/tree_plugin.h
>+++ b/kernel/rcu/tree_plugin.h
>@@ -2699,6 +2699,31 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
> 
> #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
> 
>+void rcu_show_nocb_setup(void)
>+{
>+#ifdef CONFIG_RCU_NOCB_CPU
>+	int cpu;
>+	struct rcu_data *rdp;
>+	struct rcu_state *rsp;
>+
>+	for_each_rcu_flavor(rsp) {
>+		pr_alert("rcu_show_nocb_setup(): %s nocb state:\n", rsp->name);
>+		for_each_possible_cpu(cpu) {
>+			if (!rcu_is_nocb_cpu(cpu))
>+				continue;
>+			rdp = per_cpu_ptr(rsp->rda, cpu);
>+			pr_alert("%3d: %p l:%p n:%p %c%c%c\n",
>+				 cpu,
>+				 rdp, rdp->nocb_leader, rdp->nocb_next_follower,
>+				 ".N"[!!rdp->nocb_head],
>+				 ".G"[!!rdp->nocb_gp_head],
>+				 ".F"[!!rdp->nocb_follower_head]);
>+		}
>+	}
>+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
>+}
>+EXPORT_SYMBOL_GPL(rcu_show_nocb_setup);
>+
> /*
>  * An adaptive-ticks CPU can potentially execute in kernel mode for an
>  * arbitrarily long period of time with the scheduling-clock tick turned
>

---
	-Jay Vosburgh, jay.vosburgh@canonical.com

^ permalink raw reply

* Re: localed stuck in recent 3.18 git in copy_net_ns?
From: Paul E. McKenney @ 2014-10-24 22:16 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Yanko Kaneti, Josh Boyer, Eric W. Biederman, Cong Wang,
	Kevin Fenzi, netdev, Linux-Kernel@Vger. Kernel. Org, mroos, tj
In-Reply-To: <8451.1414188124@famine>

On Fri, Oct 24, 2014 at 03:02:04PM -0700, Jay Vosburgh wrote:
> Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> 
> >On Sat, Oct 25, 2014 at 12:25:57AM +0300, Yanko Kaneti wrote:
> >> On Fri-10/24/14-2014 11:32, Paul E. McKenney wrote:
> >> > On Fri, Oct 24, 2014 at 08:35:26PM +0300, Yanko Kaneti wrote:
> >> > > On Fri-10/24/14-2014 10:20, Paul E. McKenney wrote:
> >
> >[ . . . ]
> >
> >> > > > Well, if you are feeling aggressive, give the following patch a spin.
> >> > > > I am doing sanity tests on it in the meantime.
> >> > > 
> >> > > Doesn't seem to make a difference here
> >> > 
> >> > OK, inspection isn't cutting it, so time for tracing.  Does the system
> >> > respond to user input?  If so, please enable rcu:rcu_barrier ftrace before
> >> > the problem occurs, then dump the trace buffer after the problem occurs.
> >> 
> >> Sorry for being unresposive here, but I know next to nothing about tracing
> >> or most things about the kernel, so I have some cathing up to do.
> >> 
> >> In the meantime some layman observations while I tried to find what exactly
> >> triggers the problem.
> >> - Even in runlevel 1 I can reliably trigger the problem by starting libvirtd
> >> - libvirtd seems to be very active in using all sorts of kernel facilities
> >>   that are modules on fedora so it seems to cause many simultaneous kworker 
> >>   calls to modprobe
> >> - there are 8 kworker/u16 from 0 to 7
> >> - one of these kworkers always deadlocks, while there appear to be two
> >>   kworker/u16:6 - the seventh
> >
> >Adding Tejun on CC in case this duplication of kworker/u16:6 is important.
> >
> >>   6 vs 8 as in 6 rcuos where before they were always 8
> >> 
> >> Just observations from someone who still doesn't know what the u16
> >> kworkers are..
> >
> >Could you please run the following diagnostic patch?  This will help
> >me see if I have managed to miswire the rcuo kthreads.  It should
> >print some information at task-hang time.
> 
> 	I can give this a spin after the ftrace (now that I've got
> CONFIG_RCU_TRACE turned on).
> 
> 	I've got an ftrace capture from unmodified -net, it looks like
> this:
> 
>     ovs-vswitchd-902   [000] ....   471.778441: rcu_barrier: rcu_sched Begin cpu -1 remaining 0 # 0
>     ovs-vswitchd-902   [000] ....   471.778452: rcu_barrier: rcu_sched Check cpu -1 remaining 0 # 0
>     ovs-vswitchd-902   [000] ....   471.778452: rcu_barrier: rcu_sched Inc1 cpu -1 remaining 0 # 1
>     ovs-vswitchd-902   [000] ....   471.778453: rcu_barrier: rcu_sched OnlineNoCB cpu 0 remaining 1 # 1
>     ovs-vswitchd-902   [000] ....   471.778453: rcu_barrier: rcu_sched OnlineNoCB cpu 1 remaining 2 # 1
>     ovs-vswitchd-902   [000] ....   471.778453: rcu_barrier: rcu_sched OnlineNoCB cpu 2 remaining 3 # 1
>     ovs-vswitchd-902   [000] ....   471.778454: rcu_barrier: rcu_sched OnlineNoCB cpu 3 remaining 4 # 1

OK, so it looks like your system has four CPUs, and rcu_barrier() placed
callbacks on them all.

>     ovs-vswitchd-902   [000] ....   471.778454: rcu_barrier: rcu_sched Inc2 cpu -1 remaining 4 # 2

The above removes the extra count used to avoid races between posting new
callbacks and completion of previously posted callbacks.

>          rcuos/0-9     [000] ..s.   471.793150: rcu_barrier: rcu_sched CB cpu -1 remaining 3 # 2
>          rcuos/1-18    [001] ..s.   471.793308: rcu_barrier: rcu_sched CB cpu -1 remaining 2 # 2

Two of the four callbacks fired, but the other two appear to be AWOL.
And rcu_barrier() won't return until they all fire.

> 	I let it sit through several "hung task" cycles but that was all
> there was for rcu:rcu_barrier.
> 
> 	I should have ftrace with the patch as soon as the kernel is
> done building, then I can try the below patch (I'll start it building
> now).

Sounds very good, looking forward to hearing of the results.

							Thanx, Paul

^ permalink raw reply

* Re: [PATCH net] net/sched: Fix use of wild pointer in mq_destroy() when qdisc_alloc fail
From: Cong Wang @ 2014-10-24 22:17 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: John Fastabend, wang.bo116, David Miller, netdev, cui.yunfeng
In-Reply-To: <20141024214511.GA10290@acer.localdomain>

On Fri, Oct 24, 2014 at 2:45 PM, Patrick McHardy <kaber@trash.net> wrote:
> On Fri, Oct 24, 2014 at 01:52:00PM -0700, Cong Wang wrote:
>> On Fri, Oct 24, 2014 at 12:14 PM, Patrick McHardy <kaber@trash.net> wrote:
>> > On Fri, Oct 24, 2014 at 11:13:56AM -0700, Cong Wang wrote:
>> >> On Fri, Oct 24, 2014 at 10:49 AM, John Fastabend
>> >> <john.fastabend@gmail.com> wrote:
>> >> >
>> >> > Patch looks fine, another way to fix this would be drop the
>> >> > mq_destroy() call in the error path. I'm not convinced one
>> >> > is any better than the other but maybe some other folks have
>> >> > opinions, it seems a bit wrong to call mq_destroy twice so in
>> >> > that sense it may be a bit nicer to drop the mq_destroy().
>> >>
>> >> Dropping mq_destroy() in error path is indeed better,
>> >> because upper layer does cleanup intentionally.
>> >> Look at what other qdisc's do. :)
>> >
>> > I would argue that the qdisc_destroy() call in qdisc_create_dflt()
>> > is wrong, it should instead free the qdisc and release the module
>> > reference manually as done in qdisc_create().
>> >
>> > qdisc_destroy() should only be called for fully initialized qdiscs.
>>
>> Probably, but at least ->destroy() should be called, looking at
>> those calling qdisc_watchdog_init(), they are supposed to call
>> qdisc_watchdog_cancel() when >init() fails after that.
>
> In which cases does it actually fail after that? Usually this is
> called once initialization is complete.

How about tbf_change() in tbf_init()? If tbf_change() fails,
watchdog is still there if we don't call ->destroy(). Yes,
I know the timer is started, the point is we do miss something
clean up, even trivial.

tbf is not the only one who calls xxx_change() in xxx_init().

>
>> ->destroy() is supposed to be able to clean up even partially
>> initialized qdisc's. So, for qdisc_create_dflt() we should probably
>> just call ->destroy().
>
> No, why do you think that? ->init() is supposed to clean up after
> itself if it fails, both qdisc_destroy() and ->destroy are only
> supposed to be called after ->init() succeeded.


Why? This would end up with calling xxx_destroy() in every
failure path in ->init(). Moving it up to caller makes the code cleaner
and smaller.

>
> Its simply symetrical, as everywhere else in the kernel. If a sub-init
> funtion fails, it should clean up and return an error. We don't
> destroy things we've never successfully initialized, they're supposed
> to clean up after themselves.

Most (if not all) ->destroy() are able to clean partially initialized qdisc,
I don't see why it  could be a problem here.

We don't have to keep with other kernel subsystem as long as it makes
sense, net_sched subsystem is pretty much self-contained.

^ permalink raw reply

* Re: [PATCH net-next 2/4] net: bcmgenet: return number of packets completed in TX reclaim
From: Petri Gynther @ 2014-10-24 22:20 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, David Miller
In-Reply-To: <1414180942-2132-3-git-send-email-f.fainelli@gmail.com>

Hi Florian,

On Fri, Oct 24, 2014 at 1:02 PM, Florian Fainelli <f.fainelli@gmail.com> wrote:
> In preparation for reclaiming transmitted buffers in NAPI context,
> update __bcmgenet_tx_reclaim() and bcmgenet_tx_reclaim() to return the
> number of packets completed per call.
>
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
>  drivers/net/ethernet/broadcom/genet/bcmgenet.c | 20 +++++++++++++-------
>  1 file changed, 13 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
> index ee4d5baf09b6..70f2fb366375 100644
> --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
> +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
> @@ -876,14 +876,14 @@ static inline void bcmgenet_tx_ring_int_disable(struct bcmgenet_priv *priv,
>  }
>
>  /* Unlocked version of the reclaim routine */
> -static void __bcmgenet_tx_reclaim(struct net_device *dev,
> -                                 struct bcmgenet_tx_ring *ring)
> +static unsigned int __bcmgenet_tx_reclaim(struct net_device *dev,
> +                                         struct bcmgenet_tx_ring *ring)
>  {
>         struct bcmgenet_priv *priv = netdev_priv(dev);
>         int last_tx_cn, last_c_index, num_tx_bds;
>         struct enet_cb *tx_cb_ptr;
>         struct netdev_queue *txq;
> -       unsigned int bds_compl;
> +       unsigned int bds_compl, pkts_compl = 0;

bcmgenet_desc_rx() uses "rxpktprocessed", so I'd go with "txpktprocessed" here.

>         unsigned int c_index;
>
>         /* Compute how many buffers are transmitted since last xmit call */
> @@ -928,6 +928,7 @@ static void __bcmgenet_tx_reclaim(struct net_device *dev,
>                 }
>                 dev->stats.tx_packets++;
>                 ring->free_bds += bds_compl;
> +               pkts_compl += bds_compl;

This change doesn't look correct. The number of cleaned packets is not
necessarily equal to the number of cleaned TxBDs.

I think that the while-loop should be:

        while (last_tx_cn-- > 0) {
                tx_cb_ptr = ring->cbs + last_c_index;

                if (tx_cb_ptr->skb) {
                        pkts_compl++;
                        dev->stats.tx_packets++;
                        dev->stats.tx_bytes += tx_cb_ptr->skb->len;
                        dma_unmap_single(&dev->dev,
                                         dma_unmap_addr(tx_cb_ptr, dma_addr),
                                         tx_cb_ptr->skb->len,
                                         DMA_TO_DEVICE);
                        bcmgenet_free_cb(tx_cb_ptr);
                } else if (dma_unmap_addr(tx_cb_ptr, dma_addr)) {
                        dev->stats.tx_bytes +=
                                dma_unmap_len(tx_cb_ptr, dma_len);
                        dma_unmap_page(&dev->dev,
                                       dma_unmap_addr(tx_cb_ptr, dma_addr),
                                       dma_unmap_len(tx_cb_ptr, dma_len),
                                       DMA_TO_DEVICE);
                        dma_unmap_addr_set(tx_cb_ptr, dma_addr, 0);
                }
                ring->free_bds++;
                last_c_index++;
                last_c_index &= (num_tx_bds - 1);
        }

>
>                 last_c_index++;
>                 last_c_index &= (num_tx_bds - 1);
> @@ -936,20 +937,25 @@ static void __bcmgenet_tx_reclaim(struct net_device *dev,
>         if (ring->free_bds > (MAX_SKB_FRAGS + 1))
>                 ring->int_disable(priv, ring);
>
> -       if (netif_tx_queue_stopped(txq))
> +       if (netif_tx_queue_stopped(txq) && pkts_compl)

bcmgenet_xmit() stops the Tx queue when:
ring->free_bds <= (MAX_SKB_FRAGS + 1)

So, shouldn't this check be:
netif_tx_queue_stopped(txq) && (ring->free_bds > (MAX_SKB_FRAGS + 1))

Having said that --
Why does the driver stop the Tx queue when there are still
(MAX_SKB_FRAGS + 1) TxBDs left?
It leaves 17 or so TxBDs unused on the Tx ring, and most of the
packets are not fragmented.

I feel that bcmgenet_xmit() should stop the Tx queue only when there
is 1 TxBD left after putting a new packet on the ring.

>                 netif_tx_wake_queue(txq);
>
>         ring->c_index = c_index;
> +
> +       return pkts_compl;
>  }
>
> -static void bcmgenet_tx_reclaim(struct net_device *dev,
> -                               struct bcmgenet_tx_ring *ring)
> +static unsigned int bcmgenet_tx_reclaim(struct net_device *dev,
> +                                       struct bcmgenet_tx_ring *ring)
>  {
>         unsigned long flags;
> +       unsigned int pkts_compl;
>
>         spin_lock_irqsave(&ring->lock, flags);
> -       __bcmgenet_tx_reclaim(dev, ring);
> +       pkts_compl = __bcmgenet_tx_reclaim(dev, ring);
>         spin_unlock_irqrestore(&ring->lock, flags);
> +
> +       return pkts_compl;
>  }
>
>  static void bcmgenet_tx_reclaim_all(struct net_device *dev)
> --
> 1.9.1
>

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox