Netdev List
 help / color / mirror / Atom feed
* [PATCH v5 net-next 10/15] nfp: slice .ndo_open() and .ndo_stop() up
From: Jakub Kicinski @ 2016-04-07 18:39 UTC (permalink / raw)
  To: netdev; +Cc: Jakub Kicinski
In-Reply-To: <1460054388-471-1-git-send-email-jakub.kicinski@netronome.com>

Divide .ndo_open() and .ndo_stop() into logical, callable
chunks.  No functional changes.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    | 218 +++++++++++++--------
 1 file changed, 136 insertions(+), 82 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 342335d09fb2..6c1ed8914416 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1672,6 +1672,82 @@ nfp_net_vec_write_ring_data(struct nfp_net *nn, struct nfp_net_r_vector *r_vec,
 	nn_writeb(nn, NFP_NET_CFG_TXR_VEC(idx), r_vec->irq_idx);
 }
 
+static int __nfp_net_set_config_and_enable(struct nfp_net *nn)
+{
+	u32 new_ctrl, update = 0;
+	unsigned int r;
+	int err;
+
+	new_ctrl = nn->ctrl;
+
+	if (nn->cap & NFP_NET_CFG_CTRL_RSS) {
+		nfp_net_rss_write_key(nn);
+		nfp_net_rss_write_itbl(nn);
+		nn_writel(nn, NFP_NET_CFG_RSS_CTRL, nn->rss_cfg);
+		update |= NFP_NET_CFG_UPDATE_RSS;
+	}
+
+	if (nn->cap & NFP_NET_CFG_CTRL_IRQMOD) {
+		nfp_net_coalesce_write_cfg(nn);
+
+		new_ctrl |= NFP_NET_CFG_CTRL_IRQMOD;
+		update |= NFP_NET_CFG_UPDATE_IRQMOD;
+	}
+
+	for (r = 0; r < nn->num_r_vecs; r++)
+		nfp_net_vec_write_ring_data(nn, &nn->r_vecs[r], r);
+
+	nn_writeq(nn, NFP_NET_CFG_TXRS_ENABLE, nn->num_tx_rings == 64 ?
+		  0xffffffffffffffffULL : ((u64)1 << nn->num_tx_rings) - 1);
+
+	nn_writeq(nn, NFP_NET_CFG_RXRS_ENABLE, nn->num_rx_rings == 64 ?
+		  0xffffffffffffffffULL : ((u64)1 << nn->num_rx_rings) - 1);
+
+	nfp_net_write_mac_addr(nn, nn->netdev->dev_addr);
+
+	nn_writel(nn, NFP_NET_CFG_MTU, nn->netdev->mtu);
+	nn_writel(nn, NFP_NET_CFG_FLBUFSZ, nn->fl_bufsz);
+
+	/* Enable device */
+	new_ctrl |= NFP_NET_CFG_CTRL_ENABLE;
+	update |= NFP_NET_CFG_UPDATE_GEN;
+	update |= NFP_NET_CFG_UPDATE_MSIX;
+	update |= NFP_NET_CFG_UPDATE_RING;
+	if (nn->cap & NFP_NET_CFG_CTRL_RINGCFG)
+		new_ctrl |= NFP_NET_CFG_CTRL_RINGCFG;
+
+	nn_writel(nn, NFP_NET_CFG_CTRL, new_ctrl);
+	err = nfp_net_reconfig(nn, update);
+
+	nn->ctrl = new_ctrl;
+
+	/* Since reconfiguration requests while NFP is down are ignored we
+	 * have to wipe the entire VXLAN configuration and reinitialize it.
+	 */
+	if (nn->ctrl & NFP_NET_CFG_CTRL_VXLAN) {
+		memset(&nn->vxlan_ports, 0, sizeof(nn->vxlan_ports));
+		memset(&nn->vxlan_usecnt, 0, sizeof(nn->vxlan_usecnt));
+		vxlan_get_rx_port(nn->netdev);
+	}
+
+	return err;
+}
+
+/**
+ * nfp_net_set_config_and_enable() - Write control BAR and enable NFP
+ * @nn:      NFP Net device to reconfigure
+ */
+static int nfp_net_set_config_and_enable(struct nfp_net *nn)
+{
+	int err;
+
+	err = __nfp_net_set_config_and_enable(nn);
+	if (err)
+		nfp_net_clear_config_and_disable(nn);
+
+	return err;
+}
+
 /**
  * nfp_net_start_vec() - Start ring vector
  * @nn:      NFP Net device structure
@@ -1692,20 +1768,33 @@ nfp_net_start_vec(struct nfp_net *nn, struct nfp_net_r_vector *r_vec)
 	enable_irq(irq_vec);
 }
 
+/**
+ * nfp_net_open_stack() - Start the device from stack's perspective
+ * @nn:      NFP Net device to reconfigure
+ */
+static void nfp_net_open_stack(struct nfp_net *nn)
+{
+	unsigned int r;
+
+	for (r = 0; r < nn->num_r_vecs; r++)
+		nfp_net_start_vec(nn, &nn->r_vecs[r]);
+
+	netif_tx_wake_all_queues(nn->netdev);
+
+	enable_irq(nn->irq_entries[NFP_NET_CFG_LSC].vector);
+	nfp_net_read_link_status(nn);
+}
+
 static int nfp_net_netdev_open(struct net_device *netdev)
 {
 	struct nfp_net *nn = netdev_priv(netdev);
 	int err, r;
-	u32 update = 0;
-	u32 new_ctrl;
 
 	if (nn->ctrl & NFP_NET_CFG_CTRL_ENABLE) {
 		nn_err(nn, "Dev is already enabled: 0x%08x\n", nn->ctrl);
 		return -EBUSY;
 	}
 
-	new_ctrl = nn->ctrl;
-
 	/* Step 1: Allocate resources for rings and the like
 	 * - Request interrupts
 	 * - Allocate RX and TX ring resources
@@ -1758,20 +1847,6 @@ static int nfp_net_netdev_open(struct net_device *netdev)
 	if (err)
 		goto err_free_rings;
 
-	if (nn->cap & NFP_NET_CFG_CTRL_RSS) {
-		nfp_net_rss_write_key(nn);
-		nfp_net_rss_write_itbl(nn);
-		nn_writel(nn, NFP_NET_CFG_RSS_CTRL, nn->rss_cfg);
-		update |= NFP_NET_CFG_UPDATE_RSS;
-	}
-
-	if (nn->cap & NFP_NET_CFG_CTRL_IRQMOD) {
-		nfp_net_coalesce_write_cfg(nn);
-
-		new_ctrl |= NFP_NET_CFG_CTRL_IRQMOD;
-		update |= NFP_NET_CFG_UPDATE_IRQMOD;
-	}
-
 	/* Step 2: Configure the NFP
 	 * - Enable rings from 0 to tx_rings/rx_rings - 1.
 	 * - Write MAC address (in case it changed)
@@ -1779,43 +1854,9 @@ static int nfp_net_netdev_open(struct net_device *netdev)
 	 * - Set the Freelist buffer size
 	 * - Enable the FW
 	 */
-	for (r = 0; r < nn->num_r_vecs; r++)
-		nfp_net_vec_write_ring_data(nn, &nn->r_vecs[r], r);
-
-	nn_writeq(nn, NFP_NET_CFG_TXRS_ENABLE, nn->num_tx_rings == 64 ?
-		  0xffffffffffffffffULL : ((u64)1 << nn->num_tx_rings) - 1);
-
-	nn_writeq(nn, NFP_NET_CFG_RXRS_ENABLE, nn->num_rx_rings == 64 ?
-		  0xffffffffffffffffULL : ((u64)1 << nn->num_rx_rings) - 1);
-
-	nfp_net_write_mac_addr(nn, netdev->dev_addr);
-
-	nn_writel(nn, NFP_NET_CFG_MTU, netdev->mtu);
-	nn_writel(nn, NFP_NET_CFG_FLBUFSZ, nn->fl_bufsz);
-
-	/* Enable device */
-	new_ctrl |= NFP_NET_CFG_CTRL_ENABLE;
-	update |= NFP_NET_CFG_UPDATE_GEN;
-	update |= NFP_NET_CFG_UPDATE_MSIX;
-	update |= NFP_NET_CFG_UPDATE_RING;
-	if (nn->cap & NFP_NET_CFG_CTRL_RINGCFG)
-		new_ctrl |= NFP_NET_CFG_CTRL_RINGCFG;
-
-	nn_writel(nn, NFP_NET_CFG_CTRL, new_ctrl);
-	err = nfp_net_reconfig(nn, update);
+	err = nfp_net_set_config_and_enable(nn);
 	if (err)
-		goto err_clear_config;
-
-	nn->ctrl = new_ctrl;
-
-	/* Since reconfiguration requests while NFP is down are ignored we
-	 * have to wipe the entire VXLAN configuration and reinitialize it.
-	 */
-	if (nn->ctrl & NFP_NET_CFG_CTRL_VXLAN) {
-		memset(&nn->vxlan_ports, 0, sizeof(nn->vxlan_ports));
-		memset(&nn->vxlan_usecnt, 0, sizeof(nn->vxlan_usecnt));
-		vxlan_get_rx_port(netdev);
-	}
+		goto err_free_rings;
 
 	/* Step 3: Enable for kernel
 	 * - put some freelist descriptors on each RX ring
@@ -1823,18 +1864,10 @@ static int nfp_net_netdev_open(struct net_device *netdev)
 	 * - enable all TX queues
 	 * - set link state
 	 */
-	for (r = 0; r < nn->num_r_vecs; r++)
-		nfp_net_start_vec(nn, &nn->r_vecs[r]);
-
-	netif_tx_wake_all_queues(netdev);
-
-	enable_irq(nn->irq_entries[NFP_NET_CFG_LSC].vector);
-	nfp_net_read_link_status(nn);
+	nfp_net_open_stack(nn);
 
 	return 0;
 
-err_clear_config:
-	nfp_net_clear_config_and_disable(nn);
 err_free_rings:
 	r = nn->num_r_vecs;
 err_free_prev_vecs:
@@ -1858,36 +1891,31 @@ err_free_exn:
 }
 
 /**
- * nfp_net_netdev_close() - Called when the device is downed
- * @netdev:      netdev structure
+ * nfp_net_close_stack() - Quiescent the stack (part of close)
+ * @nn:	     NFP Net device to reconfigure
  */
-static int nfp_net_netdev_close(struct net_device *netdev)
+static void nfp_net_close_stack(struct nfp_net *nn)
 {
-	struct nfp_net *nn = netdev_priv(netdev);
-	int r;
-
-	if (!(nn->ctrl & NFP_NET_CFG_CTRL_ENABLE)) {
-		nn_err(nn, "Dev is not up: 0x%08x\n", nn->ctrl);
-		return 0;
-	}
+	unsigned int r;
 
-	/* Step 1: Disable RX and TX rings from the Linux kernel perspective
-	 */
 	disable_irq(nn->irq_entries[NFP_NET_CFG_LSC].vector);
-	netif_carrier_off(netdev);
+	netif_carrier_off(nn->netdev);
 	nn->link_up = false;
 
 	for (r = 0; r < nn->num_r_vecs; r++)
 		napi_disable(&nn->r_vecs[r].napi);
 
-	netif_tx_disable(netdev);
+	netif_tx_disable(nn->netdev);
+}
 
-	/* Step 2: Tell NFP
-	 */
-	nfp_net_clear_config_and_disable(nn);
+/**
+ * nfp_net_close_free_all() - Free all runtime resources
+ * @nn:      NFP Net device to reconfigure
+ */
+static void nfp_net_close_free_all(struct nfp_net *nn)
+{
+	unsigned int r;
 
-	/* Step 3: Free resources
-	 */
 	for (r = 0; r < nn->num_r_vecs; r++) {
 		nfp_net_rx_ring_reset(nn->r_vecs[r].rx_ring);
 		nfp_net_rx_ring_bufs_free(nn, nn->r_vecs[r].rx_ring);
@@ -1902,6 +1930,32 @@ static int nfp_net_netdev_close(struct net_device *netdev)
 
 	nfp_net_aux_irq_free(nn, NFP_NET_CFG_LSC, NFP_NET_IRQ_LSC_IDX);
 	nfp_net_aux_irq_free(nn, NFP_NET_CFG_EXN, NFP_NET_IRQ_EXN_IDX);
+}
+
+/**
+ * nfp_net_netdev_close() - Called when the device is downed
+ * @netdev:      netdev structure
+ */
+static int nfp_net_netdev_close(struct net_device *netdev)
+{
+	struct nfp_net *nn = netdev_priv(netdev);
+
+	if (!(nn->ctrl & NFP_NET_CFG_CTRL_ENABLE)) {
+		nn_err(nn, "Dev is not up: 0x%08x\n", nn->ctrl);
+		return 0;
+	}
+
+	/* Step 1: Disable RX and TX rings from the Linux kernel perspective
+	 */
+	nfp_net_close_stack(nn);
+
+	/* Step 2: Tell NFP
+	 */
+	nfp_net_clear_config_and_disable(nn);
+
+	/* Step 3: Free resources
+	 */
+	nfp_net_close_free_all(nn);
 
 	nn_dbg(nn, "%s down", netdev->name);
 	return 0;
-- 
1.9.1

^ permalink raw reply related

* [PATCH v5 net-next 09/15] nfp: move filling ring information to FW config
From: Jakub Kicinski @ 2016-04-07 18:39 UTC (permalink / raw)
  To: netdev; +Cc: Jakub Kicinski
In-Reply-To: <1460054388-471-1-git-send-email-jakub.kicinski@netronome.com>

nfp_net_[rt]x_ring_{alloc,free} should only allocate or free
ring resources without touching the device.  Move setting
parameters in the BAR to separate functions.  This will make
it possible to reuse alloc/free functions to allocate new
rings while the device is running.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    | 50 ++++++++++++++--------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index a6a917fe8e31..342335d09fb2 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1387,10 +1387,6 @@ static void nfp_net_tx_ring_free(struct nfp_net_tx_ring *tx_ring)
 	struct nfp_net *nn = r_vec->nfp_net;
 	struct pci_dev *pdev = nn->pdev;
 
-	nn_writeq(nn, NFP_NET_CFG_TXR_ADDR(tx_ring->idx), 0);
-	nn_writeb(nn, NFP_NET_CFG_TXR_SZ(tx_ring->idx), 0);
-	nn_writeb(nn, NFP_NET_CFG_TXR_VEC(tx_ring->idx), 0);
-
 	kfree(tx_ring->txbufs);
 
 	if (tx_ring->txds)
@@ -1430,11 +1426,6 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring *tx_ring)
 	if (!tx_ring->txbufs)
 		goto err_alloc;
 
-	/* Write the DMA address, size and MSI-X info to the device */
-	nn_writeq(nn, NFP_NET_CFG_TXR_ADDR(tx_ring->idx), tx_ring->dma);
-	nn_writeb(nn, NFP_NET_CFG_TXR_SZ(tx_ring->idx), ilog2(tx_ring->cnt));
-	nn_writeb(nn, NFP_NET_CFG_TXR_VEC(tx_ring->idx), r_vec->irq_idx);
-
 	netif_set_xps_queue(nn->netdev, &r_vec->affinity_mask, tx_ring->idx);
 
 	nn_dbg(nn, "TxQ%02d: QCidx=%02d cnt=%d dma=%#llx host=%p\n",
@@ -1458,10 +1449,6 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring *rx_ring)
 	struct nfp_net *nn = r_vec->nfp_net;
 	struct pci_dev *pdev = nn->pdev;
 
-	nn_writeq(nn, NFP_NET_CFG_RXR_ADDR(rx_ring->idx), 0);
-	nn_writeb(nn, NFP_NET_CFG_RXR_SZ(rx_ring->idx), 0);
-	nn_writeb(nn, NFP_NET_CFG_RXR_VEC(rx_ring->idx), 0);
-
 	kfree(rx_ring->rxbufs);
 
 	if (rx_ring->rxds)
@@ -1501,11 +1488,6 @@ static int nfp_net_rx_ring_alloc(struct nfp_net_rx_ring *rx_ring)
 	if (!rx_ring->rxbufs)
 		goto err_alloc;
 
-	/* Write the DMA address, size and MSI-X info to the device */
-	nn_writeq(nn, NFP_NET_CFG_RXR_ADDR(rx_ring->idx), rx_ring->dma);
-	nn_writeb(nn, NFP_NET_CFG_RXR_SZ(rx_ring->idx), ilog2(rx_ring->cnt));
-	nn_writeb(nn, NFP_NET_CFG_RXR_VEC(rx_ring->idx), r_vec->irq_idx);
-
 	nn_dbg(nn, "RxQ%02d: FlQCidx=%02d RxQCidx=%02d cnt=%d dma=%#llx host=%p\n",
 	       rx_ring->idx, rx_ring->fl_qcidx, rx_ring->rx_qcidx,
 	       rx_ring->cnt, (unsigned long long)rx_ring->dma, rx_ring->rxds);
@@ -1630,6 +1612,17 @@ static void nfp_net_write_mac_addr(struct nfp_net *nn, const u8 *mac)
 		  get_unaligned_be16(nn->netdev->dev_addr + 4) << 16);
 }
 
+static void nfp_net_vec_clear_ring_data(struct nfp_net *nn, unsigned int idx)
+{
+	nn_writeq(nn, NFP_NET_CFG_RXR_ADDR(idx), 0);
+	nn_writeb(nn, NFP_NET_CFG_RXR_SZ(idx), 0);
+	nn_writeb(nn, NFP_NET_CFG_RXR_VEC(idx), 0);
+
+	nn_writeq(nn, NFP_NET_CFG_TXR_ADDR(idx), 0);
+	nn_writeb(nn, NFP_NET_CFG_TXR_SZ(idx), 0);
+	nn_writeb(nn, NFP_NET_CFG_TXR_VEC(idx), 0);
+}
+
 /**
  * nfp_net_clear_config_and_disable() - Clear control BAR and disable NFP
  * @nn:      NFP Net device to reconfigure
@@ -1637,6 +1630,7 @@ static void nfp_net_write_mac_addr(struct nfp_net *nn, const u8 *mac)
 static void nfp_net_clear_config_and_disable(struct nfp_net *nn)
 {
 	u32 new_ctrl, update;
+	unsigned int r;
 	int err;
 
 	new_ctrl = nn->ctrl;
@@ -1658,9 +1652,26 @@ static void nfp_net_clear_config_and_disable(struct nfp_net *nn)
 		return;
 	}
 
+	for (r = 0; r < nn->num_r_vecs; r++)
+		nfp_net_vec_clear_ring_data(nn, r);
+
 	nn->ctrl = new_ctrl;
 }
 
+static void
+nfp_net_vec_write_ring_data(struct nfp_net *nn, struct nfp_net_r_vector *r_vec,
+			    unsigned int idx)
+{
+	/* Write the DMA address, size and MSI-X info to the device */
+	nn_writeq(nn, NFP_NET_CFG_RXR_ADDR(idx), r_vec->rx_ring->dma);
+	nn_writeb(nn, NFP_NET_CFG_RXR_SZ(idx), ilog2(r_vec->rx_ring->cnt));
+	nn_writeb(nn, NFP_NET_CFG_RXR_VEC(idx), r_vec->irq_idx);
+
+	nn_writeq(nn, NFP_NET_CFG_TXR_ADDR(idx), r_vec->tx_ring->dma);
+	nn_writeb(nn, NFP_NET_CFG_TXR_SZ(idx), ilog2(r_vec->tx_ring->cnt));
+	nn_writeb(nn, NFP_NET_CFG_TXR_VEC(idx), r_vec->irq_idx);
+}
+
 /**
  * nfp_net_start_vec() - Start ring vector
  * @nn:      NFP Net device structure
@@ -1768,6 +1779,9 @@ static int nfp_net_netdev_open(struct net_device *netdev)
 	 * - Set the Freelist buffer size
 	 * - Enable the FW
 	 */
+	for (r = 0; r < nn->num_r_vecs; r++)
+		nfp_net_vec_write_ring_data(nn, &nn->r_vecs[r], r);
+
 	nn_writeq(nn, NFP_NET_CFG_TXRS_ENABLE, nn->num_tx_rings == 64 ?
 		  0xffffffffffffffffULL : ((u64)1 << nn->num_tx_rings) - 1);
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH v5 net-next 08/15] nfp: preallocate RX buffers early in .ndo_open
From: Jakub Kicinski @ 2016-04-07 18:39 UTC (permalink / raw)
  To: netdev; +Cc: Jakub Kicinski
In-Reply-To: <1460054388-471-1-git-send-email-jakub.kicinski@netronome.com>

We want the .ndo_open() to have following structure:
 - allocate resources;
 - configure HW/FW;
 - enable the device from stack perspective.
Therefore filling RX rings needs to be moved to the beginning
of .ndo_open().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    | 34 +++++++---------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 0c3c37ad28a4..a6a917fe8e31 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1666,28 +1666,19 @@ static void nfp_net_clear_config_and_disable(struct nfp_net *nn)
  * @nn:      NFP Net device structure
  * @r_vec:   Ring vector to be started
  */
-static int nfp_net_start_vec(struct nfp_net *nn, struct nfp_net_r_vector *r_vec)
+static void
+nfp_net_start_vec(struct nfp_net *nn, struct nfp_net_r_vector *r_vec)
 {
 	unsigned int irq_vec;
-	int err = 0;
 
 	irq_vec = nn->irq_entries[r_vec->irq_idx].vector;
 
 	disable_irq(irq_vec);
 
-	err = nfp_net_rx_ring_bufs_alloc(r_vec->nfp_net, r_vec->rx_ring);
-	if (err) {
-		nn_err(nn, "RV%02d: couldn't allocate enough buffers\n",
-		       r_vec->irq_idx);
-		goto out;
-	}
 	nfp_net_rx_ring_fill_freelist(r_vec->rx_ring);
-
 	napi_enable(&r_vec->napi);
-out:
-	enable_irq(irq_vec);
 
-	return err;
+	enable_irq(irq_vec);
 }
 
 static int nfp_net_netdev_open(struct net_device *netdev)
@@ -1742,6 +1733,10 @@ static int nfp_net_netdev_open(struct net_device *netdev)
 		err = nfp_net_rx_ring_alloc(nn->r_vecs[r].rx_ring);
 		if (err)
 			goto err_free_tx_ring_p;
+
+		err = nfp_net_rx_ring_bufs_alloc(nn, nn->r_vecs[r].rx_ring);
+		if (err)
+			goto err_flush_rx_ring_p;
 	}
 
 	err = netif_set_real_num_tx_queues(netdev, nn->num_tx_rings);
@@ -1814,11 +1809,8 @@ static int nfp_net_netdev_open(struct net_device *netdev)
 	 * - enable all TX queues
 	 * - set link state
 	 */
-	for (r = 0; r < nn->num_r_vecs; r++) {
-		err = nfp_net_start_vec(nn, &nn->r_vecs[r]);
-		if (err)
-			goto err_disable_napi;
-	}
+	for (r = 0; r < nn->num_r_vecs; r++)
+		nfp_net_start_vec(nn, &nn->r_vecs[r]);
 
 	netif_tx_wake_all_queues(netdev);
 
@@ -1827,18 +1819,14 @@ static int nfp_net_netdev_open(struct net_device *netdev)
 
 	return 0;
 
-err_disable_napi:
-	while (r--) {
-		napi_disable(&nn->r_vecs[r].napi);
-		nfp_net_rx_ring_reset(nn->r_vecs[r].rx_ring);
-		nfp_net_rx_ring_bufs_free(nn, nn->r_vecs[r].rx_ring);
-	}
 err_clear_config:
 	nfp_net_clear_config_and_disable(nn);
 err_free_rings:
 	r = nn->num_r_vecs;
 err_free_prev_vecs:
 	while (r--) {
+		nfp_net_rx_ring_bufs_free(nn, nn->r_vecs[r].rx_ring);
+err_flush_rx_ring_p:
 		nfp_net_rx_ring_free(nn->r_vecs[r].rx_ring);
 err_free_tx_ring_p:
 		nfp_net_tx_ring_free(nn->r_vecs[r].tx_ring);
-- 
1.9.1

^ permalink raw reply related

* [PATCH v5 net-next 07/15] nfp: reorganize initial filling of RX rings
From: Jakub Kicinski @ 2016-04-07 18:39 UTC (permalink / raw)
  To: netdev; +Cc: Jakub Kicinski
In-Reply-To: <1460054388-471-1-git-send-email-jakub.kicinski@netronome.com>

Separate allocation of buffers from giving them to FW,
thanks to this it will be possible to move allocation
earlier on .ndo_open() path and reuse buffers during
runtime reconfiguration.

Similar to TX side clean up the spill of functionality
from flush to freeing the ring.  Unlike on TX side,
RX ring reset does not free buffers from the ring.
Ring reset means only that FW pointers are zeroed and
buffers on the ring must be placed in [0, cnt - 1)
positions.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    | 119 ++++++++++++++-------
 1 file changed, 78 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 61f243760ee0..0c3c37ad28a4 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1020,62 +1020,100 @@ static void nfp_net_rx_give_one(struct nfp_net_rx_ring *rx_ring,
 }
 
 /**
- * nfp_net_rx_flush() - Free any buffers currently on the RX ring
- * @rx_ring:  RX ring to remove buffers from
+ * nfp_net_rx_ring_reset() - Reflect in SW state of freelist after disable
+ * @rx_ring:	RX ring structure
  *
- * Assumes that the device is stopped
+ * Warning: Do *not* call if ring buffers were never put on the FW freelist
+ *	    (i.e. device was not enabled)!
  */
-static void nfp_net_rx_flush(struct nfp_net_rx_ring *rx_ring)
+static void nfp_net_rx_ring_reset(struct nfp_net_rx_ring *rx_ring)
 {
-	struct nfp_net *nn = rx_ring->r_vec->nfp_net;
-	struct pci_dev *pdev = nn->pdev;
-	int idx;
+	unsigned int wr_idx, last_idx;
 
-	while (rx_ring->rd_p != rx_ring->wr_p) {
-		idx = rx_ring->rd_p % rx_ring->cnt;
+	/* Move the empty entry to the end of the list */
+	wr_idx = rx_ring->wr_p % rx_ring->cnt;
+	last_idx = rx_ring->cnt - 1;
+	rx_ring->rxbufs[wr_idx].dma_addr = rx_ring->rxbufs[last_idx].dma_addr;
+	rx_ring->rxbufs[wr_idx].skb = rx_ring->rxbufs[last_idx].skb;
+	rx_ring->rxbufs[last_idx].dma_addr = 0;
+	rx_ring->rxbufs[last_idx].skb = NULL;
 
-		if (rx_ring->rxbufs[idx].skb) {
-			dma_unmap_single(&pdev->dev,
-					 rx_ring->rxbufs[idx].dma_addr,
-					 nn->fl_bufsz, DMA_FROM_DEVICE);
-			dev_kfree_skb_any(rx_ring->rxbufs[idx].skb);
-			rx_ring->rxbufs[idx].dma_addr = 0;
-			rx_ring->rxbufs[idx].skb = NULL;
-		}
+	memset(rx_ring->rxds, 0, sizeof(*rx_ring->rxds) * rx_ring->cnt);
+	rx_ring->wr_p = 0;
+	rx_ring->rd_p = 0;
+	rx_ring->wr_ptr_add = 0;
+}
 
-		memset(&rx_ring->rxds[idx], 0, sizeof(rx_ring->rxds[idx]));
+/**
+ * nfp_net_rx_ring_bufs_free() - Free any buffers currently on the RX ring
+ * @nn:		NFP Net device
+ * @rx_ring:	RX ring to remove buffers from
+ *
+ * Assumes that the device is stopped and buffers are in [0, ring->cnt - 1)
+ * entries.  After device is disabled nfp_net_rx_ring_reset() must be called
+ * to restore required ring geometry.
+ */
+static void
+nfp_net_rx_ring_bufs_free(struct nfp_net *nn, struct nfp_net_rx_ring *rx_ring)
+{
+	struct pci_dev *pdev = nn->pdev;
+	unsigned int i;
 
-		rx_ring->rd_p++;
+	for (i = 0; i < rx_ring->cnt - 1; i++) {
+		/* NULL skb can only happen when initial filling of the ring
+		 * fails to allocate enough buffers and calls here to free
+		 * already allocated ones.
+		 */
+		if (!rx_ring->rxbufs[i].skb)
+			continue;
+
+		dma_unmap_single(&pdev->dev, rx_ring->rxbufs[i].dma_addr,
+				 nn->fl_bufsz, DMA_FROM_DEVICE);
+		dev_kfree_skb_any(rx_ring->rxbufs[i].skb);
+		rx_ring->rxbufs[i].dma_addr = 0;
+		rx_ring->rxbufs[i].skb = NULL;
 	}
 }
 
 /**
- * nfp_net_rx_fill_freelist() - Attempt filling freelist with RX buffers
- * @rx_ring: RX ring to fill
- *
- * Try to fill as many buffers as possible into freelist.  Return
- * number of buffers added.
- *
- * Return: Number of freelist buffers added.
+ * nfp_net_rx_ring_bufs_alloc() - Fill RX ring with buffers (don't give to FW)
+ * @nn:		NFP Net device
+ * @rx_ring:	RX ring to remove buffers from
  */
-static int nfp_net_rx_fill_freelist(struct nfp_net_rx_ring *rx_ring)
+static int
+nfp_net_rx_ring_bufs_alloc(struct nfp_net *nn, struct nfp_net_rx_ring *rx_ring)
 {
-	struct sk_buff *skb;
-	dma_addr_t dma_addr;
+	struct nfp_net_rx_buf *rxbufs;
+	unsigned int i;
+
+	rxbufs = rx_ring->rxbufs;
 
-	while (nfp_net_rx_space(rx_ring)) {
-		skb = nfp_net_rx_alloc_one(rx_ring, &dma_addr);
-		if (!skb) {
-			nfp_net_rx_flush(rx_ring);
+	for (i = 0; i < rx_ring->cnt - 1; i++) {
+		rxbufs[i].skb =
+			nfp_net_rx_alloc_one(rx_ring, &rxbufs[i].dma_addr);
+		if (!rxbufs[i].skb) {
+			nfp_net_rx_ring_bufs_free(nn, rx_ring);
 			return -ENOMEM;
 		}
-		nfp_net_rx_give_one(rx_ring, skb, dma_addr);
 	}
 
 	return 0;
 }
 
 /**
+ * nfp_net_rx_ring_fill_freelist() - Give buffers from the ring to FW
+ * @rx_ring: RX ring to fill
+ */
+static void nfp_net_rx_ring_fill_freelist(struct nfp_net_rx_ring *rx_ring)
+{
+	unsigned int i;
+
+	for (i = 0; i < rx_ring->cnt - 1; i++)
+		nfp_net_rx_give_one(rx_ring, rx_ring->rxbufs[i].skb,
+				    rx_ring->rxbufs[i].dma_addr);
+}
+
+/**
  * nfp_net_rx_csum_has_errors() - group check if rxd has any csum errors
  * @flags: RX descriptor flags field in CPU byte order
  */
@@ -1431,10 +1469,6 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring *rx_ring)
 				  rx_ring->rxds, rx_ring->dma);
 
 	rx_ring->cnt = 0;
-	rx_ring->wr_p = 0;
-	rx_ring->rd_p = 0;
-	rx_ring->wr_ptr_add = 0;
-
 	rx_ring->rxbufs = NULL;
 	rx_ring->rxds = NULL;
 	rx_ring->dma = 0;
@@ -1641,12 +1675,13 @@ static int nfp_net_start_vec(struct nfp_net *nn, struct nfp_net_r_vector *r_vec)
 
 	disable_irq(irq_vec);
 
-	err = nfp_net_rx_fill_freelist(r_vec->rx_ring);
+	err = nfp_net_rx_ring_bufs_alloc(r_vec->nfp_net, r_vec->rx_ring);
 	if (err) {
 		nn_err(nn, "RV%02d: couldn't allocate enough buffers\n",
 		       r_vec->irq_idx);
 		goto out;
 	}
+	nfp_net_rx_ring_fill_freelist(r_vec->rx_ring);
 
 	napi_enable(&r_vec->napi);
 out:
@@ -1795,7 +1830,8 @@ static int nfp_net_netdev_open(struct net_device *netdev)
 err_disable_napi:
 	while (r--) {
 		napi_disable(&nn->r_vecs[r].napi);
-		nfp_net_rx_flush(nn->r_vecs[r].rx_ring);
+		nfp_net_rx_ring_reset(nn->r_vecs[r].rx_ring);
+		nfp_net_rx_ring_bufs_free(nn, nn->r_vecs[r].rx_ring);
 	}
 err_clear_config:
 	nfp_net_clear_config_and_disable(nn);
@@ -1851,7 +1887,8 @@ static int nfp_net_netdev_close(struct net_device *netdev)
 	/* Step 3: Free resources
 	 */
 	for (r = 0; r < nn->num_r_vecs; r++) {
-		nfp_net_rx_flush(nn->r_vecs[r].rx_ring);
+		nfp_net_rx_ring_reset(nn->r_vecs[r].rx_ring);
+		nfp_net_rx_ring_bufs_free(nn, nn->r_vecs[r].rx_ring);
 		nfp_net_tx_ring_reset(nn, nn->r_vecs[r].tx_ring);
 		nfp_net_rx_ring_free(nn->r_vecs[r].rx_ring);
 		nfp_net_tx_ring_free(nn->r_vecs[r].tx_ring);
-- 
1.9.1

^ permalink raw reply related

* [PATCH v5 net-next 06/15] nfp: cleanup tx ring flush and rename to reset
From: Jakub Kicinski @ 2016-04-07 18:39 UTC (permalink / raw)
  To: netdev; +Cc: Jakub Kicinski
In-Reply-To: <1460054388-471-1-git-send-email-jakub.kicinski@netronome.com>

Since we never used flush without freeing the ring later
the functionality of the two operations is mixed.
Rename flush to ring reset and move there all the things
which have to be done after FW ring state is cleared.
While at it do some clean-ups.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    | 81 ++++++++++------------
 1 file changed, 37 insertions(+), 44 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 66fab7162b7c..61f243760ee0 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -867,61 +867,59 @@ static void nfp_net_tx_complete(struct nfp_net_tx_ring *tx_ring)
 }
 
 /**
- * nfp_net_tx_flush() - Free any untransmitted buffers currently on the TX ring
- * @tx_ring:     TX ring structure
+ * nfp_net_tx_ring_reset() - Free any untransmitted buffers and reset pointers
+ * @nn:		NFP Net device
+ * @tx_ring:	TX ring structure
  *
  * Assumes that the device is stopped
  */
-static void nfp_net_tx_flush(struct nfp_net_tx_ring *tx_ring)
+static void
+nfp_net_tx_ring_reset(struct nfp_net *nn, struct nfp_net_tx_ring *tx_ring)
 {
-	struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
-	struct nfp_net *nn = r_vec->nfp_net;
-	struct pci_dev *pdev = nn->pdev;
 	const struct skb_frag_struct *frag;
 	struct netdev_queue *nd_q;
-	struct sk_buff *skb;
-	int nr_frags;
-	int fidx;
-	int idx;
+	struct pci_dev *pdev = nn->pdev;
 
 	while (tx_ring->rd_p != tx_ring->wr_p) {
-		idx = tx_ring->rd_p % tx_ring->cnt;
+		int nr_frags, fidx, idx;
+		struct sk_buff *skb;
 
+		idx = tx_ring->rd_p % tx_ring->cnt;
 		skb = tx_ring->txbufs[idx].skb;
-		if (skb) {
-			nr_frags = skb_shinfo(skb)->nr_frags;
-			fidx = tx_ring->txbufs[idx].fidx;
-
-			if (fidx == -1) {
-				/* unmap head */
-				dma_unmap_single(&pdev->dev,
-						 tx_ring->txbufs[idx].dma_addr,
-						 skb_headlen(skb),
-						 DMA_TO_DEVICE);
-			} else {
-				/* unmap fragment */
-				frag = &skb_shinfo(skb)->frags[fidx];
-				dma_unmap_page(&pdev->dev,
-					       tx_ring->txbufs[idx].dma_addr,
-					       skb_frag_size(frag),
-					       DMA_TO_DEVICE);
-			}
-
-			/* check for last gather fragment */
-			if (fidx == nr_frags - 1)
-				dev_kfree_skb_any(skb);
-
-			tx_ring->txbufs[idx].dma_addr = 0;
-			tx_ring->txbufs[idx].skb = NULL;
-			tx_ring->txbufs[idx].fidx = -2;
+		nr_frags = skb_shinfo(skb)->nr_frags;
+		fidx = tx_ring->txbufs[idx].fidx;
+
+		if (fidx == -1) {
+			/* unmap head */
+			dma_unmap_single(&pdev->dev,
+					 tx_ring->txbufs[idx].dma_addr,
+					 skb_headlen(skb), DMA_TO_DEVICE);
+		} else {
+			/* unmap fragment */
+			frag = &skb_shinfo(skb)->frags[fidx];
+			dma_unmap_page(&pdev->dev,
+				       tx_ring->txbufs[idx].dma_addr,
+				       skb_frag_size(frag), DMA_TO_DEVICE);
 		}
 
-		memset(&tx_ring->txds[idx], 0, sizeof(tx_ring->txds[idx]));
+		/* check for last gather fragment */
+		if (fidx == nr_frags - 1)
+			dev_kfree_skb_any(skb);
+
+		tx_ring->txbufs[idx].dma_addr = 0;
+		tx_ring->txbufs[idx].skb = NULL;
+		tx_ring->txbufs[idx].fidx = -2;
 
 		tx_ring->qcp_rd_p++;
 		tx_ring->rd_p++;
 	}
 
+	memset(tx_ring->txds, 0, sizeof(*tx_ring->txds) * tx_ring->cnt);
+	tx_ring->wr_p = 0;
+	tx_ring->rd_p = 0;
+	tx_ring->qcp_rd_p = 0;
+	tx_ring->wr_ptr_add = 0;
+
 	nd_q = netdev_get_tx_queue(nn->netdev, tx_ring->idx);
 	netdev_tx_reset_queue(nd_q);
 }
@@ -1362,11 +1360,6 @@ static void nfp_net_tx_ring_free(struct nfp_net_tx_ring *tx_ring)
 				  tx_ring->txds, tx_ring->dma);
 
 	tx_ring->cnt = 0;
-	tx_ring->wr_p = 0;
-	tx_ring->rd_p = 0;
-	tx_ring->qcp_rd_p = 0;
-	tx_ring->wr_ptr_add = 0;
-
 	tx_ring->txbufs = NULL;
 	tx_ring->txds = NULL;
 	tx_ring->dma = 0;
@@ -1859,7 +1852,7 @@ static int nfp_net_netdev_close(struct net_device *netdev)
 	 */
 	for (r = 0; r < nn->num_r_vecs; r++) {
 		nfp_net_rx_flush(nn->r_vecs[r].rx_ring);
-		nfp_net_tx_flush(nn->r_vecs[r].tx_ring);
+		nfp_net_tx_ring_reset(nn, nn->r_vecs[r].tx_ring);
 		nfp_net_rx_ring_free(nn->r_vecs[r].rx_ring);
 		nfp_net_tx_ring_free(nn->r_vecs[r].tx_ring);
 		nfp_net_cleanup_vector(nn, &nn->r_vecs[r]);
-- 
1.9.1

^ permalink raw reply related

* [PATCH v5 net-next 05/15] nfp: allocate ring SW structs dynamically
From: Jakub Kicinski @ 2016-04-07 18:39 UTC (permalink / raw)
  To: netdev; +Cc: Jakub Kicinski
In-Reply-To: <1460054388-471-1-git-send-email-jakub.kicinski@netronome.com>

To be able to switch rings more easily on config changes
allocate them dynamically, separately from nfp_net structure.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h       |  6 ++---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    | 28 +++++++++++++++++-----
 .../net/ethernet/netronome/nfp/nfp_net_debugfs.c   | 20 +++++++++-------
 3 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 75683fb26734..fc005c982b7d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -472,6 +472,9 @@ struct nfp_net {
 
 	u32 rx_offset;
 
+	struct nfp_net_tx_ring *tx_rings;
+	struct nfp_net_rx_ring *rx_rings;
+
 #ifdef CONFIG_PCI_IOV
 	unsigned int num_vfs;
 	struct vf_data_storage *vfinfo;
@@ -504,9 +507,6 @@ struct nfp_net {
 	int txd_cnt;
 	int rxd_cnt;
 
-	struct nfp_net_tx_ring tx_rings[NFP_NET_MAX_TX_RINGS];
-	struct nfp_net_rx_ring rx_rings[NFP_NET_MAX_RX_RINGS];
-
 	u8 num_irqs;
 	u8 num_r_vecs;
 	struct nfp_net_r_vector r_vecs[NFP_NET_MAX_TX_RINGS];
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 7cd20fcd631a..66fab7162b7c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -413,12 +413,6 @@ static void nfp_net_irqs_assign(struct net_device *netdev)
 		r_vec->irq_idx = NFP_NET_NON_Q_VECTORS + r;
 
 		cpumask_set_cpu(r, &r_vec->affinity_mask);
-
-		r_vec->tx_ring = &nn->tx_rings[r];
-		nfp_net_tx_ring_init(r_vec->tx_ring, r_vec, r);
-
-		r_vec->rx_ring = &nn->rx_rings[r];
-		nfp_net_rx_ring_init(r_vec->rx_ring, r_vec, r);
 	}
 }
 
@@ -1503,6 +1497,12 @@ nfp_net_prepare_vector(struct nfp_net *nn, struct nfp_net_r_vector *r_vec,
 	struct msix_entry *entry = &nn->irq_entries[r_vec->irq_idx];
 	int err;
 
+	r_vec->tx_ring = &nn->tx_rings[idx];
+	nfp_net_tx_ring_init(r_vec->tx_ring, r_vec, idx);
+
+	r_vec->rx_ring = &nn->rx_rings[idx];
+	nfp_net_rx_ring_init(r_vec->rx_ring, r_vec, idx);
+
 	snprintf(r_vec->name, sizeof(r_vec->name),
 		 "%s-rxtx-%d", nn->netdev->name, idx);
 	err = request_irq(entry->vector, r_vec->handler, 0, r_vec->name, r_vec);
@@ -1693,6 +1693,15 @@ static int nfp_net_netdev_open(struct net_device *netdev)
 		goto err_free_exn;
 	disable_irq(nn->irq_entries[NFP_NET_CFG_LSC].vector);
 
+	nn->rx_rings = kcalloc(nn->num_rx_rings, sizeof(*nn->rx_rings),
+			       GFP_KERNEL);
+	if (!nn->rx_rings)
+		goto err_free_lsc;
+	nn->tx_rings = kcalloc(nn->num_tx_rings, sizeof(*nn->tx_rings),
+			       GFP_KERNEL);
+	if (!nn->tx_rings)
+		goto err_free_rx_rings;
+
 	for (r = 0; r < nn->num_r_vecs; r++) {
 		err = nfp_net_prepare_vector(nn, &nn->r_vecs[r], r);
 		if (err)
@@ -1807,6 +1816,10 @@ err_free_tx_ring_p:
 err_cleanup_vec_p:
 		nfp_net_cleanup_vector(nn, &nn->r_vecs[r]);
 	}
+	kfree(nn->tx_rings);
+err_free_rx_rings:
+	kfree(nn->rx_rings);
+err_free_lsc:
 	nfp_net_aux_irq_free(nn, NFP_NET_CFG_LSC, NFP_NET_IRQ_LSC_IDX);
 err_free_exn:
 	nfp_net_aux_irq_free(nn, NFP_NET_CFG_EXN, NFP_NET_IRQ_EXN_IDX);
@@ -1852,6 +1865,9 @@ static int nfp_net_netdev_close(struct net_device *netdev)
 		nfp_net_cleanup_vector(nn, &nn->r_vecs[r]);
 	}
 
+	kfree(nn->rx_rings);
+	kfree(nn->tx_rings);
+
 	nfp_net_aux_irq_free(nn, NFP_NET_CFG_LSC, NFP_NET_IRQ_LSC_IDX);
 	nfp_net_aux_irq_free(nn, NFP_NET_CFG_EXN, NFP_NET_IRQ_EXN_IDX);
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_debugfs.c b/drivers/net/ethernet/netronome/nfp/nfp_net_debugfs.c
index 4c97c713121c..f86a1f13d27b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_debugfs.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_debugfs.c
@@ -40,8 +40,9 @@ static struct dentry *nfp_dir;
 
 static int nfp_net_debugfs_rx_q_read(struct seq_file *file, void *data)
 {
-	struct nfp_net_rx_ring *rx_ring = file->private;
 	int fl_rd_p, fl_wr_p, rx_rd_p, rx_wr_p, rxd_cnt;
+	struct nfp_net_r_vector *r_vec = file->private;
+	struct nfp_net_rx_ring *rx_ring;
 	struct nfp_net_rx_desc *rxd;
 	struct sk_buff *skb;
 	struct nfp_net *nn;
@@ -49,9 +50,10 @@ static int nfp_net_debugfs_rx_q_read(struct seq_file *file, void *data)
 
 	rtnl_lock();
 
-	if (!rx_ring->r_vec || !rx_ring->r_vec->nfp_net)
+	if (!r_vec->nfp_net || !r_vec->rx_ring)
 		goto out;
-	nn = rx_ring->r_vec->nfp_net;
+	nn = r_vec->nfp_net;
+	rx_ring = r_vec->rx_ring;
 	if (!netif_running(nn->netdev))
 		goto out;
 
@@ -115,7 +117,8 @@ static const struct file_operations nfp_rx_q_fops = {
 
 static int nfp_net_debugfs_tx_q_read(struct seq_file *file, void *data)
 {
-	struct nfp_net_tx_ring *tx_ring = file->private;
+	struct nfp_net_r_vector *r_vec = file->private;
+	struct nfp_net_tx_ring *tx_ring;
 	struct nfp_net_tx_desc *txd;
 	int d_rd_p, d_wr_p, txd_cnt;
 	struct sk_buff *skb;
@@ -124,9 +127,10 @@ static int nfp_net_debugfs_tx_q_read(struct seq_file *file, void *data)
 
 	rtnl_lock();
 
-	if (!tx_ring->r_vec || !tx_ring->r_vec->nfp_net)
+	if (!r_vec->nfp_net || !r_vec->tx_ring)
 		goto out;
-	nn = tx_ring->r_vec->nfp_net;
+	nn = r_vec->nfp_net;
+	tx_ring = r_vec->tx_ring;
 	if (!netif_running(nn->netdev))
 		goto out;
 
@@ -207,13 +211,13 @@ void nfp_net_debugfs_adapter_add(struct nfp_net *nn)
 	for (i = 0; i < nn->num_rx_rings; i++) {
 		sprintf(int_name, "%d", i);
 		debugfs_create_file(int_name, S_IRUSR, rx,
-				    &nn->rx_rings[i], &nfp_rx_q_fops);
+				    &nn->r_vecs[i], &nfp_rx_q_fops);
 	}
 
 	for (i = 0; i < nn->num_tx_rings; i++) {
 		sprintf(int_name, "%d", i);
 		debugfs_create_file(int_name, S_IRUSR, tx,
-				    &nn->tx_rings[i], &nfp_tx_q_fops);
+				    &nn->r_vecs[i], &nfp_tx_q_fops);
 	}
 }
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH v5 net-next 04/15] nfp: make *x_ring_init do all the init
From: Jakub Kicinski @ 2016-04-07 18:39 UTC (permalink / raw)
  To: netdev; +Cc: Jakub Kicinski
In-Reply-To: <1460054388-471-1-git-send-email-jakub.kicinski@netronome.com>

nfp_net_[rt]x_ring_init functions used to be called from probe
path only and some of their functionality was spilled to the
call site.  In order to reuse them for ring reconfiguration
we need them to do all the init.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    | 28 ++++++++++++++--------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 8692587904c5..7cd20fcd631a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -347,12 +347,18 @@ static irqreturn_t nfp_net_irq_exn(int irq, void *data)
 /**
  * nfp_net_tx_ring_init() - Fill in the boilerplate for a TX ring
  * @tx_ring:  TX ring structure
+ * @r_vec:    IRQ vector servicing this ring
+ * @idx:      Ring index
  */
-static void nfp_net_tx_ring_init(struct nfp_net_tx_ring *tx_ring)
+static void
+nfp_net_tx_ring_init(struct nfp_net_tx_ring *tx_ring,
+		     struct nfp_net_r_vector *r_vec, unsigned int idx)
 {
-	struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
 	struct nfp_net *nn = r_vec->nfp_net;
 
+	tx_ring->idx = idx;
+	tx_ring->r_vec = r_vec;
+
 	tx_ring->qcidx = tx_ring->idx * nn->stride_tx;
 	tx_ring->qcp_q = nn->tx_bar + NFP_QCP_QUEUE_OFF(tx_ring->qcidx);
 }
@@ -360,12 +366,18 @@ static void nfp_net_tx_ring_init(struct nfp_net_tx_ring *tx_ring)
 /**
  * nfp_net_rx_ring_init() - Fill in the boilerplate for a RX ring
  * @rx_ring:  RX ring structure
+ * @r_vec:    IRQ vector servicing this ring
+ * @idx:      Ring index
  */
-static void nfp_net_rx_ring_init(struct nfp_net_rx_ring *rx_ring)
+static void
+nfp_net_rx_ring_init(struct nfp_net_rx_ring *rx_ring,
+		     struct nfp_net_r_vector *r_vec, unsigned int idx)
 {
-	struct nfp_net_r_vector *r_vec = rx_ring->r_vec;
 	struct nfp_net *nn = r_vec->nfp_net;
 
+	rx_ring->idx = idx;
+	rx_ring->r_vec = r_vec;
+
 	rx_ring->fl_qcidx = rx_ring->idx * nn->stride_rx;
 	rx_ring->rx_qcidx = rx_ring->fl_qcidx + (nn->stride_rx - 1);
 
@@ -403,14 +415,10 @@ static void nfp_net_irqs_assign(struct net_device *netdev)
 		cpumask_set_cpu(r, &r_vec->affinity_mask);
 
 		r_vec->tx_ring = &nn->tx_rings[r];
-		nn->tx_rings[r].idx = r;
-		nn->tx_rings[r].r_vec = r_vec;
-		nfp_net_tx_ring_init(r_vec->tx_ring);
+		nfp_net_tx_ring_init(r_vec->tx_ring, r_vec, r);
 
 		r_vec->rx_ring = &nn->rx_rings[r];
-		nn->rx_rings[r].idx = r;
-		nn->rx_rings[r].r_vec = r_vec;
-		nfp_net_rx_ring_init(r_vec->rx_ring);
+		nfp_net_rx_ring_init(r_vec->rx_ring, r_vec, r);
 	}
 }
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH v5 net-next 03/15] nfp: break up nfp_net_{alloc|free}_rings
From: Jakub Kicinski @ 2016-04-07 18:39 UTC (permalink / raw)
  To: netdev; +Cc: Jakub Kicinski
In-Reply-To: <1460054388-471-1-git-send-email-jakub.kicinski@netronome.com>

nfp_net_{alloc|free}_rings contained strange mix of allocations
and vector initialization.  Remove it, declare vector init as
a separate function and handle allocations explicitly.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    | 126 ++++++++-------------
 1 file changed, 47 insertions(+), 79 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 5da1199e7afb..8692587904c5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1488,91 +1488,40 @@ err_alloc:
 	return -ENOMEM;
 }
 
-static void __nfp_net_free_rings(struct nfp_net *nn, unsigned int n_free)
+static int
+nfp_net_prepare_vector(struct nfp_net *nn, struct nfp_net_r_vector *r_vec,
+		       int idx)
 {
-	struct nfp_net_r_vector *r_vec;
-	struct msix_entry *entry;
+	struct msix_entry *entry = &nn->irq_entries[r_vec->irq_idx];
+	int err;
 
-	while (n_free--) {
-		r_vec = &nn->r_vecs[n_free];
-		entry = &nn->irq_entries[r_vec->irq_idx];
+	snprintf(r_vec->name, sizeof(r_vec->name),
+		 "%s-rxtx-%d", nn->netdev->name, idx);
+	err = request_irq(entry->vector, r_vec->handler, 0, r_vec->name, r_vec);
+	if (err) {
+		nn_err(nn, "Error requesting IRQ %d\n", entry->vector);
+		return err;
+	}
 
-		nfp_net_rx_ring_free(r_vec->rx_ring);
-		nfp_net_tx_ring_free(r_vec->tx_ring);
+	/* Setup NAPI */
+	netif_napi_add(nn->netdev, &r_vec->napi,
+		       nfp_net_poll, NAPI_POLL_WEIGHT);
 
-		irq_set_affinity_hint(entry->vector, NULL);
-		free_irq(entry->vector, r_vec);
+	irq_set_affinity_hint(entry->vector, &r_vec->affinity_mask);
 
-		netif_napi_del(&r_vec->napi);
-	}
-}
+	nn_dbg(nn, "RV%02d: irq=%03d/%03d\n", idx, entry->vector, entry->entry);
 
-/**
- * nfp_net_free_rings() - Free all ring resources
- * @nn:      NFP Net device to reconfigure
- */
-static void nfp_net_free_rings(struct nfp_net *nn)
-{
-	__nfp_net_free_rings(nn, nn->num_r_vecs);
+	return 0;
 }
 
-/**
- * nfp_net_alloc_rings() - Allocate resources for RX and TX rings
- * @nn:      NFP Net device to reconfigure
- *
- * Return: 0 on success or negative errno on error.
- */
-static int nfp_net_alloc_rings(struct nfp_net *nn)
+static void
+nfp_net_cleanup_vector(struct nfp_net *nn, struct nfp_net_r_vector *r_vec)
 {
-	struct nfp_net_r_vector *r_vec;
-	struct msix_entry *entry;
-	int err;
-	int r;
+	struct msix_entry *entry = &nn->irq_entries[r_vec->irq_idx];
 
-	for (r = 0; r < nn->num_r_vecs; r++) {
-		r_vec = &nn->r_vecs[r];
-		entry = &nn->irq_entries[r_vec->irq_idx];
-
-		/* Setup NAPI */
-		netif_napi_add(nn->netdev, &r_vec->napi,
-			       nfp_net_poll, NAPI_POLL_WEIGHT);
-
-		snprintf(r_vec->name, sizeof(r_vec->name),
-			 "%s-rxtx-%d", nn->netdev->name, r);
-		err = request_irq(entry->vector, r_vec->handler, 0,
-				  r_vec->name, r_vec);
-		if (err) {
-			nn_dbg(nn, "Error requesting IRQ %d\n", entry->vector);
-			goto err_napi_del;
-		}
-
-		irq_set_affinity_hint(entry->vector, &r_vec->affinity_mask);
-
-		nn_dbg(nn, "RV%02d: irq=%03d/%03d\n",
-		       r, entry->vector, entry->entry);
-
-		/* Allocate TX ring resources */
-		err = nfp_net_tx_ring_alloc(r_vec->tx_ring);
-		if (err)
-			goto err_free_irq;
-
-		/* Allocate RX ring resources */
-		err = nfp_net_rx_ring_alloc(r_vec->rx_ring);
-		if (err)
-			goto err_free_tx;
-	}
-
-	return 0;
-
-err_free_tx:
-	nfp_net_tx_ring_free(r_vec->tx_ring);
-err_free_irq:
 	irq_set_affinity_hint(entry->vector, NULL);
-	free_irq(entry->vector, r_vec);
-err_napi_del:
 	netif_napi_del(&r_vec->napi);
-	__nfp_net_free_rings(nn, r);
-	return err;
+	free_irq(entry->vector, r_vec);
 }
 
 /**
@@ -1736,9 +1685,19 @@ static int nfp_net_netdev_open(struct net_device *netdev)
 		goto err_free_exn;
 	disable_irq(nn->irq_entries[NFP_NET_CFG_LSC].vector);
 
-	err = nfp_net_alloc_rings(nn);
-	if (err)
-		goto err_free_lsc;
+	for (r = 0; r < nn->num_r_vecs; r++) {
+		err = nfp_net_prepare_vector(nn, &nn->r_vecs[r], r);
+		if (err)
+			goto err_free_prev_vecs;
+
+		err = nfp_net_tx_ring_alloc(nn->r_vecs[r].tx_ring);
+		if (err)
+			goto err_cleanup_vec_p;
+
+		err = nfp_net_rx_ring_alloc(nn->r_vecs[r].rx_ring);
+		if (err)
+			goto err_free_tx_ring_p;
+	}
 
 	err = netif_set_real_num_tx_queues(netdev, nn->num_tx_rings);
 	if (err)
@@ -1831,8 +1790,15 @@ err_disable_napi:
 err_clear_config:
 	nfp_net_clear_config_and_disable(nn);
 err_free_rings:
-	nfp_net_free_rings(nn);
-err_free_lsc:
+	r = nn->num_r_vecs;
+err_free_prev_vecs:
+	while (r--) {
+		nfp_net_rx_ring_free(nn->r_vecs[r].rx_ring);
+err_free_tx_ring_p:
+		nfp_net_tx_ring_free(nn->r_vecs[r].tx_ring);
+err_cleanup_vec_p:
+		nfp_net_cleanup_vector(nn, &nn->r_vecs[r]);
+	}
 	nfp_net_aux_irq_free(nn, NFP_NET_CFG_LSC, NFP_NET_IRQ_LSC_IDX);
 err_free_exn:
 	nfp_net_aux_irq_free(nn, NFP_NET_CFG_EXN, NFP_NET_IRQ_EXN_IDX);
@@ -1873,9 +1839,11 @@ static int nfp_net_netdev_close(struct net_device *netdev)
 	for (r = 0; r < nn->num_r_vecs; r++) {
 		nfp_net_rx_flush(nn->r_vecs[r].rx_ring);
 		nfp_net_tx_flush(nn->r_vecs[r].tx_ring);
+		nfp_net_rx_ring_free(nn->r_vecs[r].rx_ring);
+		nfp_net_tx_ring_free(nn->r_vecs[r].tx_ring);
+		nfp_net_cleanup_vector(nn, &nn->r_vecs[r]);
 	}
 
-	nfp_net_free_rings(nn);
 	nfp_net_aux_irq_free(nn, NFP_NET_CFG_LSC, NFP_NET_IRQ_LSC_IDX);
 	nfp_net_aux_irq_free(nn, NFP_NET_CFG_EXN, NFP_NET_IRQ_EXN_IDX);
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH v5 net-next 01/15] nfp: correct RX buffer length calculation
From: Jakub Kicinski @ 2016-04-07 18:39 UTC (permalink / raw)
  To: netdev; +Cc: Jakub Kicinski
In-Reply-To: <1460054388-471-1-git-send-email-jakub.kicinski@netronome.com>

When calculating the RX buffer length we need to account
for up to 2 VLAN tags.  Rounding up to 1k is an relic of
a distant past and can be removed.  While at it also remove
trivial print statement.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 43c618bafdb6..0dae81454e77 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1911,9 +1911,6 @@ static void nfp_net_set_rx_mode(struct net_device *netdev)
 static int nfp_net_change_mtu(struct net_device *netdev, int new_mtu)
 {
 	struct nfp_net *nn = netdev_priv(netdev);
-	u32 tmp;
-
-	nn_dbg(nn, "New MTU = %d\n", new_mtu);
 
 	if (new_mtu < 68 || new_mtu > nn->max_mtu) {
 		nn_err(nn, "New MTU (%d) is not valid\n", new_mtu);
@@ -1921,10 +1918,7 @@ static int nfp_net_change_mtu(struct net_device *netdev, int new_mtu)
 	}
 
 	netdev->mtu = new_mtu;
-
-	/* Freelist buffer size rounded up to the nearest 1K */
-	tmp = new_mtu + ETH_HLEN + VLAN_HLEN + NFP_NET_MAX_PREPEND;
-	nn->fl_bufsz = roundup(tmp, 1024);
+	nn->fl_bufsz = NFP_NET_MAX_PREPEND + ETH_HLEN + VLAN_HLEN * 2 + new_mtu;
 
 	/* restart if running */
 	if (netif_running(netdev)) {
-- 
1.9.1

^ permalink raw reply related

* [PATCH v5 net-next 02/15] nfp: move link state interrupt request/free calls
From: Jakub Kicinski @ 2016-04-07 18:39 UTC (permalink / raw)
  To: netdev; +Cc: Jakub Kicinski
In-Reply-To: <1460054388-471-1-git-send-email-jakub.kicinski@netronome.com>

We need to be able to disable the link state interrupt when
the device is brought down.  We used to just free the IRQ
at the beginning of .ndo_stop().  As we now move towards
more ordered .ndo_open()/.ndo_stop() paths LSC allocation
should be placed in the "allocate resource" section.

Since the IRQ can't be freed early in .ndo_stop(), it is
disabled instead.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    | 23 +++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 0dae81454e77..5da1199e7afb 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1729,10 +1729,16 @@ static int nfp_net_netdev_open(struct net_device *netdev)
 				      NFP_NET_IRQ_EXN_IDX, nn->exn_handler);
 	if (err)
 		return err;
+	err = nfp_net_aux_irq_request(nn, NFP_NET_CFG_LSC, "%s-lsc",
+				      nn->lsc_name, sizeof(nn->lsc_name),
+				      NFP_NET_IRQ_LSC_IDX, nn->lsc_handler);
+	if (err)
+		goto err_free_exn;
+	disable_irq(nn->irq_entries[NFP_NET_CFG_LSC].vector);
 
 	err = nfp_net_alloc_rings(nn);
 	if (err)
-		goto err_free_exn;
+		goto err_free_lsc;
 
 	err = netif_set_real_num_tx_queues(netdev, nn->num_tx_rings);
 	if (err)
@@ -1812,19 +1818,11 @@ static int nfp_net_netdev_open(struct net_device *netdev)
 
 	netif_tx_wake_all_queues(netdev);
 
-	err = nfp_net_aux_irq_request(nn, NFP_NET_CFG_LSC, "%s-lsc",
-				      nn->lsc_name, sizeof(nn->lsc_name),
-				      NFP_NET_IRQ_LSC_IDX, nn->lsc_handler);
-	if (err)
-		goto err_stop_tx;
+	enable_irq(nn->irq_entries[NFP_NET_CFG_LSC].vector);
 	nfp_net_read_link_status(nn);
 
 	return 0;
 
-err_stop_tx:
-	netif_tx_disable(netdev);
-	for (r = 0; r < nn->num_r_vecs; r++)
-		nfp_net_tx_flush(nn->r_vecs[r].tx_ring);
 err_disable_napi:
 	while (r--) {
 		napi_disable(&nn->r_vecs[r].napi);
@@ -1834,6 +1832,8 @@ err_clear_config:
 	nfp_net_clear_config_and_disable(nn);
 err_free_rings:
 	nfp_net_free_rings(nn);
+err_free_lsc:
+	nfp_net_aux_irq_free(nn, NFP_NET_CFG_LSC, NFP_NET_IRQ_LSC_IDX);
 err_free_exn:
 	nfp_net_aux_irq_free(nn, NFP_NET_CFG_EXN, NFP_NET_IRQ_EXN_IDX);
 	return err;
@@ -1855,7 +1855,7 @@ static int nfp_net_netdev_close(struct net_device *netdev)
 
 	/* Step 1: Disable RX and TX rings from the Linux kernel perspective
 	 */
-	nfp_net_aux_irq_free(nn, NFP_NET_CFG_LSC, NFP_NET_IRQ_LSC_IDX);
+	disable_irq(nn->irq_entries[NFP_NET_CFG_LSC].vector);
 	netif_carrier_off(netdev);
 	nn->link_up = false;
 
@@ -1876,6 +1876,7 @@ static int nfp_net_netdev_close(struct net_device *netdev)
 	}
 
 	nfp_net_free_rings(nn);
+	nfp_net_aux_irq_free(nn, NFP_NET_CFG_LSC, NFP_NET_IRQ_LSC_IDX);
 	nfp_net_aux_irq_free(nn, NFP_NET_CFG_EXN, NFP_NET_IRQ_EXN_IDX);
 
 	nn_dbg(nn, "%s down", netdev->name);
-- 
1.9.1

^ permalink raw reply related

* [PATCH v5 net-next 00/15] MTU/buffer reconfig changes
From: Jakub Kicinski @ 2016-04-07 18:39 UTC (permalink / raw)
  To: netdev; +Cc: Jakub Kicinski

Hi!

I re-discussed MPLS/MTU internally, dropped it from the patch 1,
re-tested everything, found out I forgot about debugfs pointers,
fixed that as well.

v5:
 - don't reserve space in RX buffers for MPLS label stack
   (patch 1);
 - fix debugfs pointers to ring structures (patch 5).
v4:
 - cut down on unrelated patches;
 - don't "close" the device on error path.

--- v4 cover letter

Previous series included some not entirely related patches,
this one is cut down.  Main issue I'm trying to solve here
is that .ndo_change_mtu() in nfpvf driver is doing full
close/open to reallocate buffers - which if open fails
can result in device being basically closed even though
the interface is started.  As suggested by you I try to move
towards a paradigm where the resources are allocated first
and the MTU change is only done once I'm certain (almost)
nothing can fail.  Almost because I need to communicate 
with FW and that can always time out.

Patch 1 fixes small issue.  Next 10 patches reorganize things
so that I can easily allocate new rings and sets of buffers
while the device is running.  Patches 13 and 15 reshape the
.ndo_change_mtu() and ethtool's ring-resize operation into
desired form.


Jakub Kicinski (15):
  nfp: correct RX buffer length calculation
  nfp: move link state interrupt request/free calls
  nfp: break up nfp_net_{alloc|free}_rings
  nfp: make *x_ring_init do all the init
  nfp: allocate ring SW structs dynamically
  nfp: cleanup tx ring flush and rename to reset
  nfp: reorganize initial filling of RX rings
  nfp: preallocate RX buffers early in .ndo_open
  nfp: move filling ring information to FW config
  nfp: slice .ndo_open() and .ndo_stop() up
  nfp: sync ring state during FW reconfiguration
  nfp: propagate list buffer size in struct rx_ring
  nfp: convert .ndo_change_mtu() to prepare/commit paradigm
  nfp: pass ring count as function parameter
  nfp: allow ring size reconfiguration at runtime

 drivers/net/ethernet/netronome/nfp/nfp_net.h       |  10 +-
 .../net/ethernet/netronome/nfp/nfp_net_common.c    | 903 ++++++++++++++-------
 .../net/ethernet/netronome/nfp/nfp_net_debugfs.c   |  20 +-
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   |  30 +-
 4 files changed, 627 insertions(+), 336 deletions(-)

-- 
1.9.1

^ permalink raw reply

* Re: [RFC PATCH net 3/4] ipv6: datagram: Update dst cache of a connected datagram sk during pmtu update
From: Cong Wang @ 2016-04-07 18:37 UTC (permalink / raw)
  To: Martin KaFai Lau; +Cc: netdev, Eric Dumazet, Wei Wang, Kernel Team
In-Reply-To: <20160406184851.GA14894@kafai-mba.local>

On Wed, Apr 6, 2016 at 11:49 AM, Martin KaFai Lau <kafai@fb.com> wrote:
> On Wed, Apr 06, 2016 at 10:58:23AM -0700, Cong Wang wrote:
>> On Tue, Apr 5, 2016 at 5:11 PM, Martin KaFai Lau <kafai@fb.com> wrote:
>> > On Mon, Apr 04, 2016 at 01:45:02PM -0700, Cong Wang wrote:
>> >> I see your point, but calling __ip6_datagram_connect() seems overkill
>> >> here, we don't need to update so many things in the pmtu update context,
>> >> at least IPv4 doesn't do that either. I don't think you have to do that.
>> >>
>> >> So why just updating the dst cache (also some addr cache) here is not
>> >> enough?
>> > I am not sure I understand.  I could be missing something.
>> >
>> > This patch uses ip6_datagram_dst_update() to do the route lookup and
>> > sk->sk_dst_cache update.  ip6_datagram_dst_update() is
>> > created in the first two refactoring patches and is also used by
>> > __ip6_datagram_connect().
>> >
>> > Which operations in ip6_datagram_dst_update() could be saved
>> > during the pmtu update?
>>
>> I thought you call the same ip6_datagram_dst_update() for both
>> pmtu update and __ip6_datagram_connect(), but you actually skip
>> some sk operations for pmtu case, which means you don't need
>> to worry about parallel ip6_datagram_connect().
>>
>> IPv6 UDP sendmsg() path stores the dst without sock lock anyway,
>> we don't cope with a concurrent connect() on another cpu.
> A parallel sendmsg and connect could be an issue.  The user is connecting
> to a new dest while another parallel sendmsg is sending to (could be the old
> dest, new dest or somewhere between old and new dest?)
>
> However, it is the userland making and it will be another patch if we want
> to protect this case too.

Yeah, it is a different problem, but no one complains about it yet.

>
> In pmtu update, the kernel is doing the lookup and update without the
> userland conscious.
>
>> But still, I don't see this is a problem here, because even if we store
>> an obsolete address in cache, it would be corrected later.
> The sendmsg() path will correct it (relookup and update sk_dst_cache) but not
> the getsockopt(IPV6_MTU) path which is what this patch is trying to fix: Update
> a _valid_ dst to sk->sk_dst_cache.

You are lost in discussion, I never object to update sk_dst_cache, what
we disagree here is merely if we need to lock the sock in pmtu update
context.

I still think it is okay without the lock, because even if you take the lock,
the pmtu update could still happen after you release it, so there is no
essential difference here. The only reason I can think of for taking
the sock lock is protecting parallel pmtu update, but it looks safe for
this case too.

So which case do you want to protect by taking the sock lock?

^ permalink raw reply

* Re: [PATCH net-next 0/7] sctp: support sctp_diag in kernel
From: Marcelo Ricardo Leitner @ 2016-04-07 18:34 UTC (permalink / raw)
  To: Xin Long; +Cc: network dev, linux-sctp, Vlad Yasevich, daniel, davem
In-Reply-To: <cover.1459829123.git.lucien.xin@gmail.com>

On Tue, Apr 05, 2016 at 12:06:25PM +0800, Xin Long wrote:
> This patchset will add sctp_diag module to implement diag interface on
> sctp in kernel.
...

Other than the const thing and the point noticed by Neil which need
to be fixed, patchset looks good to me.

  Marcelo

^ permalink raw reply

* Re: veth regression with "don’t modify ip_summed; doing so treats packets with bad checksums as good."
From: Ben Greear @ 2016-04-07 18:32 UTC (permalink / raw)
  To: Vijay Pandurangan; +Cc: Cong Wang, netdev, Evan Jones, Cong Wang
In-Reply-To: <CAKUBDd_2sC45FW72g1nWzM0k0+EeBUk0LoOMzVwuBfLWHUh-KA@mail.gmail.com>

On 04/07/2016 08:11 AM, Vijay Pandurangan wrote:
> On Fri, Mar 25, 2016 at 7:46 PM, Ben Greear <greearb@candelatech.com> wrote:
>> A real NIC can either do hardware checksums, or it cannot.  If it
>> cannot, then the host must do it on the CPU for both transmit and
>> receive.
>>
>> Veth is not a real NIC, and it cannot do hardware checksum offloading.
>>
>> So, we either lie and pretend it does, or we eat massive amounts
>> of CPU usage to calculate and check checksums when sending across
>> a veth pair.
>>
>
> That's a good point. Does anyone know what the overhead actually is these days?

You could try setting up a system with ixgbe or similar, and then manually
disable csum offload using ethtool, and see how that performs in comparison
to hardware offload?

>> But, if I am purposely corrupting a frame destined for veth, then the only
>> reason
>> I would want the stack to check the checksums is if I were testing my own
>> stack's checksum logic, and that seems to be a pretty limited use.
>
>
> In the common case you're 100% right.  OTOH, there's something
> disconcerting about an abstraction layer lying and behaving
> unexpectedly.  Most traffic that originates on a machine can have its
> checksums safely ignored.  Whatever the reason is (maybe, as you say
> you're testing checksums – on the other hand maybe there's a bug in
> your code somewhere), I really feel like we should try to figure out a
> way to ensure that this optimization is at the very least opt-in…

I'm fine with allowing a user to force software-csum on veth devices
if someone wants to code that up, but forcing sw-csum for local frames
on veth devices should be disabled by default.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply

* Re: How to get creatior PID information for the local tcp connection
From: Eric Dumazet @ 2016-04-07 18:31 UTC (permalink / raw)
  To: Vishnu Pratap Singh; +Cc: dccp, linux-kernel, netdev, vishnu.ps@samsung.com
In-Reply-To: <1460053602.6473.415.camel@edumazet-glaptop3.roam.corp.google.com>

On Thu, 2016-04-07 at 11:26 -0700, Eric Dumazet wrote:
> On Thu, 2016-04-07 at 23:01 +0530, Vishnu Pratap Singh wrote:
> > Hi,
> > 
> > 
> > Issue -  How to get PID information for the local tcp connection
> > 
> > 
> > 
> > i want to get the creator PID for each socket in user space for local
> > tcp connection, i see in kernel there is support for returing PID with
> > "SO_PEERCRED" ioctl to work across namespaces. it uses struct pid and
> > struct cred to store the peer credentials on struct sock.
> > cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); Above
> > function stores the PID information in ucred->pid = pid_vnr(pid); and
> > same is returned via "SO_PEERCRED" ioctl .
> > 
> > But for local tcp connection i get pid as 0, is there any way i can
> > get the PID information. Any help or suggestion will be highly
> > helpful.
> > 
> > 
> 
> man 7 socket
> 
>        SO_PEERCRED
>               Return the credentials of the foreign  process  connected  to  this  socket.
>               This  is  possible  only  for  connected  AF_UNIX stream sockets and AF_UNIX
>               stream and datagram socket pairs created using socketpair(2);  see  unix(7).
>               The  returned  credentials  are those that were in effect at the time of the
>               call to connect(2) or socketpair(2).  The argument  is  a  ucred  structure;
>               define  the  GNU_SOURCE  feature test macro to obtain the definition of that
>               structure from <sys/socket.h>.  This socket option is read-only.
> 

Sorry, I hit "Send" too fast.

This is not implemented for TCP yet.

You'll have to take a look at iproute2 package, since "ss -tp" is able
to find this information, by looking at all /proc/{pid}/fd/*  files and
the socket inode number the kernel gives through inet_diag

Not scalable if you have millions of sockets...

^ permalink raw reply

* Re: [PATCH net-next 5/7] sctp: reuse the some transport traversal functions in proc
From: Marcelo Ricardo Leitner @ 2016-04-07 18:29 UTC (permalink / raw)
  To: Neil Horman
  Cc: Xin Long, network dev, linux-sctp, Vlad Yasevich, daniel, davem
In-Reply-To: <20160407130930.GA4573@hmsreliant.think-freely.org>

On Thu, Apr 07, 2016 at 09:09:30AM -0400, Neil Horman wrote:
> On Tue, Apr 05, 2016 at 12:06:30PM +0800, Xin Long wrote:
> > There are some transport traversal functions for sctp_diag, we can also
> > use it for sctp_proc. cause they have the similar situation to traversal
> > transport.
> > 
> > Signed-off-by: Xin Long <lucien.xin@gmail.com>
> > ---
> >  net/sctp/proc.c | 80 +++++++++++++--------------------------------------------
> >  1 file changed, 18 insertions(+), 62 deletions(-)
> > 
> > diff --git a/net/sctp/proc.c b/net/sctp/proc.c
> > index 5cfac8d..dd8492f 100644
> > --- a/net/sctp/proc.c
> > +++ b/net/sctp/proc.c
> > @@ -282,80 +282,31 @@ struct sctp_ht_iter {
> >  	struct rhashtable_iter hti;
> >  };
> >  
> > -static struct sctp_transport *sctp_transport_get_next(struct seq_file *seq)
> > -{
> > -	struct sctp_ht_iter *iter = seq->private;
> > -	struct sctp_transport *t;
> > -
> > -	t = rhashtable_walk_next(&iter->hti);
> > -	for (; t; t = rhashtable_walk_next(&iter->hti)) {
> > -		if (IS_ERR(t)) {
> > -			if (PTR_ERR(t) == -EAGAIN)
> > -				continue;
> > -			break;
> > -		}
> > -
> > -		if (net_eq(sock_net(t->asoc->base.sk), seq_file_net(seq)) &&
> > -		    t->asoc->peer.primary_path == t)
> > -			break;
> > -	}
> > -
> > -	return t;
> > -}
> > -
> 
> this may just be a nit, but you defined the new sctp_transport_get_next in patch
> 2 of this series, and didn't remove this private version until here.  Is that
> going to cause some behavioral issue, if someone builds a kernel between patch 2

Yes, it causes issues:

...net/sctp/proc.c:285:31: error: conflicting types for ‘sctp_transport_get_next’
 static struct sctp_transport *sctp_transport_get_next(struct seq_file *seq)
                               ^

> and 7?  Seems like perhaps those two patches should be merged.

Agreed.

  Marcelo

^ permalink raw reply

* Re: How to get creatior PID information for the local tcp connection
From: Eric Dumazet @ 2016-04-07 18:26 UTC (permalink / raw)
  To: Vishnu Pratap Singh; +Cc: dccp, linux-kernel, netdev, vishnu.ps@samsung.com
In-Reply-To: <CADCeU4NfNucPCqemSxr4SsMUgdOhxzM41zjsRDKfrhsFyMH0+Q@mail.gmail.com>

On Thu, 2016-04-07 at 23:01 +0530, Vishnu Pratap Singh wrote:
> Hi,
> 
> 
> Issue -  How to get PID information for the local tcp connection
> 
> 
> 
> i want to get the creator PID for each socket in user space for local
> tcp connection, i see in kernel there is support for returing PID with
> "SO_PEERCRED" ioctl to work across namespaces. it uses struct pid and
> struct cred to store the peer credentials on struct sock.
> cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); Above
> function stores the PID information in ucred->pid = pid_vnr(pid); and
> same is returned via "SO_PEERCRED" ioctl .
> 
> But for local tcp connection i get pid as 0, is there any way i can
> get the PID information. Any help or suggestion will be highly
> helpful.
> 
> 

man 7 socket

       SO_PEERCRED
              Return the credentials of the foreign  process  connected  to  this  socket.
              This  is  possible  only  for  connected  AF_UNIX stream sockets and AF_UNIX
              stream and datagram socket pairs created using socketpair(2);  see  unix(7).
              The  returned  credentials  are those that were in effect at the time of the
              call to connect(2) or socketpair(2).  The argument  is  a  ucred  structure;
              define  the  GNU_SOURCE  feature test macro to obtain the definition of that
              structure from <sys/socket.h>.  This socket option is read-only.

^ permalink raw reply

* [PATCH] net-next: mediatek: add support for IRQ grouping
From: John Crispin @ 2016-04-07 18:24 UTC (permalink / raw)
  To: David S. Miller
  Cc: Felix Fietkau, Matthias Brugger, netdev, linux-mediatek,
	linux-kernel, John Crispin

The ethernet core has 3 IRQs. Using the IRQ grouping registers we are able
to separate TX and RX IRQs, which allows us to service them on separate
cores. This patch splits the IRQ handler into 2 separate functions, one
for TX and another for RX. The TX housekeeping is split out of the NAPI
handler. Instead we use a tasklet to handle housekeeping.

Signed-off-by: John Crispin <blogic@openwrt.org>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c |  115 +++++++++++++++++----------
 drivers/net/ethernet/mediatek/mtk_eth_soc.h |   12 ++-
 2 files changed, 86 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 8163047..6387516 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -756,7 +756,7 @@ drop:
 }
 
 static int mtk_poll_rx(struct napi_struct *napi, int budget,
-		       struct mtk_eth *eth, u32 rx_intr)
+		       struct mtk_eth *eth)
 {
 	struct mtk_rx_ring *ring = &eth->rx_ring;
 	int idx = ring->calc_idx;
@@ -842,12 +842,12 @@ release_desc:
 	}
 
 	if (done < budget)
-		mtk_w32(eth, rx_intr, MTK_QMTK_INT_STATUS);
+		mtk_w32(eth, MTK_RX_DONE_INT, MTK_QMTK_INT_STATUS);
 
 	return done;
 }
 
-static int mtk_poll_tx(struct mtk_eth *eth, int budget, bool *tx_again)
+static int mtk_poll_tx(struct mtk_eth *eth, int budget)
 {
 	struct mtk_tx_ring *ring = &eth->tx_ring;
 	struct mtk_tx_dma *desc;
@@ -910,9 +910,7 @@ static int mtk_poll_tx(struct mtk_eth *eth, int budget, bool *tx_again)
 	}
 
 	/* read hw index again make sure no new tx packet */
-	if (cpu != dma || cpu != mtk_r32(eth, MTK_QTX_DRX_PTR))
-		*tx_again = true;
-	else
+	if (cpu == dma && cpu == mtk_r32(eth, MTK_QTX_DRX_PTR))
 		mtk_w32(eth, MTK_TX_DONE_INT, MTK_QMTK_INT_STATUS);
 
 	if (!total)
@@ -924,27 +922,27 @@ static int mtk_poll_tx(struct mtk_eth *eth, int budget, bool *tx_again)
 	return total;
 }
 
+static void mtk_clean_tx_tasklet(unsigned long arg)
+{
+	struct mtk_eth *eth = (struct mtk_eth *)arg;
+
+	if (mtk_poll_tx(eth, MTK_NAPI_WEIGHT) > 0)
+		tasklet_schedule(&eth->tx_clean_tasklet);
+	else
+		mtk_irq_enable(eth, MTK_TX_DONE_INT);
+}
+
 static int mtk_poll(struct napi_struct *napi, int budget)
 {
 	struct mtk_eth *eth = container_of(napi, struct mtk_eth, rx_napi);
-	u32 status, status2, mask, tx_intr, rx_intr, status_intr;
-	int tx_done, rx_done;
-	bool tx_again = false;
+	u32 status, status2, mask, status_intr;
+	int rx_done = 0;
 
 	status = mtk_r32(eth, MTK_QMTK_INT_STATUS);
 	status2 = mtk_r32(eth, MTK_INT_STATUS2);
-	tx_intr = MTK_TX_DONE_INT;
-	rx_intr = MTK_RX_DONE_INT;
 	status_intr = (MTK_GDM1_AF | MTK_GDM2_AF);
-	tx_done = 0;
-	rx_done = 0;
-	tx_again = 0;
 
-	if (status & tx_intr)
-		tx_done = mtk_poll_tx(eth, budget, &tx_again);
-
-	if (status & rx_intr)
-		rx_done = mtk_poll_rx(napi, budget, eth, rx_intr);
+	rx_done = mtk_poll_rx(napi, budget, eth);
 
 	if (unlikely(status2 & status_intr)) {
 		mtk_stats_update(eth);
@@ -953,20 +951,20 @@ static int mtk_poll(struct napi_struct *napi, int budget)
 
 	if (unlikely(netif_msg_intr(eth))) {
 		mask = mtk_r32(eth, MTK_QDMA_INT_MASK);
-		netdev_info(eth->netdev[0],
-			    "done tx %d, rx %d, intr 0x%08x/0x%x\n",
-			    tx_done, rx_done, status, mask);
+		dev_info(eth->dev,
+			 "done rx %d, intr 0x%08x/0x%x\n",
+			 rx_done, status, mask);
 	}
 
-	if (tx_again || rx_done == budget)
+	if (rx_done == budget)
 		return budget;
 
 	status = mtk_r32(eth, MTK_QMTK_INT_STATUS);
-	if (status & (tx_intr | rx_intr))
+	if (status & MTK_RX_DONE_INT)
 		return budget;
 
 	napi_complete(napi);
-	mtk_irq_enable(eth, tx_intr | rx_intr);
+	mtk_irq_enable(eth, MTK_RX_DONE_INT);
 
 	return rx_done;
 }
@@ -1195,22 +1193,43 @@ static void mtk_tx_timeout(struct net_device *dev)
 	schedule_work(&eth->pending_work);
 }
 
-static irqreturn_t mtk_handle_irq(int irq, void *_eth)
+static irqreturn_t mtk_handle_irq_rx(int irq, void *_eth)
 {
 	struct mtk_eth *eth = _eth;
 	u32 status;
 
 	status = mtk_r32(eth, MTK_QMTK_INT_STATUS);
+	status &= ~MTK_TX_DONE_INT;
+
 	if (unlikely(!status))
 		return IRQ_NONE;
 
-	if (likely(status & (MTK_RX_DONE_INT | MTK_TX_DONE_INT))) {
+	if (status & MTK_RX_DONE_INT) {
 		if (likely(napi_schedule_prep(&eth->rx_napi)))
 			__napi_schedule(&eth->rx_napi);
-	} else {
-		mtk_w32(eth, status, MTK_QMTK_INT_STATUS);
+		mtk_irq_disable(eth, MTK_RX_DONE_INT);
 	}
-	mtk_irq_disable(eth, (MTK_RX_DONE_INT | MTK_TX_DONE_INT));
+	mtk_w32(eth, status, MTK_QMTK_INT_STATUS);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mtk_handle_irq_tx(int irq, void *_eth)
+{
+	struct mtk_eth *eth = _eth;
+	u32 status;
+
+	status = mtk_r32(eth, MTK_QMTK_INT_STATUS);
+	status &= ~MTK_RX_DONE_INT;
+
+	if (unlikely(!status))
+		return IRQ_NONE;
+
+	if (status & MTK_TX_DONE_INT) {
+		tasklet_schedule(&eth->tx_clean_tasklet);
+		mtk_irq_disable(eth, MTK_TX_DONE_INT);
+	}
+	mtk_w32(eth, status, MTK_QMTK_INT_STATUS);
 
 	return IRQ_HANDLED;
 }
@@ -1223,7 +1242,7 @@ static void mtk_poll_controller(struct net_device *dev)
 	u32 int_mask = MTK_TX_DONE_INT | MTK_RX_DONE_INT;
 
 	mtk_irq_disable(eth, int_mask);
-	mtk_handle_irq(dev->irq, dev);
+	mtk_handle_irq(dev->irq[0], dev);
 	mtk_irq_enable(eth, int_mask);
 }
 #endif
@@ -1344,7 +1363,11 @@ static int __init mtk_hw_init(struct mtk_eth *eth)
 	/* Enable RX VLan Offloading */
 	mtk_w32(eth, 1, MTK_CDMP_EG_CTRL);
 
-	err = devm_request_irq(eth->dev, eth->irq, mtk_handle_irq, 0,
+	err = devm_request_irq(eth->dev, eth->irq[1], mtk_handle_irq_tx, 0,
+			       dev_name(eth->dev), eth);
+	if (err)
+		return err;
+	err = devm_request_irq(eth->dev, eth->irq[2], mtk_handle_irq_rx, 0,
 			       dev_name(eth->dev), eth);
 	if (err)
 		return err;
@@ -1360,7 +1383,11 @@ static int __init mtk_hw_init(struct mtk_eth *eth)
 	mtk_w32(eth, 0, MTK_RST_GL);
 
 	/* FE int grouping */
-	mtk_w32(eth, 0, MTK_FE_INT_GRP);
+	mtk_w32(eth, MTK_TX_DONE_INT, MTK_PDMA_INT_GRP1);
+	mtk_w32(eth, MTK_RX_DONE_INT, MTK_PDMA_INT_GRP2);
+	mtk_w32(eth, MTK_TX_DONE_INT, MTK_QDMA_INT_GRP1);
+	mtk_w32(eth, MTK_RX_DONE_INT, MTK_QDMA_INT_GRP2);
+	mtk_w32(eth, 0x21021000, MTK_FE_INT_GRP);
 
 	for (i = 0; i < 2; i++) {
 		u32 val = mtk_r32(eth, MTK_GDMA_FWD_CFG(i));
@@ -1408,7 +1435,9 @@ static void mtk_uninit(struct net_device *dev)
 	phy_disconnect(mac->phy_dev);
 	mtk_mdio_cleanup(eth);
 	mtk_irq_disable(eth, ~0);
-	free_irq(dev->irq, dev);
+	free_irq(eth->irq[0], dev);
+	free_irq(eth->irq[1], dev);
+	free_irq(eth->irq[2], dev);
 }
 
 static int mtk_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
@@ -1682,10 +1711,10 @@ static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np)
 		dev_err(eth->dev, "error bringing up device\n");
 		goto free_netdev;
 	}
-	eth->netdev[id]->irq = eth->irq;
+	eth->netdev[id]->irq = eth->irq[0];
 	netif_info(eth, probe, eth->netdev[id],
 		   "mediatek frame engine at 0x%08lx, irq %d\n",
-		   eth->netdev[id]->base_addr, eth->netdev[id]->irq);
+		   eth->netdev[id]->base_addr, eth->irq[0]);
 
 	return 0;
 
@@ -1702,6 +1731,7 @@ static int mtk_probe(struct platform_device *pdev)
 	struct mtk_soc_data *soc;
 	struct mtk_eth *eth;
 	int err;
+	int i;
 
 	match = of_match_device(of_mtk_match, &pdev->dev);
 	soc = (struct mtk_soc_data *)match->data;
@@ -1736,10 +1766,12 @@ static int mtk_probe(struct platform_device *pdev)
 		return PTR_ERR(eth->rstc);
 	}
 
-	eth->irq = platform_get_irq(pdev, 0);
-	if (eth->irq < 0) {
-		dev_err(&pdev->dev, "no IRQ resource found\n");
-		return -ENXIO;
+	for (i = 0; i < 3; i++) {
+		eth->irq[i] = platform_get_irq(pdev, i);
+		if (eth->irq[i] < 0) {
+			dev_err(&pdev->dev, "no IRQ%d resource found\n", i);
+			return -ENXIO;
+		}
 	}
 
 	eth->clk_ethif = devm_clk_get(&pdev->dev, "ethif");
@@ -1783,6 +1815,9 @@ static int mtk_probe(struct platform_device *pdev)
 	netif_napi_add(&eth->dummy_dev, &eth->rx_napi, mtk_poll,
 		       MTK_NAPI_WEIGHT);
 
+	tasklet_init(&eth->tx_clean_tasklet,
+		     mtk_clean_tx_tasklet, (unsigned long)eth);
+
 	platform_set_drvdata(pdev, eth);
 
 	return 0;
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
index eed626d..4cfb40c 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -68,6 +68,10 @@
 /* Unicast Filter MAC Address Register - High */
 #define MTK_GDMA_MAC_ADRH(x)	(0x50C + (x * 0x1000))
 
+/* PDMA Interrupt grouping registers */
+#define MTK_PDMA_INT_GRP1	0xa50
+#define MTK_PDMA_INT_GRP2	0xa54
+
 /* QDMA TX Queue Configuration Registers */
 #define MTK_QTX_CFG(x)		(0x1800 + (x * 0x10))
 #define QDMA_RES_THRES		4
@@ -124,6 +128,11 @@
 #define MTK_TX_DONE_INT		(MTK_TX_DONE_INT0 | MTK_TX_DONE_INT1 | \
 				 MTK_TX_DONE_INT2 | MTK_TX_DONE_INT3)
 
+/* QDMA Interrupt grouping registers */
+#define MTK_QDMA_INT_GRP1	0x1a20
+#define MTK_QDMA_INT_GRP2	0x1a24
+#define MTK_RLS_DONE_INT	BIT(0)
+
 /* QDMA Interrupt Status Register */
 #define MTK_QDMA_INT_MASK	0x1A1C
 
@@ -374,7 +383,7 @@ struct mtk_eth {
 	struct net_device		dummy_dev;
 	struct net_device		*netdev[MTK_MAX_DEVS];
 	struct mtk_mac			*mac[MTK_MAX_DEVS];
-	int				irq;
+	int				irq[3];
 	u32				msg_enable;
 	unsigned long			sysclk;
 	struct regmap			*ethsys;
@@ -391,6 +400,7 @@ struct mtk_eth {
 	struct clk			*clk_gp2;
 	struct mii_bus			*mii_bus;
 	struct work_struct		pending_work;
+	struct tasklet_struct		tx_clean_tasklet;
 };
 
 /* struct mtk_mac -	the structure that holds the info about the MACs of the
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH net-next 3/7] sctp: export some functions for sctp_diag in inet_diag
From: Marcelo Ricardo Leitner @ 2016-04-07 18:17 UTC (permalink / raw)
  To: Xin Long; +Cc: network dev, linux-sctp, Vlad Yasevich, daniel, davem
In-Reply-To: <d16e3f12f689f28d8c3b62c7c2a15916177bd7c0.1459829123.git.lucien.xin@gmail.com>

On Tue, Apr 05, 2016 at 12:06:28PM +0800, Xin Long wrote:
> inet_diag_msg_common_fill is used to fill the diag msg common info,
> we need to use it in sctp_diag as well, so export it.
> 
> We also add inet_diag_get_handler() to access inet_diag_table in sctp
> diag.
> 
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> ---
>  net/ipv4/inet_diag.c | 9 ++++++++-
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
> index bd591eb..29121a6 100644
> --- a/net/ipv4/inet_diag.c
> +++ b/net/ipv4/inet_diag.c
> @@ -66,7 +66,13 @@ static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
>  	mutex_unlock(&inet_diag_table_mutex);
>  }
>  
> -static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
> +struct inet_diag_handler *inet_diag_get_handler(int proto)

It needs to return it as const, as inet_diag_table is also declared as
const, so we don't loose the qualifier.

> +{
> +	return inet_diag_table[proto];
> +}
> +EXPORT_SYMBOL_GPL(inet_diag_get_handler);
> +
> +void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
>  {
>  	r->idiag_family = sk->sk_family;
>  
> @@ -89,6 +95,7 @@ static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
>  	r->id.idiag_dst[0] = sk->sk_daddr;
>  	}
>  }
> +EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill);
>  
>  static size_t inet_sk_attr_size(void)
>  {
> -- 
> 2.1.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* [PATCH net-next] net: ipv6: Use passed in table for nexthop lookups
From: David Ahern @ 2016-04-07 18:11 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern

Similar to 3bfd847203c6 ("net: Use passed in table for nexthop lookups")
for IPv4, if the route spec contains a table id use that to lookup the
next hop first and fall back to a full lookup if it fails (per the fix
4c9bcd117918b ("net: Fix nexthop lookups")).

Example:

    root@kenny:~# ip -6 ro ls table red
    local 2100:1::1 dev lo  proto none  metric 0  pref medium
    2100:1::/120 dev eth1  proto kernel  metric 256  pref medium
    local 2100:2::1 dev lo  proto none  metric 0  pref medium
    2100:2::/120 dev eth2  proto kernel  metric 256  pref medium
    local fe80::e0:f9ff:fe09:3cac dev lo  proto none  metric 0  pref medium
    local fe80::e0:f9ff:fe1c:b974 dev lo  proto none  metric 0  pref medium
    fe80::/64 dev eth1  proto kernel  metric 256  pref medium
    fe80::/64 dev eth2  proto kernel  metric 256  pref medium
    ff00::/8 dev red  metric 256  pref medium
    ff00::/8 dev eth1  metric 256  pref medium
    ff00::/8 dev eth2  metric 256  pref medium
    unreachable default dev lo  metric 240  error -113 pref medium

    root@kenny:~# ip -6 ro add table red 2100:3::/64 via 2100:1::64
    RTNETLINK answers: No route to host

Route add fails even though 2100:1::64 is a reachable next hop:
    root@kenny:~# ping6 -I red  2100:1::64
    ping6: Warning: source address might be selected on device other than red.
    PING 2100:1::64(2100:1::64) from 2100:1::1 red: 56 data bytes
    64 bytes from 2100:1::64: icmp_seq=1 ttl=64 time=1.33 ms

With this patch:
    root@kenny:~# ip -6 ro add table red 2100:3::/64 via 2100:1::64
    root@kenny:~# ip -6 ro ls table red
    local 2100:1::1 dev lo  proto none  metric 0  pref medium
    2100:1::/120 dev eth1  proto kernel  metric 256  pref medium
    local 2100:2::1 dev lo  proto none  metric 0  pref medium
    2100:2::/120 dev eth2  proto kernel  metric 256  pref medium
    2100:3::/64 via 2100:1::64 dev eth1  metric 1024  pref medium
    local fe80::e0:f9ff:fe09:3cac dev lo  proto none  metric 0  pref medium
    local fe80::e0:f9ff:fe1c:b974 dev lo  proto none  metric 0  pref medium
    fe80::/64 dev eth1  proto kernel  metric 256  pref medium
    fe80::/64 dev eth2  proto kernel  metric 256  pref medium
    ff00::/8 dev red  metric 256  pref medium
    ff00::/8 dev eth1  metric 256  pref medium
    ff00::/8 dev eth2  metric 256  pref medium
    unreachable default dev lo  metric 240  error -113 pref medium

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 net/ipv6/route.c | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1d8871a5ed20..3e699dc199f3 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1928,7 +1928,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
 		rt->rt6i_gateway = *gw_addr;
 
 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
-			struct rt6_info *grt;
+			struct rt6_info *grt = NULL;
 
 			/* IPv6 strictly inhibits using not link-local
 			   addresses as nexthop address.
@@ -1940,7 +1940,38 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
 			if (!(gwa_type & IPV6_ADDR_UNICAST))
 				goto out;
 
-			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
+			if (cfg->fc_table) {
+				struct flowi6 fl6 = {
+					.flowi6_oif = cfg->fc_ifindex,
+					.daddr = *gw_addr,
+					.saddr = cfg->fc_prefsrc,
+				};
+				struct fib6_table *table;
+				int flags = 0;
+
+				err = -EHOSTUNREACH;
+				table = fib6_get_table(net, cfg->fc_table);
+				if (!table)
+					goto out;
+
+				if (!ipv6_addr_any(&cfg->fc_prefsrc))
+					flags |= RT6_LOOKUP_F_HAS_SADDR;
+
+				grt = ip6_pol_route(net, table, cfg->fc_ifindex,
+						    &fl6, flags);
+
+				/* if table lookup failed, fall back
+				 * to full lookup
+				 */
+				if (grt == net->ipv6.ip6_null_entry) {
+					ip6_rt_put(grt);
+					grt = NULL;
+				}
+			}
+
+			if (!grt)
+				grt = rt6_lookup(net, gw_addr, NULL,
+						 cfg->fc_ifindex, 1);
 
 			err = -EHOSTUNREACH;
 			if (!grt)
-- 
2.1.4

^ permalink raw reply related

* [PATCH] net: vrf: Fix dev refcnt leak due to IPv6 prefix route
From: David Ahern @ 2016-04-07 18:10 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern

ifupdown2 found a kernel bug with IPv6 routes and movement from the main
table to the VRF table. Sequence of events:

Create the interface and add addresses:
    ip link add dev eth4.105 link eth4 type vlan id 105
    ip addr add dev eth4.105 8.105.105.10/24
    ip -6 addr add dev eth4.105 2008:105:105::10/64

At this point IPv6 has inserted a prefix route in the main table even
though the interface is 'down'. From there the VRF device is created:
    ip link add dev vrf105 type vrf table 105
    ip addr add dev vrf105 9.9.105.10/32
    ip -6 addr add dev vrf105 2000:9:105::10/128
    ip link set vrf105 up

Then the interface is enslaved, while still in the 'down' state:
    ip link set dev eth4.105 master vrf105

Since the device is down the VRF driver cycling the device does not
send the NETDEV_UP and NETDEV_DOWN but rather the NETDEV_CHANGE event
which does not flush the routes inserted prior.

When the link is brought up
    ip link set dev eth4.105 up

the prefix route is added in the VRF table, but does not remove
the route from the main table.

Fix by handling the NETDEV_CHANGEUPPER event similar what was implemented
for IPv4 in 7f49e7a38b77 ("net: Flush local routes when device changes vrf
association")

Fixes: 35402e3136634 ("net: Add IPv6 support to VRF device")

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 net/ipv6/addrconf.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 27aed1afcf81..2db2116d3e6b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3255,6 +3255,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			   void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct netdev_notifier_changeupper_info *info;
 	struct inet6_dev *idev = __in6_dev_get(dev);
 	int run_pending = 0;
 	int err;
@@ -3413,6 +3414,15 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 		if (idev)
 			addrconf_type_change(dev, event);
 		break;
+
+	case NETDEV_CHANGEUPPER:
+		info = ptr;
+
+		/* flush all routes if dev is linked to or unlinked from
+		 * an L3 master device (e.g., VRF)
+		 */
+		if (info->upper_dev && netif_is_l3_master(info->upper_dev))
+			addrconf_ifdown(dev, 0);
 	}
 
 	return NOTIFY_OK;
-- 
2.1.4

^ permalink raw reply related

* [PATCH] net: vrf: Fix dst reference counting
From: David Ahern @ 2016-04-07 18:10 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern

Vivek reported a kernel exception deleting a VRF with an active
connection through it. The root cause is that the socket has a cached
reference to a dst that is destroyed. Converting the dst_destroy to
dst_release and letting proper reference counting kick in does not
work as the dst has a reference to the device which needs to be released
as well.

I talked to Hannes about this at netdev and he pointed out the ipv4 and
ipv6 dst handling has dst_ifdown for just this scenario. Rather than
continuing with the reinvented dst wheel in VRF just remove it and
leverage the ipv4 and ipv6 versions.

Fixes: 193125dbd8eb2 ("net: Introduce VRF device driver")
Fixes: 35402e3136634 ("net: Add IPv6 support to VRF device")

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 drivers/net/vrf.c       | 177 +++++-------------------------------------------
 include/net/ip6_route.h |   3 +
 include/net/route.h     |   3 +
 net/ipv4/route.c        |   7 +-
 net/ipv6/route.c        |   7 +-
 5 files changed, 30 insertions(+), 167 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 9a9fabb900c1..8a8f1e58b415 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -60,41 +60,6 @@ struct pcpu_dstats {
 	struct u64_stats_sync	syncp;
 };
 
-static struct dst_entry *vrf_ip_check(struct dst_entry *dst, u32 cookie)
-{
-	return dst;
-}
-
-static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
-	return ip_local_out(net, sk, skb);
-}
-
-static unsigned int vrf_v4_mtu(const struct dst_entry *dst)
-{
-	/* TO-DO: return max ethernet size? */
-	return dst->dev->mtu;
-}
-
-static void vrf_dst_destroy(struct dst_entry *dst)
-{
-	/* our dst lives forever - or until the device is closed */
-}
-
-static unsigned int vrf_default_advmss(const struct dst_entry *dst)
-{
-	return 65535 - 40;
-}
-
-static struct dst_ops vrf_dst_ops = {
-	.family		= AF_INET,
-	.local_out	= vrf_ip_local_out,
-	.check		= vrf_ip_check,
-	.mtu		= vrf_v4_mtu,
-	.destroy	= vrf_dst_destroy,
-	.default_advmss	= vrf_default_advmss,
-};
-
 /* neighbor handling is done with actual device; do not want
  * to flip skb->dev for those ndisc packets. This really fails
  * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
@@ -349,46 +314,6 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static struct dst_entry *vrf_ip6_check(struct dst_entry *dst, u32 cookie)
-{
-	return dst;
-}
-
-static struct dst_ops vrf_dst_ops6 = {
-	.family		= AF_INET6,
-	.local_out	= ip6_local_out,
-	.check		= vrf_ip6_check,
-	.mtu		= vrf_v4_mtu,
-	.destroy	= vrf_dst_destroy,
-	.default_advmss	= vrf_default_advmss,
-};
-
-static int init_dst_ops6_kmem_cachep(void)
-{
-	vrf_dst_ops6.kmem_cachep = kmem_cache_create("vrf_ip6_dst_cache",
-						     sizeof(struct rt6_info),
-						     0,
-						     SLAB_HWCACHE_ALIGN,
-						     NULL);
-
-	if (!vrf_dst_ops6.kmem_cachep)
-		return -ENOMEM;
-
-	return 0;
-}
-
-static void free_dst_ops6_kmem_cachep(void)
-{
-	kmem_cache_destroy(vrf_dst_ops6.kmem_cachep);
-}
-
-static int vrf_input6(struct sk_buff *skb)
-{
-	skb->dev->stats.rx_errors++;
-	kfree_skb(skb);
-	return 0;
-}
-
 /* modelled after ip6_finish_output2 */
 static int vrf_finish_output6(struct net *net, struct sock *sk,
 			      struct sk_buff *skb)
@@ -429,67 +354,34 @@ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb)
 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 }
 
-static void vrf_rt6_destroy(struct net_vrf *vrf)
+static void vrf_rt6_release(struct net_vrf *vrf)
 {
-	dst_destroy(&vrf->rt6->dst);
-	free_percpu(vrf->rt6->rt6i_pcpu);
+	dst_release(&vrf->rt6->dst);
 	vrf->rt6 = NULL;
 }
 
 static int vrf_rt6_create(struct net_device *dev)
 {
 	struct net_vrf *vrf = netdev_priv(dev);
-	struct dst_entry *dst;
+	struct net *net = dev_net(dev);
 	struct rt6_info *rt6;
-	int cpu;
 	int rc = -ENOMEM;
 
-	rt6 = dst_alloc(&vrf_dst_ops6, dev, 0,
-			DST_OBSOLETE_NONE,
-			(DST_HOST | DST_NOPOLICY | DST_NOXFRM));
+	rt6 = ip6_dst_alloc(net, dev,
+			    DST_HOST | DST_NOPOLICY | DST_NOXFRM | DST_NOCACHE);
 	if (!rt6)
 		goto out;
 
-	dst = &rt6->dst;
-
-	rt6->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL);
-	if (!rt6->rt6i_pcpu) {
-		dst_destroy(dst);
-		goto out;
-	}
-	for_each_possible_cpu(cpu) {
-		struct rt6_info **p = per_cpu_ptr(rt6->rt6i_pcpu, cpu);
-		*p =  NULL;
-	}
-
-	memset(dst + 1, 0, sizeof(*rt6) - sizeof(*dst));
-
-	INIT_LIST_HEAD(&rt6->rt6i_siblings);
-	INIT_LIST_HEAD(&rt6->rt6i_uncached);
-
-	rt6->dst.input	= vrf_input6;
 	rt6->dst.output	= vrf_output6;
-
-	rt6->rt6i_table = fib6_get_table(dev_net(dev), vrf->tb_id);
-
-	atomic_set(&rt6->dst.__refcnt, 2);
-
+	rt6->rt6i_table = fib6_get_table(net, vrf->tb_id);
+	dst_hold(&rt6->dst);
 	vrf->rt6 = rt6;
 	rc = 0;
 out:
 	return rc;
 }
 #else
-static int init_dst_ops6_kmem_cachep(void)
-{
-	return 0;
-}
-
-static void free_dst_ops6_kmem_cachep(void)
-{
-}
-
-static void vrf_rt6_destroy(struct net_vrf *vrf)
+static void vrf_rt6_release(struct net_vrf *vrf)
 {
 }
 
@@ -557,11 +449,11 @@ static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
 
-static void vrf_rtable_destroy(struct net_vrf *vrf)
+static void vrf_rtable_release(struct net_vrf *vrf)
 {
 	struct dst_entry *dst = (struct dst_entry *)vrf->rth;
 
-	dst_destroy(dst);
+	dst_release(dst);
 	vrf->rth = NULL;
 }
 
@@ -570,22 +462,10 @@ static struct rtable *vrf_rtable_create(struct net_device *dev)
 	struct net_vrf *vrf = netdev_priv(dev);
 	struct rtable *rth;
 
-	rth = dst_alloc(&vrf_dst_ops, dev, 2,
-			DST_OBSOLETE_NONE,
-			(DST_HOST | DST_NOPOLICY | DST_NOXFRM));
+	rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1, 0);
 	if (rth) {
 		rth->dst.output	= vrf_output;
-		rth->rt_genid	= rt_genid_ipv4(dev_net(dev));
-		rth->rt_flags	= 0;
-		rth->rt_type	= RTN_UNICAST;
-		rth->rt_is_input = 0;
-		rth->rt_iif	= 0;
-		rth->rt_pmtu	= 0;
-		rth->rt_gateway	= 0;
-		rth->rt_uses_gateway = 0;
 		rth->rt_table_id = vrf->tb_id;
-		INIT_LIST_HEAD(&rth->rt_uncached);
-		rth->rt_uncached_list = NULL;
 	}
 
 	return rth;
@@ -673,8 +553,8 @@ static void vrf_dev_uninit(struct net_device *dev)
 	struct net_device *port_dev;
 	struct list_head *iter;
 
-	vrf_rtable_destroy(vrf);
-	vrf_rt6_destroy(vrf);
+	vrf_rtable_release(vrf);
+	vrf_rt6_release(vrf);
 
 	netdev_for_each_lower_dev(dev, port_dev, iter)
 		vrf_del_slave(dev, port_dev);
@@ -704,7 +584,7 @@ static int vrf_dev_init(struct net_device *dev)
 	return 0;
 
 out_rth:
-	vrf_rtable_destroy(vrf);
+	vrf_rtable_release(vrf);
 out_stats:
 	free_percpu(dev->dstats);
 	dev->dstats = NULL;
@@ -737,7 +617,7 @@ static struct rtable *vrf_get_rtable(const struct net_device *dev,
 		struct net_vrf *vrf = netdev_priv(dev);
 
 		rth = vrf->rth;
-		atomic_inc(&rth->dst.__refcnt);
+		dst_hold(&rth->dst);
 	}
 
 	return rth;
@@ -788,7 +668,7 @@ static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev,
 		struct net_vrf *vrf = netdev_priv(dev);
 
 		rt = vrf->rt6;
-		atomic_inc(&rt->dst.__refcnt);
+		dst_hold(&rt->dst);
 	}
 
 	return (struct dst_entry *)rt;
@@ -946,19 +826,6 @@ static int __init vrf_init_module(void)
 {
 	int rc;
 
-	vrf_dst_ops.kmem_cachep =
-		kmem_cache_create("vrf_ip_dst_cache",
-				  sizeof(struct rtable), 0,
-				  SLAB_HWCACHE_ALIGN,
-				  NULL);
-
-	if (!vrf_dst_ops.kmem_cachep)
-		return -ENOMEM;
-
-	rc = init_dst_ops6_kmem_cachep();
-	if (rc != 0)
-		goto error2;
-
 	register_netdevice_notifier(&vrf_notifier_block);
 
 	rc = rtnl_link_register(&vrf_link_ops);
@@ -969,22 +836,10 @@ static int __init vrf_init_module(void)
 
 error:
 	unregister_netdevice_notifier(&vrf_notifier_block);
-	free_dst_ops6_kmem_cachep();
-error2:
-	kmem_cache_destroy(vrf_dst_ops.kmem_cachep);
 	return rc;
 }
 
-static void __exit vrf_cleanup_module(void)
-{
-	rtnl_link_unregister(&vrf_link_ops);
-	unregister_netdevice_notifier(&vrf_notifier_block);
-	kmem_cache_destroy(vrf_dst_ops.kmem_cachep);
-	free_dst_ops6_kmem_cachep();
-}
-
 module_init(vrf_init_module);
-module_exit(vrf_cleanup_module);
 MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern");
 MODULE_DESCRIPTION("Device driver to instantiate VRF domains");
 MODULE_LICENSE("GPL");
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 295d291269e2..54c779416eec 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -101,6 +101,9 @@ void fib6_force_start_gc(struct net *net);
 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 				    const struct in6_addr *addr, bool anycast);
 
+struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
+			       int flags);
+
 /*
  *	support functions for ND
  *
diff --git a/include/net/route.h b/include/net/route.h
index 9b0a523bb428..6de665bf1750 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -209,6 +209,9 @@ unsigned int inet_addr_type_dev_table(struct net *net,
 void ip_rt_multicast_event(struct in_device *);
 int ip_rt_ioctl(struct net *, unsigned int cmd, void __user *arg);
 void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt);
+struct rtable *rt_dst_alloc(struct net_device *dev,
+			     unsigned int flags, u16 type,
+			     bool nopolicy, bool noxfrm, bool will_cache);
 
 struct in_ifaddr;
 void fib_add_ifaddr(struct in_ifaddr *);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 02c62299d717..2852bdf73540 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1438,9 +1438,9 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 #endif
 }
 
-static struct rtable *rt_dst_alloc(struct net_device *dev,
-				   unsigned int flags, u16 type,
-				   bool nopolicy, bool noxfrm, bool will_cache)
+struct rtable *rt_dst_alloc(struct net_device *dev,
+			    unsigned int flags, u16 type,
+			    bool nopolicy, bool noxfrm, bool will_cache)
 {
 	struct rtable *rt;
 
@@ -1468,6 +1468,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
 
 	return rt;
 }
+EXPORT_SYMBOL(rt_dst_alloc);
 
 /* called in rcu_read_lock() section */
 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ed446639219c..1d8871a5ed20 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -338,9 +338,9 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,
 	return rt;
 }
 
-static struct rt6_info *ip6_dst_alloc(struct net *net,
-				      struct net_device *dev,
-				      int flags)
+struct rt6_info *ip6_dst_alloc(struct net *net,
+			       struct net_device *dev,
+			       int flags)
 {
 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
 
@@ -364,6 +364,7 @@ static struct rt6_info *ip6_dst_alloc(struct net *net,
 
 	return rt;
 }
+EXPORT_SYMBOL(ip6_dst_alloc);
 
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
-- 
2.1.4

^ permalink raw reply related

* [PATCH 7/9] net: mediatek: fix TX locking
From: John Crispin @ 2016-04-07 17:57 UTC (permalink / raw)
  To: David S. Miller
  Cc: Felix Fietkau, Matthias Brugger,
	Sean Wang (王志亘), netdev, linux-mediatek,
	linux-kernel, John Crispin
In-Reply-To: <1460051876-53135-1-git-send-email-blogic@openwrt.org>

Inside the TX path there is a lock inside the tx_map function. This is
however too late. The patch moves the lock to the start of the xmit
function right before the free count check of the DMA ring happens.
If we do not do this, the code becomes racy leading to TX stalls and
dropped packets. This happens as there are 2 netdevs running on the
same physical DMA ring.

Signed-off-by: John Crispin <blogic@openwrt.org>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c |   20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 60b66ab..8434355 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -536,7 +536,6 @@ static int mtk_tx_map(struct sk_buff *skb, struct net_device *dev,
 	struct mtk_eth *eth = mac->hw;
 	struct mtk_tx_dma *itxd, *txd;
 	struct mtk_tx_buf *tx_buf;
-	unsigned long flags;
 	dma_addr_t mapped_addr;
 	unsigned int nr_frags;
 	int i, n_desc = 1;
@@ -568,11 +567,6 @@ static int mtk_tx_map(struct sk_buff *skb, struct net_device *dev,
 	if (unlikely(dma_mapping_error(&dev->dev, mapped_addr)))
 		return -ENOMEM;
 
-	/* normally we can rely on the stack not calling this more than once,
-	 * however we have 2 queues running ont he same ring so we need to lock
-	 * the ring access
-	 */
-	spin_lock_irqsave(&eth->page_lock, flags);
 	WRITE_ONCE(itxd->txd1, mapped_addr);
 	tx_buf->flags |= MTK_TX_FLAGS_SINGLE0;
 	dma_unmap_addr_set(tx_buf, dma_addr0, mapped_addr);
@@ -632,8 +626,6 @@ static int mtk_tx_map(struct sk_buff *skb, struct net_device *dev,
 	WRITE_ONCE(itxd->txd3, (TX_DMA_SWC | TX_DMA_PLEN0(skb_headlen(skb)) |
 				(!nr_frags * TX_DMA_LS0)));
 
-	spin_unlock_irqrestore(&eth->page_lock, flags);
-
 	netdev_sent_queue(dev, skb->len);
 	skb_tx_timestamp(skb);
 
@@ -661,8 +653,6 @@ err_dma:
 		itxd = mtk_qdma_phys_to_virt(ring, itxd->txd2);
 	} while (itxd != txd);
 
-	spin_unlock_irqrestore(&eth->page_lock, flags);
-
 	return -ENOMEM;
 }
 
@@ -712,14 +702,22 @@ static int mtk_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct mtk_eth *eth = mac->hw;
 	struct mtk_tx_ring *ring = &eth->tx_ring;
 	struct net_device_stats *stats = &dev->stats;
+	unsigned long flags;
 	bool gso = false;
 	int tx_num;
 
+	/* normally we can rely on the stack not calling this more than once,
+	 * however we have 2 queues running ont he same ring so we need to lock
+	 * the ring access
+	 */
+	spin_lock_irqsave(&eth->page_lock, flags);
+
 	tx_num = mtk_cal_txd_req(skb);
 	if (unlikely(atomic_read(&ring->free_count) <= tx_num)) {
 		mtk_stop_queue(eth);
 		netif_err(eth, tx_queued, dev,
 			  "Tx Ring full when queue awake!\n");
+		spin_unlock_irqrestore(&eth->page_lock, flags);
 		return NETDEV_TX_BUSY;
 	}
 
@@ -747,10 +745,12 @@ static int mtk_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			     ring->thresh))
 			mtk_wake_queue(eth);
 	}
+	spin_unlock_irqrestore(&eth->page_lock, flags);
 
 	return NETDEV_TX_OK;
 
 drop:
+	spin_unlock_irqrestore(&eth->page_lock, flags);
 	stats->tx_dropped++;
 	dev_kfree_skb(skb);
 	return NETDEV_TX_OK;
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 1/9] net: mediatek: update the IRQ part of the binding document
From: John Crispin @ 2016-04-07 17:57 UTC (permalink / raw)
  To: David S. Miller
  Cc: Felix Fietkau, Matthias Brugger,
	Sean Wang (王志亘), netdev, linux-mediatek,
	linux-kernel, John Crispin, devicetree

The current binding document only describes a single interrupt. Update the
document by adding the 2 other interrupts.

The driver currently only uses a single interrupt. The HW is however able
to using IRQ grouping to split TX and RX onto separate GIC irqs.

Signed-off-by: John Crispin <blogic@openwrt.org>
Cc: devicetree@vger.kernel.org
---
 Documentation/devicetree/bindings/net/mediatek-net.txt |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/mediatek-net.txt b/Documentation/devicetree/bindings/net/mediatek-net.txt
index 5ca7929..2f142be 100644
--- a/Documentation/devicetree/bindings/net/mediatek-net.txt
+++ b/Documentation/devicetree/bindings/net/mediatek-net.txt
@@ -9,7 +9,7 @@ have dual GMAC each represented by a child node..
 Required properties:
 - compatible: Should be "mediatek,mt7623-eth"
 - reg: Address and length of the register set for the device
-- interrupts: Should contain the frame engines interrupt
+- interrupts: Should contain the three frame engines interrupts
 - clocks: the clock used by the core
 - clock-names: the names of the clock listed in the clocks property. These are
 	"ethif", "esw", "gp2", "gp1"
@@ -42,7 +42,9 @@ eth: ethernet@1b100000 {
 		 <&ethsys CLK_ETHSYS_GP2>,
 		 <&ethsys CLK_ETHSYS_GP1>;
 	clock-names = "ethif", "esw", "gp2", "gp1";
-	interrupts = <GIC_SPI 200 IRQ_TYPE_LEVEL_LOW>;
+	interrupts = <GIC_SPI 200 IRQ_TYPE_LEVEL_LOW
+		      GIC_SPI 199 IRQ_TYPE_LEVEL_LOW
+		      GIC_SPI 198 IRQ_TYPE_LEVEL_LOW>;
 	power-domains = <&scpsys MT2701_POWER_DOMAIN_ETH>;
 	resets = <&ethsys MT2701_ETHSYS_ETH_RST>;
 	reset-names = "eth";
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH 9/9] net: mediatek: do not set the QID field in the TX DMA descriptors
From: John Crispin @ 2016-04-07 17:57 UTC (permalink / raw)
  To: David S. Miller
  Cc: Felix Fietkau, netdev-u79uwXL29TY76Z2rM5mHXA,
	Sean Wang (王志亘),
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mediatek-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r, Matthias Brugger,
	John Crispin
In-Reply-To: <1460051876-53135-1-git-send-email-blogic-p3rKhJxN3npAfugRpC6u6w@public.gmane.org>

The QID field gets set to the mac id. This made the DMA linked list queue
the traffic of each MAC on a different internal queue. However during long
term testing we found that this will cause traffic stalls as the multi
queue setup requires a more complete initialisation which is not part of
the upstream driver yet.

This patch removes the code setting the QID field, resulting in all
traffic ending up in queue 0 which works without any special setup.

Signed-off-by: John Crispin <blogic-p3rKhJxN3npAfugRpC6u6w@public.gmane.org>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c |    3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index f9f8851..8163047 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -603,8 +603,7 @@ static int mtk_tx_map(struct sk_buff *skb, struct net_device *dev,
 			WRITE_ONCE(txd->txd1, mapped_addr);
 			WRITE_ONCE(txd->txd3, (TX_DMA_SWC |
 					       TX_DMA_PLEN0(frag_map_size) |
-					       last_frag * TX_DMA_LS0) |
-					       mac->id);
+					       last_frag * TX_DMA_LS0));
 			WRITE_ONCE(txd->txd4, 0);
 
 			tx_buf->skb = (struct sk_buff *)MTK_DMA_DUMMY_DESC;
-- 
1.7.10.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox