Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v2 07/13] amd-xgbe: Optimize DMA channel interrupt enablement
From: Tom Lendacky @ 2017-08-18 14:03 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20170818140209.14804.94997.stgit@tlendack-t1.amdoffice.net>

Currently whenever the driver needs to enable or disable interrupts for
a DMA channel it reads the interrupt enable register (IER), updates the
value and then writes the new value back to the IER. Since the hardware
does not change the IER, software can track this value and elimiate the
need to read it each time.

Add the IER value to the channel related data structure and use that as
the base for enabling and disabling interrupts, thus removing the need
for the MMIO read.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-dev.c |   77 ++++++++++++++----------------
 drivers/net/ethernet/amd/xgbe/xgbe.h     |    4 +-
 2 files changed, 37 insertions(+), 44 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
index bb60507..75a479c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
@@ -605,7 +605,6 @@ static void xgbe_config_flow_control(struct xgbe_prv_data *pdata)
 static void xgbe_enable_dma_interrupts(struct xgbe_prv_data *pdata)
 {
 	struct xgbe_channel *channel;
-	unsigned int dma_ch_isr, dma_ch_ier;
 	unsigned int i;
 
 	/* Set the interrupt mode if supported */
@@ -617,20 +616,20 @@ static void xgbe_enable_dma_interrupts(struct xgbe_prv_data *pdata)
 		channel = pdata->channel[i];
 
 		/* Clear all the interrupts which are set */
-		dma_ch_isr = XGMAC_DMA_IOREAD(channel, DMA_CH_SR);
-		XGMAC_DMA_IOWRITE(channel, DMA_CH_SR, dma_ch_isr);
+		XGMAC_DMA_IOWRITE(channel, DMA_CH_SR,
+				  XGMAC_DMA_IOREAD(channel, DMA_CH_SR));
 
 		/* Clear all interrupt enable bits */
-		dma_ch_ier = 0;
+		channel->curr_ier = 0;
 
 		/* Enable following interrupts
 		 *   NIE  - Normal Interrupt Summary Enable
 		 *   AIE  - Abnormal Interrupt Summary Enable
 		 *   FBEE - Fatal Bus Error Enable
 		 */
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, NIE, 1);
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, AIE, 1);
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, FBEE, 1);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, NIE, 1);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, AIE, 1);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, FBEE, 1);
 
 		if (channel->tx_ring) {
 			/* Enable the following Tx interrupts
@@ -639,7 +638,8 @@ static void xgbe_enable_dma_interrupts(struct xgbe_prv_data *pdata)
 			 *          mode)
 			 */
 			if (!pdata->per_channel_irq || pdata->channel_irq_mode)
-				XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, TIE, 1);
+				XGMAC_SET_BITS(channel->curr_ier,
+					       DMA_CH_IER, TIE, 1);
 		}
 		if (channel->rx_ring) {
 			/* Enable following Rx interrupts
@@ -648,12 +648,13 @@ static void xgbe_enable_dma_interrupts(struct xgbe_prv_data *pdata)
 			 *          per channel interrupts in edge triggered
 			 *          mode)
 			 */
-			XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, RBUE, 1);
+			XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, RBUE, 1);
 			if (!pdata->per_channel_irq || pdata->channel_irq_mode)
-				XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, RIE, 1);
+				XGMAC_SET_BITS(channel->curr_ier,
+					       DMA_CH_IER, RIE, 1);
 		}
 
-		XGMAC_DMA_IOWRITE(channel, DMA_CH_IER, dma_ch_ier);
+		XGMAC_DMA_IOWRITE(channel, DMA_CH_IER, channel->curr_ier);
 	}
 }
 
@@ -1964,44 +1965,40 @@ static int xgbe_is_last_desc(struct xgbe_ring_desc *rdesc)
 static int xgbe_enable_int(struct xgbe_channel *channel,
 			   enum xgbe_int int_id)
 {
-	unsigned int dma_ch_ier;
-
-	dma_ch_ier = XGMAC_DMA_IOREAD(channel, DMA_CH_IER);
-
 	switch (int_id) {
 	case XGMAC_INT_DMA_CH_SR_TI:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, TIE, 1);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, TIE, 1);
 		break;
 	case XGMAC_INT_DMA_CH_SR_TPS:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, TXSE, 1);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, TXSE, 1);
 		break;
 	case XGMAC_INT_DMA_CH_SR_TBU:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, TBUE, 1);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, TBUE, 1);
 		break;
 	case XGMAC_INT_DMA_CH_SR_RI:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, RIE, 1);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, RIE, 1);
 		break;
 	case XGMAC_INT_DMA_CH_SR_RBU:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, RBUE, 1);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, RBUE, 1);
 		break;
 	case XGMAC_INT_DMA_CH_SR_RPS:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, RSE, 1);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, RSE, 1);
 		break;
 	case XGMAC_INT_DMA_CH_SR_TI_RI:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, TIE, 1);
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, RIE, 1);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, TIE, 1);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, RIE, 1);
 		break;
 	case XGMAC_INT_DMA_CH_SR_FBE:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, FBEE, 1);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, FBEE, 1);
 		break;
 	case XGMAC_INT_DMA_ALL:
-		dma_ch_ier |= channel->saved_ier;
+		channel->curr_ier |= channel->saved_ier;
 		break;
 	default:
 		return -1;
 	}
 
-	XGMAC_DMA_IOWRITE(channel, DMA_CH_IER, dma_ch_ier);
+	XGMAC_DMA_IOWRITE(channel, DMA_CH_IER, channel->curr_ier);
 
 	return 0;
 }
@@ -2009,45 +2006,41 @@ static int xgbe_enable_int(struct xgbe_channel *channel,
 static int xgbe_disable_int(struct xgbe_channel *channel,
 			    enum xgbe_int int_id)
 {
-	unsigned int dma_ch_ier;
-
-	dma_ch_ier = XGMAC_DMA_IOREAD(channel, DMA_CH_IER);
-
 	switch (int_id) {
 	case XGMAC_INT_DMA_CH_SR_TI:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, TIE, 0);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, TIE, 0);
 		break;
 	case XGMAC_INT_DMA_CH_SR_TPS:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, TXSE, 0);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, TXSE, 0);
 		break;
 	case XGMAC_INT_DMA_CH_SR_TBU:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, TBUE, 0);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, TBUE, 0);
 		break;
 	case XGMAC_INT_DMA_CH_SR_RI:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, RIE, 0);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, RIE, 0);
 		break;
 	case XGMAC_INT_DMA_CH_SR_RBU:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, RBUE, 0);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, RBUE, 0);
 		break;
 	case XGMAC_INT_DMA_CH_SR_RPS:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, RSE, 0);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, RSE, 0);
 		break;
 	case XGMAC_INT_DMA_CH_SR_TI_RI:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, TIE, 0);
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, RIE, 0);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, TIE, 0);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, RIE, 0);
 		break;
 	case XGMAC_INT_DMA_CH_SR_FBE:
-		XGMAC_SET_BITS(dma_ch_ier, DMA_CH_IER, FBEE, 0);
+		XGMAC_SET_BITS(channel->curr_ier, DMA_CH_IER, FBEE, 0);
 		break;
 	case XGMAC_INT_DMA_ALL:
-		channel->saved_ier = dma_ch_ier & XGBE_DMA_INTERRUPT_MASK;
-		dma_ch_ier &= ~XGBE_DMA_INTERRUPT_MASK;
+		channel->saved_ier = channel->curr_ier;
+		channel->curr_ier = 0;
 		break;
 	default:
 		return -1;
 	}
 
-	XGMAC_DMA_IOWRITE(channel, DMA_CH_IER, dma_ch_ier);
+	XGMAC_DMA_IOWRITE(channel, DMA_CH_IER, channel->curr_ier);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 9a80f20..58bb455 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -182,8 +182,6 @@
 #define XGBE_IRQ_MODE_EDGE	0
 #define XGBE_IRQ_MODE_LEVEL	1
 
-#define XGBE_DMA_INTERRUPT_MASK	0x31c7
-
 #define XGMAC_MIN_PACKET	60
 #define XGMAC_STD_PACKET_MTU	1500
 #define XGMAC_MAX_STD_PACKET	1518
@@ -462,6 +460,8 @@ struct xgbe_channel {
 	/* Netdev related settings */
 	struct napi_struct napi;
 
+	/* Per channel interrupt enablement tracker */
+	unsigned int curr_ier;
 	unsigned int saved_ier;
 
 	unsigned int tx_timer_active;

^ permalink raw reply related

* [PATCH net-next v2 06/13] amd-xgbe: Add additional dynamic debug messages
From: Tom Lendacky @ 2017-08-18 14:03 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20170818140209.14804.94997.stgit@tlendack-t1.amdoffice.net>

Add some additional dynamic debug message to the driver. The new messages
will provide additional information about the PCS window calculation.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-pci.c |    4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
index 1e56ad7..3e5833c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
@@ -292,6 +292,10 @@ static int xgbe_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	pdata->xpcs_window_size = 1 << (pdata->xpcs_window_size + 7);
 	pdata->xpcs_window_mask = pdata->xpcs_window_size - 1;
 	if (netif_msg_probe(pdata)) {
+		dev_dbg(dev, "xpcs window def  = %#010x\n",
+			pdata->xpcs_window_def_reg);
+		dev_dbg(dev, "xpcs window sel  = %#010x\n",
+			pdata->xpcs_window_sel_reg);
 		dev_dbg(dev, "xpcs window      = %#010x\n",
 			pdata->xpcs_window);
 		dev_dbg(dev, "xpcs window size = %#010x\n",

^ permalink raw reply related

* [PATCH net-next v2 05/13] amd-xgbe: Add support to handle device renaming
From: Tom Lendacky @ 2017-08-18 14:02 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20170818140209.14804.94997.stgit@tlendack-t1.amdoffice.net>

Many of the names used by the driver are based upon the name of the device
found during device probe.  Move the formatting of the names into the
device open function so that any renaming that occurs before the device is
brought up will be accounted for.  This also means moving the creation of
some named workqueues into the device open path.

Add support to register for net events so that if a device is renamed
the corresponding debugfs directory can be renamed.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c |   25 +++++++++
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c     |   44 +++++++++++++++-
 drivers/net/ethernet/amd/xgbe/xgbe-main.c    |   72 +++++++++++---------------
 drivers/net/ethernet/amd/xgbe/xgbe.h         |    5 +-
 4 files changed, 100 insertions(+), 46 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
index 7546b66..7d128be 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
@@ -527,3 +527,28 @@ void xgbe_debugfs_exit(struct xgbe_prv_data *pdata)
 	debugfs_remove_recursive(pdata->xgbe_debugfs);
 	pdata->xgbe_debugfs = NULL;
 }
+
+void xgbe_debugfs_rename(struct xgbe_prv_data *pdata)
+{
+	struct dentry *pfile;
+	char *buf;
+
+	if (!pdata->xgbe_debugfs)
+		return;
+
+	buf = kasprintf(GFP_KERNEL, "amd-xgbe-%s", pdata->netdev->name);
+	if (!buf)
+		return;
+
+	if (!strcmp(pdata->xgbe_debugfs->d_name.name, buf))
+		goto out;
+
+	pfile = debugfs_rename(pdata->xgbe_debugfs->d_parent,
+			       pdata->xgbe_debugfs,
+			       pdata->xgbe_debugfs->d_parent, buf);
+	if (!pfile)
+		netdev_err(pdata->netdev, "debugfs_rename failed\n");
+
+out:
+	kfree(buf);
+}
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 2fd9b80..d6d29d8 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -887,7 +887,7 @@ static int xgbe_request_irqs(struct xgbe_prv_data *pdata)
 		     (unsigned long)pdata);
 
 	ret = devm_request_irq(pdata->dev, pdata->dev_irq, xgbe_isr, 0,
-			       netdev->name, pdata);
+			       netdev_name(netdev), pdata);
 	if (ret) {
 		netdev_alert(netdev, "error requesting irq %d\n",
 			     pdata->dev_irq);
@@ -1589,16 +1589,42 @@ static int xgbe_open(struct net_device *netdev)
 
 	DBGPR("-->xgbe_open\n");
 
+	/* Create the various names based on netdev name */
+	snprintf(pdata->an_name, sizeof(pdata->an_name) - 1, "%s-pcs",
+		 netdev_name(netdev));
+
+	snprintf(pdata->ecc_name, sizeof(pdata->ecc_name) - 1, "%s-ecc",
+		 netdev_name(netdev));
+
+	snprintf(pdata->i2c_name, sizeof(pdata->i2c_name) - 1, "%s-i2c",
+		 netdev_name(netdev));
+
+	/* Create workqueues */
+	pdata->dev_workqueue =
+		create_singlethread_workqueue(netdev_name(netdev));
+	if (!pdata->dev_workqueue) {
+		netdev_err(netdev, "device workqueue creation failed\n");
+		return -ENOMEM;
+	}
+
+	pdata->an_workqueue =
+		create_singlethread_workqueue(pdata->an_name);
+	if (!pdata->an_workqueue) {
+		netdev_err(netdev, "phy workqueue creation failed\n");
+		ret = -ENOMEM;
+		goto err_dev_wq;
+	}
+
 	/* Reset the phy settings */
 	ret = xgbe_phy_reset(pdata);
 	if (ret)
-		return ret;
+		goto err_an_wq;
 
 	/* Enable the clocks */
 	ret = clk_prepare_enable(pdata->sysclk);
 	if (ret) {
 		netdev_alert(netdev, "dma clk_prepare_enable failed\n");
-		return ret;
+		goto err_an_wq;
 	}
 
 	ret = clk_prepare_enable(pdata->ptpclk);
@@ -1651,6 +1677,12 @@ static int xgbe_open(struct net_device *netdev)
 err_sysclk:
 	clk_disable_unprepare(pdata->sysclk);
 
+err_an_wq:
+	destroy_workqueue(pdata->an_workqueue);
+
+err_dev_wq:
+	destroy_workqueue(pdata->dev_workqueue);
+
 	return ret;
 }
 
@@ -1674,6 +1706,12 @@ static int xgbe_close(struct net_device *netdev)
 	clk_disable_unprepare(pdata->ptpclk);
 	clk_disable_unprepare(pdata->sysclk);
 
+	flush_workqueue(pdata->an_workqueue);
+	destroy_workqueue(pdata->an_workqueue);
+
+	flush_workqueue(pdata->dev_workqueue);
+	destroy_workqueue(pdata->dev_workqueue);
+
 	set_bit(XGBE_DOWN, &pdata->dev_state);
 
 	DBGPR("<--xgbe_close\n");
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
index 53a425c..c5ff385 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
@@ -120,6 +120,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/io.h>
+#include <linux/notifier.h>
 
 #include "xgbe.h"
 #include "xgbe-common.h"
@@ -399,35 +400,6 @@ int xgbe_config_netdev(struct xgbe_prv_data *pdata)
 		return ret;
 	}
 
-	/* Create the PHY/ANEG name based on netdev name */
-	snprintf(pdata->an_name, sizeof(pdata->an_name) - 1, "%s-pcs",
-		 netdev_name(netdev));
-
-	/* Create the ECC name based on netdev name */
-	snprintf(pdata->ecc_name, sizeof(pdata->ecc_name) - 1, "%s-ecc",
-		 netdev_name(netdev));
-
-	/* Create the I2C name based on netdev name */
-	snprintf(pdata->i2c_name, sizeof(pdata->i2c_name) - 1, "%s-i2c",
-		 netdev_name(netdev));
-
-	/* Create workqueues */
-	pdata->dev_workqueue =
-		create_singlethread_workqueue(netdev_name(netdev));
-	if (!pdata->dev_workqueue) {
-		netdev_err(netdev, "device workqueue creation failed\n");
-		ret = -ENOMEM;
-		goto err_netdev;
-	}
-
-	pdata->an_workqueue =
-		create_singlethread_workqueue(pdata->an_name);
-	if (!pdata->an_workqueue) {
-		netdev_err(netdev, "phy workqueue creation failed\n");
-		ret = -ENOMEM;
-		goto err_wq;
-	}
-
 	if (IS_REACHABLE(CONFIG_PTP_1588_CLOCK))
 		xgbe_ptp_register(pdata);
 
@@ -439,14 +411,6 @@ int xgbe_config_netdev(struct xgbe_prv_data *pdata)
 		  pdata->rx_ring_count);
 
 	return 0;
-
-err_wq:
-	destroy_workqueue(pdata->dev_workqueue);
-
-err_netdev:
-	unregister_netdev(netdev);
-
-	return ret;
 }
 
 void xgbe_deconfig_netdev(struct xgbe_prv_data *pdata)
@@ -461,18 +425,42 @@ void xgbe_deconfig_netdev(struct xgbe_prv_data *pdata)
 	unregister_netdev(netdev);
 
 	pdata->phy_if.phy_exit(pdata);
+}
 
-	flush_workqueue(pdata->an_workqueue);
-	destroy_workqueue(pdata->an_workqueue);
+static int xgbe_netdev_event(struct notifier_block *nb, unsigned long event,
+			     void *data)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(data);
+	struct xgbe_prv_data *pdata = netdev_priv(netdev);
 
-	flush_workqueue(pdata->dev_workqueue);
-	destroy_workqueue(pdata->dev_workqueue);
+	if (netdev->netdev_ops != xgbe_get_netdev_ops())
+		goto out;
+
+	switch (event) {
+	case NETDEV_CHANGENAME:
+		xgbe_debugfs_rename(pdata);
+		break;
+
+	default:
+		break;
+	}
+
+out:
+	return NOTIFY_DONE;
 }
 
+static struct notifier_block xgbe_netdev_notifier = {
+	.notifier_call = xgbe_netdev_event,
+};
+
 static int __init xgbe_mod_init(void)
 {
 	int ret;
 
+	ret = register_netdevice_notifier(&xgbe_netdev_notifier);
+	if (ret)
+		return ret;
+
 	ret = xgbe_platform_init();
 	if (ret)
 		return ret;
@@ -489,6 +477,8 @@ static void __exit xgbe_mod_exit(void)
 	xgbe_pci_exit();
 
 	xgbe_platform_exit();
+
+	unregister_netdevice_notifier(&xgbe_netdev_notifier);
 }
 
 module_init(xgbe_mod_init);
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index e9282c9..9a80f20 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -130,6 +130,7 @@
 #include <linux/completion.h>
 #include <linux/cpumask.h>
 #include <linux/interrupt.h>
+#include <linux/dcache.h>
 
 #define XGBE_DRV_NAME		"amd-xgbe"
 #define XGBE_DRV_VERSION	"1.0.3"
@@ -1172,7 +1173,6 @@ struct xgbe_prv_data {
 	struct tasklet_struct tasklet_i2c;
 	struct tasklet_struct tasklet_an;
 
-#ifdef CONFIG_DEBUG_FS
 	struct dentry *xgbe_debugfs;
 
 	unsigned int debugfs_xgmac_reg;
@@ -1183,7 +1183,6 @@ struct xgbe_prv_data {
 	unsigned int debugfs_xprop_reg;
 
 	unsigned int debugfs_xi2c_reg;
-#endif
 };
 
 /* Function prototypes*/
@@ -1232,9 +1231,11 @@ void xgbe_dump_rx_desc(struct xgbe_prv_data *, struct xgbe_ring *,
 #ifdef CONFIG_DEBUG_FS
 void xgbe_debugfs_init(struct xgbe_prv_data *);
 void xgbe_debugfs_exit(struct xgbe_prv_data *);
+void xgbe_debugfs_rename(struct xgbe_prv_data *pdata);
 #else
 static inline void xgbe_debugfs_init(struct xgbe_prv_data *pdata) {}
 static inline void xgbe_debugfs_exit(struct xgbe_prv_data *pdata) {}
+static inline void xgbe_debugfs_rename(struct xgbe_prv_data *pdata) {}
 #endif /* CONFIG_DEBUG_FS */
 
 /* NOTE: Uncomment for function trace log messages in KERNEL LOG */

^ permalink raw reply related

* [PATCH net-next v2 04/13] amd-xgbe: Update TSO packet statistics accuracy
From: Tom Lendacky @ 2017-08-18 14:02 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20170818140209.14804.94997.stgit@tlendack-t1.amdoffice.net>

When transmitting a TSO packet, the driver only increments the TSO packet
statistic by one rather than the number of total packets that were sent.
Update the driver to record the total number of packets that resulted from
TSO transmit.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-dev.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
index 06f953e..bb60507 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
@@ -1740,7 +1740,7 @@ static void xgbe_dev_xmit(struct xgbe_channel *channel)
 		XGMAC_SET_BITS_LE(rdesc->desc3, TX_NORMAL_DESC3, TCPHDRLEN,
 				  packet->tcp_header_len / 4);
 
-		pdata->ext_stats.tx_tso_packets++;
+		pdata->ext_stats.tx_tso_packets += packet->tx_packets;
 	} else {
 		/* Enable CRC and Pad Insertion */
 		XGMAC_SET_BITS_LE(rdesc->desc3, TX_NORMAL_DESC3, CPC, 0);

^ permalink raw reply related

* [PATCH net-next v2 03/13] amd-xgbe: Be sure driver shuts down cleanly on module removal
From: Tom Lendacky @ 2017-08-18 14:02 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20170818140209.14804.94997.stgit@tlendack-t1.amdoffice.net>

Sometimes when the driver is being unloaded while the devices are still
up the driver can issue errors.  This is based on timing and the double
invocation of some routines.  The phy_exit() call needs to be run after
the network device has been closed and unregistered from the system.
Also, the phy_exit() does not need to invoke phy_stop() since that will
be called as part of the device closing, so remove that call.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-main.c |    4 ++--
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c |    2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
index 500147d..53a425c 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
@@ -458,6 +458,8 @@ void xgbe_deconfig_netdev(struct xgbe_prv_data *pdata)
 	if (IS_REACHABLE(CONFIG_PTP_1588_CLOCK))
 		xgbe_ptp_unregister(pdata);
 
+	unregister_netdev(netdev);
+
 	pdata->phy_if.phy_exit(pdata);
 
 	flush_workqueue(pdata->an_workqueue);
@@ -465,8 +467,6 @@ void xgbe_deconfig_netdev(struct xgbe_prv_data *pdata)
 
 	flush_workqueue(pdata->dev_workqueue);
 	destroy_workqueue(pdata->dev_workqueue);
-
-	unregister_netdev(netdev);
 }
 
 static int __init xgbe_mod_init(void)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 2222bbf8..2409202 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -1533,8 +1533,6 @@ static int xgbe_phy_best_advertised_speed(struct xgbe_prv_data *pdata)
 
 static void xgbe_phy_exit(struct xgbe_prv_data *pdata)
 {
-	xgbe_phy_stop(pdata);
-
 	pdata->phy_if.phy_impl.exit(pdata);
 }
 

^ permalink raw reply related

* [PATCH net-next v2 02/13] amd-xgbe: Set the MII control width for the MAC interface
From: Tom Lendacky @ 2017-08-18 14:02 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20170818140209.14804.94997.stgit@tlendack-t1.amdoffice.net>

When running in SGMII mode at speeds below 1000Mbps, the auto-negotition
control register must set the MII control width for the MAC interface
to be 8-bits wide.  By default the width is 4-bits.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-common.h |    1 +
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c   |    2 ++
 2 files changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
index 9795419..d07edf9 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
@@ -1339,6 +1339,7 @@
 #define XGBE_AN_CL37_PCS_MODE_BASEX	0x00
 #define XGBE_AN_CL37_PCS_MODE_SGMII	0x04
 #define XGBE_AN_CL37_TX_CONFIG_MASK	0x08
+#define XGBE_AN_CL37_MII_CTRL_8BIT	0x0100
 
 /* Bit setting and getting macros
  *  The get macro will extract the current bit field value from within
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 8068491..2222bbf8 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -982,6 +982,8 @@ static void xgbe_an37_init(struct xgbe_prv_data *pdata)
 		break;
 	}
 
+	reg |= XGBE_AN_CL37_MII_CTRL_8BIT;
+
 	XMDIO_WRITE(pdata, MDIO_MMD_VEND2, MDIO_VEND2_AN_CTRL, reg);
 
 	netif_dbg(pdata, link, pdata->netdev, "CL37 AN (%s) initialized\n",

^ permalink raw reply related

* [PATCH net-next v2 01/13] amd-xgbe: Set the MDIO mode for 10000Base-T configuration
From: Tom Lendacky @ 2017-08-18 14:02 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20170818140209.14804.94997.stgit@tlendack-t1.amdoffice.net>

Currently the MDIO mode is set to none for the 10000Base-T, which is
incorrect.  The MDIO mode should for this configuration should be
clause 45.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 04b5c14..81c45fa 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -2921,7 +2921,7 @@ static int xgbe_phy_init(struct xgbe_prv_data *pdata)
 			phy_data->start_mode = XGBE_MODE_KR;
 		}
 
-		phy_data->phydev_mode = XGBE_MDIO_MODE_NONE;
+		phy_data->phydev_mode = XGBE_MDIO_MODE_CL45;
 		break;
 
 	/* 10GBase-R support */

^ permalink raw reply related

* [PATCH net-next v2 00/13] amd-xgbe: AMD XGBE driver updates 2017-08-17
From: Tom Lendacky @ 2017-08-18 14:02 UTC (permalink / raw)
  To: netdev; +Cc: David Miller

The following updates are included in this driver update series:

- Set the MDIO mode to clause 45 for the 10GBase-T configuration
- Set the MII control width to 8-bits for speeds less than 1Gbps
- Fix an issue to related to module removal when the devices are up
- Fix ethtool statistics related to packet counting of TSO packets
- Add support for device renaming
- Add additional dynamic debug output for the PCS window calculation
- Optimize reading of DMA channel interrupt enablement register
- Add additional dynamic debug output about the hardware features
- Add per queue Tx and Rx ethtool statistics
- Add a macro to clear ethtool_link_ksettings modes
- Convert the driver to use the ethtool_link_ksettings
- Add support for VXLAN offload capabilities
- Add additional ethtool statistics related to VXLAN

This patch series is based on net-next.

---

Changes since v1:
- Remove added debugfs support for reading SFP eeprom and phydev
  registers


Tom Lendacky (13):
      amd-xgbe: Set the MDIO mode for 10000Base-T configuration
      amd-xgbe: Set the MII control width for the MAC interface
      amd-xgbe: Be sure driver shuts down cleanly on module removal
      amd-xgbe: Update TSO packet statistics accuracy
      amd-xgbe: Add support to handle device renaming
      amd-xgbe: Add additional dynamic debug messages
      amd-xgbe: Optimize DMA channel interrupt enablement
      amd-xgbe: Add hardware features debug output
      amd-xgbe: Add per queue Tx and Rx statistics
      net: ethtool: Add macro to clear a link mode setting
      amd-xgbe: Convert to using the new link mode settings
      amd-xgbe: Add support for VXLAN offload capabilities
      amd-xgbe: Add additional ethtool statistics


 drivers/net/ethernet/amd/xgbe/xgbe-common.h  |   25 +
 drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c |   25 +
 drivers/net/ethernet/amd/xgbe/xgbe-dev.c     |  198 ++++++++---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c     |  487 ++++++++++++++++++++++++++
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c |   86 +++--
 drivers/net/ethernet/amd/xgbe/xgbe-main.c    |   97 +++--
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c    |   81 +++-
 drivers/net/ethernet/amd/xgbe/xgbe-pci.c     |    4 
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c  |   54 ++-
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c  |  352 ++++++++++---------
 drivers/net/ethernet/amd/xgbe/xgbe.h         |   91 ++++-
 include/linux/ethtool.h                      |   11 +
 12 files changed, 1158 insertions(+), 353 deletions(-)

-- 
Tom Lendacky

^ permalink raw reply

* [PATCH] mlx5: ensure 0 is returned when vport is zero
From: Colin King @ 2017-08-18 13:49 UTC (permalink / raw)
  To: Saeed Mahameed, Matan Barak, Leon Romanovsky, netdev, linux-rdma
  Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

Currently, if vport is zero then then an uninialized return status
in err is returned.  Since the only return status at the end of the
function esw_add_uc_addr is zero for the current set of return paths
we may as well just return 0 rather than err to fix this issue.

Detected by CoverityScan, CID#1452698 ("Uninitialized scalar variable")

Fixes: eeb66cdb6826 ("net/mlx5: Separate between E-Switch and MPFS")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 6d9fb6ac6e9b..c77f4c0c7769 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -401,7 +401,7 @@ static int esw_add_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 	esw_debug(esw->dev, "\tADDED UC MAC: vport[%d] %pM fr(%p)\n",
 		  vport, mac, vaddr->flow_rule);
 
-	return err;
+	return 0;
 }
 
 static int esw_del_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH net-next] net: hns3: Add support to change MTU in hardware & netdev
From: Andrew Lunn @ 2017-08-18 13:31 UTC (permalink / raw)
  To: Salil Mehta
  Cc: davem, yisen.zhuang, lipeng321, dan.carpenter, mehta.salil.lnk,
	netdev, linux-kernel, linux-rdma, linuxarm
In-Reply-To: <20170818113558.71928-1-salil.mehta@huawei.com>

On Fri, Aug 18, 2017 at 12:35:58PM +0100, Salil Mehta wrote:
> This patch adds the following support to the HNS3 driver:
> 1. Support to change the Maximum Transmission Unit of a
>    netdevice and of a port in hardware .
> 2. Initializes the supported MTU range for the netdevice.
> 
> Signed-off-by: lipeng <lipeng321@huawei.com>
> Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
> ---
>  .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c | 46 ++++++++++++++++++++++
>  .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h |  1 +
>  2 files changed, 47 insertions(+)
> 
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
> index e731f87..8e3711e 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
> @@ -1278,11 +1278,53 @@ static int hns3_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan,
>  	return ret;
>  }
>  
> +static int hns3_nic_change_mtu(struct net_device *netdev, int new_mtu)
> +{
> +	struct hns3_nic_priv *priv = netdev_priv(netdev);
> +	struct hnae3_handle *h = priv->ae_handle;
> +	bool if_running = netif_running(netdev);
> +	int ret;
> +
> +	/* no change in MTU */
> +	if (new_mtu == netdev->mtu)
> +		return 0;

Hi Salil

It is worth reading the core code:

http://elixir.free-electrons.com/linux/latest/source/net/core/dev.c#L6713

 +
> +	if (!h->ae_algo->ops->set_mtu)
> +		return -ENOTSUPP;
> +
> +	/* if this was called with netdev up then bring netdevice down */
> +	if (if_running) {
> +		(void)hns3_nic_net_stop(netdev);
> +		msleep(100);
> +	}
> +
> +	ret = h->ae_algo->ops->set_mtu(h, new_mtu);
> +	if (ret) {
> +		netdev_err(netdev, "failed to change MTU in hardware %d\n",
> +			   ret);
> +		return ret;
> +	}
> +
> +	/* assign newly changed mtu to netdevice as well */
> +	netdev->mtu = new_mtu;

http://elixir.free-electrons.com/linux/latest/source/net/core/dev.c#L6698

> +
> +	/* if the netdev was running earlier, bring it up again */
> +	if (if_running) {
> +		if (hns3_nic_net_open(netdev)) {
> +			netdev_err(netdev, "MTU, couldnt up netdev again\n");
> +			ret = -EINVAL;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
>  static const struct net_device_ops hns3_nic_netdev_ops = {
>  	.ndo_open		= hns3_nic_net_open,
>  	.ndo_stop		= hns3_nic_net_stop,
>  	.ndo_start_xmit		= hns3_nic_net_xmit,
>  	.ndo_set_mac_address	= hns3_nic_net_set_mac_address,
> +	.ndo_change_mtu		= hns3_nic_change_mtu,
>  	.ndo_set_features	= hns3_nic_set_features,
>  	.ndo_get_stats64	= hns3_nic_get_stats64,
>  	.ndo_setup_tc		= hns3_nic_setup_tc,
> @@ -2752,6 +2794,10 @@ static int hns3_client_init(struct hnae3_handle *handle)
>  		goto out_reg_netdev_fail;
>  	}
>  
> +	/* MTU range: 68 - 9706 */
> +	netdev->min_mtu = ETH_MIN_MTU;

http://elixir.free-electrons.com/linux/latest/source/net/ethernet/eth.c#L361

> +	netdev->max_mtu = HNS3_MAX_MTU - (ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN);
> +
>  	return ret;
>  
>  out_reg_netdev_fail:
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h
> index a6e8f15..7e87461 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h
> @@ -76,6 +76,7 @@ enum hns3_nic_state {
>  #define HNS3_RING_NAME_LEN			16
>  #define HNS3_BUFFER_SIZE_2048			2048
>  #define HNS3_RING_MAX_PENDING			32768
> +#define HNS3_MAX_MTU				9728

It seems odd that it does not already exists somewhere. The core does
not enforce the MTU.  You could be passed a frame which is bigger. So
you should check before trying to pass something to the hardware which
the hardware cannot handle.

    Andrew

^ permalink raw reply

* Re: [PATCH net-next v4] openvswitch: enable NSH support
From: Jiri Benc @ 2017-08-18 13:31 UTC (permalink / raw)
  To: Yi Yang; +Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA, e
In-Reply-To: <20170818152601.3760aaec@griffin>

On Fri, 18 Aug 2017 15:26:01 +0200, Jiri Benc wrote:
> Out of time for today, will continue the review next week. Again, feel
> free to send a new version meanwhile or wait for the rest of the
> review, whatever works better for you.

One more thing: before you send a new version, be sure and double check
that you addressed *all* of the feedback. In this version, you still
have not addressed a few things I gave feedback on in the previous
version. This wastes everyone's time.

It doesn't mean I'm always right, of course, but you either explain why
I'm wrong or change the code. It's generally not acceptable to ignore
feedback.

Thank you,

 Jiri

^ permalink raw reply

* Re: [PATCH net-next v4] openvswitch: enable NSH support
From: Jiri Benc @ 2017-08-18 13:26 UTC (permalink / raw)
  To: Yi Yang; +Cc: netdev, dev, blp, e, jan.scheurich
In-Reply-To: <1503041071-68753-1-git-send-email-yi.y.yang@intel.com>

On Fri, 18 Aug 2017 15:24:31 +0800, Yi Yang wrote:
> +struct nsh_md2_tlv {
> +	__be16 md_class;
> +	u8 type;
> +	u8 length;
> +	/* Followed by variable-length data. */
> +};

What was wrong with the u8[] field that was present at the end of the
struct in the previous version of the patch?

> +#define NSH_M_TYPE2_MAX_LEN 256

This is defined twice, please delete this define and keep the one lower
in the file.

> +#define NSH_DST_PORT    4790     /* UDP Port for NSH on VXLAN. */

This is a VXLAN-GPE port, it has nothing to do with NSH (except that
VXLAN-GPE can contain a NSH packet). It's also unused. Please remove it.

> +/* NSH Metadata Length. */
> +#define NSH_M_TYPE1_MDLEN 16

This is unused and it seems it's not much useful anyway,
sizeof(struct nsh_md1_ctx) provides the same value. Please remove this
define.

> +#define NSH_MD1_CTX(nsh_hdr_ptr) (&(nsh_hdr_ptr)->md1)
> +
> +#define NSH_MD2_CTX(nsh_hdr_ptr) (&(nsh_hdr_ptr)->md2)

Please remove these two. They are unused and would just obscure things
anyway.

> +static inline struct nsh_md1_ctx *nsh_md1_ctx(struct nsh_hdr *nsh)
> +{
> +	return &nsh->md1;
> +}
> +
> +static inline struct nsh_md2_tlv *nsh_md2_ctx(struct nsh_hdr *nsh)
> +{
> +	return &nsh->md2;
> +}

And remove these too, for the same reason. Just use nsh->md1 when you
need the metadata, there's no reason for these helper functions. They
just obscure things.

> +static inline void nsh_set_flags_and_ttl(struct nsh_hdr *nsh, u8 flags, u8 ttl)
> +{
> +	nsh->ver_flags_ttl_len
> +		= htons((ntohs(nsh->ver_flags_ttl_len)
> +			& ~(NSH_FLAGS_MASK | NSH_TTL_MASK))
> +			| ((flags << NSH_FLAGS_SHIFT) & NSH_FLAGS_MASK)
> +			| ((ttl << NSH_TTL_SHIFT) & NSH_TTL_MASK));
> +}
> +
> +static inline void nsh_set_flags_ttl_len(struct nsh_hdr *nsh, u8 flags,
> +					 u8 ttl, u8 len)
> +{
> +	nsh->ver_flags_ttl_len
> +		= htons((ntohs(nsh->ver_flags_ttl_len)
> +			& ~(NSH_FLAGS_MASK | NSH_TTL_MASK | NSH_LEN_MASK))
> +			| ((flags << NSH_FLAGS_SHIFT) & NSH_FLAGS_MASK)
> +			| ((ttl << NSH_TTL_SHIFT) & NSH_TTL_MASK)
> +			| ((len << NSH_LEN_SHIFT) & NSH_LEN_MASK));
> +}

Okay. Could those two perhaps use a common function?

static inline void __nsh_set_flags(struct nsh_hdr *nsh, u16 value, u16 mask)
{
	nsh->ver_flags_ttl_len = nsh->ver_flags_ttl_len & ~htons(mask)
							| htons(value);
}

static inline void nsh_set_flags_and_ttl(struct nsh_hdr *nsh, u8 flags, u8 ttl)
{
	__nsh_set_flags(nsh, flags << NSH_FLAGS_SHIFT | ttl << NSH_TTL_SHIFT,
			NSH_FLAGS_MASK | NSH_TTL_MASK);
}

etc.

> +static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key,
> +		    const struct nsh_hdr *nsh_src)
> +{
[...]
> +	if (!skb->inner_protocol)
> +		skb_set_inner_protocol(skb, skb->protocol);

I was wondering about this during the reviews of the previous versions.
Now I've given this more thought but I still don't see it - why is the
inner_protocol set here?

> +	case OVS_KEY_ATTR_NSH: {
> +		struct ovs_key_nsh nsh;
> +		struct ovs_key_nsh nsh_mask;
> +		size_t size = nla_len(a) / 2;
> +		struct nlattr attr[1 + size / sizeof(struct nlattr) + 1];
> +		struct nlattr mask[1 + size / sizeof(struct nlattr) + 1];
> +
> +		attr->nla_type = nla_type(a);
> +		mask->nla_type = attr->nla_type;
> +		attr->nla_len = NLA_HDRLEN + size;
> +		mask->nla_len = attr->nla_len;
> +		memcpy(attr + 1, (char *)(a + 1), size);
> +		memcpy(mask + 1, (char *)(a + 1) + size, size);

No, please. See my reply to the previous version for how to do this in
a less hacky way.

> +		case OVS_ACTION_ATTR_PUSH_NSH: {
> +			u8 buffer[256];
> +			struct nsh_hdr *nsh_hdr = (struct nsh_hdr *)buffer;
> +			const struct nsh_hdr *nsh_src = nsh_hdr;
> +
> +			nsh_hdr_from_nlattr(nla_data(a), nsh_hdr);

This is very dangerous security wise. You have to protect against
buffer overflow, one way or other. The current code may not overflow
(I have not checked that, though) but a future addition may break the
assumption without being obvious it's a problem.

Note that the previous version had exactly the same problem but it was
hidden and I didn't notice it. Which means that getting rid of that
push_nsh_para struct was a very good thing, the code is more clean and
more obvious now.

> +static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key)
> +{
> +	struct nsh_hdr *nsh = (struct nsh_hdr *)skb_network_header(skb);
> +	u8 version, length;
> +	int err;
> +
> +	err = check_header(skb, NSH_BASE_HDR_LEN);
> +	if (unlikely(err))
> +		return err;
> +
> +	memset(&key->nsh, 0, sizeof(struct ovs_key_nsh));

This is unnecessary and expensive. We're initializing all the fields
below.

> +	version = nsh_get_ver(nsh);
> +	length = nsh_hdr_len(nsh);

You have to reload nsh after pskb_may_pull (which is called by
check_header).

> +	if (version != 0)
> +		return -EINVAL;
> +
> +	if (nsh->md_type == NSH_M_TYPE1 && length != NSH_M_TYPE1_LEN)
> +		return -EINVAL;
> +
> +	if (nsh->md_type == NSH_M_TYPE2 && length < NSH_BASE_HDR_LEN)
> +		return -EINVAL;

This might better be merged to the switch below. Or are you concerned
about potentially expensive pskb_may_pull with unchecked length? In
that case, it would be better to convert to switch and reject on
unknown md_types.

> +	err = check_header(skb, length);
> +	if (unlikely(err))
> +		return err;
> +
> +	key->nsh.flags = nsh_get_flags(nsh);

Again, need to reload nsh.

> +	key->nsh.ttl = nsh_get_ttl(nsh);
> +	key->nsh.mdtype = nsh->md_type;
> +	key->nsh.np = nsh->next_proto;
> +	key->nsh.path_hdr = nsh->path_hdr;
> +	switch (key->nsh.mdtype) {
> +	case NSH_M_TYPE1:
> +		memcpy(key->nsh.context, nsh->md1.context,
> +		       sizeof(nsh->md1));
> +		break;
> +	case NSH_M_TYPE2:
> +		/* Don't support MD type 2 metedata parsing yet */
> +		break;
> +	default:
> +		return -EINVAL;
> +	}

This is the switch I mentioned above.

> +struct ovs_key_nsh {
> +	__u8 flags;
> +	__u8 ttl;
> +	__u8 mdtype;
> +	__u8 np;

Just u8, please, this is kernel internal.

> +size_t ovs_nsh_key_attr_size(void)
> +{
> +	/* Whenever adding new OVS_NSH_KEY_ FIELDS, we should consider
> +	 * updating this function.
> +	 */
> +	return  nla_total_size(8)      /* OVS_NSH_KEY_ATTR_BASE */

NSH_BASE_HDR_LEN, perhaps? Not that much important, though.

> +		switch (type) {
> +		case OVS_NSH_KEY_ATTR_BASE: {
> +			const struct ovs_nsh_key_base *base =
> +				(struct ovs_nsh_key_base *)nla_data(a);
> +			flags = base->flags;
> +			ttl = base->ttl;
> +			nsh->next_proto = base->np;
> +			nsh->md_type = base->mdtype;
> +			nsh->path_hdr = base->path_hdr;

Wouldn't it be nicer if the fields of struct ovs_nsh_key_base and of
struct nsh_hdr had the same names?

> +		case OVS_NSH_KEY_ATTR_MD1: {
> +			const struct ovs_nsh_key_md1 *md1 =
> +				(struct ovs_nsh_key_md1 *)nla_data(a);
> +			struct nsh_md1_ctx *md1_dst = nsh_md1_ctx(nsh);
> +
> +			has_md1 = true;
> +			mdlen = nla_len(a);
> +			memcpy(md1_dst, md1, mdlen);

How can we be sure there's enough room in the nsh buffer? See also my
previous remark.

> +			break;
> +		}
> +		case OVS_NSH_KEY_ATTR_MD2: {
> +			const struct u8 *md2 = nla_data(a);
> +			struct nsh_md2_tlv *md2_dst = nsh_md2_ctx(nsh);
> +
> +			has_md2 = true;
> +			mdlen = nla_len(a);
> +			if ((mdlen > NSH_M_TYPE2_MD_MAX_LEN) ||
> +			    (mdlen == 0)) {
> +				OVS_NLERR(
> +				    1,
> +				    "length %d of nsh attr %d is invalid",
> +				    mdlen,
> +				    type
> +				);
> +				return -EINVAL;
> +			}
> +			memcpy(md2_dst, md2, mdlen);

And, more importantly, here. It seems that it's currently capped at
256 bytes by the mdlen check yet it's too fragile. Either add a
parameter with the nsh buffer size or find other way to make this more
robust. Otherwise we're going to hunt a buffer overflow in a year.

> +	if ((has_md1 && nsh->md_type != NSH_M_TYPE1) ||
> +	    (has_md2 && nsh->md_type != NSH_M_TYPE2)) {
> +		OVS_NLERR(1,
> +			  "nsh attribute has unmatched MD type %d.",
> +			  nsh->md_type);
> +		return -EINVAL;
> +	}

What if both type 1 and type 2 attributes were specified? Or neither?
This condition does not catch that.

> +	/* nsh header length  = NSH_BASE_HDR_LEN + mdlen */
> +	nsh_set_flags_ttl_len(nsh, flags, ttl,
> +			      (NSH_BASE_HDR_LEN + mdlen) >> 2);

Just specify the len. It's the job of the helper function to convert it
to whatever format is needed in the header. (I'm talking about the
">> 2". That should not be done by the caller but by the helper
function.)

Out of time for today, will continue the review next week. Again, feel
free to send a new version meanwhile or wait for the rest of the
review, whatever works better for you.

 Jiri

^ permalink raw reply

* [PATCHv3 net-next] gre: introduce native tunnel support for ERSPAN
From: William Tu @ 2017-08-18 13:24 UTC (permalink / raw)
  To: netdev; +Cc: Meenakshi Vohra, Alexey Kuznetsov, Hideaki YOSHIFUJI

The patch adds ERSPAN type II tunnel support.  The implementation
is based on the draft at [1].  One of the purposes is for Linux
box to be able to receive ERSPAN monitoring traffic sent from
the Cisco switch, by creating a ERSPAN tunnel device.
In addition, the patch also adds ERSPAN TX, so traffic can
also be encapsulated into ERSPAN and sent out.

The implementation reuses the key as ERSPAN session ID, and
field 'erspan' as ERSPAN Index fields:
./ip link add dev ers11 type erspan seq key 100 erspan 123 \
			local 172.16.1.200 remote 172.16.1.100

To use the above device as ERSPAN receiver, on configure
Nexus 5000 switch as below:

monitor session 1 type erspan-source
  erspan-id 100
  vrf default
  destination ip 172.16.1.200
  source interface Ethernet1/11 both
  source interface Ethernet1/12 both
  no shut
monitor erspan origin ip-address 172.16.1.100 global

[1] https://tools.ietf.org/html/draft-foschiano-erspan-01
[2] iproute2 patch: http://marc.info/?l=linux-netdev&m=150306086924951&w=2
[3] test script: http://marc.info/?l=linux-netdev&m=150231021807304&w=2

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: Meenakshi Vohra <mvohra@vmware.com>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
---
v2->v3:
  add skb_may_pull check at erspan_rcv
  a couple of minor fixes and checks for index value
v1->v2:
  Add missing erspan.h header
---
 include/net/erspan.h           |  61 ++++++++++
 include/net/ip_tunnels.h       |   3 +
 include/uapi/linux/if_ether.h  |   1 +
 include/uapi/linux/if_tunnel.h |   1 +
 net/ipv4/ip_gre.c              | 267 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 333 insertions(+)
 create mode 100644 include/net/erspan.h

diff --git a/include/net/erspan.h b/include/net/erspan.h
new file mode 100644
index 000000000000..ca94fc86865e
--- /dev/null
+++ b/include/net/erspan.h
@@ -0,0 +1,61 @@
+#ifndef __LINUX_ERSPAN_H
+#define __LINUX_ERSPAN_H
+
+/*
+ * GRE header for ERSPAN encapsulation (8 octets [34:41]) -- 8 bytes
+ *       0                   1                   2                   3
+ *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |0|0|0|1|0|00000|000000000|00000|    Protocol Type for ERSPAN   |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |      Sequence Number (increments per packet per session)      |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ *  Note that in the above GRE header [RFC1701] out of the C, R, K, S,
+ *  s, Recur, Flags, Version fields only S (bit 03) is set to 1. The
+ *  other fields are set to zero, so only a sequence number follows.
+ *
+ *  ERSPAN Type II header (8 octets [42:49])
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |  Ver  |          VLAN         | COS | En|T|    Session ID     |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |      Reserved         |                  Index                |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB
+ */
+
+#define ERSPAN_VERSION	0x1
+
+#define VER_MASK	0xf000
+#define VLAN_MASK	0x0fff
+#define COS_MASK	0xe000
+#define EN_MASK		0x1800
+#define T_MASK		0x0400
+#define ID_MASK		0x03ff
+#define INDEX_MASK	0xfffff
+
+enum erspan_encap_type {
+	ERSPAN_ENCAP_NOVLAN = 0x0,	/* originally without VLAN tag */
+	ERSPAN_ENCAP_ISL = 0x1,		/* originally ISL encapsulated */
+	ERSPAN_ENCAP_8021Q = 0x2,	/* originally 802.1Q encapsulated */
+	ERSPAN_ENCAP_INFRAME = 0x3,	/* VLAN tag perserved in frame */
+};
+
+struct erspan_metadata {
+	__be32 index;   /* type II */
+};
+
+struct erspanhdr {
+	__be16 ver_vlan;
+#define VER_OFFSET  12
+	__be16 session_id;
+#define COS_OFFSET  13
+#define EN_OFFSET   11
+#define T_OFFSET    10
+	struct erspan_metadata md;
+};
+
+#endif
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 520809912f03..625c29329372 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -115,6 +115,9 @@ struct ip_tunnel {
 	u32		o_seqno;	/* The last output seqno */
 	int		tun_hlen;	/* Precalculated header length */
 
+	/* This field used only by ERSPAN */
+	u32		index;		/* ERSPAN type II index */
+
 	struct dst_cache dst_cache;
 
 	struct ip_tunnel_parm parms;
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 5bc9bfd816b7..efeb1190c2ca 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -66,6 +66,7 @@
 #define ETH_P_ATALK	0x809B		/* Appletalk DDP		*/
 #define ETH_P_AARP	0x80F3		/* Appletalk AARP		*/
 #define ETH_P_8021Q	0x8100          /* 802.1Q VLAN Extended Header  */
+#define ETH_P_ERSPAN	0x88BE		/* ERSPAN type II		*/
 #define ETH_P_IPX	0x8137		/* IPX over DIX			*/
 #define ETH_P_IPV6	0x86DD		/* IPv6 over bluebook		*/
 #define ETH_P_PAUSE	0x8808		/* IEEE Pause frames. See 802.3 31B */
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 6792d1967d31..2e520883c054 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -134,6 +134,7 @@ enum {
 	IFLA_GRE_COLLECT_METADATA,
 	IFLA_GRE_IGNORE_DF,
 	IFLA_GRE_FWMARK,
+	IFLA_GRE_ERSPAN_INDEX,
 	__IFLA_GRE_MAX,
 };
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 7a7829e839c2..a6ec77429a6a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -48,6 +48,7 @@
 #include <net/rtnetlink.h>
 #include <net/gre.h>
 #include <net/dst_metadata.h>
+#include <net/erspan.h>
 
 /*
    Problems & solutions
@@ -115,6 +116,7 @@ static int ipgre_tunnel_init(struct net_device *dev);
 
 static unsigned int ipgre_net_id __read_mostly;
 static unsigned int gre_tap_net_id __read_mostly;
+static unsigned int erspan_net_id __read_mostly;
 
 static void ipgre_err(struct sk_buff *skb, u32 info,
 		      const struct tnl_ptk_info *tpi)
@@ -246,6 +248,56 @@ static void gre_err(struct sk_buff *skb, u32 info)
 	ipgre_err(skb, info, &tpi);
 }
 
+static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+		      int gre_hdr_len)
+{
+	struct net *net = dev_net(skb->dev);
+	struct ip_tunnel_net *itn;
+	struct ip_tunnel *tunnel;
+	struct metadata_dst *tun_dst = NULL;
+	const struct iphdr *iph;
+	struct erspanhdr *ershdr;
+	__be32 index;
+	__be32 session_id;
+	int len;
+
+	itn = net_generic(net, erspan_net_id);
+	iph = ip_hdr(skb);
+	len =  iph->ihl * 4 + gre_hdr_len + sizeof(*ershdr);
+
+	if (unlikely(!pskb_may_pull(skb, len)))
+		return -ENOMEM;
+
+	iph = ip_hdr(skb);
+	ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
+
+	/* The original GRE header does not have key field,
+	 * Use ERSPAN 10-bit session ID as key.
+	 */
+	session_id = cpu_to_be32(ntohs(ershdr->session_id));
+	tpi->key = session_id;
+	index = ershdr->md.index;
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
+				  tpi->flags | TUNNEL_KEY,
+				  iph->saddr, iph->daddr, tpi->key);
+
+	if (tunnel) {
+		if (__iptunnel_pull_header(skb,
+					   gre_hdr_len + sizeof(*ershdr),
+					   htons(ETH_P_TEB),
+					   false, false) < 0)
+			goto drop;
+
+		tunnel->index = ntohl(index);
+		skb_reset_mac_header(skb);
+		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+		return PACKET_RCVD;
+	}
+drop:
+	kfree_skb(skb);
+	return PACKET_RCVD;
+}
+
 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 		       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
 {
@@ -328,6 +380,11 @@ static int gre_rcv(struct sk_buff *skb)
 	if (hdr_len < 0)
 		goto drop;
 
+	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
+		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
+			return 0;
+	}
+
 	if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 		return 0;
 
@@ -503,6 +560,79 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 	return NETDEV_TX_OK;
 }
 
+static inline u8 tos_to_cos(u8 tos)
+{
+	u8 dscp, cos;
+
+	dscp = tos >> 2;
+	cos = dscp >> 3;
+	return cos;
+}
+
+static void erspan_build_header(struct sk_buff *skb,
+				__be32 id, u32 index, bool truncate)
+{
+	struct erspanhdr *ershdr;
+	struct iphdr *iphdr = ip_hdr(skb);
+	struct ethhdr *eth = eth_hdr(skb);
+	struct qtag_prefix {
+		__be16 eth_type;
+		__be16 tci;
+	} *qp;
+	u16 vlan_tci = 0;
+	enum erspan_encap_type enc_type = ERSPAN_ENCAP_NOVLAN;
+
+	/* If mirrored packet has vlan tag, extract tci and
+	 *  perserve vlan header in the mirrored frame.
+	 */
+	if (eth->h_proto == htons(ETH_P_8021Q)) {
+		qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
+		vlan_tci = ntohs(qp->tci);
+		enc_type = ERSPAN_ENCAP_INFRAME;
+	}
+
+	skb_push(skb, sizeof(*ershdr));
+	ershdr = (struct erspanhdr *)skb->data;
+	memset(ershdr, 0, sizeof(*ershdr));
+
+	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
+				 (ERSPAN_VERSION << VER_OFFSET));
+	ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
+			   ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
+			   (enc_type << EN_OFFSET & EN_MASK) |
+			   ((truncate << T_OFFSET) & T_MASK));
+	ershdr->md.index = htonl(index & INDEX_MASK);
+}
+
+static netdev_tx_t erspan_xmit(struct sk_buff *skb,
+			       struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	bool truncate = false;
+
+	if (gre_handle_offloads(skb, false))
+		goto free_skb;
+
+	if (skb_cow_head(skb, dev->needed_headroom))
+		goto free_skb;
+
+	if (skb->len > dev->mtu) {
+		pskb_trim(skb, dev->mtu);
+		truncate = true;
+	}
+
+	/* Push ERSPAN header */
+	erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
+	tunnel->parms.o_flags &= ~TUNNEL_KEY;
+	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
+	return NETDEV_TX_OK;
+
+free_skb:
+	kfree_skb(skb);
+	dev->stats.tx_dropped++;
+	return NETDEV_TX_OK;
+}
+
 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 				struct net_device *dev)
 {
@@ -828,6 +958,39 @@ static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
 	return ipgre_tunnel_validate(tb, data, extack);
 }
 
+static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
+			   struct netlink_ext_ack *extack)
+{
+	int ret;
+	__be16 flags = 0;
+
+	if (!data)
+		return 0;
+
+	ret = ipgre_tap_validate(tb, data, extack);
+	if (ret)
+		return ret;
+
+	/* ERSPAN should only have GRE sequence and key flag */
+	flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+	flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+	if (flags != (GRE_SEQ | GRE_KEY))
+		return -EINVAL;
+
+	/* ERSPAN Session ID only has 10-bit. Since we reuse
+	 * 32-bit key field as ID, check it's range.
+	 */
+	if (data[IFLA_GRE_IKEY] &&
+	    (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
+		return -EINVAL;
+
+	if (data[IFLA_GRE_OKEY] &&
+	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
+		return -EINVAL;
+
+	return 0;
+}
+
 static int ipgre_netlink_parms(struct net_device *dev,
 				struct nlattr *data[],
 				struct nlattr *tb[],
@@ -892,6 +1055,13 @@ static int ipgre_netlink_parms(struct net_device *dev,
 	if (data[IFLA_GRE_FWMARK])
 		*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
 
+	if (data[IFLA_GRE_ERSPAN_INDEX]) {
+		t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+
+		if (t->index & ~INDEX_MASK)
+			return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -949,6 +1119,36 @@ static const struct net_device_ops gre_tap_netdev_ops = {
 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
 };
 
+static int erspan_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	int t_hlen;
+
+	tunnel->tun_hlen = 8;
+	tunnel->parms.iph.protocol = IPPROTO_GRE;
+	t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr);
+
+	dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
+	dev->mtu = ETH_DATA_LEN - t_hlen - 4;
+	dev->features		|= GRE_FEATURES;
+	dev->hw_features	|= GRE_FEATURES;
+	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE;
+
+	return ip_tunnel_init(dev);
+}
+
+static const struct net_device_ops erspan_netdev_ops = {
+	.ndo_init		= erspan_tunnel_init,
+	.ndo_uninit		= ip_tunnel_uninit,
+	.ndo_start_xmit		= erspan_xmit,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= ip_tunnel_change_mtu,
+	.ndo_get_stats64	= ip_tunnel_get_stats64,
+	.ndo_get_iflink		= ip_tunnel_get_iflink,
+	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
+};
+
 static void ipgre_tap_setup(struct net_device *dev)
 {
 	ether_setup(dev);
@@ -1041,6 +1241,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
 		nla_total_size(1) +
 		/* IFLA_GRE_FWMARK */
 		nla_total_size(4) +
+		/* IFLA_GRE_ERSPAN_INDEX */
+		nla_total_size(4) +
 		0;
 }
 
@@ -1083,12 +1285,25 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			goto nla_put_failure;
 	}
 
+	if (t->index)
+		if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
+			goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static void erspan_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->netdev_ops = &erspan_netdev_ops;
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	ip_tunnel_setup(dev, erspan_net_id);
+}
+
 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
@@ -1107,6 +1322,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_COLLECT_METADATA]	= { .type = NLA_FLAG },
 	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
 	[IFLA_GRE_FWMARK]	= { .type = NLA_U32 },
+	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
 };
 
 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
@@ -1139,6 +1355,21 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
 	.get_link_net	= ip_tunnel_get_link_net,
 };
 
+static struct rtnl_link_ops erspan_link_ops __read_mostly = {
+	.kind		= "erspan",
+	.maxtype	= IFLA_GRE_MAX,
+	.policy		= ipgre_policy,
+	.priv_size	= sizeof(struct ip_tunnel),
+	.setup		= erspan_setup,
+	.validate	= erspan_validate,
+	.newlink	= ipgre_newlink,
+	.changelink	= ipgre_changelink,
+	.dellink	= ip_tunnel_dellink,
+	.get_size	= ipgre_get_size,
+	.fill_info	= ipgre_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
+};
+
 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
 					u8 name_assign_type)
 {
@@ -1202,6 +1433,26 @@ static struct pernet_operations ipgre_tap_net_ops = {
 	.size = sizeof(struct ip_tunnel_net),
 };
 
+static int __net_init erspan_init_net(struct net *net)
+{
+	return ip_tunnel_init_net(net, erspan_net_id,
+				  &erspan_link_ops, "erspan0");
+}
+
+static void __net_exit erspan_exit_net(struct net *net)
+{
+	struct ip_tunnel_net *itn = net_generic(net, erspan_net_id);
+
+	ip_tunnel_delete_net(itn, &erspan_link_ops);
+}
+
+static struct pernet_operations erspan_net_ops = {
+	.init = erspan_init_net,
+	.exit = erspan_exit_net,
+	.id   = &erspan_net_id,
+	.size = sizeof(struct ip_tunnel_net),
+};
+
 static int __init ipgre_init(void)
 {
 	int err;
@@ -1216,6 +1467,10 @@ static int __init ipgre_init(void)
 	if (err < 0)
 		goto pnet_tap_faied;
 
+	err = register_pernet_device(&erspan_net_ops);
+	if (err < 0)
+		goto pnet_erspan_failed;
+
 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
 	if (err < 0) {
 		pr_info("%s: can't add protocol\n", __func__);
@@ -1230,13 +1485,21 @@ static int __init ipgre_init(void)
 	if (err < 0)
 		goto tap_ops_failed;
 
+	err = rtnl_link_register(&erspan_link_ops);
+	if (err < 0)
+		goto erspan_link_failed;
+
 	return 0;
 
+erspan_link_failed:
+	rtnl_link_unregister(&ipgre_tap_ops);
 tap_ops_failed:
 	rtnl_link_unregister(&ipgre_link_ops);
 rtnl_link_failed:
 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
 add_proto_failed:
+	unregister_pernet_device(&erspan_net_ops);
+pnet_erspan_failed:
 	unregister_pernet_device(&ipgre_tap_net_ops);
 pnet_tap_faied:
 	unregister_pernet_device(&ipgre_net_ops);
@@ -1247,9 +1510,11 @@ static void __exit ipgre_fini(void)
 {
 	rtnl_link_unregister(&ipgre_tap_ops);
 	rtnl_link_unregister(&ipgre_link_ops);
+	rtnl_link_unregister(&erspan_link_ops);
 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
 	unregister_pernet_device(&ipgre_tap_net_ops);
 	unregister_pernet_device(&ipgre_net_ops);
+	unregister_pernet_device(&erspan_net_ops);
 }
 
 module_init(ipgre_init);
@@ -1257,5 +1522,7 @@ module_exit(ipgre_fini);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_RTNL_LINK("gre");
 MODULE_ALIAS_RTNL_LINK("gretap");
+MODULE_ALIAS_RTNL_LINK("erspan");
 MODULE_ALIAS_NETDEV("gre0");
 MODULE_ALIAS_NETDEV("gretap0");
+MODULE_ALIAS_NETDEV("erspan0");
-- 
2.7.4

^ permalink raw reply related

* Re: [RFC] about net: Fix inconsistent teardown and release of private netdev state.
From: Eric Dumazet @ 2017-08-18 13:13 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20170817.222131.548007722474107806.davem@davemloft.net>

On Thu, 2017-08-17 at 22:21 -0700, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Thu, 17 Aug 2017 15:30:40 -0700
> 
> > So we do not really know if we need to clean up or not.
> 
> We always know, the answer is that whenever register_netdev() fails we
> never need to perform any cleanup which is done by priv_destructor.
> 
> > Any idea how to fix the issue ?
> 
> Your patch is exactly how we should fix this, but without the comment.
> The logic is straightforward.
> 
> If register_netdevice() fails any resources handled by priv_destructor
> are cleaned up, it is guaranteed.

Not in current code.

There are some failures which do a "goto out;" 

out:
	return ret;


In these cases, priv_destructor is not called.

So we need multiple fixes I think :/

^ permalink raw reply

* [PATCH] netxen: fix incorrect loop counter decrement
From: Colin King @ 2017-08-18 13:12 UTC (permalink / raw)
  To: Manish Chopra, Rahul Verma, Dept-GELinuxNICDev, netdev; +Cc: linux-kernel

From: Colin Ian King <colin.king@canonical.com>

The loop counter k is currently being decremented from zero which
is incorrect. Fix this by incrementing k instead

Detected by CoverityScan, CID#401847 ("Infinite loop")

Fixes: 83f18a557c6d ("netxen_nic: fw dump support")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c
index 66ff15d08bad..0a66389c06c2 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c
@@ -2311,7 +2311,7 @@ netxen_md_rdqueue(struct netxen_adapter *adapter,
 				 loop_cnt++) {
 		NX_WR_DUMP_REG(select_addr, adapter->ahw.pci_base0, queue_id);
 		read_addr = queueEntry->read_addr;
-		for (k = 0; k < read_cnt; k--) {
+		for (k = 0; k < read_cnt; k++) {
 			NX_RD_DUMP_REG(read_addr, adapter->ahw.pci_base0,
 							&read_value);
 			*data_buff++ = read_value;
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH net-next v1 05/14] amd-xgbe: Add additional debugfs support
From: Tom Lendacky @ 2017-08-18 12:58 UTC (permalink / raw)
  To: David Miller, andrew; +Cc: netdev
In-Reply-To: <20170817.220203.1607309698344574831.davem@davemloft.net>

On 8/18/2017 12:02 AM, David Miller wrote:
> From: Andrew Lunn <andrew@lunn.ch>
> Date: Fri, 18 Aug 2017 02:30:57 +0200
> 
>> On Thu, Aug 17, 2017 at 07:02:50PM -0500, Tom Lendacky wrote:
>>> Add additional debugfs support for reading / writing registers of any
>>> attached external phy devices as well as the SFP eeprom data.
>>
>> Hi Tom
>>
>> What is wrong with using the standard APIs for this?
>>
>> ethtool --moduile-info
>>
>> ioctls SIOCGMIIREG and SIOCSMIIREG.
> 
> Yeah debugfs is a horrible choice for this.
> 
> debugfs in general should be strongly avoided.  We have rich eneough
> facilities to export just about anything that is actually appropriate
> and useful, and where we do not existing facilities should be extended
> as needed rather than ignored.

I was intending this to be purely debugging and that's why I chose
debugfs as opposed to ethtool.  I can switch over to using ethtool
though.  I'll resubmit the series without this patch and look at
moving to this to ethtool as a separate patch later.

Thanks,
Tom

> 

^ permalink raw reply

* Re: Something hitting my total number of connections to the server
From: Eric Dumazet @ 2017-08-18 12:57 UTC (permalink / raw)
  To: Akshat Kakkar; +Cc: netdev
In-Reply-To: <CAA5aLPjNL0tDXjnPkTcBF8x+HqauQfWQVqWqKggLGqSdsg3Gog@mail.gmail.com>

On Fri, 2017-08-18 at 18:14 +0530, Akshat Kakkar wrote:
> On Fri, Aug 18, 2017 at 5:36 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > On Fri, 2017-08-18 at 14:44 +0530, Akshat Kakkar wrote:
> >> On Thu, Aug 17, 2017 at 5:06 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >> > On Thu, 2017-08-17 at 14:35 +0530, Akshat Kakkar wrote:
> >> >
> >> >> I upgraded to 4.4 but still experiencing same issue.
> >> >> Please help.
> >> >
> >> > Still too old kernel, shoot again ;)
> >> >
> >> >
> >>
> >>
> >> Sorry but that's the maximum I can try as of now as its the LT version.
> >>
> >> Besides, this issue was not present in 2.6.32 but came with 3.10 and
> >> still there in 4.4, so I doubt if it has to do with some kernel and/or
> >> kernel parameters much as you guys are good enough not to keep an
> >> issue for so long (around 3 years).
> >>
> >> So please help.
> >
> > netdev is the developer list.
> >
> > We deal with recent kernels only. Because we already spent time fixing
> > all these issues, we are not going to spend time fixing old kernels.
> >
> > Please to your distro provider to backport the needed patches.
> >
> >
> >
> I appreciate that.
> Can you just recall if there was any such issue which was fixed after 4.4.

More than one hundred patches yes.

Sorry, someone else than me will have to build a list of these patches.

^ permalink raw reply

* [PATCHv3 iproute2 net-next] gre: add support for ERSPAN tunnel
From: William Tu @ 2017-08-18 12:54 UTC (permalink / raw)
  To: netdev; +Cc: Meenakshi Vohra, Stephen Hemminger, Alexey Kuznetsov

The patch adds ERSPAN type II tunnel support.  The implementation
is based on the draft at https://tools.ietf.org/html/draft-foschiano-erspan-01
One of the purposes is for Linux box to be able to receive ERSPAN
monitoring traffic sent from the Cisco switch, by creating a ERSPAN
tunnel device.  In addition, the patch also adds ERSPAN TX, so traffic
can also be encapsulated into ERSPAN and sent out.

The implementation reuses the key as ERSPAN session ID, and
field 'erspan' as ERSPAN Index fields:
./ip link add dev ers11 type erspan seq key 100 erspan 123 \
		local 172.16.1.200 remote 172.16.1.100

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: Meenakshi Vohra <mvohra@vmware.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
---
v2->v3:
  - make erspan index 0 as reserved, only set to kernel
    when index is non-zero
  - endianness: make the index host byte order

v1->v2:
  Add manual entry for ERSPAN.
  Check 20-bit ERSPAN index field.
---
 include/linux/if_tunnel.h |  1 +
 ip/ipaddress.c            |  2 +-
 ip/iplink.c               |  5 +++--
 ip/link_gre.c             | 29 ++++++++++++++++++++++++++++-
 man/man8/ip-address.8.in  |  1 +
 man/man8/ip-link.8.in     | 19 ++++++++++++++++---
 6 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h
index 7375335a0773..21834cac4c0d 100644
--- a/include/linux/if_tunnel.h
+++ b/include/linux/if_tunnel.h
@@ -134,6 +134,7 @@ enum {
 	IFLA_GRE_COLLECT_METADATA,
 	IFLA_GRE_IGNORE_DF,
 	IFLA_GRE_FWMARK,
+	IFLA_GRE_ERSPAN_INDEX,
 	__IFLA_GRE_MAX,
 };
 
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 4d37c5e04507..f7296991e483 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -76,7 +76,7 @@ static void usage(void)
 	fprintf(stderr, "LFT := forever | SECONDS\n");
 	fprintf(stderr, "TYPE := { vlan | veth | vcan | dummy | ifb | macvlan | macvtap |\n");
 	fprintf(stderr, "          bridge | bond | ipoib | ip6tnl | ipip | sit | vxlan | lowpan |\n");
-	fprintf(stderr, "          gre | gretap | ip6gre | ip6gretap | vti | nlmon | can |\n");
+	fprintf(stderr, "          gre | gretap | erspan | ip6gre | ip6gretap | vti | nlmon | can |\n");
 	fprintf(stderr, "          bond_slave | ipvlan | geneve | bridge_slave | vrf | hsr | macsec }\n");
 
 	exit(-1);
diff --git a/ip/iplink.c b/ip/iplink.c
index 5aff2fde38da..62430110bfab 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -112,8 +112,9 @@ void iplink_usage(void)
 			"\n"
 			"TYPE := { vlan | veth | vcan | dummy | ifb | macvlan | macvtap |\n"
 			"          bridge | bond | team | ipoib | ip6tnl | ipip | sit | vxlan |\n"
-			"          gre | gretap | ip6gre | ip6gretap | vti | nlmon | team_slave |\n"
-			"          bond_slave | ipvlan | geneve | bridge_slave | vrf | macsec }\n");
+			"          gre | gretap | erspan | ip6gre | ip6gretap | vti | nlmon |\n"
+			"          team_slave | bond_slave | ipvlan | geneve | bridge_slave |\n"
+			"          vrf | macsec }\n");
 	}
 	exit(-1);
 }
diff --git a/ip/link_gre.c b/ip/link_gre.c
index c2ec5f26902f..56c3fb8819db 100644
--- a/ip/link_gre.c
+++ b/ip/link_gre.c
@@ -26,7 +26,7 @@
 static void print_usage(FILE *f)
 {
 	fprintf(f,
-		"Usage: ... { gre | gretap } [ remote ADDR ]\n"
+		"Usage: ... { gre | gretap | erspan } [ remote ADDR ]\n"
 		"                            [ local ADDR ]\n"
 		"                            [ [i|o]seq ]\n"
 		"                            [ [i|o]key KEY ]\n"
@@ -44,6 +44,7 @@ static void print_usage(FILE *f)
 		"                            [ [no]encap-csum6 ]\n"
 		"                            [ [no]encap-remcsum ]\n"
 		"                            [ fwmark MARK ]\n"
+		"                            [ erspan IDX ]\n"
 		"\n"
 		"Where: ADDR := { IP_ADDRESS | any }\n"
 		"       TOS  := { NUMBER | inherit }\n"
@@ -96,6 +97,7 @@ static int gre_parse_opt(struct link_util *lu, int argc, char **argv,
 	__u8 metadata = 0;
 	__u8 ignore_df = 0;
 	__u32 fwmark = 0;
+	__u32 erspan_idx = 0;
 
 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
 		if (rtnl_talk(&rth, &req.n, &req.n, sizeof(req)) < 0) {
@@ -172,6 +174,9 @@ get_failed:
 
 		if (greinfo[IFLA_GRE_FWMARK])
 			fwmark = rta_getattr_u32(greinfo[IFLA_GRE_FWMARK]);
+
+		if (greinfo[IFLA_GRE_ERSPAN_INDEX])
+			erspan_idx = rta_getattr_u32(greinfo[IFLA_GRE_ERSPAN_INDEX]);
 	}
 
 	while (argc > 0) {
@@ -328,6 +333,12 @@ get_failed:
 			NEXT_ARG();
 			if (get_u32(&fwmark, *argv, 0))
 				invarg("invalid fwmark\n", *argv);
+		} else if (strcmp(*argv, "erspan") == 0) {
+			NEXT_ARG();
+			if (get_u32(&erspan_idx, *argv, 0))
+				invarg("invalid erspan index\n", *argv);
+			if (erspan_idx & ~((1<<20) - 1) || erspan_idx == 0)
+				invarg("erspan index must be > 0 and <= 20-bit\n", *argv);
 		} else
 			usage();
 		argc--; argv++;
@@ -359,6 +370,8 @@ get_failed:
 		addattr_l(n, 1024, IFLA_GRE_TTL, &ttl, 1);
 		addattr_l(n, 1024, IFLA_GRE_TOS, &tos, 1);
 		addattr32(n, 1024, IFLA_GRE_FWMARK, fwmark);
+		if (erspan_idx != 0)
+			addattr32(n, 1024, IFLA_GRE_ERSPAN_INDEX, erspan_idx);
 	} else {
 		addattr_l(n, 1024, IFLA_GRE_COLLECT_METADATA, NULL, 0);
 	}
@@ -473,6 +486,12 @@ static void gre_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
 	if (tb[IFLA_GRE_IGNORE_DF] && rta_getattr_u8(tb[IFLA_GRE_IGNORE_DF]))
 		fputs("ignore-df ", f);
 
+	if (tb[IFLA_GRE_ERSPAN_INDEX]) {
+		__u32 erspan_idx = rta_getattr_u32(tb[IFLA_GRE_ERSPAN_INDEX]);
+
+		fprintf(f, "erspan_index %u ", erspan_idx);
+	}
+
 	if (tb[IFLA_GRE_ENCAP_TYPE] &&
 	    rta_getattr_u16(tb[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE) {
 		__u16 type = rta_getattr_u16(tb[IFLA_GRE_ENCAP_TYPE]);
@@ -538,3 +557,11 @@ struct link_util gretap_link_util = {
 	.print_opt = gre_print_opt,
 	.print_help = gre_print_help,
 };
+
+struct link_util erspan_link_util = {
+	.id = "erspan",
+	.maxattr = IFLA_GRE_MAX,
+	.parse_opt = gre_parse_opt,
+	.print_opt = gre_print_opt,
+	.print_help = gre_print_help,
+};
diff --git a/man/man8/ip-address.8.in b/man/man8/ip-address.8.in
index 43385813a134..988a79652f6f 100644
--- a/man/man8/ip-address.8.in
+++ b/man/man8/ip-address.8.in
@@ -120,6 +120,7 @@ ip-address \- protocol address management
 .BR sit " |"
 .BR gre " |"
 .BR gretap " |"
+.BR erspan " |"
 .BR ip6gre " |"
 .BR ip6gretap " |"
 .BR vti " |"
diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
index c0207281905d..851b308cbe1a 100644
--- a/man/man8/ip-link.8.in
+++ b/man/man8/ip-link.8.in
@@ -202,6 +202,7 @@ ip-link \- network device configuration
 .BR sit " |"
 .BR gre " |"
 .BR gretap " |"
+.BR erspan " |"
 .BR ip6gre " |"
 .BR ip6gretap " |"
 .BR vti " |"
@@ -297,6 +298,9 @@ Link types:
 .BR gretap
 - Virtual L2 tunnel interface GRE over IPv4
 .sp
+.BR erspan
+- Encapsulated Remote SPAN over GRE and IPv4
+.sp
 .BR ip6gre
 - Virtual tunnel interface GRE over IPv6
 .sp
@@ -643,13 +647,13 @@ keyword.
 .in -8
 
 .TP
-GRE, IPIP, SIT Type Support
+GRE, IPIP, SIT, ERSPAN Type Support
 For a link of types
-.I GRE/IPIP/SIT
+.I GRE/IPIP/SIT/ERSPAN
 the following additional arguments are supported:
 
 .BI "ip link add " DEVICE
-.BR type " { " gre " | " ipip " | " sit " }"
+.BR type " { " gre " | " ipip " | " sit " | " erspan " }"
 .BI " remote " ADDR " local " ADDR
 [
 .BR encap " { " fou " | " gue " | " none " }"
@@ -663,6 +667,8 @@ the following additional arguments are supported:
 .I " [no]encap-remcsum "
 ] [
 .I " mode " { ip6ip | ipip | mplsip | any } "
+] [
+.BR erspan " \fIIDX "
 ]
 
 .in +8
@@ -707,6 +713,13 @@ MPLS-Over-IPv4, "any" indicates IPv6, IPv4 or MPLS Over IPv4. Supported for
 SIT where the default is "ip6ip" and IPIP where the default is "ipip".
 IPv6-Over-IPv4 is not supported for IPIP.
 
+.sp
+.BR erspan " \fIIDX "
+- specifies the ERSPAN index field.
+.IR IDX
+indicates a 20 bit index/port number associated with the ERSPAN
+traffic's source port and direction.
+
 .in -8
 
 .TP
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next] tcp: export drops counter to /proc/net/tcp{,6}
From: Stéphan Gorget @ 2017-08-18 12:50 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, Jeethu Rao, David S . Miller, Alexei Starovoitov,
	Eric Dumazet, kernel-team, linux-kernel
In-Reply-To: <1503058444.4936.169.camel@edumazet-glaptop3.roam.corp.google.com>

On 8/18/17 1:14 PM, Eric Dumazet wrote:
> On Fri, 2017-08-18 at 03:21 -0700, Stéphan Gorget wrote:
>> Those counters are exported for raw and udp but not for tcp, though they
>> are incremented.
>>
>> An example where it is useful is chasing listen overflow. Listen overflow
>> are counted as a global counter in LINUX_MIB_LISTENOVERFLOWS accessible
>> in /proc/net/netstat but there is no way to find related drops in the
>> information exported for tcp. With this patch it will make possible to
>> correlate growth of LINUX_MIB_LISTENOVERFLOWS with growth of drops for
>> a tcp socket.
> 
> Simply use iproute2/ss tool to access this information in a very
> efficient way (like filtering done in the kernel, instead having to
> parse a gigantic /proc output)
> 
> lpaa5:~# ss -tm state listening src :22

Thanks for the quick response, I'll use ss with those options that's
exactly what I needed.

^ permalink raw reply

* Re: Something hitting my total number of connections to the server
From: Akshat Kakkar @ 2017-08-18 12:44 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1503057984.4936.163.camel@edumazet-glaptop3.roam.corp.google.com>

On Fri, Aug 18, 2017 at 5:36 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Fri, 2017-08-18 at 14:44 +0530, Akshat Kakkar wrote:
>> On Thu, Aug 17, 2017 at 5:06 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> > On Thu, 2017-08-17 at 14:35 +0530, Akshat Kakkar wrote:
>> >
>> >> I upgraded to 4.4 but still experiencing same issue.
>> >> Please help.
>> >
>> > Still too old kernel, shoot again ;)
>> >
>> >
>>
>>
>> Sorry but that's the maximum I can try as of now as its the LT version.
>>
>> Besides, this issue was not present in 2.6.32 but came with 3.10 and
>> still there in 4.4, so I doubt if it has to do with some kernel and/or
>> kernel parameters much as you guys are good enough not to keep an
>> issue for so long (around 3 years).
>>
>> So please help.
>
> netdev is the developer list.
>
> We deal with recent kernels only. Because we already spent time fixing
> all these issues, we are not going to spend time fixing old kernels.
>
> Please to your distro provider to backport the needed patches.
>
>
>
I appreciate that.
Can you just recall if there was any such issue which was fixed after 4.4.

^ permalink raw reply

* [PATCH net] ipv6: accept 64k - 1 packet length in ip6_find_1stfragopt()
From: Stefano Brivio @ 2017-08-18 12:40 UTC (permalink / raw)
  To: netdev; +Cc: Sabrina Dubroca, Hannes Frederic Sowa

A packet length of exactly IPV6_MAXPLEN is allowed, we should
refuse parsing options only if the size is 64KiB or more.

While at it, remove one extra variable and one assignment which
were also introduced by the commit that introduced the size
check. Checking the sum 'offset + len' and only later adding
'len' to 'offset' doesn't provide any advantage over directly
summing to 'offset' and checking it.

Fixes: 6399f1fae4ec ("ipv6: avoid overflow of offset in ip6_find_1stfragopt")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 net/ipv6/output_core.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index abb2c307fbe8..a338bbc33cf3 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -86,7 +86,6 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 
 	while (offset <= packet_len) {
 		struct ipv6_opt_hdr *exthdr;
-		unsigned int len;
 
 		switch (**nexthdr) {
 
@@ -112,10 +111,9 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 
 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 						 offset);
-		len = ipv6_optlen(exthdr);
-		if (len + offset >= IPV6_MAXPLEN)
+		offset += ipv6_optlen(exthdr);
+		if (offset > IPV6_MAXPLEN)
 			return -EINVAL;
-		offset += len;
 		*nexthdr = &exthdr->nexthdr;
 	}
 
-- 
2.9.4

^ permalink raw reply related

* Re: [PATCH 2/2] xdp: adjust xdp redirect tracepoint to include return error code
From: Daniel Borkmann @ 2017-08-18 12:38 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, John Fastabend; +Cc: netdev
In-Reply-To: <20170818142946.1bad7203@redhat.com>

On 08/18/2017 02:29 PM, Jesper Dangaard Brouer wrote:
> On Thu, 17 Aug 2017 12:43:18 -0700
> John Fastabend <john.fastabend@gmail.com> wrote:
>
>>>>> @@ -2532,12 +2535,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
>>>>>   	ri->map = NULL;
>>>>>   	if (unlikely(!fwd)) {
>>>>>   		bpf_warn_invalid_xdp_redirect(index);
>>
>> I think we should drop the warn_invalid now that we have a tracepoint.
>> The tracepoint is much nicer for debugging vs a warning for what might
>> be a valid case depending on xdp program.
>
> I agree.  I'll do that in a follow up patch. I'll likely remove the
> bpf_warn_invalid_xdp_redirect() function completely.

+1

> We also have bpf_warn_invalid_xdp_action() but that might be relevant
> to keep around(?).

Keeping this is fine, imo.

^ permalink raw reply

* [PATCH net-next] net: check type when freeing metadata dst
From: David Lamparter @ 2017-08-18 12:31 UTC (permalink / raw)
  To: netdev
  Cc: David Lamparter, Jakub Kicinski, Sridhar Samudrala, Simon Horman,
	David S . Miller
In-Reply-To: <a983211c-530a-8f73-ed00-c95840ce3d08@cogentembedded.com>

Commit 3fcece12bc1b ("net: store port/representator id in metadata_dst")
added a new type field to metadata_dst, but metadata_dst_free() wasn't
updated to check it before freeing the METADATA_IP_TUNNEL specific dst
cache entry.

This is not currently causing problems since it's far enough back in the
struct to be zeroed for the only other type currently in existance
(METADATA_HW_PORT_MUX), but nevertheless it's not correct.

Fixes: 3fcece12bc1b ("net: store port/representator id in metadata_dst")
Signed-off-by: David Lamparter <equinox@diac24.net>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: Sridhar Samudrala <sridhar.samudrala@intel.com>
Cc: Simon Horman <horms@verge.net.au>
Cc: David S. Miller <davem@davemloft.net>
---
 net/core/dst.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/dst.c b/net/core/dst.c
index 00aa972ad1a1..7954d1710d97 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -299,7 +299,8 @@ EXPORT_SYMBOL_GPL(metadata_dst_alloc);
 void metadata_dst_free(struct metadata_dst *md_dst)
 {
 #ifdef CONFIG_DST_CACHE
-	dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
+	if (md_dst->type == METADATA_IP_TUNNEL)
+		dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
 #endif
 	kfree(md_dst);
 }
-- 
2.13.0

^ permalink raw reply related

* Re: [PATCH 2/2] xdp: adjust xdp redirect tracepoint to include return error code
From: Jesper Dangaard Brouer @ 2017-08-18 12:29 UTC (permalink / raw)
  To: John Fastabend; +Cc: netdev, brouer
In-Reply-To: <5995F1D6.8040103@gmail.com>

On Thu, 17 Aug 2017 12:43:18 -0700
John Fastabend <john.fastabend@gmail.com> wrote:

> >>> @@ -2532,12 +2535,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
> >>>  	ri->map = NULL;
> >>>  	if (unlikely(!fwd)) {
> >>>  		bpf_warn_invalid_xdp_redirect(index);  
> 
> I think we should drop the warn_invalid now that we have a tracepoint.
> The tracepoint is much nicer for debugging vs a warning for what might
> be a valid case depending on xdp program.

I agree.  I'll do that in a follow up patch. I'll likely remove the
bpf_warn_invalid_xdp_redirect() function completely.

We also have bpf_warn_invalid_xdp_action() but that might be relevant
to keep around(?).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH net-next] net: check type when freeing metadata dst
From: Sergei Shtylyov @ 2017-08-18 12:25 UTC (permalink / raw)
  To: David Lamparter, netdev
  Cc: Jakub Kicinski, Sridhar Samudrala, Simon Horman, David S . Miller
In-Reply-To: <20170818121651.516877-1-equinox@diac24.net>

Hello!

On 08/18/2017 03:16 PM, David Lamparter wrote:

> There is a new metadata dst type field added in "net: store
> port/representator id in metadata_dst", but metadata_dst_free() wasn't
> updated to check it before freeing the METADATA_IP_TUNNEL specific dst
> cache entry.
> 
> This is not currently causing problems since it's far enough back in the
> struct to be zeroed for the only other type currently in existance
> (METADATA_HW_PORT_MUX), but nevertheless it's not correct.
> 
> Fixes: 3fcece12bc1b6dcdf0986f2cd9e8f63b1f9b6aa0

    Please see Documentation/process/submitting-patches.rst on how this tag 
(and the commit citing in your description as well) should look like.

> Signed-off-by: David Lamparter <equinox@diac24.net>
> Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
> Cc: Sridhar Samudrala <sridhar.samudrala@intel.com>
> Cc: Simon Horman <horms@verge.net.au>
> Cc: David S. Miller <davem@davemloft.net>

[...]

MBR, Sergei

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox