Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v2 2/9] qlcnic: Enhance ethtool to display ring indices and interrupt mask
From: Himanshu Madhani @ 2013-10-11 18:42 UTC (permalink / raw)
  To: davem; +Cc: netdev, Dept_NX_Linux_NIC_Driver, Pratik Pujar, himanshu.madhani
In-Reply-To: <cover.1381538863.git.himanshu.madhani@qlogic.com>

From: Pratik Pujar <pratik.pujar@qlogic.com>

o Updated ethtool -d <ethX> option to display ring indices for Transmit(Tx),
  Receive(Rx), and Status(St) rings.
o Updated ethtool -d <ethX> option to display ring interrupt mask for Transmit(Tx),
  and Status(St) rings.

Signed-off-by: Pratik Pujar <pratik.pujar@qlogic.com>
Signed-off-by: Himanshu Madhani <himanshu.madhani@qlogic.com>
---
 .../net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c    |  8 ++--
 .../net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c    | 54 ++++++++++++++++------
 2 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
index 66e94dc..c2df4ce 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
@@ -3267,12 +3267,12 @@ int qlcnic_83xx_reg_test(struct qlcnic_adapter *adapter)
 	return 0;
 }
 
-int qlcnic_83xx_get_regs_len(struct qlcnic_adapter *adapter)
+inline int qlcnic_83xx_get_regs_len(struct qlcnic_adapter *adapter)
 {
 	return (ARRAY_SIZE(qlcnic_83xx_ext_reg_tbl) *
-		sizeof(adapter->ahw->ext_reg_tbl)) +
-		(ARRAY_SIZE(qlcnic_83xx_reg_tbl) +
-		sizeof(adapter->ahw->reg_tbl));
+		sizeof(*adapter->ahw->ext_reg_tbl)) +
+		(ARRAY_SIZE(qlcnic_83xx_reg_tbl) *
+		sizeof(*adapter->ahw->reg_tbl));
 }
 
 int qlcnic_83xx_get_registers(struct qlcnic_adapter *adapter, u32 *regs_buff)
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c
index ebe4c86..66355b7 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c
@@ -187,8 +187,8 @@ static int qlcnic_dev_statistics_len(struct qlcnic_adapter *adapter)
 		return -1;
 }
 
-#define QLCNIC_RING_REGS_COUNT	20
-#define QLCNIC_RING_REGS_LEN	(QLCNIC_RING_REGS_COUNT * sizeof(u32))
+#define	QLCNIC_TX_INTR_NOT_CONFIGURED	0X78563412
+
 #define QLCNIC_MAX_EEPROM_LEN   1024
 
 static const u32 diag_registers[] = {
@@ -219,7 +219,15 @@ static const u32 ext_diag_registers[] = {
 };
 
 #define QLCNIC_MGMT_API_VERSION	2
-#define QLCNIC_ETHTOOL_REGS_VER	3
+#define QLCNIC_ETHTOOL_REGS_VER	4
+
+static inline int qlcnic_get_ring_regs_len(struct qlcnic_adapter *adapter)
+{
+	int ring_regs_cnt = (adapter->max_drv_tx_rings * 5) +
+			    (adapter->max_rds_rings * 2) +
+			    (adapter->max_sds_rings * 3) + 5;
+	return ring_regs_cnt * sizeof(u32);
+}
 
 static int qlcnic_get_regs_len(struct net_device *dev)
 {
@@ -231,7 +239,9 @@ static int qlcnic_get_regs_len(struct net_device *dev)
 	else
 		len = sizeof(ext_diag_registers) + sizeof(diag_registers);
 
-	return QLCNIC_RING_REGS_LEN + len + QLCNIC_DEV_INFO_SIZE + 1;
+	len += ((QLCNIC_DEV_INFO_SIZE + 2) * sizeof(u32));
+	len += qlcnic_get_ring_regs_len(adapter);
+	return len;
 }
 
 static int qlcnic_get_eeprom_len(struct net_device *dev)
@@ -493,6 +503,8 @@ qlcnic_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *p)
 	struct qlcnic_adapter *adapter = netdev_priv(dev);
 	struct qlcnic_recv_context *recv_ctx = adapter->recv_ctx;
 	struct qlcnic_host_sds_ring *sds_ring;
+	struct qlcnic_host_rds_ring *rds_rings;
+	struct qlcnic_host_tx_ring *tx_ring;
 	u32 *regs_buff = p;
 	int ring, i = 0;
 
@@ -512,21 +524,35 @@ qlcnic_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *p)
 	if (!test_bit(__QLCNIC_DEV_UP, &adapter->state))
 		return;
 
-	regs_buff[i++] = 0xFFEFCDAB; /* Marker btw regs and ring count*/
-
-	regs_buff[i++] = 1; /* No. of tx ring */
-	regs_buff[i++] = le32_to_cpu(*(adapter->tx_ring->hw_consumer));
-	regs_buff[i++] = readl(adapter->tx_ring->crb_cmd_producer);
-
-	regs_buff[i++] = 2; /* No. of rx ring */
-	regs_buff[i++] = readl(recv_ctx->rds_rings[0].crb_rcv_producer);
-	regs_buff[i++] = readl(recv_ctx->rds_rings[1].crb_rcv_producer);
+	/* Marker btw regs and TX ring count */
+	regs_buff[i++] = 0xFFEFCDAB;
+
+	regs_buff[i++] = adapter->max_drv_tx_rings; /* No. of TX ring */
+	for (ring = 0; ring < adapter->max_drv_tx_rings; ring++) {
+		tx_ring = &adapter->tx_ring[ring];
+		regs_buff[i++] = le32_to_cpu(*(tx_ring->hw_consumer));
+		regs_buff[i++] = tx_ring->sw_consumer;
+		regs_buff[i++] = readl(tx_ring->crb_cmd_producer);
+		regs_buff[i++] = tx_ring->producer;
+		if (tx_ring->crb_intr_mask)
+			regs_buff[i++] = readl(tx_ring->crb_intr_mask);
+		else
+			regs_buff[i++] = QLCNIC_TX_INTR_NOT_CONFIGURED;
+	}
 
-	regs_buff[i++] = adapter->max_sds_rings;
+	regs_buff[i++] = adapter->max_rds_rings; /* No. of RX ring */
+	for (ring = 0; ring < adapter->max_rds_rings; ring++) {
+		rds_rings = &recv_ctx->rds_rings[ring];
+		regs_buff[i++] = readl(rds_rings->crb_rcv_producer);
+		regs_buff[i++] = rds_rings->producer;
+	}
 
+	regs_buff[i++] = adapter->max_sds_rings; /* No. of SDS ring */
 	for (ring = 0; ring < adapter->max_sds_rings; ring++) {
 		sds_ring = &(recv_ctx->sds_rings[ring]);
 		regs_buff[i++] = readl(sds_ring->crb_sts_consumer);
+		regs_buff[i++] = sds_ring->consumer;
+		regs_buff[i++] = readl(sds_ring->crb_intr_mask);
 	}
 }
 
-- 
1.8.1.4

^ permalink raw reply related

* [PATCH net-next v2 4/9] qlcnic: Update ethtool standard pause settings.
From: Himanshu Madhani @ 2013-10-11 18:42 UTC (permalink / raw)
  To: davem; +Cc: netdev, Dept_NX_Linux_NIC_Driver, Jitendra Kalsaria,
	himanshu.madhani
In-Reply-To: <cover.1381538863.git.himanshu.madhani@qlogic.com>

From: Jitendra Kalsaria <jitendra.kalsaria@qlogic.com>

Update ethtool standard pause parameter settings and display

Signed-off-by: Jitendra Kalsaria <jitendra.kalsaria@qlogic.com>
Signed-off-by: Himanshu Madhani <himanshu.madhani@qlogic.com>
---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c | 18 +++++++++++++++---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h |  3 +++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
index c2df4ce..268fda6 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
@@ -3369,10 +3369,21 @@ void qlcnic_83xx_get_pauseparam(struct qlcnic_adapter *adapter,
 	}
 	config = ahw->port_config;
 	if (config & QLC_83XX_CFG_STD_PAUSE) {
-		if (config & QLC_83XX_CFG_STD_TX_PAUSE)
+		switch (MSW(config)) {
+		case QLC_83XX_TX_PAUSE:
+			pause->tx_pause = 1;
+			break;
+		case QLC_83XX_RX_PAUSE:
+			pause->rx_pause = 1;
+			break;
+		case QLC_83XX_TX_RX_PAUSE:
+		default:
+			/* Backward compatibility for existing
+			 * flash definitions
+			 */
 			pause->tx_pause = 1;
-		if (config & QLC_83XX_CFG_STD_RX_PAUSE)
 			pause->rx_pause = 1;
+		}
 	}
 
 	if (QLC_83XX_AUTONEG(config))
@@ -3415,7 +3426,8 @@ int qlcnic_83xx_set_pauseparam(struct qlcnic_adapter *adapter,
 		ahw->port_config &= ~QLC_83XX_CFG_STD_RX_PAUSE;
 		ahw->port_config |= QLC_83XX_CFG_STD_TX_PAUSE;
 	} else if (!pause->rx_pause && !pause->tx_pause) {
-		ahw->port_config &= ~QLC_83XX_CFG_STD_TX_RX_PAUSE;
+		ahw->port_config &= ~(QLC_83XX_CFG_STD_TX_RX_PAUSE |
+				      QLC_83XX_CFG_STD_PAUSE);
 	}
 	status = qlcnic_83xx_set_port_config(adapter);
 	if (status) {
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h
index 533e150..2883b57 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h
@@ -363,6 +363,9 @@ enum qlcnic_83xx_states {
 #define QLC_83XX_LINK_EEE(data)		((data) & BIT_13)
 #define QLC_83XX_DCBX(data)			(((data) >> 28) & 7)
 #define QLC_83XX_AUTONEG(data)			((data) & BIT_15)
+#define QLC_83XX_TX_PAUSE			0x10
+#define QLC_83XX_RX_PAUSE			0x20
+#define QLC_83XX_TX_RX_PAUSE			0x30
 #define QLC_83XX_CFG_STD_PAUSE			(1 << 5)
 #define QLC_83XX_CFG_STD_TX_PAUSE		(1 << 20)
 #define QLC_83XX_CFG_STD_RX_PAUSE		(2 << 20)
-- 
1.8.1.4

^ permalink raw reply related

* [PATCH net-next v2 1/9] qlcnic: Print informational messages only once during driver load.
From: Himanshu Madhani @ 2013-10-11 18:42 UTC (permalink / raw)
  To: davem
  Cc: netdev, Dept_NX_Linux_NIC_Driver, Sucheta Chakraborty,
	himanshu.madhani
In-Reply-To: <cover.1381538863.git.himanshu.madhani@qlogic.com>

From: Sucheta Chakraborty <sucheta.chakraborty@qlogic.com>

Signed-off-by: Sucheta Chakraborty <sucheta.chakraborty@qlogic.com>
Signed-off-by: Himanshu Madhani <himanshu.madhani@qlogic.com>
---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic.h        |  1 +
 .../net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c    | 12 -----------
 .../net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c  | 25 ++++++++++++++++++----
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c   |  1 +
 4 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
index 81bf836..a3c4379 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h
@@ -1199,6 +1199,7 @@ struct qlcnic_npar_info {
 	u8	promisc_mode;
 	u8	offload_flags;
 	u8      pci_func;
+	u8      mac[ETH_ALEN];
 };
 
 struct qlcnic_eswitch {
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
index 3ca00e0..66e94dc 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
@@ -2321,19 +2321,7 @@ int qlcnic_83xx_get_pci_info(struct qlcnic_adapter *adapter,
 			i++;
 			memcpy(pci_info->mac + sizeof(u32), &cmd.rsp.arg[i], 2);
 			i = i + 3;
-			if (ahw->op_mode == QLCNIC_MGMT_FUNC)
-				dev_info(dev, "id = %d active = %d type = %d\n"
-					 "\tport = %d min bw = %d max bw = %d\n"
-					 "\tmac_addr =  %pM\n", pci_info->id,
-					 pci_info->active, pci_info->type,
-					 pci_info->default_port,
-					 pci_info->tx_min_bw,
-					 pci_info->tx_max_bw, pci_info->mac);
 		}
-		if (ahw->op_mode == QLCNIC_MGMT_FUNC)
-			dev_info(dev, "Max functions = %d, active functions = %d\n",
-				 ahw->max_pci_func, ahw->act_pci_func);
-
 	} else {
 		dev_err(dev, "Failed to get PCI Info, error = %d\n", err);
 		err = -EIO;
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c
index 0248a4c..63cdddf 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c
@@ -94,13 +94,30 @@ qlcnic_83xx_config_vnic_buff_descriptors(struct qlcnic_adapter *adapter)
  **/
 static int qlcnic_83xx_init_mgmt_vnic(struct qlcnic_adapter *adapter)
 {
-	int err = -EIO;
+	struct qlcnic_hardware_context *ahw = adapter->ahw;
+	struct device *dev = &adapter->pdev->dev;
+	struct qlcnic_npar_info *npar;
+	int i, err = -EIO;
 
 	qlcnic_83xx_get_minidump_template(adapter);
+
 	if (!(adapter->flags & QLCNIC_ADAPTER_INITIALIZED)) {
 		if (qlcnic_init_pci_info(adapter))
 			return err;
 
+		npar = adapter->npars;
+
+		for (i = 0; i < ahw->act_pci_func; i++, npar++) {
+			dev_info(dev, "id = %d active = %d type = %d\n"
+				 "\tport = %d min bw = %d max bw = %d\n"
+				 "\tmac_addr =  %pM\n", npar->pci_func,
+				 npar->active, npar->type, npar->phy_port,
+				 npar->min_bw, npar->max_bw, npar->mac);
+		}
+
+		dev_info(dev, "Max functions = %d, active functions = %d\n",
+			 ahw->max_pci_func, ahw->act_pci_func);
+
 		if (qlcnic_83xx_set_vnic_opmode(adapter))
 			return err;
 
@@ -115,12 +132,12 @@ static int qlcnic_83xx_init_mgmt_vnic(struct qlcnic_adapter *adapter)
 		return err;
 
 	qlcnic_83xx_config_vnic_buff_descriptors(adapter);
-	adapter->ahw->msix_supported = !!qlcnic_use_msi_x;
+	ahw->msix_supported = qlcnic_use_msi_x ? 1 : 0;
 	adapter->flags |= QLCNIC_ADAPTER_INITIALIZED;
 	qlcnic_83xx_enable_vnic_mode(adapter, 1);
 
-	dev_info(&adapter->pdev->dev, "HAL Version: %d, Management function\n",
-		 adapter->ahw->fw_hal_version);
+	dev_info(dev, "HAL Version: %d, Management function\n",
+		 ahw->fw_hal_version);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index f07f2b0..55e8b23 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -875,6 +875,7 @@ int qlcnic_init_pci_info(struct qlcnic_adapter *adapter)
 		adapter->npars[j].min_bw = pci_info[i].tx_min_bw;
 		adapter->npars[j].max_bw = pci_info[i].tx_max_bw;
 
+		memcpy(&adapter->npars[j].mac, &pci_info[i].mac, ETH_ALEN);
 		j++;
 	}
 
-- 
1.8.1.4

^ permalink raw reply related

* [PATCH net-next v2 0/9] qlcnic: fixes and ethtool enhancements.
From: Himanshu Madhani @ 2013-10-11 18:42 UTC (permalink / raw)
  To: davem; +Cc: netdev, Dept_NX_Linux_NIC_Driver, Himanshu Madhani

From: Himanshu Madhani <himanshu.madhani@qlogic.com>

This patch series contains

o patch to fix regression introduced by commit
  aa4a1f7df7cbb98797c9f4edfde3c726e2b3841f.
o updates to ethtool for pause settings and enhance
  register dump to display mask and ring indices.
o cleanup in DCB code and remove redundant eSwitch enablement command.
o fixed firmware dump collection logic to skip unknown entries.

Changes from v1 -> v2

o Dropped patch to register device if adapter is in FAILED state for more rework.
o Updated patch to display ring indices via ethtool per Ben Hutchings's comment.
o Update patch for DCB cleanup per Stephen Hemminger's comment.

Please apply to net-next.

Thanks,
Himanshu

Himanshu Madhani (2):
  qlcnic: Validate Tx queue only for 82xx adapters.
  qlcnic: update version to 5.3.51

Jitendra Kalsaria (1):
  qlcnic: Update ethtool standard pause settings.

Pratik Pujar (2):
  qlcnic: Enhance ethtool to display ring indices and interrupt mask
  qlcnic: Firmware dump collection when auto recovery is disabled.

Shahed Shaikh (1):
  qlcnic: Skip unknown entry type while collecting firmware dump

Sony Chacko (1):
  qlcnic: Remove redundant eSwitch enable commands

Sucheta Chakraborty (2):
  qlcnic: Print informational messages only once during driver load.
  qlcnic: dcb code cleanup and refactoring.

 drivers/net/ethernet/qlogic/qlcnic/qlcnic.h        | 101 +----------
 .../net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c    |  40 ++---
 .../net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h    |   5 +-
 .../net/ethernet/qlogic/qlcnic/qlcnic_83xx_init.c  |  20 ++-
 .../net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c  |  48 +++---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.c    | 184 ++++++++++-----------
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h    | 109 ++++++++++--
 .../net/ethernet/qlogic/qlcnic/qlcnic_ethtool.c    |  56 +++++--
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c     |   2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c   |  53 +++---
 .../net/ethernet/qlogic/qlcnic/qlcnic_minidump.c   |  41 +++--
 .../ethernet/qlogic/qlcnic/qlcnic_sriov_common.c   |   9 +-
 12 files changed, 344 insertions(+), 324 deletions(-)

-- 
1.8.1.4

^ permalink raw reply

* Re: [PATCH next 2/6] be2net: pass if_id for v1 and V2 versions of TX_CREATE cmd
From: David Miller @ 2013-10-11 19:02 UTC (permalink / raw)
  To: Sathya.Perla; +Cc: netdev
In-Reply-To: <CF9D1877D81D214CB0CA0669EFAE020C26A275D8@CMEXMB1.ad.emulex.com>

From: Sathya Perla <Sathya.Perla@Emulex.Com>
Date: Thu, 10 Oct 2013 10:53:41 +0000

>> -----Original Message-----
>> From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org] On Behalf
>> 
>> From: Vasundhara Volam <vasundhara.volam@emulex.com>
>> 
>> It is a required field for all TX_CREATE cmd versions > 0.
>> Signed-off-by: Vasundhara Volam <vasundhara.volam@emulex.com>
>> Signed-off-by: Sathya Perla <sathya.perla@emulex.com>
> 
> David, Could you pls queue this patch for the stable tree.

I have a better idea, can you guys actually submit patches to the
proper destination?

If it's a bug fix, send it to 'net'.  If it's not in 'net' I'm
not submitting it to -stable.

^ permalink raw reply

* Re: [PATCH 1/1] net: fix cipso packet validation when !NETLABEL
From: Paul Moore @ 2013-10-11 19:02 UTC (permalink / raw)
  To: Seif Mazareeb
  Cc: davem@davemloft.net, netdev@vger.kernel.org,
	thomas.petazzoni@free-electrons.com, Dmitri Epshtein
In-Reply-To: <0DB595A2CB707F458400BE9663B6A72269C0047793@SC-VEXCH2.marvell.com>

On Friday, October 11, 2013 10:58:31 AM Seif Mazareeb wrote:
> When CONFIG_NETLABEL is disabled, the cipso_v4_validate() function could
> loop forever in the main loop if opt[opt_iter +1] == 0, this will causing a
> kernel crash in an SMP system, since the CPU executing this function will
> stall /not respond to IPIs.
> 
> This problem can be reproduced by running the IP Stack Integrity Checker
> (http://isic.sourceforge.net) using the following command on a Linux machine
> connected to DUT:
> 
> "icmpsic -s rand -d <DUT IP address> -r 123456"
> wait (1-2 min)
> 
> Signed-off-by: Seif Mazareeb <seif@marvell.com>
> ---
>  include/net/cipso_ipv4.h | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h
> index a7a683e..047f1f6 100644
> --- a/include/net/cipso_ipv4.h
> +++ b/include/net/cipso_ipv4.h
> @@ -306,6 +306,10 @@ static inline int cipso_v4_validate(const struct
> sk_buff *skb, err_offset = opt_iter + 1;
>                         goto out;
>                 }
> +
> +               if (opt[opt_iter + 1] == 0)
> +                       break;
> +
>                 opt_iter += opt[opt_iter + 1];
>         }

Thanks for finding and reporting this bug.  Unfortunately, I don't think the 
supplied patch is the best way to solve this.  Since a length of zero is not 
valid for any known CIPSO tag types (at least that I am aware of), we should 
treat a zero length tag as an error, similar to how we treat tags with length 
values that stretch beyond the option itself.

I'm thinking something like this:

static inline int cipso_v4_validate(const struct sk_buff *skb,
                                    unsigned char **option)
{
        unsigned char *opt = *option;
        unsigned char err_offset = 0;
        u8 opt_len = opt[1];
        u8 opt_iter;
        u8 tag_len;

        if (opt_len < 8) {
                err_offset = 1;
                goto out;
        }

        if (get_unaligned_be32(&opt[2]) == 0) {
                err_offset = 2;
                goto out;
        }

        for (opt_iter = 6; opt_iter < opt_len;) {
                tag_len = opt[opt_iter + 1];
                if ((tag_len == 0) || (tag_len > (opt_len - opt_iter))) {
                        err_offset = opt_iter + 1;
                        goto out;
                }
                opt_iter += tag_len;
        }

out:
        *option = opt + err_offset;
        return err_offset;

}

If you want to fixup your patch that would be appreciated, if not, please let 
me know so I can submit the fix.

Thanks,
-Paul

-- 
paul moore
www.paul-moore.com

^ permalink raw reply

* Re: [PATCH RFC 0/2] xfrm: Remove ancient sleeping code
From: David Miller @ 2013-10-11 19:01 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev
In-Reply-To: <20131010063301.GO7660@secunet.com>

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Thu, 10 Oct 2013 08:33:01 +0200

> Does anyone still rely on the ancient sleeping when the SA is in
> acquire state? It is disabled by default since more that five years,
> but can cause indefinite task hangs if enabled and the needed state
> does not get resolved.
> 
> We now queue packets to the policy if the states are not yet resolved
> if we are in a code path that can not sleep. We could do this even in
> the case we can sleep. As a bonus, we can remove the FLOWI_FLAG_CAN_SLEEP
> flag because the only thing this flag does, is to notify xfrm that we are
> in a codepath that can sleep.
> 
> The two RFC patches to remove the sleeping code are in reply to this
> mail. I'd add this to the ipsec-next tree if there are no objections.

The sleep path has the slight benefit that the TCP retransmit timers
for the initial SYN packet will not be started until the IPSEC rule
is resolved and the SYN actually goes out.

With the packet queue, if the IPSEC resolution is slow then we'll have
spurious SYN retransmits.

It makes no sense for TCP to keep queueing up SYNs if they will just
all get stuck in the packet queue.  The first one is enough.

On the other hand we do want TCP to timeout, we do want the user to
be able to "Ctrl-C" (ie. send a SIGINT) during a connect, etc.

^ permalink raw reply

* Re: [PATCH] net: sh_eth: Correct fix for RX packet errors on R8A7740
From: David Miller @ 2013-10-11 18:58 UTC (permalink / raw)
  To: horms+renesas; +Cc: netdev, linux-sh, magnus.damm, sergei.shtylyov, nh-ky
In-Reply-To: <1381384276-8077-2-git-send-email-horms+renesas@verge.net.au>

From: Simon Horman <horms+renesas@verge.net.au>
Date: Thu, 10 Oct 2013 14:51:16 +0900

> Nguyen Hong Ky posted a patch to correct RX packet errors on R8A7740 which
> was applied as 2c6221e4a5aab417 ("net: sh_eth: Fix RX packets errors on
> R8A7740"). Unfortunately sh_eth.c contains many similar instances
> of struct sh_eth_cpu_data and the patch was miss-applied, updating
> sh7734_data instead of r8a7740_data.
> 
> This patch corrects this problem by.
> 1. Reverting the change to sh7734_data and;
> 2. Applying the change to r8a7740_data.
> 
> Signed-off-by: Simon Horman <horms+renesas@verge.net.au>

Applied, thanks Somon.

^ permalink raw reply

* Re: [PATCH 2/3] net: bpf jit: x86: optimize choose_load_func error path
From: David Miller @ 2013-10-11 18:56 UTC (permalink / raw)
  To: murzin.v; +Cc: netdev, av1474, kaffeemonster, edumazet, mingo, tglx
In-Reply-To: <1381249910-17338-2-git-send-email-murzin.v@gmail.com>

From: Vladimir Murzin <murzin.v@gmail.com>
Date: Tue,  8 Oct 2013 20:31:49 +0400

> Macro CHOOSE_LOAD_FUNC returns handler for "any offset" if checks for K
> were not passed. At the same time handlers for "any offset" cases make
> the same checks against r_addr at run-time, that will always lead to
> bpf_error.
> 
> Run-time checks are still necessary for indirect load operations, but
> error path for absolute and mesh loads are worth to optimize during bpf
> compile time.
> 
> Signed-off-by: Vladimir Murzin <murzin.v@gmail.com>
> 
> Cc: Jan Seiffert <kaffeemonster@googlemail.com>
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: "David S. Miller" <davem@davemloft.net
> Cc: "H. Peter Anvin" <hpa@zytor.com>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> 
> ---
>  arch/x86/net/bpf_jit_comp.c |   15 +++++++++------
>  1 file changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 79c216a..28ac17f 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -123,7 +123,7 @@ static inline void bpf_flush_icache(void *start, void *end)
>  }
>  
>  #define CHOOSE_LOAD_FUNC(K, func) \
> -	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
> +	((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : NULL) : func##_positive_offset)
>  
>  /* Helper to find the offset of pkt_type in sk_buff
>   * We want to make sure its still a 3bit field starting at a byte boundary.
> @@ -611,7 +611,13 @@ void bpf_jit_compile(struct sk_filter *fp)
>  			}
>  			case BPF_S_LD_W_ABS:
>  				func = CHOOSE_LOAD_FUNC(K, sk_load_word);
> -common_load:			seen |= SEEN_DATAREF;
> +common_load:
> +				if (!func) {
> +					CLEAR_A();
> +					EMIT_JMP(cleanup_addr - addrs[i]);
> +					break;
> +				}
> +				seen |= SEEN_DATAREF;
>  				t_offset = func - (image + addrs[i]);
>  				EMIT1_off32(0xbe, K); /* mov imm32,%esi */
>  				EMIT1_off32(0xe8, t_offset); /* call */
> @@ -625,10 +631,7 @@ common_load:			seen |= SEEN_DATAREF;
>  			case BPF_S_LDX_B_MSH:
>  				func = CHOOSE_LOAD_FUNC(K, sk_load_byte_msh);
>  				seen |= SEEN_DATAREF | SEEN_XREG;
> -				t_offset = func - (image + addrs[i]);
> -				EMIT1_off32(0xbe, K);	/* mov imm32,%esi */
> -				EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */
> -				break;
> +				goto common_load;

This second hunk will set SEEN_DATAREF even if common_load takes the
!func path, that is not the intention at all here.

There's a reason why these two code blocks aren't shared.

^ permalink raw reply

* Re: [PATCH net] vti: get rid of nf mark rule in prerouting
From: David Miller @ 2013-10-11 18:53 UTC (permalink / raw)
  To: christophe.gouault; +Cc: netdev, amwang, saurabh
In-Reply-To: <1381245682-15523-1-git-send-email-christophe.gouault@6wind.com>

From: Christophe Gouault <christophe.gouault@6wind.com>
Date: Tue,  8 Oct 2013 17:21:22 +0200

> This patch fixes and improves the use of vti interfaces (while
> lightly changing the way of configuring them).
 ...
> Signed-off-by: Christophe Gouault <christophe.gouault@6wind.com>
> ---
> This is is both a fix and enhancement patch. However, there are 2 ways
> of fixing the inbound processing bug:
> - either keep the current configuration model (ikey + netfilter rule)
>   and change the tunnel lookup method. This patch would then be reverted
>   by the enhancement (this sounds counterproductive).
> - or directly change the configuration model (okey, no netfilter rule) and keep
>   the current tunnel lookup method.

Ok, applied and queued up for -stable, thanks.

^ permalink raw reply

* [PATCH 2/2] ixgbe: enable l2 forwarding acceleration for macvlans
From: Neil Horman @ 2013-10-11 18:43 UTC (permalink / raw)
  To: netdev; +Cc: john.fastabend, Andy Gospodarek, David Miller, Neil Horman
In-Reply-To: <1381517037-26007-1-git-send-email-nhorman@tuxdriver.com>

Now that l2 acceleration ops are in place from the prior patch, enable ixgbe to
take advantage of these operations.  Allow it to allocate queues for a macvlan
so that when we transmit a frame, we can do the switching in hardware inside the
ixgbe card, rather than in software.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: john.fastabend@gmail.com
CC: Andy Gospodarek <andy@greyhouse.net>
CC: "David S. Miller" <davem@davemloft.net>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h         |  33 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |   4 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_l2a.h     |  54 ++++
 drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c     |  15 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    | 388 ++++++++++++++++++-----
 5 files changed, 400 insertions(+), 94 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/ixgbe/ixgbe_l2a.h

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 0ac6b11..e924efa 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -219,6 +219,17 @@ enum ixgbe_ring_state_t {
 	__IXGBE_RX_FCOE,
 };
 
+struct ixgbe_fwd_adapter {
+	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
+	struct net_device *netdev;
+	struct ixgbe_adapter *real_adapter;
+	unsigned int tx_base_queue;
+	unsigned int rx_base_queue;
+	struct net_device_stats net_stats;
+	int pool;
+	bool online;
+};
+
 #define check_for_tx_hang(ring) \
 	test_bit(__IXGBE_TX_DETECT_HANG, &(ring)->state)
 #define set_check_for_tx_hang(ring) \
@@ -236,6 +247,7 @@ struct ixgbe_ring {
 	struct ixgbe_q_vector *q_vector; /* backpointer to host q_vector */
 	struct net_device *netdev;	/* netdev ring belongs to */
 	struct device *dev;		/* device for DMA mapping */
+	struct ixgbe_fwd_adapter *l2_accel_priv;
 	void *desc;			/* descriptor ring memory */
 	union {
 		struct ixgbe_tx_buffer *tx_buffer_info;
@@ -244,6 +256,7 @@ struct ixgbe_ring {
 	unsigned long last_rx_timestamp;
 	unsigned long state;
 	u8 __iomem *tail;
+	struct net_device *vmdq_netdev;
 	dma_addr_t dma;			/* phys. address of descriptor ring */
 	unsigned int size;		/* length in bytes */
 
@@ -288,11 +301,15 @@ enum ixgbe_ring_f_enum {
 };
 
 #define IXGBE_MAX_RSS_INDICES  16
-#define IXGBE_MAX_VMDQ_INDICES 64
+#define IXGBE_MAX_VMDQ_INDICES 32
 #define IXGBE_MAX_FDIR_INDICES 63	/* based on q_vector limit */
 #define IXGBE_MAX_FCOE_INDICES  8
 #define MAX_RX_QUEUES (IXGBE_MAX_FDIR_INDICES + 1)
 #define MAX_TX_QUEUES (IXGBE_MAX_FDIR_INDICES + 1)
+#define IXGBE_MAX_L2A_QUEUES 4
+#define IXGBE_MAX_L2A_QUEUES 4
+#define IXGBE_BAD_L2A_QUEUE 3
+
 struct ixgbe_ring_feature {
 	u16 limit;	/* upper limit on feature indices */
 	u16 indices;	/* current value of indices */
@@ -738,6 +755,7 @@ struct ixgbe_adapter {
 #endif /*CONFIG_DEBUG_FS*/
 
 	u8 default_up;
+	unsigned long fwd_bitmask; /* Bitmask indicating in use pools */
 };
 
 struct ixgbe_fdir_filter {
@@ -879,9 +897,14 @@ static inline void ixgbe_dbg_adapter_exit(struct ixgbe_adapter *adapter) {}
 static inline void ixgbe_dbg_init(void) {}
 static inline void ixgbe_dbg_exit(void) {}
 #endif /* CONFIG_DEBUG_FS */
+static inline struct net_device *netdev_ring(const struct ixgbe_ring *ring)
+{
+	return ring->vmdq_netdev ? ring->vmdq_netdev : ring->netdev;
+}
+
 static inline struct netdev_queue *txring_txq(const struct ixgbe_ring *ring)
 {
-	return netdev_get_tx_queue(ring->netdev, ring->queue_index);
+	return netdev_get_tx_queue(netdev_ring(ring), ring->queue_index);
 }
 
 extern void ixgbe_ptp_init(struct ixgbe_adapter *adapter);
@@ -915,4 +938,10 @@ extern void ixgbe_ptp_check_pps_event(struct ixgbe_adapter *adapter, u32 eicr);
 void ixgbe_sriov_reinit(struct ixgbe_adapter *adapter);
 #endif
 
+int ixgbe_get_settings(struct net_device *dev, struct ethtool_cmd *ecmd);
+int ixgbe_write_uc_addr_list(struct net_device *netdev);
+netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
+				  struct ixgbe_adapter *adapter,
+				  struct ixgbe_ring *tx_ring);
+void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring);
 #endif /* _IXGBE_H_ */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index e8649ab..277af14 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -150,8 +150,8 @@ static const char ixgbe_gstrings_test[][ETH_GSTRING_LEN] = {
 };
 #define IXGBE_TEST_LEN sizeof(ixgbe_gstrings_test) / ETH_GSTRING_LEN
 
-static int ixgbe_get_settings(struct net_device *netdev,
-                              struct ethtool_cmd *ecmd)
+int ixgbe_get_settings(struct net_device *netdev,
+		       struct ethtool_cmd *ecmd)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	struct ixgbe_hw *hw = &adapter->hw;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_l2a.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_l2a.h
new file mode 100644
index 0000000..2f36584
--- /dev/null
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_l2a.h
@@ -0,0 +1,54 @@
+/*******************************************************************************
+
+  Intel 10 Gigabit PCI Express Linux driver
+  Copyright(c) 1999 - 2013 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms and conditions of the GNU General Public License,
+  version 2, as published by the Free Software Foundation.
+
+  This program is distributed in the hope it will be useful, but WITHOUT
+  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+  more details.
+
+  You should have received a copy of the GNU General Public License along with
+  this program; if not, write to the Free Software Foundation, Inc.,
+  51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+
+  The full GNU General Public License is included in this distribution in
+  the file called "COPYING".
+
+  Contact Information:
+  e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+  Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+
+*******************************************************************************/
+#include "ixgbe.h"
+
+
+static inline void ixgbe_irq_enable_queues(struct ixgbe_adapter *adapter,
+					   u64 qmask)
+{
+	u32 mask;
+	struct ixgbe_hw *hw = &adapter->hw;
+
+	switch (hw->mac.type) {
+	case ixgbe_mac_82598EB:
+		mask = (IXGBE_EIMS_RTX_QUEUE & qmask);
+		IXGBE_WRITE_REG(hw, IXGBE_EIMS, mask);
+		break;
+	case ixgbe_mac_82599EB:
+	case ixgbe_mac_X540:
+		mask = (qmask & 0xFFFFFFFF);
+		if (mask)
+			IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask);
+		mask = (qmask >> 32);
+		if (mask)
+			IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask);
+		break;
+	default:
+		break;
+	}
+	/* skip the flush */
+}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
index 90b4e10..e2dd635 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
@@ -500,7 +500,8 @@ static bool ixgbe_set_sriov_queues(struct ixgbe_adapter *adapter)
 #endif
 
 	/* only proceed if SR-IOV is enabled */
-	if (!(adapter->flags & IXGBE_FLAG_SRIOV_ENABLED))
+	if (!(adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) &&
+	    !(adapter->flags & IXGBE_FLAG_VMDQ_ENABLED))
 		return false;
 
 	/* Add starting offset to total pool count */
@@ -852,7 +853,11 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,
 
 		/* apply Tx specific ring traits */
 		ring->count = adapter->tx_ring_count;
-		ring->queue_index = txr_idx;
+		if (adapter->num_rx_pools > 1)
+			ring->queue_index =
+				txr_idx % adapter->num_rx_queues_per_pool;
+		else
+			ring->queue_index = txr_idx;
 
 		/* assign ring to adapter */
 		adapter->tx_ring[txr_idx] = ring;
@@ -895,7 +900,11 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,
 #endif /* IXGBE_FCOE */
 		/* apply Rx specific ring traits */
 		ring->count = adapter->rx_ring_count;
-		ring->queue_index = rxr_idx;
+		if (adapter->num_rx_pools > 1)
+			ring->queue_index =
+				rxr_idx % adapter->num_rx_queues_per_pool;
+		else
+			ring->queue_index = rxr_idx;
 
 		/* assign ring to adapter */
 		adapter->rx_ring[rxr_idx] = ring;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0ade0cd..582d30b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -52,6 +52,7 @@
 #include "ixgbe_common.h"
 #include "ixgbe_dcb_82599.h"
 #include "ixgbe_sriov.h"
+#include "ixgbe_l2a.h"
 
 char ixgbe_driver_name[] = "ixgbe";
 static const char ixgbe_driver_string[] =
@@ -118,6 +119,8 @@ static DEFINE_PCI_DEVICE_TABLE(ixgbe_pci_tbl) = {
 };
 MODULE_DEVICE_TABLE(pci, ixgbe_pci_tbl);
 
+netdev_tx_t ixgbe_fwd_xmit_frame(struct sk_buff *skb, void *priv);
+
 #ifdef CONFIG_IXGBE_DCA
 static int ixgbe_notify_dca(struct notifier_block *, unsigned long event,
 			    void *p);
@@ -872,7 +875,8 @@ static u64 ixgbe_get_tx_completed(struct ixgbe_ring *ring)
 
 static u64 ixgbe_get_tx_pending(struct ixgbe_ring *ring)
 {
-	struct ixgbe_adapter *adapter = netdev_priv(ring->netdev);
+	struct net_device *dev = ring->netdev;
+	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	struct ixgbe_hw *hw = &adapter->hw;
 
 	u32 head = IXGBE_READ_REG(hw, IXGBE_TDH(ring->reg_idx));
@@ -1055,7 +1059,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 			tx_ring->next_to_use, i,
 			tx_ring->tx_buffer_info[i].time_stamp, jiffies);
 
-		netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
+		netif_stop_subqueue(netdev_ring(tx_ring), tx_ring->queue_index);
 
 		e_info(probe,
 		       "tx hang %d detected on queue %d, resetting adapter\n",
@@ -1072,16 +1076,16 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 				  total_packets, total_bytes);
 
 #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
-	if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) &&
+	if (unlikely(total_packets && netif_carrier_ok(netdev_ring(tx_ring)) &&
 		     (ixgbe_desc_unused(tx_ring) >= TX_WAKE_THRESHOLD))) {
 		/* Make sure that anybody stopping the queue after this
 		 * sees the new next_to_clean.
 		 */
 		smp_mb();
-		if (__netif_subqueue_stopped(tx_ring->netdev,
+		if (__netif_subqueue_stopped(netdev_ring(tx_ring),
 					     tx_ring->queue_index)
 		    && !test_bit(__IXGBE_DOWN, &adapter->state)) {
-			netif_wake_subqueue(tx_ring->netdev,
+			netif_wake_subqueue(netdev_ring(tx_ring),
 					    tx_ring->queue_index);
 			++tx_ring->tx_stats.restart_queue;
 		}
@@ -1226,7 +1230,7 @@ static inline void ixgbe_rx_hash(struct ixgbe_ring *ring,
 				 union ixgbe_adv_rx_desc *rx_desc,
 				 struct sk_buff *skb)
 {
-	if (ring->netdev->features & NETIF_F_RXHASH)
+	if (netdev_ring(ring)->features & NETIF_F_RXHASH)
 		skb->rxhash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
 }
 
@@ -1260,10 +1264,12 @@ static inline void ixgbe_rx_checksum(struct ixgbe_ring *ring,
 				     union ixgbe_adv_rx_desc *rx_desc,
 				     struct sk_buff *skb)
 {
+	struct net_device *dev = netdev_ring(ring);
+
 	skb_checksum_none_assert(skb);
 
 	/* Rx csum disabled */
-	if (!(ring->netdev->features & NETIF_F_RXCSUM))
+	if (!(dev->features & NETIF_F_RXCSUM))
 		return;
 
 	/* if IP and error */
@@ -1559,7 +1565,7 @@ static void ixgbe_process_skb_fields(struct ixgbe_ring *rx_ring,
 				     union ixgbe_adv_rx_desc *rx_desc,
 				     struct sk_buff *skb)
 {
-	struct net_device *dev = rx_ring->netdev;
+	struct net_device *dev = netdev_ring(rx_ring);
 
 	ixgbe_update_rsc_stats(rx_ring, skb);
 
@@ -1739,7 +1745,7 @@ static bool ixgbe_cleanup_headers(struct ixgbe_ring *rx_ring,
 				  union ixgbe_adv_rx_desc *rx_desc,
 				  struct sk_buff *skb)
 {
-	struct net_device *netdev = rx_ring->netdev;
+	struct net_device *netdev = netdev_ring(rx_ring);
 
 	/* verify that the packet does not have any known errors */
 	if (unlikely(ixgbe_test_staterr(rx_desc,
@@ -1905,7 +1911,7 @@ static struct sk_buff *ixgbe_fetch_rx_buffer(struct ixgbe_ring *rx_ring,
 #endif
 
 		/* allocate a skb to store the frags */
-		skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
+		skb = netdev_alloc_skb_ip_align(netdev_ring(rx_ring),
 						IXGBE_RX_HDR_SIZE);
 		if (unlikely(!skb)) {
 			rx_ring->rx_stats.alloc_rx_buff_failed++;
@@ -1986,6 +1992,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 	struct ixgbe_adapter *adapter = q_vector->adapter;
 	int ddp_bytes;
 	unsigned int mss = 0;
+	struct net_device *netdev = netdev_ring(rx_ring);
 #endif /* IXGBE_FCOE */
 	u16 cleaned_count = ixgbe_desc_unused(rx_ring);
 
@@ -1993,6 +2000,10 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		union ixgbe_adv_rx_desc *rx_desc;
 		struct sk_buff *skb;
 
+		if (rx_ring->l2_accel_priv) {
+			printk(KERN_CRIT "RECEIVING ON AN ACCELERATED QUEUE\n");
+		}
+
 		/* return some buffers to hardware, one at a time is too slow */
 		if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {
 			ixgbe_alloc_rx_buffers(rx_ring, cleaned_count);
@@ -2041,7 +2052,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			/* include DDPed FCoE data */
 			if (ddp_bytes > 0) {
 				if (!mss) {
-					mss = rx_ring->netdev->mtu -
+					mss = netdev->mtu -
 						sizeof(struct fcoe_hdr) -
 						sizeof(struct fc_frame_header) -
 						sizeof(struct fcoe_crc_eof);
@@ -2455,58 +2466,6 @@ static void ixgbe_check_lsc(struct ixgbe_adapter *adapter)
 	}
 }
 
-static inline void ixgbe_irq_enable_queues(struct ixgbe_adapter *adapter,
-					   u64 qmask)
-{
-	u32 mask;
-	struct ixgbe_hw *hw = &adapter->hw;
-
-	switch (hw->mac.type) {
-	case ixgbe_mac_82598EB:
-		mask = (IXGBE_EIMS_RTX_QUEUE & qmask);
-		IXGBE_WRITE_REG(hw, IXGBE_EIMS, mask);
-		break;
-	case ixgbe_mac_82599EB:
-	case ixgbe_mac_X540:
-		mask = (qmask & 0xFFFFFFFF);
-		if (mask)
-			IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(0), mask);
-		mask = (qmask >> 32);
-		if (mask)
-			IXGBE_WRITE_REG(hw, IXGBE_EIMS_EX(1), mask);
-		break;
-	default:
-		break;
-	}
-	/* skip the flush */
-}
-
-static inline void ixgbe_irq_disable_queues(struct ixgbe_adapter *adapter,
-					    u64 qmask)
-{
-	u32 mask;
-	struct ixgbe_hw *hw = &adapter->hw;
-
-	switch (hw->mac.type) {
-	case ixgbe_mac_82598EB:
-		mask = (IXGBE_EIMS_RTX_QUEUE & qmask);
-		IXGBE_WRITE_REG(hw, IXGBE_EIMC, mask);
-		break;
-	case ixgbe_mac_82599EB:
-	case ixgbe_mac_X540:
-		mask = (qmask & 0xFFFFFFFF);
-		if (mask)
-			IXGBE_WRITE_REG(hw, IXGBE_EIMC_EX(0), mask);
-		mask = (qmask >> 32);
-		if (mask)
-			IXGBE_WRITE_REG(hw, IXGBE_EIMC_EX(1), mask);
-		break;
-	default:
-		break;
-	}
-	/* skip the flush */
-}
-
 /**
  * ixgbe_irq_enable - Enable default interrupt generation settings
  * @adapter: board private structure
@@ -2946,6 +2905,7 @@ static void ixgbe_configure_msi_and_legacy(struct ixgbe_adapter *adapter)
 void ixgbe_configure_tx_ring(struct ixgbe_adapter *adapter,
 			     struct ixgbe_ring *ring)
 {
+	struct net_device *netdev = netdev_ring(ring);
 	struct ixgbe_hw *hw = &adapter->hw;
 	u64 tdba = ring->dma;
 	int wait_loop = 10;
@@ -3005,7 +2965,7 @@ void ixgbe_configure_tx_ring(struct ixgbe_adapter *adapter,
 		struct ixgbe_q_vector *q_vector = ring->q_vector;
 
 		if (q_vector)
-			netif_set_xps_queue(adapter->netdev,
+			netif_set_xps_queue(netdev,
 					    &q_vector->affinity_mask,
 					    ring->queue_index);
 	}
@@ -3395,7 +3355,7 @@ static void ixgbe_setup_psrtype(struct ixgbe_adapter *adapter)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
 	int rss_i = adapter->ring_feature[RING_F_RSS].indices;
-	int p;
+	u16 pool;
 
 	/* PSRTYPE must be initialized in non 82598 adapters */
 	u32 psrtype = IXGBE_PSRTYPE_TCPHDR |
@@ -3412,9 +3372,8 @@ static void ixgbe_setup_psrtype(struct ixgbe_adapter *adapter)
 	else if (rss_i > 1)
 		psrtype |= 1 << 29;
 
-	for (p = 0; p < adapter->num_rx_pools; p++)
-		IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(VMDQ_P(p)),
-				psrtype);
+	for_each_set_bit(pool, &adapter->fwd_bitmask, 32)
+		IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(VMDQ_P(pool)), psrtype);
 }
 
 static void ixgbe_configure_virtualization(struct ixgbe_adapter *adapter)
@@ -3683,6 +3642,8 @@ static void ixgbe_vlan_strip_disable(struct ixgbe_adapter *adapter)
 	case ixgbe_mac_82599EB:
 	case ixgbe_mac_X540:
 		for (i = 0; i < adapter->num_rx_queues; i++) {
+			if (adapter->rx_ring[i]->vmdq_netdev)
+				continue;
 			j = adapter->rx_ring[i]->reg_idx;
 			vlnctrl = IXGBE_READ_REG(hw, IXGBE_RXDCTL(j));
 			vlnctrl &= ~IXGBE_RXDCTL_VME;
@@ -3713,6 +3674,8 @@ static void ixgbe_vlan_strip_enable(struct ixgbe_adapter *adapter)
 	case ixgbe_mac_82599EB:
 	case ixgbe_mac_X540:
 		for (i = 0; i < adapter->num_rx_queues; i++) {
+			if (adapter->rx_ring[i]->vmdq_netdev)
+				continue;
 			j = adapter->rx_ring[i]->reg_idx;
 			vlnctrl = IXGBE_READ_REG(hw, IXGBE_RXDCTL(j));
 			vlnctrl |= IXGBE_RXDCTL_VME;
@@ -3743,15 +3706,16 @@ static void ixgbe_restore_vlan(struct ixgbe_adapter *adapter)
  *                0 on no addresses written
  *                X on writing X addresses to the RAR table
  **/
-static int ixgbe_write_uc_addr_list(struct net_device *netdev)
+int ixgbe_write_uc_addr_list(struct net_device *netdev)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	struct ixgbe_hw *hw = &adapter->hw;
 	unsigned int rar_entries = hw->mac.num_rar_entries - 1;
 	int count = 0;
 
-	/* In SR-IOV mode significantly less RAR entries are available */
-	if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED)
+	/* In SR-IOV/VMDQ modes significantly less RAR entries are available */
+	if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED ||
+	    adapter->flags & IXGBE_FLAG_VMDQ_ENABLED)
 		rar_entries = IXGBE_MAX_PF_MACVLANS - 1;
 
 	/* return ENOMEM indicating insufficient memory for addresses */
@@ -3772,6 +3736,7 @@ static int ixgbe_write_uc_addr_list(struct net_device *netdev)
 			count++;
 		}
 	}
+
 	/* write the addresses in reverse order to avoid write combining */
 	for (; rar_entries > 0 ; rar_entries--)
 		hw->mac.ops.clear_rar(hw, rar_entries);
@@ -4133,6 +4098,7 @@ static void ixgbe_configure(struct ixgbe_adapter *adapter)
 	ixgbe_configure_virtualization(adapter);
 
 	ixgbe_set_rx_mode(adapter->netdev);
+
 	ixgbe_restore_vlan(adapter);
 
 	switch (hw->mac.type) {
@@ -4459,7 +4425,7 @@ void ixgbe_reset(struct ixgbe_adapter *adapter)
  * ixgbe_clean_rx_ring - Free Rx Buffers per Queue
  * @rx_ring: ring to free buffers from
  **/
-static void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring)
+void ixgbe_clean_rx_ring(struct ixgbe_ring *rx_ring)
 {
 	struct device *dev = rx_ring->dev;
 	unsigned long size;
@@ -4838,6 +4804,8 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter)
 		return -EIO;
 	}
 
+	/* PF holds first pool slot */
+	set_bit(0, &adapter->fwd_bitmask);
 	set_bit(__IXGBE_DOWN, &adapter->state);
 
 	return 0;
@@ -5143,7 +5111,7 @@ static int ixgbe_change_mtu(struct net_device *netdev, int new_mtu)
 static int ixgbe_open(struct net_device *netdev)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
-	int err;
+	int err, queues;
 
 	/* disallow open during test */
 	if (test_bit(__IXGBE_TESTING, &adapter->state))
@@ -5168,16 +5136,22 @@ static int ixgbe_open(struct net_device *netdev)
 		goto err_req_irq;
 
 	/* Notify the stack of the actual queue counts. */
-	err = netif_set_real_num_tx_queues(netdev,
-					   adapter->num_rx_pools > 1 ? 1 :
-					   adapter->num_tx_queues);
+	if (adapter->num_rx_pools > 1 &&
+	    adapter->num_tx_queues > IXGBE_MAX_L2A_QUEUES)
+		queues = IXGBE_MAX_L2A_QUEUES;
+	else
+		queues = adapter->num_tx_queues;
+
+	err = netif_set_real_num_tx_queues(netdev, queues);
 	if (err)
 		goto err_set_queues;
 
-
-	err = netif_set_real_num_rx_queues(netdev,
-					   adapter->num_rx_pools > 1 ? 1 :
-					   adapter->num_rx_queues);
+	if (adapter->num_rx_pools > 1 &&
+	    adapter->num_rx_queues > IXGBE_MAX_L2A_QUEUES)
+		queues = IXGBE_MAX_L2A_QUEUES;
+	else
+		queues = adapter->num_rx_queues;
+	err = netif_set_real_num_rx_queues(netdev, queues);
 	if (err)
 		goto err_set_queues;
 
@@ -5215,7 +5189,6 @@ static int ixgbe_close(struct net_device *netdev)
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 
 	ixgbe_ptp_stop(adapter);
-
 	ixgbe_down(adapter);
 	ixgbe_free_irq(adapter);
 
@@ -6576,7 +6549,7 @@ static void ixgbe_atr(struct ixgbe_ring *ring,
 
 static int __ixgbe_maybe_stop_tx(struct ixgbe_ring *tx_ring, u16 size)
 {
-	netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
+	netif_stop_subqueue(netdev_ring(tx_ring), tx_ring->queue_index);
 	/* Herbert's original patch had:
 	 *  smp_mb__after_netif_stop_queue();
 	 * but since that doesn't exist yet, just open code it. */
@@ -6588,7 +6561,7 @@ static int __ixgbe_maybe_stop_tx(struct ixgbe_ring *tx_ring, u16 size)
 		return -EBUSY;
 
 	/* A reprieve! - use start_queue because it doesn't call schedule */
-	netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
+	netif_start_subqueue(netdev_ring(tx_ring), tx_ring->queue_index);
 	++tx_ring->tx_stats.restart_queue;
 	return 0;
 }
@@ -6639,6 +6612,9 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
 			  struct ixgbe_ring *tx_ring)
 {
 	struct ixgbe_tx_buffer *first;
+#ifdef IXGBE_FCOE
+	struct net_device *dev;
+#endif
 	int tso;
 	u32 tx_flags = 0;
 	unsigned short f;
@@ -6730,9 +6706,10 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
 	first->protocol = protocol;
 
 #ifdef IXGBE_FCOE
+	dev = netdev_ring(tx_ring);
 	/* setup tx offload for FCoE */
 	if ((protocol == __constant_htons(ETH_P_FCOE)) &&
-	    (tx_ring->netdev->features & (NETIF_F_FSO | NETIF_F_FCOE_CRC))) {
+	    (dev->features & (NETIF_F_FSO | NETIF_F_FCOE_CRC))) {
 		tso = ixgbe_fso(tx_ring, first, &hdr_len);
 		if (tso < 0)
 			goto out_drop;
@@ -6767,8 +6744,9 @@ out_drop:
 	return NETDEV_TX_OK;
 }
 
-static netdev_tx_t ixgbe_xmit_frame(struct sk_buff *skb,
-				    struct net_device *netdev)
+static netdev_tx_t __ixgbe_xmit_frame(struct sk_buff *skb,
+				      struct net_device *netdev,
+				      struct ixgbe_ring *ring)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	struct ixgbe_ring *tx_ring;
@@ -6784,10 +6762,17 @@ static netdev_tx_t ixgbe_xmit_frame(struct sk_buff *skb,
 		skb_set_tail_pointer(skb, 17);
 	}
 
-	tx_ring = adapter->tx_ring[skb->queue_mapping];
+	tx_ring = ring ? ring : adapter->tx_ring[skb->queue_mapping];
+
 	return ixgbe_xmit_frame_ring(skb, adapter, tx_ring);
 }
 
+static netdev_tx_t ixgbe_xmit_frame(struct sk_buff *skb,
+				    struct net_device*netdev)
+{
+	return __ixgbe_xmit_frame(skb, netdev, NULL);
+}
+
 /**
  * ixgbe_set_mac - Change the Ethernet Address of the NIC
  * @netdev: network interface device structure
@@ -7057,6 +7042,7 @@ int ixgbe_setup_tc(struct net_device *dev, u8 tc)
 	 */
 	if (netif_running(dev))
 		ixgbe_close(dev);
+
 	ixgbe_clear_interrupt_scheme(adapter);
 
 #ifdef CONFIG_IXGBE_DCB
@@ -7305,6 +7291,231 @@ static int ixgbe_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode);
 }
 
+static void ixgbe_irq_disable_queues(struct ixgbe_adapter *adapter, u64 qmask)
+{
+	u32 mask;
+	struct ixgbe_hw *hw = &adapter->hw;
+
+	switch (hw->mac.type) {
+	case ixgbe_mac_82598EB:
+		mask = (IXGBE_EIMS_RTX_QUEUE & qmask);
+		IXGBE_WRITE_REG(hw, IXGBE_EIMC, mask);
+		break;
+	case ixgbe_mac_82599EB:
+	case ixgbe_mac_X540:
+		mask = (qmask & 0xFFFFFFFF);
+		if (mask)
+			IXGBE_WRITE_REG(hw, IXGBE_EIMC_EX(0), mask);
+		mask = (qmask >> 32);
+		if (mask)
+			IXGBE_WRITE_REG(hw, IXGBE_EIMC_EX(1), mask);
+		break;
+	default:
+		break;
+	}
+}
+
+static void ixgbe_add_mac_filter(struct ixgbe_adapter *adapter,
+				 u8 *addr, u16 pool)
+{
+	struct ixgbe_hw *hw = &adapter->hw;
+	unsigned int entry;
+
+	entry = hw->mac.num_rar_entries - pool;
+	hw->mac.ops.set_rar(hw, entry, addr, VMDQ_P(pool), IXGBE_RAH_AV);
+}
+
+static void ixgbe_fwd_psrtype(struct ixgbe_fwd_adapter *vadapter)
+{
+	struct ixgbe_adapter *adapter = vadapter->real_adapter;
+	int rss_i = vadapter->netdev->real_num_rx_queues;
+	struct ixgbe_hw *hw = &adapter->hw;
+	u16 pool = vadapter->pool;
+	u32 psrtype = IXGBE_PSRTYPE_TCPHDR |
+		      IXGBE_PSRTYPE_UDPHDR |
+		      IXGBE_PSRTYPE_IPV4HDR |
+		      IXGBE_PSRTYPE_L2HDR |
+		      IXGBE_PSRTYPE_IPV6HDR;
+
+	if (hw->mac.type == ixgbe_mac_82598EB)
+		return;
+
+	if (rss_i > 3)
+		psrtype |= 2 << 29;
+	else if (rss_i > 1)
+		psrtype |= 1 << 29;
+
+	IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(VMDQ_P(pool)), psrtype);
+}
+
+static void ixgbe_enable_fwd_ring(struct ixgbe_adapter *adapter,
+				  struct ixgbe_ring *rx_ring,
+				  struct ixgbe_fwd_adapter *accel)
+{
+	rx_ring->l2_accel_priv = accel;
+	ixgbe_configure_rx_ring(adapter, rx_ring);
+}
+
+
+static void ixgbe_disable_fwd_ring(struct ixgbe_fwd_adapter *vadapter,
+				   struct ixgbe_ring *rx_ring)
+{
+	struct ixgbe_adapter *adapter = vadapter->real_adapter;
+	int index = rx_ring->queue_index + vadapter->rx_base_queue;
+
+	/* shutdown specific queue receive and wait for dma to settle */
+	ixgbe_disable_rx_queue(adapter, rx_ring);
+	usleep_range(10000, 20000);
+	ixgbe_irq_disable_queues(adapter, ((u64)1 << index));
+	ixgbe_clean_rx_ring(rx_ring);
+	rx_ring->l2_accel_priv = NULL;
+}
+
+int ixgbe_fwd_ring_up(struct net_device *vdev, struct ixgbe_fwd_adapter *accel)
+{
+	struct ixgbe_adapter *adapter = accel->real_adapter;
+	unsigned int rxbase = accel->pool * adapter->num_rx_queues_per_pool;
+	unsigned int txbase = accel->pool * adapter->num_rx_queues_per_pool;
+	int err, i;
+
+
+	accel->rx_base_queue = rxbase;
+	accel->tx_base_queue = txbase;
+
+	for (i = 0; i < vdev->num_rx_queues; i++)
+		ixgbe_disable_fwd_ring(accel, adapter->rx_ring[rxbase + i]);
+
+	for (i = 0; i < vdev->num_rx_queues; i++) {
+		adapter->rx_ring[rxbase + i]->vmdq_netdev = vdev;
+		ixgbe_enable_fwd_ring(adapter, adapter->rx_ring[rxbase + i], accel);
+	}
+
+	for (i = 0; i < vdev->num_tx_queues; i++)
+		adapter->tx_ring[txbase + i]->vmdq_netdev = vdev;
+
+	if (is_valid_ether_addr(vdev->dev_addr))
+		ixgbe_add_mac_filter(adapter, vdev->dev_addr, accel->pool);
+
+	err = netif_set_real_num_tx_queues(vdev, vdev->num_tx_queues);
+	if (err)
+		goto err_set_queues;
+	err = netif_set_real_num_rx_queues(vdev, vdev->num_rx_queues);
+	if (err)
+		goto err_set_queues;
+
+	ixgbe_fwd_psrtype(accel);
+	netif_tx_start_all_queues(vdev);
+	return 0;
+err_set_queues:
+	for (i = 0; i < vdev->num_rx_queues; i++)
+		ixgbe_disable_fwd_ring(accel, adapter->rx_ring[rxbase + i]);
+	return err;
+}
+
+int ixgbe_fwd_ring_down(struct net_device *vdev, struct ixgbe_fwd_adapter *accel)
+{
+	struct ixgbe_adapter *adapter = accel->real_adapter;
+	unsigned int rxbase = accel->rx_base_queue;
+	int i;
+
+	netif_tx_stop_all_queues(vdev);
+
+	for (i = 0; i < vdev->num_rx_queues; i++)
+		ixgbe_disable_fwd_ring(accel, adapter->rx_ring[rxbase + i]);
+
+	return 0;
+}
+
+static void* ixgbe_fwd_add(struct net_device *pdev, struct net_device *vdev)
+{
+	struct ixgbe_fwd_adapter *fwd_adapter = NULL;
+	struct ixgbe_adapter *adapter = netdev_priv(pdev);
+	int pool, vmdq_pool, base_queue;
+	int err;
+
+	/* Check for hardware restriction on number of rx/tx queues */
+	if (vdev->num_rx_queues != vdev->num_tx_queues ||
+	    vdev->num_tx_queues > IXGBE_MAX_L2A_QUEUES ||
+	    vdev->num_tx_queues == IXGBE_BAD_L2A_QUEUE) {
+		netdev_info(pdev, "%s: Supports RX/TX Queue counts 1,2, and 4\n",
+		       pdev->name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (adapter->num_rx_pools > IXGBE_MAX_VMDQ_INDICES)
+		return ERR_PTR(-EBUSY);
+
+	fwd_adapter = kcalloc(1, sizeof(struct ixgbe_fwd_adapter), GFP_KERNEL);
+	if (!fwd_adapter)
+		return ERR_PTR(-ENOMEM);
+
+	pool = find_first_zero_bit(&adapter->fwd_bitmask, 32);
+	adapter->num_rx_pools++;
+	set_bit(pool, &adapter->fwd_bitmask);
+
+	/* Enable VMDq flag so device will be set in VM mode */
+	adapter->flags |= IXGBE_FLAG_VMDQ_ENABLED | IXGBE_FLAG_SRIOV_ENABLED;
+	adapter->ring_feature[RING_F_VMDQ].limit = adapter->num_rx_pools;
+	adapter->ring_feature[RING_F_VMDQ].offset = 0;
+	adapter->ring_feature[RING_F_RSS].limit = IXGBE_MAX_L2A_QUEUES;
+
+	/* Force reinit of ring allocation with VMDQ enabled */
+	ixgbe_setup_tc(pdev, netdev_get_num_tc(pdev));
+
+	/* Configure VSI adapter structure */
+	vmdq_pool = VMDQ_P(pool);
+	base_queue = vmdq_pool * adapter->num_rx_queues_per_pool;
+
+	netdev_dbg(pdev, "pool %i:%i queues %i:%i VSI bitmask %lx\n",
+		   pool, adapter->num_rx_pools,
+		   base_queue, base_queue + adapter->num_rx_queues_per_pool,
+		   adapter->fwd_bitmask);
+
+	fwd_adapter->pool = pool;
+	fwd_adapter->netdev = vdev;
+	fwd_adapter->real_adapter = adapter;
+	fwd_adapter->rx_base_queue = base_queue;
+	fwd_adapter->tx_base_queue = base_queue;
+
+	err = ixgbe_fwd_ring_up(vdev, fwd_adapter);	
+	if (!err) {
+		kfree(fwd_adapter);
+		return ERR_PTR(err);
+	}
+	return fwd_adapter;
+}
+
+static void ixgbe_fwd_del(struct net_device *pdev, void *priv)
+{
+	struct ixgbe_fwd_adapter *fwd_adapter = priv; 
+	struct ixgbe_adapter *adapter = fwd_adapter->real_adapter;
+
+	clear_bit(fwd_adapter->pool, &adapter->fwd_bitmask);
+	adapter->num_rx_pools--;
+
+	ixgbe_fwd_ring_down(fwd_adapter->netdev, fwd_adapter);
+
+	netdev_dbg(pdev, "pool %i:%i queues %i:%i VSI bitmask %lx\n",
+		   fwd_adapter->pool, adapter->num_rx_pools,
+		   fwd_adapter->rx_base_queue,
+		   fwd_adapter->rx_base_queue + adapter->num_rx_queues_per_pool,
+		   adapter->fwd_bitmask);
+}
+
+static netdev_tx_t ixgbe_fwd_xmit (struct sk_buff *skb,
+				   struct net_device *dev,
+				   void *priv)
+{
+	struct ixgbe_fwd_adapter *fwd_adapter = priv;
+	unsigned int queue;
+	struct ixgbe_ring *tx_ring;
+
+	queue = skb->queue_mapping + fwd_adapter->tx_base_queue;
+	tx_ring = fwd_adapter->real_adapter->tx_ring[queue];
+
+	return __ixgbe_xmit_frame(skb, dev, tx_ring);
+}
+
 static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_open		= ixgbe_open,
 	.ndo_stop		= ixgbe_close,
@@ -7349,6 +7560,9 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_fdb_add		= ixgbe_ndo_fdb_add,
 	.ndo_bridge_setlink	= ixgbe_ndo_bridge_setlink,
 	.ndo_bridge_getlink	= ixgbe_ndo_bridge_getlink,
+	.ndo_dfwd_add_station	= ixgbe_fwd_add,
+	.ndo_dfwd_del_station	= ixgbe_fwd_del,
+	.ndo_dfwd_start_xmit	= ixgbe_fwd_xmit,
 };
 
 /**
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH 1/2] net: Add layer 2 hardware acceleration operations for macvlan devices
From: Neil Horman @ 2013-10-11 18:43 UTC (permalink / raw)
  To: netdev; +Cc: john.fastabend, Andy Gospodarek, David Miller, Neil Horman
In-Reply-To: <1381517037-26007-1-git-send-email-nhorman@tuxdriver.com>

Add a operations structure that allows a network interface to export the fact
that it supports package forwarding in hardware between physical interfaces and
other mac layer devices assigned to it (such as macvlans).  this operaions
structure can be used by virtual mac devices to bypass software switching so
that forwarding can be done in hardware more efficiently.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: john.fastabend@gmail.com
CC: Andy Gospodarek <andy@greyhouse.net>
CC: "David S. Miller" <davem@davemloft.net>
---
 drivers/net/macvlan.c           | 31 +++++++++++++++++++++++++++++++
 include/linux/if_macvlan.h      |  1 +
 include/linux/netdev_features.h |  2 ++
 include/linux/netdevice.h       | 11 ++++++++++-
 include/linux/skbuff.h          |  4 ++--
 net/core/dev.c                  | 18 +++++++++++++-----
 net/core/ethtool.c              |  1 +
 net/sched/sch_generic.c         |  2 +-
 8 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 9bf46bd..c5a2718 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -297,7 +297,16 @@ netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
 	int ret;
 	const struct macvlan_dev *vlan = netdev_priv(dev);
 
+	if (vlan->fwd_priv) {
+		skb->dev = vlan->lowerdev;
+		ret = dev_hard_start_xmit(skb, skb->dev, NULL, vlan->fwd_priv);
+					  
+		if (likely(ret == NETDEV_TX_OK))
+			goto update_stats;
+	}
+
 	ret = macvlan_queue_xmit(skb, dev);
+update_stats:
 	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
 		struct macvlan_pcpu_stats *pcpu_stats;
 
@@ -347,6 +356,18 @@ static int macvlan_open(struct net_device *dev)
 		goto hash_add;
 	}
 
+	if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD) {
+		vlan->fwd_priv = lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev);
+		/*
+		 * If we get a NULL pointer back, or if we get an error
+		 * then we should just fall through to the non accelerated path
+		 */
+		if (IS_ERR_OR_NULL(vlan->fwd_priv))
+			vlan->fwd_priv = NULL;
+		else
+			return 0;
+	}
+
 	err = -EBUSY;
 	if (macvlan_addr_busy(vlan->port, dev->dev_addr))
 		goto out;
@@ -367,6 +388,10 @@ hash_add:
 del_unicast:
 	dev_uc_del(lowerdev, dev->dev_addr);
 out:
+	if (vlan->fwd_priv) {
+		lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, vlan->fwd_priv);
+		vlan->fwd_priv = NULL;
+	}	
 	return err;
 }
 
@@ -391,6 +416,11 @@ static int macvlan_stop(struct net_device *dev)
 
 hash_del:
 	macvlan_hash_del(vlan, !dev->dismantle);
+	if (vlan->fwd_priv) {
+		lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, vlan->fwd_priv);
+		vlan->fwd_priv = NULL;
+	}
+
 	return 0;
 }
 
@@ -801,6 +831,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 		if (err < 0)
 			return err;
 	}
+
 	port = macvlan_port_get_rtnl(lowerdev);
 
 	/* Only 1 macvlan device can be created in passthru mode */
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index ddd33fd..c270285 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -61,6 +61,7 @@ struct macvlan_dev {
 	struct hlist_node	hlist;
 	struct macvlan_port	*port;
 	struct net_device	*lowerdev;
+	void			*fwd_priv;
 	struct macvlan_pcpu_stats __percpu *pcpu_stats;
 
 	DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index a2a89a5..9d1ee76 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -60,6 +60,7 @@ enum {
 	NETIF_F_HW_VLAN_STAG_TX_BIT,	/* Transmit VLAN STAG HW acceleration */
 	NETIF_F_HW_VLAN_STAG_RX_BIT,	/* Receive VLAN STAG HW acceleration */
 	NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
+	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
 
 	/*
 	 * Add your fresh new feature above and remember to update
@@ -112,6 +113,7 @@ enum {
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
+#define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
 
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3de49ac..0249179 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1097,6 +1097,13 @@ struct net_device_ops {
 	void			(*ndo_del_vxlan_port)(struct  net_device *dev,
 						      sa_family_t sa_family,
 						      __be16 port);
+
+	void*			(*ndo_dfwd_add_station)(struct net_device *pdev,
+							struct net_device *vdev);
+	void			(*ndo_dfwd_del_station)(struct net_device *pdev, void *priv);
+
+	netdev_tx_t		(*ndo_dfwd_start_xmit) (struct sk_buff *skb,
+							struct net_device *dev, void *priv);
 };
 
 /*
@@ -1183,6 +1190,7 @@ struct net_device {
 	/* Management operations */
 	const struct net_device_ops *netdev_ops;
 	const struct ethtool_ops *ethtool_ops;
+	const struct forwarding_accel_ops *fwd_ops;
 
 	/* Hardware header description */
 	const struct header_ops *header_ops;
@@ -2383,7 +2391,8 @@ extern int		dev_get_phys_port_id(struct net_device *dev,
 					     struct netdev_phys_port_id *ppid);
 extern int		dev_hard_start_xmit(struct sk_buff *skb,
 					    struct net_device *dev,
-					    struct netdev_queue *txq);
+					    struct netdev_queue *txq,
+					    void *accel_priv);
 extern int		dev_forward_skb(struct net_device *dev,
 					struct sk_buff *skb);
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2ddb48d..1710fdb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -426,9 +426,9 @@ struct sk_buff {
 	char			cb[48] __aligned(8);
 
 	unsigned long		_skb_refdst;
-#ifdef CONFIG_XFRM
+
 	struct	sec_path	*sp;
-#endif
+
 	unsigned int		len,
 				data_len;
 	__u16			mac_len,
diff --git a/net/core/dev.c b/net/core/dev.c
index 5c713f2..ecad8c2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2536,7 +2536,7 @@ static inline int skb_needs_linearize(struct sk_buff *skb,
 }
 
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
-			struct netdev_queue *txq)
+			struct netdev_queue *txq, void *accel_priv)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	int rc = NETDEV_TX_OK;
@@ -2602,9 +2602,13 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			dev_queue_xmit_nit(skb, dev);
 
 		skb_len = skb->len;
-		rc = ops->ndo_start_xmit(skb, dev);
+		if (accel_priv)
+			rc = ops->ndo_dfwd_start_xmit(skb, dev, accel_priv);
+		else
+			rc = ops->ndo_start_xmit(skb, dev);
+
 		trace_net_dev_xmit(skb, rc, dev, skb_len);
-		if (rc == NETDEV_TX_OK)
+		if (rc == NETDEV_TX_OK && txq)
 			txq_trans_update(txq);
 		return rc;
 	}
@@ -2620,7 +2624,10 @@ gso:
 			dev_queue_xmit_nit(nskb, dev);
 
 		skb_len = nskb->len;
-		rc = ops->ndo_start_xmit(nskb, dev);
+		if (accel_priv)
+			rc = ops->ndo_dfwd_start_xmit(nskb, dev, accel_priv);
+		else
+			rc = ops->ndo_start_xmit(nskb, dev);
 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
 		if (unlikely(rc != NETDEV_TX_OK)) {
 			if (rc & ~NETDEV_TX_MASK)
@@ -2645,6 +2652,7 @@ out_kfree_skb:
 out:
 	return rc;
 }
+EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
 
 static void qdisc_pkt_len_init(struct sk_buff *skb)
 {
@@ -2852,7 +2860,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 
 			if (!netif_xmit_stopped(txq)) {
 				__this_cpu_inc(xmit_recursion);
-				rc = dev_hard_start_xmit(skb, dev, txq);
+				rc = dev_hard_start_xmit(skb, dev, txq, NULL);
 				__this_cpu_dec(xmit_recursion);
 				if (dev_xmit_complete(rc)) {
 					HARD_TX_UNLOCK(dev, txq);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 78e9d92..9f0c599b 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -94,6 +94,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_LOOPBACK_BIT] =         "loopback",
 	[NETIF_F_RXFCS_BIT] =            "rx-fcs",
 	[NETIF_F_RXALL_BIT] =            "rx-all",
+	[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
 };
 
 static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index e7121d2..8c44b1b 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -126,7 +126,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 	HARD_TX_LOCK(dev, txq, smp_processor_id());
 	if (!netif_xmit_frozen_or_stopped(txq))
-		ret = dev_hard_start_xmit(skb, dev, txq);
+		ret = dev_hard_start_xmit(skb, dev, txq, NULL);
 
 	HARD_TX_UNLOCK(dev, txq);
 
-- 
1.8.3.1

^ permalink raw reply related

* [RFC PATCH 0/2 v3] net: alternate proposal for using macvlans with forwarding acceleration
From: Neil Horman @ 2013-10-11 18:43 UTC (permalink / raw)
  To: netdev; +Cc: john.fastabend, Andy Gospodarek, David Miller
In-Reply-To: <1380140209-24587-1-git-send-email-nhorman@tuxdriver.com>

Hey all-
     heres the next, updated version of the vsi/macvlan integration that we've
been discussing.

Change notes:

* Moved the feature flag to netdev_features.h.  No ethtool option for disabling
it yet, but its there now, and seems to fit fairly well.  I was actually
thinking about your comment John, regarding the clumsiness in allowing sw and hw
accel vlans on the same lowerdev, and it just occured to me that we could use
the same flag on the macvlan device directly - i.e. if we found that a lowerdev
supported acceleration, then call ndo_dfwd_station_add, and, if successfull, set
the same feature flag in the macvlan device.  Then we could use ethtool to
control the enabling/disabling of acceleration at the macvlan device directly.
Thoughts?

* Moved the acceleration net device methods back into net_device_ops.  Looks
pretty good to me there.

* Restored the use of a separate xmit routine so we weren't subject to the
lowerdevs queue disciplines.  I integrated its use with dev_hard_start_xmit, so
we could share the use of the linearization code, etc.  Let me know what you
think.

Best
Neil

^ permalink raw reply

* Re: [PATCH] wimax: Use WARN(1,...) rather than printk followed by WARN_ON(1)
From: David Miller @ 2013-10-11 18:38 UTC (permalink / raw)
  To: djduanjiong; +Cc: inaky.perez-gonzalez, linux-wimax, wimax, netdev, duanj.fnst
In-Reply-To: <1381192238-6486-1-git-send-email-duanj.fnst@cn.fujitsu.com>

From: djduanjiong@gmail.com
Date: Mon,  7 Oct 2013 17:30:38 -0700

> -	list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node) {
> -		printk(KERN_ERR "BUG: %s wimax_dev %p ifindex %d not cleared\n",
> +	list_for_each_entry(wimax_dev, &wimax_id_table, id_table_node)
> +		WARN(1, KERN_ERR "BUG: %s wimax_dev %p ifindex %d not cleared\n",
>  		       __func__, wimax_dev, wimax_dev->net_dev->ifindex);


You are changing the column where the openning parenthesis appears by
changing the printk() into a WARN(), therefore you have to adjust the
indentation of the next line such that the arguments start at the first
column after the openning parenthesis.

^ permalink raw reply

* Re: [PATCH net] netem: update backlog after drop
From: Eric Dumazet @ 2013-10-11 18:38 UTC (permalink / raw)
  To: David Miller; +Cc: stephen, netdev
In-Reply-To: <20131011.143018.1801127237009991334.davem@davemloft.net>

On Fri, 2013-10-11 at 14:30 -0400, David Miller wrote:
> From: Stephen Hemminger <stephen@networkplumber.org>
> Date: Sun, 6 Oct 2013 15:15:33 -0700
> 
> > When packet is dropped from rb-tree netem the backlog statistic should
> > also be updated.
> > 
> > Reported-by: Сергеев Сергей <adron@yapic.net>
> > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> > 
> > ---
> > Should be reviewed by Eric (he added the rb-tree stuff), and added to stable
> > as well.
> 
> Eric please review this patch, thanks.

It somehow escaped from my radar ;)

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* Re: [PATCH net] netem: update backlog after drop
From: David Miller @ 2013-10-11 18:30 UTC (permalink / raw)
  To: stephen; +Cc: eric.dumazet, netdev
In-Reply-To: <20131006151533.52988624@nehalam.linuxnetplumber.net>

From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sun, 6 Oct 2013 15:15:33 -0700

> When packet is dropped from rb-tree netem the backlog statistic should
> also be updated.
> 
> Reported-by: Сергеев Сергей <adron@yapic.net>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> 
> ---
> Should be reviewed by Eric (he added the rb-tree stuff), and added to stable
> as well.

Eric please review this patch, thanks.

^ permalink raw reply

* Re: IPv6 kernel warning
From: dormando @ 2013-10-11 18:15 UTC (permalink / raw)
  To: Yuchung Cheng
  Cc: Michele Baldessari, Russell King - ARM Linux, netdev,
	Neal Cardwell, Nandita Dukkipati
In-Reply-To: <alpine.DEB.2.02.1310091147340.5669@dtop>



On Wed, 9 Oct 2013, dormando wrote:

> > > >> >>
> > > >> >
> > > >> > Should I apply this and see if the warning stops?
> > Hi Dormando,
> >
> > Could you try this patch to make sure it fixes the warning (with
> > sysctl net.ipv4.early_retrans=3)?
>
> It's now running on one machine, with early_retrans=3. Will have to give
> it 24 hours to confirm.
>

Almost 48 hours, early_retrans=3, no warnings! (or crashes...)

Good catch :)

^ permalink raw reply

* Re: [PATCH net-next] openvswitch: fix vport-netdev unregister
From: Jesse Gross @ 2013-10-11 18:11 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Pravin Shelar, David S. Miller, Jiri Pirko, dev@openvswitch.org,
	netdev
In-Reply-To: <CAMEtUuzf5UbZ_87DU+vSMbKzxLQgmuEfrf3NXQSz7vSF8LQTzw@mail.gmail.com>

On Thu, Oct 10, 2013 at 9:48 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
> On Thu, Oct 10, 2013 at 8:56 PM, Jesse Gross <jesse@nicira.com> wrote:
>> However, the check dev->reg_state in netdev_destroy() looks racy to
>> me, as it could already be in NETREG_UNREGISTERED even if we already
>> processed this device.
>
> you mean that netdev_destroy() will see reg_state == netreg_unregistered,
> while dp_device_event() didn't see reg_state == netreg_unregistering yet?
> or dp_device_event() saw it, proceeded to do unlink and
> netdev_destroy() ran in parallel?
> well, that's why reg_state == netreg_unregistering check in netdev_destroy()
> is done with rtnl_lock() held.
> reg_state cannot go into netreg_unregistered state skipping
> netreg_unregistering and notifier.
> therefore I don't think it's racy.
>
> In ovs_dp_notify_wq() you're checking for both unregistering and
> unregistered and that makes
> sense, since workq can run after unregistering notifier called and
> netdev_run_todo()
> already changed the state to unregistered.
> But here it's not the case.

ovs_dp_notify_wq() calls ovs_dp_detach_port(), which indirectly calls
netdev_destroy() so it seems like it actually is the same case to me.

^ permalink raw reply

* [PATCH 1/1] net: fix cipso packet validation when !NETLABEL
From: Seif Mazareeb @ 2013-10-11 17:58 UTC (permalink / raw)
  To: davem@davemloft.net, paul@paul-moore.com, netdev@vger.kernel.org
  Cc: thomas.petazzoni@free-electrons.com, Dmitri Epshtein

When CONFIG_NETLABEL is disabled, the cipso_v4_validate() function could loop
forever in the main loop if opt[opt_iter +1] == 0, this will causing a kernel
crash in an SMP system, since the CPU executing this function will
stall /not respond to IPIs.

This problem can be reproduced by running the IP Stack Integrity Checker
(http://isic.sourceforge.net) using the following command on a Linux machine
connected to DUT:

"icmpsic -s rand -d <DUT IP address> -r 123456"
wait (1-2 min)

Signed-off-by: Seif Mazareeb <seif@marvell.com>
---
 include/net/cipso_ipv4.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h
index a7a683e..047f1f6 100644
--- a/include/net/cipso_ipv4.h
+++ b/include/net/cipso_ipv4.h
@@ -306,6 +306,10 @@ static inline int cipso_v4_validate(const struct sk_buff *skb,
                        err_offset = opt_iter + 1;
                        goto out;
                }
+
+               if (opt[opt_iter + 1] == 0)
+                       break;
+
                opt_iter += opt[opt_iter + 1];
        }

--
1.8.1.2

^ permalink raw reply related

* Re: [PATCH 1/1] net: fix cipso packet validation when !NETLABEL
From: David Miller @ 2013-10-11 17:53 UTC (permalink / raw)
  To: seif; +Cc: paul, netdev, thomas.petazzoni, dima
In-Reply-To: <0DB595A2CB707F458400BE9663B6A72269C004777F@SC-VEXCH2.marvell.com>


Sorry, no HTML encoded email is allowed on the list.

Turn off all encodings and send plain ASCII text to this mailing
list.

Thank you.

^ permalink raw reply

* Re: Peak TCP performance
From: Rick Jones @ 2013-10-11 17:46 UTC (permalink / raw)
  To: Kyle Hubert, Eric Dumazet; +Cc: netdev
In-Reply-To: <CAJoZ4U2CRaF+kvgXhbq3DNkMdYbouoKdU-xAUAkuhD5LbC7X8Q@mail.gmail.com>

On 10/10/2013 09:21 PM, Kyle Hubert wrote:
> On Thu, Oct 10, 2013 at 11:44 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>> Also, my copy of ethtool does not recognize tx-nocache-copy. However,
>>> I do have control over the net device. Is there something there I can
>>> set, or is tx-nocache-copy also a new feature? I'll start digging.
>>>
>>
>> nocache-copy was added in 3.0, but I do find its not a gain for current
>> cpus.
>>
>> You could get a fresh copy of ethtool sources :
>>
>> git clone git://git.kernel.org/pub/scm/network/ethtool/ethtool.git
>> cd ethtool
>> ./autogen.sh  ...
>
> That did the trick. Thanks for the help! Is there somewhere I can read
> up on this feature? A lot of the netdev features are opaque to me.
> Also, I can set NETIF_F_NOCACHE_COPY in the netdev->features to set
> this by default, yes?
>
> This at least mirrors the performance improvement that I see when
> forwarding, however I still see reserved CPU time. Is there a way to
> push it even farther?

Thought I would point-out that unless you do concrete steps to make it 
behave otherwise, netperf will constantly present the same set of 
cache-clean buffers to the transport.  The size of those buffers will be 
determined by some heuristics and will depend on the socket buffer size 
at the time the data socket is create, which itself will depend on 
whether or not you have used a test-specific -s option.  And the 
test-specific -m option will come into play.

If I am recalling correctly, the number of buffers will be one more than:

initialSO_SNDBUF / send_size

though you can control that with global -W option.

happy benchmarking,

rick jones

^ permalink raw reply

* Re: [PATCHv2 1/2 ] {xfrm, sctp} Stick to software crc32 even if hardware is capable of that
From: Vlad Yasevich @ 2013-10-11 17:12 UTC (permalink / raw)
  To: Fan Du, Neil Horman, steffen.klassert, davem; +Cc: netdev
In-Reply-To: <5258055F.5060703@gmail.com>

On 10/11/2013 10:04 AM, Vlad Yasevich wrote:
> On 10/11/2013 03:05 AM, Fan Du wrote:
>>
>>
>> On 2013年10月10日 21:11, Neil Horman wrote:
>>> On Thu, Oct 10, 2013 at 01:51:36PM +0800, Fan Du wrote:
>>>> igb/ixgbe have hardware sctp checksum support, when this feature is
>>>> enabled
>>>> and also IPsec is armed to protect sctp traffic, ugly things
>>>> happened as
>>>> xfrm_output checks CHECKSUM_PARTIAL to do check sum operation(sum
>>>> every thing
>>>> up and pack the 16bits result in the checksum field). The result is
>>>> fail
>>>> establishment of sctp communication.
>>>>
>>> Shouldn't this be fixed in the xfrm code then?  E.g. check the device
>>> features
>>> for SCTP checksum offloading and and skip the checksum during xfrm
>>> output if its
>>> available?
>>>
>>> Or am I missing something?
>>> Neil
>>>
>>>
>>
>>
>>  From 014276de0877f11d46e1704114a7d91f19221a63 Mon Sep 17 00:00:00 2001
>> From: Fan Du <fan.du@windriver.com>
>> Date: Fri, 11 Oct 2013 14:24:33 +0800
>> Subject: [PATCH 1/2] {xfrm, sctp} Stick to software crc32 even if
>> hardware is
>>   capable of that
>>
>> igb/ixgbe have hardware sctp checksum support, when this feature is
>> enabled
>> and also IPsec is armed to protect sctp traffic, ugly things happened as
>> xfrm_output checks CHECKSUM_PARTIAL to do check sum operation(sum every
>> thing
>> up and pack the 16bits result in the checksum field). The result is fail
>> establishment of sctp communication.
>>
>> Signed-off-by: Fan Du <fan.du@windriver.com>
>> ---
>> v2:
>>    Leave ip_summed as CHECKSUM_PARTIAL as before, the second patch will
>> fix this.
>>
>> ---
>>   net/sctp/output.c |   14 +++++++++++++-
>>   1 file changed, 13 insertions(+), 1 deletion(-)
>>
>> diff --git a/net/sctp/output.c b/net/sctp/output.c
>> index 0ac3a65..6de6402 100644
>> --- a/net/sctp/output.c
>> +++ b/net/sctp/output.c
>> @@ -372,6 +372,16 @@ static void sctp_packet_set_owner_w(struct sk_buff
>> *skb, struct sock *sk)
>>       atomic_inc(&sk->sk_wmem_alloc);
>>   }
>>
>> +static int is_xfrm_armed(struct dst_entry *dst)
>> +{
>> +#ifdef CONFIG_XFRM
>> +    /* If dst->xfrm is valid, this skb needs to be transformed */
>> +    return dst->xfrm != NULL;
>> +#else
>> +    return 0;
>> +#endif
>> +}
>> +
>
> I would really prefer to have an accessor function to dst->xfrm, but
> I see that everyone codes it inside the #ifdef.  Gack.
>
>>   /* All packets are sent to the network through this function from
>>    * sctp_outq_tail().
>>    *
>> @@ -536,7 +546,9 @@ int sctp_packet_transmit(struct sctp_packet *packet)
>>        * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
>>        */
>>       if (!sctp_checksum_disable) {
>> -        if (!(dst->dev->features & NETIF_F_SCTP_CSUM)) {
>> +        if ((!(dst->dev->features & NETIF_F_SCTP_CSUM)) ||
>> +            is_xfrm_armed(dst)) {
>> +
>>               __u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len);
>>
>>               /* 3) Put the resultant value into the checksum field in
>> the
>
> Acked-by: Vlad Yasevich <vyasevich@gmail.com>

This patch doesn't seem to apply to net.git.


-vlad

^ permalink raw reply

* SIT Tunnel: Generate an ICMPV6 message and send them back to the original IPV6 node
From: Oussama Ghorbel @ 2013-10-11 16:36 UTC (permalink / raw)
  To: netdev, linux-kernel

In RFC 4213, section 3.4, page 11, it says:
If sufficient data bytes from the offending packet are available, the
encapsulator MAY extract the encapsulated IPv6 packet and use it to
generate an ICMPv6 message directed back to the originating IPv6  node.

Currently, linux does not do that.
More info on the RFC: http://tools.ietf.org/html/rfc4213
Issue described also in  https://bugzilla.kernel.org/show_bug.cgi?id=49761

I'm considering to implement this feature.
Will be interesting to do it?

Regards,
Oussama

^ permalink raw reply

* [PATCH net-next] netfilter: xt_socket: use sock_gen_put()
From: Eric Dumazet @ 2013-10-11 16:03 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Pablo Neira Ayuso, netfilter-devel

From: Eric Dumazet <edumazet@google.com>

TCP listener refactoring, part 7 :

Use sock_gen_put() instead of xt_socket_put_sk() for future
SYN_RECV support.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_socket.c |   13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 3dd0e37..1ba6793 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -35,15 +35,6 @@
 #include <net/netfilter/nf_conntrack.h>
 #endif
 
-static void
-xt_socket_put_sk(struct sock *sk)
-{
-	if (sk->sk_state == TCP_TIME_WAIT)
-		inet_twsk_put(inet_twsk(sk));
-	else
-		sock_put(sk);
-}
-
 static int
 extract_icmp4_fields(const struct sk_buff *skb,
 		    u8 *protocol,
@@ -216,7 +207,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 					inet_twsk(sk)->tw_transparent));
 
 		if (sk != skb->sk)
-			xt_socket_put_sk(sk);
+			sock_gen_put(sk);
 
 		if (wildcard || !transparent)
 			sk = NULL;
@@ -381,7 +372,7 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
 					inet_twsk(sk)->tw_transparent));
 
 		if (sk != skb->sk)
-			xt_socket_put_sk(sk);
+			sock_gen_put(sk);
 
 		if (wildcard || !transparent)
 			sk = NULL;

^ permalink raw reply related

* [PATCH net-next] inet_diag: use sock_gen_put()
From: Eric Dumazet @ 2013-10-11 15:54 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

From: Eric Dumazet <edumazet@google.com>

TCP listener refactoring, part 6 :

Use sock_gen_put() from inet_diag_dump_one_icsk() for future
SYN_RECV support.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/inet_diag.c |    9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 41e1c3e..56a964a 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -336,12 +336,9 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
 		err = 0;
 
 out:
-	if (sk) {
-		if (sk->sk_state == TCP_TIME_WAIT)
-			inet_twsk_put((struct inet_timewait_sock *)sk);
-		else
-			sock_put(sk);
-	}
+	if (sk)
+		sock_gen_put(sk);
+
 out_nosk:
 	return err;
 }

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox