Netdev List
 help / color / mirror / Atom feed
* [net-2.6 PATCH 3/5] ixgbe: Bump driver version number
From: Jeff Kirsher @ 2009-09-30 22:07 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Peter P Waskiewicz Jr, Jeff Kirsher
In-Reply-To: <20090930220705.27479.62694.stgit@localhost.localdomain>

From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>

A number of changes have gone in since the last version bump.  Bump
it to reflect the changes.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/ixgbe/ixgbe_main.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index fe52736..c198183 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -49,7 +49,7 @@ char ixgbe_driver_name[] = "ixgbe";
 static const char ixgbe_driver_string[] =
                               "Intel(R) 10 Gigabit PCI Express Network Driver";
 
-#define DRV_VERSION "2.0.37-k2"
+#define DRV_VERSION "2.0.44-k2"
 const char ixgbe_driver_version[] = DRV_VERSION;
 static char ixgbe_copyright[] = "Copyright (c) 1999-2009 Intel Corporation.";
 


^ permalink raw reply related

* [net-2.6 PATCH 2/5] ixgbe: Fix backplane flow control autoneg
From: Jeff Kirsher @ 2009-09-30 22:07 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Peter P Waskiewicz Jr, Jeff Kirsher
In-Reply-To: <20090930220705.27479.62694.stgit@localhost.localdomain>

From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>

Backplane flow control autonegotiation is currently broken for
ixgbe devices.  This patch fixes the flow control issues
with clause 37 autoneg.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/ixgbe/ixgbe_82598.c  |    2 
 drivers/net/ixgbe/ixgbe_common.c |  228 ++++++++++++++++++++++++++++++--------
 drivers/net/ixgbe/ixgbe_type.h   |    9 ++
 3 files changed, 187 insertions(+), 52 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_82598.c b/drivers/net/ixgbe/ixgbe_82598.c
index 56b12f3..e2d5343 100644
--- a/drivers/net/ixgbe/ixgbe_82598.c
+++ b/drivers/net/ixgbe/ixgbe_82598.c
@@ -425,7 +425,7 @@ static s32 ixgbe_fc_enable_82598(struct ixgbe_hw *hw, s32 packetbuf_num)
 #endif /* CONFIG_DCB */
 	default:
 		hw_dbg(hw, "Flow control param set incorrectly\n");
-		ret_val = -IXGBE_ERR_CONFIG;
+		ret_val = IXGBE_ERR_CONFIG;
 		goto out;
 		break;
 	}
diff --git a/drivers/net/ixgbe/ixgbe_common.c b/drivers/net/ixgbe/ixgbe_common.c
index 6621e17..143b0fc 100644
--- a/drivers/net/ixgbe/ixgbe_common.c
+++ b/drivers/net/ixgbe/ixgbe_common.c
@@ -1663,7 +1663,7 @@ s32 ixgbe_fc_enable_generic(struct ixgbe_hw *hw, s32 packetbuf_num)
 #endif /* CONFIG_DCB */
 	default:
 		hw_dbg(hw, "Flow control param set incorrectly\n");
-		ret_val = -IXGBE_ERR_CONFIG;
+		ret_val = IXGBE_ERR_CONFIG;
 		goto out;
 		break;
 	}
@@ -1734,75 +1734,140 @@ s32 ixgbe_fc_autoneg(struct ixgbe_hw *hw)
 	s32 ret_val = 0;
 	ixgbe_link_speed speed;
 	u32 pcs_anadv_reg, pcs_lpab_reg, linkstat;
+	u32 links2, anlp1_reg, autoc_reg, links;
 	bool link_up;
 
 	/*
 	 * AN should have completed when the cable was plugged in.
 	 * Look for reasons to bail out.  Bail out if:
 	 * - FC autoneg is disabled, or if
-	 * - we don't have multispeed fiber, or if
-	 * - we're not running at 1G, or if
-	 * - link is not up, or if
-	 * - link is up but AN did not complete, or if
-	 * - link is up and AN completed but timed out
+	 * - link is not up.
 	 *
-	 * Since we're being called from an LSC, link is already know to be up.
+	 * Since we're being called from an LSC, link is already known to be up.
 	 * So use link_up_wait_to_complete=false.
 	 */
 	hw->mac.ops.check_link(hw, &speed, &link_up, false);
-	linkstat = IXGBE_READ_REG(hw, IXGBE_PCS1GLSTA);
-
-	if (hw->fc.disable_fc_autoneg ||
-	    !hw->phy.multispeed_fiber ||
-	    (speed != IXGBE_LINK_SPEED_1GB_FULL) ||
-	    !link_up ||
-	    ((linkstat & IXGBE_PCS1GLSTA_AN_COMPLETE) == 0) ||
-	    ((linkstat & IXGBE_PCS1GLSTA_AN_TIMED_OUT) == 1)) {
+
+	if (hw->fc.disable_fc_autoneg || (!link_up)) {
 		hw->fc.fc_was_autonegged = false;
 		hw->fc.current_mode = hw->fc.requested_mode;
-		hw_dbg(hw, "Autoneg FC was skipped.\n");
 		goto out;
 	}
 
 	/*
+	 * On backplane, bail out if
+	 * - backplane autoneg was not completed, or if
+	 * - link partner is not AN enabled
+	 */
+	if (hw->phy.media_type == ixgbe_media_type_backplane) {
+		links = IXGBE_READ_REG(hw, IXGBE_LINKS);
+		links2 = IXGBE_READ_REG(hw, IXGBE_LINKS2);
+		if (((links & IXGBE_LINKS_KX_AN_COMP) == 0) ||
+		    ((links2 & IXGBE_LINKS2_AN_SUPPORTED) == 0)) {
+			hw->fc.fc_was_autonegged = false;
+			hw->fc.current_mode = hw->fc.requested_mode;
+			goto out;
+		}
+	}
+
+	/*
+	 * On multispeed fiber at 1g, bail out if
+	 * - link is up but AN did not complete, or if
+	 * - link is up and AN completed but timed out
+	 */
+	if (hw->phy.multispeed_fiber && (speed == IXGBE_LINK_SPEED_1GB_FULL)) {
+		linkstat = IXGBE_READ_REG(hw, IXGBE_PCS1GLSTA);
+		if (((linkstat & IXGBE_PCS1GLSTA_AN_COMPLETE) == 0) ||
+		    ((linkstat & IXGBE_PCS1GLSTA_AN_TIMED_OUT) == 1)) {
+			hw->fc.fc_was_autonegged = false;
+			hw->fc.current_mode = hw->fc.requested_mode;
+			goto out;
+		}
+	}
+
+	/*
 	 * Read the AN advertisement and LP ability registers and resolve
 	 * local flow control settings accordingly
 	 */
-	pcs_anadv_reg = IXGBE_READ_REG(hw, IXGBE_PCS1GANA);
-	pcs_lpab_reg = IXGBE_READ_REG(hw, IXGBE_PCS1GANLP);
-	if ((pcs_anadv_reg & IXGBE_PCS1GANA_SYM_PAUSE) &&
-		(pcs_lpab_reg & IXGBE_PCS1GANA_SYM_PAUSE)) {
+	if ((speed == IXGBE_LINK_SPEED_1GB_FULL) &&
+	    (hw->phy.media_type != ixgbe_media_type_backplane)) {
+		pcs_anadv_reg = IXGBE_READ_REG(hw, IXGBE_PCS1GANA);
+		pcs_lpab_reg = IXGBE_READ_REG(hw, IXGBE_PCS1GANLP);
+		if ((pcs_anadv_reg & IXGBE_PCS1GANA_SYM_PAUSE) &&
+		    (pcs_lpab_reg & IXGBE_PCS1GANA_SYM_PAUSE)) {
+			/*
+			 * Now we need to check if the user selected Rx ONLY
+			 * of pause frames.  In this case, we had to advertise
+			 * FULL flow control because we could not advertise RX
+			 * ONLY. Hence, we must now check to see if we need to
+			 * turn OFF the TRANSMISSION of PAUSE frames.
+			 */
+			if (hw->fc.requested_mode == ixgbe_fc_full) {
+				hw->fc.current_mode = ixgbe_fc_full;
+				hw_dbg(hw, "Flow Control = FULL.\n");
+			} else {
+				hw->fc.current_mode = ixgbe_fc_rx_pause;
+				hw_dbg(hw, "Flow Control=RX PAUSE only\n");
+			}
+		} else if (!(pcs_anadv_reg & IXGBE_PCS1GANA_SYM_PAUSE) &&
+			   (pcs_anadv_reg & IXGBE_PCS1GANA_ASM_PAUSE) &&
+			   (pcs_lpab_reg & IXGBE_PCS1GANA_SYM_PAUSE) &&
+			   (pcs_lpab_reg & IXGBE_PCS1GANA_ASM_PAUSE)) {
+			hw->fc.current_mode = ixgbe_fc_tx_pause;
+			hw_dbg(hw, "Flow Control = TX PAUSE frames only.\n");
+		} else if ((pcs_anadv_reg & IXGBE_PCS1GANA_SYM_PAUSE) &&
+			   (pcs_anadv_reg & IXGBE_PCS1GANA_ASM_PAUSE) &&
+			   !(pcs_lpab_reg & IXGBE_PCS1GANA_SYM_PAUSE) &&
+			   (pcs_lpab_reg & IXGBE_PCS1GANA_ASM_PAUSE)) {
+			hw->fc.current_mode = ixgbe_fc_rx_pause;
+			hw_dbg(hw, "Flow Control = RX PAUSE frames only.\n");
+		} else {
+			hw->fc.current_mode = ixgbe_fc_none;
+			hw_dbg(hw, "Flow Control = NONE.\n");
+		}
+	}
+
+	if (hw->phy.media_type == ixgbe_media_type_backplane) {
 		/*
-		 * Now we need to check if the user selected Rx ONLY
-		 * of pause frames.  In this case, we had to advertise
-		 * FULL flow control because we could not advertise RX
-		 * ONLY. Hence, we must now check to see if we need to
-		 * turn OFF the TRANSMISSION of PAUSE frames.
+		 * Read the 10g AN autoc and LP ability registers and resolve
+		 * local flow control settings accordingly
 		 */
-		if (hw->fc.requested_mode == ixgbe_fc_full) {
-			hw->fc.current_mode = ixgbe_fc_full;
-			hw_dbg(hw, "Flow Control = FULL.\n");
-		} else {
+		autoc_reg = IXGBE_READ_REG(hw, IXGBE_AUTOC);
+		anlp1_reg = IXGBE_READ_REG(hw, IXGBE_ANLP1);
+
+		if ((autoc_reg & IXGBE_AUTOC_SYM_PAUSE) &&
+		    (anlp1_reg & IXGBE_ANLP1_SYM_PAUSE)) {
+			/*
+			 * Now we need to check if the user selected Rx ONLY
+			 * of pause frames.  In this case, we had to advertise
+			 * FULL flow control because we could not advertise RX
+			 * ONLY. Hence, we must now check to see if we need to
+			 * turn OFF the TRANSMISSION of PAUSE frames.
+			 */
+			if (hw->fc.requested_mode == ixgbe_fc_full) {
+				hw->fc.current_mode = ixgbe_fc_full;
+				hw_dbg(hw, "Flow Control = FULL.\n");
+			} else {
+				hw->fc.current_mode = ixgbe_fc_rx_pause;
+				hw_dbg(hw, "Flow Control=RX PAUSE only\n");
+			}
+		} else if (!(autoc_reg & IXGBE_AUTOC_SYM_PAUSE) &&
+			   (autoc_reg & IXGBE_AUTOC_ASM_PAUSE) &&
+			   (anlp1_reg & IXGBE_ANLP1_SYM_PAUSE) &&
+			   (anlp1_reg & IXGBE_ANLP1_ASM_PAUSE)) {
+			hw->fc.current_mode = ixgbe_fc_tx_pause;
+			hw_dbg(hw, "Flow Control = TX PAUSE frames only.\n");
+		} else if ((autoc_reg & IXGBE_AUTOC_SYM_PAUSE) &&
+			   (autoc_reg & IXGBE_AUTOC_ASM_PAUSE) &&
+			   !(anlp1_reg & IXGBE_ANLP1_SYM_PAUSE) &&
+			   (anlp1_reg & IXGBE_ANLP1_ASM_PAUSE)) {
 			hw->fc.current_mode = ixgbe_fc_rx_pause;
 			hw_dbg(hw, "Flow Control = RX PAUSE frames only.\n");
+		} else {
+			hw->fc.current_mode = ixgbe_fc_none;
+			hw_dbg(hw, "Flow Control = NONE.\n");
 		}
-	} else if (!(pcs_anadv_reg & IXGBE_PCS1GANA_SYM_PAUSE) &&
-		   (pcs_anadv_reg & IXGBE_PCS1GANA_ASM_PAUSE) &&
-		   (pcs_lpab_reg & IXGBE_PCS1GANA_SYM_PAUSE) &&
-		   (pcs_lpab_reg & IXGBE_PCS1GANA_ASM_PAUSE)) {
-		hw->fc.current_mode = ixgbe_fc_tx_pause;
-		hw_dbg(hw, "Flow Control = TX PAUSE frames only.\n");
-	} else if ((pcs_anadv_reg & IXGBE_PCS1GANA_SYM_PAUSE) &&
-		   (pcs_anadv_reg & IXGBE_PCS1GANA_ASM_PAUSE) &&
-		   !(pcs_lpab_reg & IXGBE_PCS1GANA_SYM_PAUSE) &&
-		   (pcs_lpab_reg & IXGBE_PCS1GANA_ASM_PAUSE)) {
-		hw->fc.current_mode = ixgbe_fc_rx_pause;
-		hw_dbg(hw, "Flow Control = RX PAUSE frames only.\n");
-	} else {
-		hw->fc.current_mode = ixgbe_fc_none;
-		hw_dbg(hw, "Flow Control = NONE.\n");
 	}
-
 	/* Record that current_mode is the result of a successful autoneg */
 	hw->fc.fc_was_autonegged = true;
 
@@ -1919,7 +1984,7 @@ static s32 ixgbe_setup_fc(struct ixgbe_hw *hw, s32 packetbuf_num)
 #endif /* CONFIG_DCB */
 	default:
 		hw_dbg(hw, "Flow control param set incorrectly\n");
-		ret_val = -IXGBE_ERR_CONFIG;
+		ret_val = IXGBE_ERR_CONFIG;
 		goto out;
 		break;
 	}
@@ -1927,9 +1992,6 @@ static s32 ixgbe_setup_fc(struct ixgbe_hw *hw, s32 packetbuf_num)
 	IXGBE_WRITE_REG(hw, IXGBE_PCS1GANA, reg);
 	reg = IXGBE_READ_REG(hw, IXGBE_PCS1GLCTL);
 
-	/* Enable and restart autoneg to inform the link partner */
-	reg |= IXGBE_PCS1GLCTL_AN_ENABLE | IXGBE_PCS1GLCTL_AN_RESTART;
-
 	/* Disable AN timeout */
 	if (hw->fc.strict_ieee)
 		reg &= ~IXGBE_PCS1GLCTL_AN_1G_TIMEOUT_EN;
@@ -1937,6 +1999,70 @@ static s32 ixgbe_setup_fc(struct ixgbe_hw *hw, s32 packetbuf_num)
 	IXGBE_WRITE_REG(hw, IXGBE_PCS1GLCTL, reg);
 	hw_dbg(hw, "Set up FC; PCS1GLCTL = 0x%08X\n", reg);
 
+	/*
+	 * Set up the 10G flow control advertisement registers so the HW
+	 * can do fc autoneg once the cable is plugged in.  If we end up
+	 * using 1g instead, this is harmless.
+	 */
+	reg = IXGBE_READ_REG(hw, IXGBE_AUTOC);
+
+	/*
+	 * The possible values of fc.requested_mode are:
+	 * 0: Flow control is completely disabled
+	 * 1: Rx flow control is enabled (we can receive pause frames,
+	 *    but not send pause frames).
+	 * 2: Tx flow control is enabled (we can send pause frames but
+	 *    we do not support receiving pause frames).
+	 * 3: Both Rx and Tx flow control (symmetric) are enabled.
+	 * other: Invalid.
+	 */
+	switch (hw->fc.requested_mode) {
+	case ixgbe_fc_none:
+		/* Flow control completely disabled by software override. */
+		reg &= ~(IXGBE_AUTOC_SYM_PAUSE | IXGBE_AUTOC_ASM_PAUSE);
+		break;
+	case ixgbe_fc_rx_pause:
+		/*
+		 * Rx Flow control is enabled and Tx Flow control is
+		 * disabled by software override. Since there really
+		 * isn't a way to advertise that we are capable of RX
+		 * Pause ONLY, we will advertise that we support both
+		 * symmetric and asymmetric Rx PAUSE.  Later, we will
+		 * disable the adapter's ability to send PAUSE frames.
+		 */
+		reg |= (IXGBE_AUTOC_SYM_PAUSE | IXGBE_AUTOC_ASM_PAUSE);
+		break;
+	case ixgbe_fc_tx_pause:
+		/*
+		 * Tx Flow control is enabled, and Rx Flow control is
+		 * disabled by software override.
+		 */
+		reg |= (IXGBE_AUTOC_ASM_PAUSE);
+		reg &= ~(IXGBE_AUTOC_SYM_PAUSE);
+		break;
+	case ixgbe_fc_full:
+		/* Flow control (both Rx and Tx) is enabled by SW override. */
+		reg |= (IXGBE_AUTOC_SYM_PAUSE | IXGBE_AUTOC_ASM_PAUSE);
+		break;
+#ifdef CONFIG_DCB
+	case ixgbe_fc_pfc:
+		goto out;
+		break;
+#endif /* CONFIG_DCB */
+	default:
+		hw_dbg(hw, "Flow control param set incorrectly\n");
+		ret_val = IXGBE_ERR_CONFIG;
+		goto out;
+		break;
+	}
+	/*
+	 * AUTOC restart handles negotiation of 1G and 10G. There is
+	 * no need to set the PCS1GCTL register.
+	 */
+	reg |= IXGBE_AUTOC_AN_RESTART;
+	IXGBE_WRITE_REG(hw, IXGBE_AUTOC, reg);
+	hw_dbg(hw, "Set up FC; IXGBE_AUTOC = 0x%08X\n", reg);
+
 out:
 	return ret_val;
 }
@@ -2000,7 +2126,7 @@ s32 ixgbe_acquire_swfw_sync(struct ixgbe_hw *hw, u16 mask)
 
 	while (timeout) {
 		if (ixgbe_get_eeprom_semaphore(hw))
-			return -IXGBE_ERR_SWFW_SYNC;
+			return IXGBE_ERR_SWFW_SYNC;
 
 		gssr = IXGBE_READ_REG(hw, IXGBE_GSSR);
 		if (!(gssr & (fwmask | swmask)))
@@ -2017,7 +2143,7 @@ s32 ixgbe_acquire_swfw_sync(struct ixgbe_hw *hw, u16 mask)
 
 	if (!timeout) {
 		hw_dbg(hw, "Driver can't access resource, GSSR timeout.\n");
-		return -IXGBE_ERR_SWFW_SYNC;
+		return IXGBE_ERR_SWFW_SYNC;
 	}
 
 	gssr |= swmask;
diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h
index 8761d78..7c93e92 100644
--- a/drivers/net/ixgbe/ixgbe_type.h
+++ b/drivers/net/ixgbe/ixgbe_type.h
@@ -1336,6 +1336,8 @@
 #define IXGBE_AUTOC_KX4_SUPP    0x80000000
 #define IXGBE_AUTOC_KX_SUPP     0x40000000
 #define IXGBE_AUTOC_PAUSE       0x30000000
+#define IXGBE_AUTOC_ASM_PAUSE   0x20000000
+#define IXGBE_AUTOC_SYM_PAUSE   0x10000000
 #define IXGBE_AUTOC_RF          0x08000000
 #define IXGBE_AUTOC_PD_TMR      0x06000000
 #define IXGBE_AUTOC_AN_RX_LOOSE 0x01000000
@@ -1404,6 +1406,8 @@
 #define IXGBE_LINK_UP_TIME      90 /* 9.0 Seconds */
 #define IXGBE_AUTO_NEG_TIME     45 /* 4.5 Seconds */
 
+#define IXGBE_LINKS2_AN_SUPPORTED   0x00000040
+
 /* PCS1GLSTA Bit Masks */
 #define IXGBE_PCS1GLSTA_LINK_OK         1
 #define IXGBE_PCS1GLSTA_SYNK_OK         0x10
@@ -1424,6 +1428,11 @@
 #define IXGBE_PCS1GLCTL_AN_ENABLE       0x10000
 #define IXGBE_PCS1GLCTL_AN_RESTART      0x20000
 
+/* ANLP1 Bit Masks */
+#define IXGBE_ANLP1_PAUSE               0x0C00
+#define IXGBE_ANLP1_SYM_PAUSE           0x0400
+#define IXGBE_ANLP1_ASM_PAUSE           0x0800
+
 /* SW Semaphore Register bitmasks */
 #define IXGBE_SWSM_SMBI 0x00000001 /* Driver Semaphore bit */
 #define IXGBE_SWSM_SWESMBI 0x00000002 /* FW Semaphore bit */


^ permalink raw reply related

* [net-2.6 PATCH 1/5] ixgbe: Fix disabling of relaxed ordering with Tx DCA
From: Jeff Kirsher @ 2009-09-30 22:07 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, Peter P Waskiewicz Jr, Jeff Kirsher

From: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>

82599 has a different register offset for the Tx DCA control registers.
We disable relaxed ordering of the descriptor writebacks for Tx head
writeback, but didn't disable it properly for 82599.  However, this
shouldn't be a visible issue, since ixgbe doesn't use Tx head writeback.
This patch just makes sure we're not doing blind writes to offsets we
don't expect.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/ixgbe/ixgbe_main.c |   23 ++++++++++++++++++++---
 1 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index c407bd9..fe52736 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -1885,12 +1885,29 @@ static void ixgbe_configure_tx(struct ixgbe_adapter *adapter)
 		IXGBE_WRITE_REG(hw, IXGBE_TDT(j), 0);
 		adapter->tx_ring[i].head = IXGBE_TDH(j);
 		adapter->tx_ring[i].tail = IXGBE_TDT(j);
-		/* Disable Tx Head Writeback RO bit, since this hoses
+		/*
+		 * Disable Tx Head Writeback RO bit, since this hoses
 		 * bookkeeping if things aren't delivered in order.
 		 */
-		txctrl = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(j));
+		switch (hw->mac.type) {
+		case ixgbe_mac_82598EB:
+			txctrl = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(j));
+			break;
+		case ixgbe_mac_82599EB:
+		default:
+			txctrl = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(j));
+			break;
+		}
 		txctrl &= ~IXGBE_DCA_TXCTRL_TX_WB_RO_EN;
-		IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(j), txctrl);
+		switch (hw->mac.type) {
+		case ixgbe_mac_82598EB:
+			IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(j), txctrl);
+			break;
+		case ixgbe_mac_82599EB:
+		default:
+			IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(j), txctrl);
+			break;
+		}
 	}
 	if (hw->mac.type == ixgbe_mac_82599EB) {
 		/* We enable 8 traffic classes, DCB only */


^ permalink raw reply related

* Re: r8169 chips on some Intel D945GSEJT boards fail to work after PXE boot
From: Francois Romieu @ 2009-09-30 22:07 UTC (permalink / raw)
  To: Simon Farnsworth; +Cc: netdev
In-Reply-To: <4ABB5435.6040609@onelan.com>

Simon Farnsworth <simon.farnsworth@onelan.com> :
[...]
> We've tried this, and we've tried 2GB and 1GB modules; the failure to
> boot sticks with the board, not with the memory module. On my most
> recent attempt, the failing board isn't showing a correctable error
> status, so I've not yet tried your patch, on the assumption that it just
> clears the error status.
> 
> Is my assumption wrong? If not, is there anything else I can do that
> would help you diagnose this?

Try this against 2.6.31 or latest -rc.


diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 50c6a3c..74488a6 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -115,7 +115,9 @@ enum mac_version {
 	RTL_GIGA_MAC_VER_22 = 0x16, // 8168C
 	RTL_GIGA_MAC_VER_23 = 0x17, // 8168CP
 	RTL_GIGA_MAC_VER_24 = 0x18, // 8168CP
-	RTL_GIGA_MAC_VER_25 = 0x19  // 8168D
+	RTL_GIGA_MAC_VER_25 = 0x19, // 8168D
+	RTL_GIGA_MAC_VER_26 = 0x1a, // 8168D
+	RTL_GIGA_MAC_VER_27 = 0x1b  // 8168DP
 };
 
 #define _R(NAME,MAC,MASK) \
@@ -150,7 +152,9 @@ static const struct {
 	_R("RTL8168c/8111c",	RTL_GIGA_MAC_VER_22, 0xff7e1880), // PCI-E
 	_R("RTL8168cp/8111cp",	RTL_GIGA_MAC_VER_23, 0xff7e1880), // PCI-E
 	_R("RTL8168cp/8111cp",	RTL_GIGA_MAC_VER_24, 0xff7e1880), // PCI-E
-	_R("RTL8168d/8111d",	RTL_GIGA_MAC_VER_25, 0xff7e1880)  // PCI-E
+	_R("RTL8168d/8111d",	RTL_GIGA_MAC_VER_25, 0xff7e1880), // PCI-E
+	_R("RTL8168d/8111d",	RTL_GIGA_MAC_VER_26, 0xff7e1880), // PCI-E
+	_R("RTL8168dp/8111dp",	RTL_GIGA_MAC_VER_27, 0xff7e1880)  // PCI-E
 };
 #undef _R
 
@@ -253,6 +257,13 @@ enum rtl8168_8101_registers {
 	DBG_REG			= 0xd1,
 #define	FIX_NAK_1			(1 << 4)
 #define	FIX_NAK_2			(1 << 3)
+	EFUSEAR			= 0xdc,
+#define	EFUSEAR_FLAG			0x80000000
+#define	EFUSEAR_WRITE_CMD		0x80000000
+#define	EFUSEAR_READ_CMD		0x00000000
+#define	EFUSEAR_REG_MASK		0x03ff
+#define	EFUSEAR_REG_SHIFT		8
+#define	EFUSEAR_DATA_MASK		0xff
 };
 
 enum rtl_register_content {
@@ -568,6 +579,14 @@ static void mdio_patch(void __iomem *ioaddr, int reg_addr, int value)
 	mdio_write(ioaddr, reg_addr, mdio_read(ioaddr, reg_addr) | value);
 }
 
+static void mdio_plus_minus(void __iomem *ioaddr, int reg_addr, int p, int m)
+{
+	int val;
+
+	val = mdio_read(ioaddr, reg_addr);
+	mdio_write(ioaddr, reg_addr, (val | p) & ~m);
+}
+
 static void rtl_mdio_write(struct net_device *dev, int phy_id, int location,
 			   int val)
 {
@@ -651,6 +670,24 @@ static u32 rtl_csi_read(void __iomem *ioaddr, int addr)
 	return value;
 }
 
+static u8 rtl8168d_efuse_read(void __iomem *ioaddr, int reg_addr)
+{
+	u8 value = 0xff;
+	unsigned int i;
+
+	RTL_W32(EFUSEAR, (reg_addr & EFUSEAR_REG_MASK) << EFUSEAR_REG_SHIFT);
+
+	for (i = 0; i < 300; i++) {
+		if (RTL_R32(EFUSEAR) & EFUSEAR_FLAG) {
+			value = RTL_R32(EFUSEAR) & EFUSEAR_DATA_MASK;
+			break;
+		}
+		udelay(100);
+	}
+
+	return value;
+}
+
 static void rtl8169_irq_mask_and_ack(void __iomem *ioaddr)
 {
 	RTL_W16(IntrMask, 0x0000);
@@ -1243,7 +1280,10 @@ static void rtl8169_get_mac_version(struct rtl8169_private *tp,
 		int mac_version;
 	} mac_info[] = {
 		/* 8168D family. */
-		{ 0x7c800000, 0x28000000,	RTL_GIGA_MAC_VER_25 },
+		{ 0x7cf00000, 0x28300000,	RTL_GIGA_MAC_VER_26 },
+		{ 0x7cf00000, 0x28100000,	RTL_GIGA_MAC_VER_25 },
+		{ 0x7c800000, 0x28800000,	RTL_GIGA_MAC_VER_27 },
+		{ 0x7c800000, 0x28000000,	RTL_GIGA_MAC_VER_26 },
 
 		/* 8168C family. */
 		{ 0x7cf00000, 0x3ca00000,	RTL_GIGA_MAC_VER_24 },
@@ -1648,74 +1688,903 @@ static void rtl8168c_4_hw_phy_config(void __iomem *ioaddr)
 	rtl8168c_3_hw_phy_config(ioaddr);
 }
 
-static void rtl8168d_hw_phy_config(void __iomem *ioaddr)
+static void rtl8168d_1_hw_phy_config(void __iomem *ioaddr)
 {
 	struct phy_reg phy_reg_init_0[] = {
 		{ 0x1f, 0x0001 },
-		{ 0x09, 0x2770 },
-		{ 0x08, 0x04d0 },
-		{ 0x0b, 0xad15 },
-		{ 0x0c, 0x5bf0 },
-		{ 0x1c, 0xf101 },
+		{ 0x06, 0x4064 },
+		{ 0x07, 0x2863 },
+		{ 0x08, 0x059c },
+		{ 0x09, 0x26b4 },
+		{ 0x0a, 0x6a19 },
+		{ 0x0b, 0xdcc8 },
+		{ 0x10, 0xf06d },
+		{ 0x14, 0x7f68 },
+		{ 0x18, 0x7fd9 },
+		{ 0x1c, 0xf0ff },
+		{ 0x1d, 0x3d9c },
 		{ 0x1f, 0x0003 },
-		{ 0x14, 0x94d7 },
-		{ 0x12, 0xf4d6 },
-		{ 0x09, 0xca0f },
-		{ 0x1f, 0x0002 },
-		{ 0x0b, 0x0b10 },
-		{ 0x0c, 0xd1f7 },
-		{ 0x1f, 0x0002 },
-		{ 0x06, 0x5461 },
+		{ 0x12, 0xf49f },
+		{ 0x13, 0x070b },
+		{ 0x1a, 0x05ad },
+		{ 0x14, 0x94c0 }
+	};
+	struct phy_reg phy_reg_init_1[] = {
 		{ 0x1f, 0x0002 },
-		{ 0x05, 0x6662 },
+		{ 0x06, 0x5561 },
+		{ 0x1f, 0x0005 },
+		{ 0x05, 0x8332 },
+		{ 0x06, 0x5561 }
+	};
+	struct phy_reg phy_reg_init_2[] = {
+		{ 0x1f, 0x0005 },
+		{ 0x05, 0xffc2 },
+		{ 0x1f, 0x0005 },
+		{ 0x05, 0x8000 },
+		{ 0x06, 0xf8f9 },
+		{ 0x06, 0xfaef },
+		{ 0x06, 0x59ee },
+		{ 0x06, 0xf8ea },
+		{ 0x06, 0x00ee },
+		{ 0x06, 0xf8eb },
+		{ 0x06, 0x00e0 },
+		{ 0x06, 0xf87c },
+		{ 0x06, 0xe1f8 },
+		{ 0x06, 0x7d59 },
+		{ 0x06, 0x0fef },
+		{ 0x06, 0x0139 },
+		{ 0x06, 0x029e },
+		{ 0x06, 0x06ef },
+		{ 0x06, 0x1039 },
+		{ 0x06, 0x089f },
+		{ 0x06, 0x2aee },
+		{ 0x06, 0xf8ea },
+		{ 0x06, 0x00ee },
+		{ 0x06, 0xf8eb },
+		{ 0x06, 0x01e0 },
+		{ 0x06, 0xf87c },
+		{ 0x06, 0xe1f8 },
+		{ 0x06, 0x7d58 },
+		{ 0x06, 0x409e },
+		{ 0x06, 0x0f39 },
+		{ 0x06, 0x46aa },
+		{ 0x06, 0x0bbf },
+		{ 0x06, 0x8290 },
+		{ 0x06, 0xd682 },
+		{ 0x06, 0x9802 },
+		{ 0x06, 0x014f },
+		{ 0x06, 0xae09 },
+		{ 0x06, 0xbf82 },
+		{ 0x06, 0x98d6 },
+		{ 0x06, 0x82a0 },
+		{ 0x06, 0x0201 },
+		{ 0x06, 0x4fef },
+		{ 0x06, 0x95fe },
+		{ 0x06, 0xfdfc },
+		{ 0x06, 0x05f8 },
+		{ 0x06, 0xf9fa },
+		{ 0x06, 0xeef8 },
+		{ 0x06, 0xea00 },
+		{ 0x06, 0xeef8 },
+		{ 0x06, 0xeb00 },
+		{ 0x06, 0xe2f8 },
+		{ 0x06, 0x7ce3 },
+		{ 0x06, 0xf87d },
+		{ 0x06, 0xa511 },
+		{ 0x06, 0x1112 },
+		{ 0x06, 0xd240 },
+		{ 0x06, 0xd644 },
+		{ 0x06, 0x4402 },
+		{ 0x06, 0x8217 },
+		{ 0x06, 0xd2a0 },
+		{ 0x06, 0xd6aa },
+		{ 0x06, 0xaa02 },
+		{ 0x06, 0x8217 },
+		{ 0x06, 0xae0f },
+		{ 0x06, 0xa544 },
+		{ 0x06, 0x4402 },
+		{ 0x06, 0xae4d },
+		{ 0x06, 0xa5aa },
+		{ 0x06, 0xaa02 },
+		{ 0x06, 0xae47 },
+		{ 0x06, 0xaf82 },
+		{ 0x06, 0x13ee },
+		{ 0x06, 0x834e },
+		{ 0x06, 0x00ee },
+		{ 0x06, 0x834d },
+		{ 0x06, 0x0fee },
+		{ 0x06, 0x834c },
+		{ 0x06, 0x0fee },
+		{ 0x06, 0x834f },
+		{ 0x06, 0x00ee },
+		{ 0x06, 0x8351 },
+		{ 0x06, 0x00ee },
+		{ 0x06, 0x834a },
+		{ 0x06, 0xffee },
+		{ 0x06, 0x834b },
+		{ 0x06, 0xffe0 },
+		{ 0x06, 0x8330 },
+		{ 0x06, 0xe183 },
+		{ 0x06, 0x3158 },
+		{ 0x06, 0xfee4 },
+		{ 0x06, 0xf88a },
+		{ 0x06, 0xe5f8 },
+		{ 0x06, 0x8be0 },
+		{ 0x06, 0x8332 },
+		{ 0x06, 0xe183 },
+		{ 0x06, 0x3359 },
+		{ 0x06, 0x0fe2 },
+		{ 0x06, 0x834d },
+		{ 0x06, 0x0c24 },
+		{ 0x06, 0x5af0 },
+		{ 0x06, 0x1e12 },
+		{ 0x06, 0xe4f8 },
+		{ 0x06, 0x8ce5 },
+		{ 0x06, 0xf88d },
+		{ 0x06, 0xaf82 },
+		{ 0x06, 0x13e0 },
+		{ 0x06, 0x834f },
+		{ 0x06, 0x10e4 },
+		{ 0x06, 0x834f },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x4e78 },
+		{ 0x06, 0x009f },
+		{ 0x06, 0x0ae0 },
+		{ 0x06, 0x834f },
+		{ 0x06, 0xa010 },
+		{ 0x06, 0xa5ee },
+		{ 0x06, 0x834e },
+		{ 0x06, 0x01e0 },
+		{ 0x06, 0x834e },
+		{ 0x06, 0x7805 },
+		{ 0x06, 0x9e9a },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x4e78 },
+		{ 0x06, 0x049e },
+		{ 0x06, 0x10e0 },
+		{ 0x06, 0x834e },
+		{ 0x06, 0x7803 },
+		{ 0x06, 0x9e0f },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x4e78 },
+		{ 0x06, 0x019e },
+		{ 0x06, 0x05ae },
+		{ 0x06, 0x0caf },
+		{ 0x06, 0x81f8 },
+		{ 0x06, 0xaf81 },
+		{ 0x06, 0xa3af },
+		{ 0x06, 0x81dc },
+		{ 0x06, 0xaf82 },
+		{ 0x06, 0x13ee },
+		{ 0x06, 0x8348 },
+		{ 0x06, 0x00ee },
+		{ 0x06, 0x8349 },
+		{ 0x06, 0x00e0 },
+		{ 0x06, 0x8351 },
+		{ 0x06, 0x10e4 },
+		{ 0x06, 0x8351 },
+		{ 0x06, 0x5801 },
+		{ 0x06, 0x9fea },
+		{ 0x06, 0xd000 },
+		{ 0x06, 0xd180 },
+		{ 0x06, 0x1f66 },
+		{ 0x06, 0xe2f8 },
+		{ 0x06, 0xeae3 },
+		{ 0x06, 0xf8eb },
+		{ 0x06, 0x5af8 },
+		{ 0x06, 0x1e20 },
+		{ 0x06, 0xe6f8 },
+		{ 0x06, 0xeae5 },
+		{ 0x06, 0xf8eb },
+		{ 0x06, 0xd302 },
+		{ 0x06, 0xb3fe },
+		{ 0x06, 0xe2f8 },
+		{ 0x06, 0x7cef },
+		{ 0x06, 0x325b },
+		{ 0x06, 0x80e3 },
+		{ 0x06, 0xf87d },
+		{ 0x06, 0x9e03 },
+		{ 0x06, 0x7dff },
+		{ 0x06, 0xff0d },
+		{ 0x06, 0x581c },
+		{ 0x06, 0x551a },
+		{ 0x06, 0x6511 },
+		{ 0x06, 0xa190 },
+		{ 0x06, 0xd3e2 },
+		{ 0x06, 0x8348 },
+		{ 0x06, 0xe383 },
+		{ 0x06, 0x491b },
+		{ 0x06, 0x56ab },
+		{ 0x06, 0x08ef },
+		{ 0x06, 0x56e6 },
+		{ 0x06, 0x8348 },
+		{ 0x06, 0xe783 },
+		{ 0x06, 0x4910 },
+		{ 0x06, 0xd180 },
+		{ 0x06, 0x1f66 },
+		{ 0x06, 0xa004 },
+		{ 0x06, 0xb9e2 },
+		{ 0x06, 0x8348 },
+		{ 0x06, 0xe383 },
+		{ 0x06, 0x49ef },
+		{ 0x06, 0x65e2 },
+		{ 0x06, 0x834a },
+		{ 0x06, 0xe383 },
+		{ 0x06, 0x4b1b },
+		{ 0x06, 0x56aa },
+		{ 0x06, 0x0eef },
+		{ 0x06, 0x56e6 },
+		{ 0x06, 0x834a },
+		{ 0x06, 0xe783 },
+		{ 0x06, 0x4be2 },
+		{ 0x06, 0x834d },
+		{ 0x06, 0xe683 },
+		{ 0x06, 0x4ce0 },
+		{ 0x06, 0x834d },
+		{ 0x06, 0xa000 },
+		{ 0x06, 0x0caf },
+		{ 0x06, 0x81dc },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x4d10 },
+		{ 0x06, 0xe483 },
+		{ 0x06, 0x4dae },
+		{ 0x06, 0x0480 },
+		{ 0x06, 0xe483 },
+		{ 0x06, 0x4de0 },
+		{ 0x06, 0x834e },
+		{ 0x06, 0x7803 },
+		{ 0x06, 0x9e0b },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x4e78 },
+		{ 0x06, 0x049e },
+		{ 0x06, 0x04ee },
+		{ 0x06, 0x834e },
+		{ 0x06, 0x02e0 },
+		{ 0x06, 0x8332 },
+		{ 0x06, 0xe183 },
+		{ 0x06, 0x3359 },
+		{ 0x06, 0x0fe2 },
+		{ 0x06, 0x834d },
+		{ 0x06, 0x0c24 },
+		{ 0x06, 0x5af0 },
+		{ 0x06, 0x1e12 },
+		{ 0x06, 0xe4f8 },
+		{ 0x06, 0x8ce5 },
+		{ 0x06, 0xf88d },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x30e1 },
+		{ 0x06, 0x8331 },
+		{ 0x06, 0x6801 },
+		{ 0x06, 0xe4f8 },
+		{ 0x06, 0x8ae5 },
+		{ 0x06, 0xf88b },
+		{ 0x06, 0xae37 },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x4e03 },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x4ce1 },
+		{ 0x06, 0x834d },
+		{ 0x06, 0x1b01 },
+		{ 0x06, 0x9e04 },
+		{ 0x06, 0xaaa1 },
+		{ 0x06, 0xaea8 },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x4e04 },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x4f00 },
+		{ 0x06, 0xaeab },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x4f78 },
+		{ 0x06, 0x039f },
+		{ 0x06, 0x14ee },
+		{ 0x06, 0x834e },
+		{ 0x06, 0x05d2 },
+		{ 0x06, 0x40d6 },
+		{ 0x06, 0x5554 },
+		{ 0x06, 0x0282 },
+		{ 0x06, 0x17d2 },
+		{ 0x06, 0xa0d6 },
+		{ 0x06, 0xba00 },
+		{ 0x06, 0x0282 },
+		{ 0x06, 0x17fe },
+		{ 0x06, 0xfdfc },
+		{ 0x06, 0x05f8 },
+		{ 0x06, 0xe0f8 },
+		{ 0x06, 0x60e1 },
+		{ 0x06, 0xf861 },
+		{ 0x06, 0x6802 },
+		{ 0x06, 0xe4f8 },
+		{ 0x06, 0x60e5 },
+		{ 0x06, 0xf861 },
+		{ 0x06, 0xe0f8 },
+		{ 0x06, 0x48e1 },
+		{ 0x06, 0xf849 },
+		{ 0x06, 0x580f },
+		{ 0x06, 0x1e02 },
+		{ 0x06, 0xe4f8 },
+		{ 0x06, 0x48e5 },
+		{ 0x06, 0xf849 },
+		{ 0x06, 0xd000 },
+		{ 0x06, 0x0282 },
+		{ 0x06, 0x5bbf },
+		{ 0x06, 0x8350 },
+		{ 0x06, 0xef46 },
+		{ 0x06, 0xdc19 },
+		{ 0x06, 0xddd0 },
+		{ 0x06, 0x0102 },
+		{ 0x06, 0x825b },
+		{ 0x06, 0x0282 },
+		{ 0x06, 0x77e0 },
+		{ 0x06, 0xf860 },
+		{ 0x06, 0xe1f8 },
+		{ 0x06, 0x6158 },
+		{ 0x06, 0xfde4 },
+		{ 0x06, 0xf860 },
+		{ 0x06, 0xe5f8 },
+		{ 0x06, 0x61fc },
+		{ 0x06, 0x04f9 },
+		{ 0x06, 0xfafb },
+		{ 0x06, 0xc6bf },
+		{ 0x06, 0xf840 },
+		{ 0x06, 0xbe83 },
+		{ 0x06, 0x50a0 },
+		{ 0x06, 0x0101 },
+		{ 0x06, 0x071b },
+		{ 0x06, 0x89cf },
+		{ 0x06, 0xd208 },
+		{ 0x06, 0xebdb },
+		{ 0x06, 0x19b2 },
+		{ 0x06, 0xfbff },
+		{ 0x06, 0xfefd },
+		{ 0x06, 0x04f8 },
+		{ 0x06, 0xe0f8 },
+		{ 0x06, 0x48e1 },
+		{ 0x06, 0xf849 },
+		{ 0x06, 0x6808 },
+		{ 0x06, 0xe4f8 },
+		{ 0x06, 0x48e5 },
+		{ 0x06, 0xf849 },
+		{ 0x06, 0x58f7 },
+		{ 0x06, 0xe4f8 },
+		{ 0x06, 0x48e5 },
+		{ 0x06, 0xf849 },
+		{ 0x06, 0xfc04 },
+		{ 0x06, 0x4d20 },
+		{ 0x06, 0x0002 },
+		{ 0x06, 0x4e22 },
+		{ 0x06, 0x0002 },
+		{ 0x06, 0x4ddf },
+		{ 0x06, 0xff01 },
+		{ 0x06, 0x4edd },
+		{ 0x06, 0xff01 },
+		{ 0x05, 0x83d4 },
+		{ 0x06, 0x8000 },
+		{ 0x05, 0x83d8 },
+		{ 0x06, 0x8051 },
+		{ 0x02, 0x6010 },
+		{ 0x03, 0xdc00 },
+		{ 0x05, 0xfff6 },
+		{ 0x06, 0x00fc },
 		{ 0x1f, 0x0000 },
-		{ 0x14, 0x0060 },
+
 		{ 0x1f, 0x0000 },
-		{ 0x0d, 0xf8a0 },
+		{ 0x0d, 0xf880 },
+		{ 0x1f, 0x0000 }
+	};
+
+	rtl_phy_write(ioaddr, phy_reg_init_0, ARRAY_SIZE(phy_reg_init_0));
+
+	mdio_write(ioaddr, 0x1f, 0x0002);
+	mdio_plus_minus(ioaddr, 0x0b, 0x0010, 0x00ef);
+	mdio_plus_minus(ioaddr, 0x0c, 0xa200, 0x5d00);
+
+	rtl_phy_write(ioaddr, phy_reg_init_1, ARRAY_SIZE(phy_reg_init_1));
+
+	if (rtl8168d_efuse_read(ioaddr, 0x01) == 0xb1) {
+		struct phy_reg phy_reg_init[] = {
+			{ 0x1f, 0x0002 },
+			{ 0x05, 0x669a },
+			{ 0x1f, 0x0005 },
+			{ 0x05, 0x8330 },
+			{ 0x06, 0x669a },
+			{ 0x1f, 0x0002 }
+		};
+		int val;
+
+		rtl_phy_write(ioaddr, phy_reg_init, ARRAY_SIZE(phy_reg_init));
+
+		val = mdio_read(ioaddr, 0x0d);
+
+		if ((val & 0x00ff) != 0x006c) {
+			u32 set[] = {
+				0x0065, 0x0066, 0x0067, 0x0068,
+				0x0069, 0x006a, 0x006b, 0x006c
+			};
+			int i;
+
+			mdio_write(ioaddr, 0x1f, 0x0002);
+
+			val &= 0xff00;
+			for (i = 0; i < ARRAY_SIZE(set); i++)
+				mdio_write(ioaddr, 0x0d, val | set[i]);
+		}
+	} else {
+		struct phy_reg phy_reg_init[] = {
+			{ 0x1f, 0x0002 },
+			{ 0x05, 0x6662 },
+			{ 0x1f, 0x0005 },
+			{ 0x05, 0x8330 },
+			{ 0x06, 0x6662 }
+		};
+
+		rtl_phy_write(ioaddr, phy_reg_init, ARRAY_SIZE(phy_reg_init));
+	}
+
+	mdio_write(ioaddr, 0x1f, 0x0002);
+	mdio_patch(ioaddr, 0x0d, 0x0300);
+	mdio_patch(ioaddr, 0x0f, 0x0010);
+
+	mdio_write(ioaddr, 0x1f, 0x0002);
+	mdio_plus_minus(ioaddr, 0x02, 0x0100, 0x0600);
+	mdio_plus_minus(ioaddr, 0x03, 0x0000, 0xe000);
+
+	rtl_phy_write(ioaddr, phy_reg_init_2, ARRAY_SIZE(phy_reg_init_2));
+}
+
+static void rtl8168d_2_hw_phy_config(void __iomem *ioaddr)
+{
+	struct phy_reg phy_reg_init_0[] = {
+		{ 0x1f, 0x0001 },
+		{ 0x06, 0x4064 },
+		{ 0x07, 0x2863 },
+		{ 0x08, 0x059c },
+		{ 0x09, 0x26b4 },
+		{ 0x0a, 0x6a19 },
+		{ 0x0b, 0xdcc8 },
+		{ 0x10, 0xf06d },
+		{ 0x14, 0x7f68 },
+		{ 0x18, 0x7fd9 },
+		{ 0x1c, 0xf0ff },
+		{ 0x1d, 0x3d9c },
+		{ 0x1f, 0x0003 },
+		{ 0x12, 0xf49f },
+		{ 0x13, 0x070b },
+		{ 0x1a, 0x05ad },
+		{ 0x14, 0x94c0 },
+
+		{ 0x1f, 0x0002 },
+		{ 0x06, 0x5561 },
 		{ 0x1f, 0x0005 },
-		{ 0x05, 0xffc2 }
+		{ 0x05, 0x8332 },
+		{ 0x06, 0x5561 }
+	};
+	struct phy_reg phy_reg_init_1[] = {
+		{ 0x1f, 0x0005 },
+		{ 0x05, 0xffc2 },
+		{ 0x1f, 0x0005 },
+		{ 0x05, 0x8000 },
+		{ 0x06, 0xf8f9 },
+		{ 0x06, 0xfaee },
+		{ 0x06, 0xf8ea },
+		{ 0x06, 0x00ee },
+		{ 0x06, 0xf8eb },
+		{ 0x06, 0x00e2 },
+		{ 0x06, 0xf87c },
+		{ 0x06, 0xe3f8 },
+		{ 0x06, 0x7da5 },
+		{ 0x06, 0x1111 },
+		{ 0x06, 0x12d2 },
+		{ 0x06, 0x40d6 },
+		{ 0x06, 0x4444 },
+		{ 0x06, 0x0281 },
+		{ 0x06, 0xc6d2 },
+		{ 0x06, 0xa0d6 },
+		{ 0x06, 0xaaaa },
+		{ 0x06, 0x0281 },
+		{ 0x06, 0xc6ae },
+		{ 0x06, 0x0fa5 },
+		{ 0x06, 0x4444 },
+		{ 0x06, 0x02ae },
+		{ 0x06, 0x4da5 },
+		{ 0x06, 0xaaaa },
+		{ 0x06, 0x02ae },
+		{ 0x06, 0x47af },
+		{ 0x06, 0x81c2 },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x4e00 },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x4d0f },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x4c0f },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x4f00 },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x5100 },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x4aff },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x4bff },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x30e1 },
+		{ 0x06, 0x8331 },
+		{ 0x06, 0x58fe },
+		{ 0x06, 0xe4f8 },
+		{ 0x06, 0x8ae5 },
+		{ 0x06, 0xf88b },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x32e1 },
+		{ 0x06, 0x8333 },
+		{ 0x06, 0x590f },
+		{ 0x06, 0xe283 },
+		{ 0x06, 0x4d0c },
+		{ 0x06, 0x245a },
+		{ 0x06, 0xf01e },
+		{ 0x06, 0x12e4 },
+		{ 0x06, 0xf88c },
+		{ 0x06, 0xe5f8 },
+		{ 0x06, 0x8daf },
+		{ 0x06, 0x81c2 },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x4f10 },
+		{ 0x06, 0xe483 },
+		{ 0x06, 0x4fe0 },
+		{ 0x06, 0x834e },
+		{ 0x06, 0x7800 },
+		{ 0x06, 0x9f0a },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x4fa0 },
+		{ 0x06, 0x10a5 },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x4e01 },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x4e78 },
+		{ 0x06, 0x059e },
+		{ 0x06, 0x9ae0 },
+		{ 0x06, 0x834e },
+		{ 0x06, 0x7804 },
+		{ 0x06, 0x9e10 },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x4e78 },
+		{ 0x06, 0x039e },
+		{ 0x06, 0x0fe0 },
+		{ 0x06, 0x834e },
+		{ 0x06, 0x7801 },
+		{ 0x06, 0x9e05 },
+		{ 0x06, 0xae0c },
+		{ 0x06, 0xaf81 },
+		{ 0x06, 0xa7af },
+		{ 0x06, 0x8152 },
+		{ 0x06, 0xaf81 },
+		{ 0x06, 0x8baf },
+		{ 0x06, 0x81c2 },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x4800 },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x4900 },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x5110 },
+		{ 0x06, 0xe483 },
+		{ 0x06, 0x5158 },
+		{ 0x06, 0x019f },
+		{ 0x06, 0xead0 },
+		{ 0x06, 0x00d1 },
+		{ 0x06, 0x801f },
+		{ 0x06, 0x66e2 },
+		{ 0x06, 0xf8ea },
+		{ 0x06, 0xe3f8 },
+		{ 0x06, 0xeb5a },
+		{ 0x06, 0xf81e },
+		{ 0x06, 0x20e6 },
+		{ 0x06, 0xf8ea },
+		{ 0x06, 0xe5f8 },
+		{ 0x06, 0xebd3 },
+		{ 0x06, 0x02b3 },
+		{ 0x06, 0xfee2 },
+		{ 0x06, 0xf87c },
+		{ 0x06, 0xef32 },
+		{ 0x06, 0x5b80 },
+		{ 0x06, 0xe3f8 },
+		{ 0x06, 0x7d9e },
+		{ 0x06, 0x037d },
+		{ 0x06, 0xffff },
+		{ 0x06, 0x0d58 },
+		{ 0x06, 0x1c55 },
+		{ 0x06, 0x1a65 },
+		{ 0x06, 0x11a1 },
+		{ 0x06, 0x90d3 },
+		{ 0x06, 0xe283 },
+		{ 0x06, 0x48e3 },
+		{ 0x06, 0x8349 },
+		{ 0x06, 0x1b56 },
+		{ 0x06, 0xab08 },
+		{ 0x06, 0xef56 },
+		{ 0x06, 0xe683 },
+		{ 0x06, 0x48e7 },
+		{ 0x06, 0x8349 },
+		{ 0x06, 0x10d1 },
+		{ 0x06, 0x801f },
+		{ 0x06, 0x66a0 },
+		{ 0x06, 0x04b9 },
+		{ 0x06, 0xe283 },
+		{ 0x06, 0x48e3 },
+		{ 0x06, 0x8349 },
+		{ 0x06, 0xef65 },
+		{ 0x06, 0xe283 },
+		{ 0x06, 0x4ae3 },
+		{ 0x06, 0x834b },
+		{ 0x06, 0x1b56 },
+		{ 0x06, 0xaa0e },
+		{ 0x06, 0xef56 },
+		{ 0x06, 0xe683 },
+		{ 0x06, 0x4ae7 },
+		{ 0x06, 0x834b },
+		{ 0x06, 0xe283 },
+		{ 0x06, 0x4de6 },
+		{ 0x06, 0x834c },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x4da0 },
+		{ 0x06, 0x000c },
+		{ 0x06, 0xaf81 },
+		{ 0x06, 0x8be0 },
+		{ 0x06, 0x834d },
+		{ 0x06, 0x10e4 },
+		{ 0x06, 0x834d },
+		{ 0x06, 0xae04 },
+		{ 0x06, 0x80e4 },
+		{ 0x06, 0x834d },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x4e78 },
+		{ 0x06, 0x039e },
+		{ 0x06, 0x0be0 },
+		{ 0x06, 0x834e },
+		{ 0x06, 0x7804 },
+		{ 0x06, 0x9e04 },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x4e02 },
+		{ 0x06, 0xe083 },
+		{ 0x06, 0x32e1 },
+		{ 0x06, 0x8333 },
+		{ 0x06, 0x590f },
+		{ 0x06, 0xe283 },
+		{ 0x06, 0x4d0c },
+		{ 0x06, 0x245a },
+		{ 0x06, 0xf01e },
+		{ 0x06, 0x12e4 },
+		{ 0x06, 0xf88c },
+		{ 0x06, 0xe5f8 },
+		{ 0x06, 0x8de0 },
+		{ 0x06, 0x8330 },
+		{ 0x06, 0xe183 },
+		{ 0x06, 0x3168 },
+		{ 0x06, 0x01e4 },
+		{ 0x06, 0xf88a },
+		{ 0x06, 0xe5f8 },
+		{ 0x06, 0x8bae },
+		{ 0x06, 0x37ee },
+		{ 0x06, 0x834e },
+		{ 0x06, 0x03e0 },
+		{ 0x06, 0x834c },
+		{ 0x06, 0xe183 },
+		{ 0x06, 0x4d1b },
+		{ 0x06, 0x019e },
+		{ 0x06, 0x04aa },
+		{ 0x06, 0xa1ae },
+		{ 0x06, 0xa8ee },
+		{ 0x06, 0x834e },
+		{ 0x06, 0x04ee },
+		{ 0x06, 0x834f },
+		{ 0x06, 0x00ae },
+		{ 0x06, 0xabe0 },
+		{ 0x06, 0x834f },
+		{ 0x06, 0x7803 },
+		{ 0x06, 0x9f14 },
+		{ 0x06, 0xee83 },
+		{ 0x06, 0x4e05 },
+		{ 0x06, 0xd240 },
+		{ 0x06, 0xd655 },
+		{ 0x06, 0x5402 },
+		{ 0x06, 0x81c6 },
+		{ 0x06, 0xd2a0 },
+		{ 0x06, 0xd6ba },
+		{ 0x06, 0x0002 },
+		{ 0x06, 0x81c6 },
+		{ 0x06, 0xfefd },
+		{ 0x06, 0xfc05 },
+		{ 0x06, 0xf8e0 },
+		{ 0x06, 0xf860 },
+		{ 0x06, 0xe1f8 },
+		{ 0x06, 0x6168 },
+		{ 0x06, 0x02e4 },
+		{ 0x06, 0xf860 },
+		{ 0x06, 0xe5f8 },
+		{ 0x06, 0x61e0 },
+		{ 0x06, 0xf848 },
+		{ 0x06, 0xe1f8 },
+		{ 0x06, 0x4958 },
+		{ 0x06, 0x0f1e },
+		{ 0x06, 0x02e4 },
+		{ 0x06, 0xf848 },
+		{ 0x06, 0xe5f8 },
+		{ 0x06, 0x49d0 },
+		{ 0x06, 0x0002 },
+		{ 0x06, 0x820a },
+		{ 0x06, 0xbf83 },
+		{ 0x06, 0x50ef },
+		{ 0x06, 0x46dc },
+		{ 0x06, 0x19dd },
+		{ 0x06, 0xd001 },
+		{ 0x06, 0x0282 },
+		{ 0x06, 0x0a02 },
+		{ 0x06, 0x8226 },
+		{ 0x06, 0xe0f8 },
+		{ 0x06, 0x60e1 },
+		{ 0x06, 0xf861 },
+		{ 0x06, 0x58fd },
+		{ 0x06, 0xe4f8 },
+		{ 0x06, 0x60e5 },
+		{ 0x06, 0xf861 },
+		{ 0x06, 0xfc04 },
+		{ 0x06, 0xf9fa },
+		{ 0x06, 0xfbc6 },
+		{ 0x06, 0xbff8 },
+		{ 0x06, 0x40be },
+		{ 0x06, 0x8350 },
+		{ 0x06, 0xa001 },
+		{ 0x06, 0x0107 },
+		{ 0x06, 0x1b89 },
+		{ 0x06, 0xcfd2 },
+		{ 0x06, 0x08eb },
+		{ 0x06, 0xdb19 },
+		{ 0x06, 0xb2fb },
+		{ 0x06, 0xfffe },
+		{ 0x06, 0xfd04 },
+		{ 0x06, 0xf8e0 },
+		{ 0x06, 0xf848 },
+		{ 0x06, 0xe1f8 },
+		{ 0x06, 0x4968 },
+		{ 0x06, 0x08e4 },
+		{ 0x06, 0xf848 },
+		{ 0x06, 0xe5f8 },
+		{ 0x06, 0x4958 },
+		{ 0x06, 0xf7e4 },
+		{ 0x06, 0xf848 },
+		{ 0x06, 0xe5f8 },
+		{ 0x06, 0x49fc },
+		{ 0x06, 0x044d },
+		{ 0x06, 0x2000 },
+		{ 0x06, 0x024e },
+		{ 0x06, 0x2200 },
+		{ 0x06, 0x024d },
+		{ 0x06, 0xdfff },
+		{ 0x06, 0x014e },
+		{ 0x06, 0xddff },
+		{ 0x06, 0x0100 },
+		{ 0x05, 0x83d8 },
+		{ 0x06, 0x8000 },
+		{ 0x03, 0xdc00 },
+		{ 0x05, 0xfff6 },
+		{ 0x06, 0x00fc },
+		{ 0x1f, 0x0000 },
+
+		{ 0x1f, 0x0000 },
+		{ 0x0d, 0xf880 },
+		{ 0x1f, 0x0000 }
 	};
 
 	rtl_phy_write(ioaddr, phy_reg_init_0, ARRAY_SIZE(phy_reg_init_0));
 
-	if (mdio_read(ioaddr, 0x06) == 0xc400) {
-		struct phy_reg phy_reg_init_1[] = {
+	if (rtl8168d_efuse_read(ioaddr, 0x01) == 0xb1) {
+		struct phy_reg phy_reg_init[] = {
+			{ 0x1f, 0x0002 },
+			{ 0x05, 0x669a },
 			{ 0x1f, 0x0005 },
-			{ 0x01, 0x0300 },
-			{ 0x1f, 0x0000 },
-			{ 0x11, 0x401c },
-			{ 0x16, 0x4100 },
+			{ 0x05, 0x8330 },
+			{ 0x06, 0x669a },
+
+			{ 0x1f, 0x0002 }
+		};
+		int val;
+
+		rtl_phy_write(ioaddr, phy_reg_init, ARRAY_SIZE(phy_reg_init));
+
+		val = mdio_read(ioaddr, 0x0d);
+		if ((val & 0x00ff) != 0x006c) {
+			u32 set[] = {
+				0x0065, 0x0066, 0x0067, 0x0068,
+				0x0069, 0x006a, 0x006b, 0x006c
+			};
+			int i;
+
+			mdio_write(ioaddr, 0x1f, 0x0002);
+
+			val &= 0xff00;
+			for (i = 0; i < ARRAY_SIZE(set); i++)
+				mdio_write(ioaddr, 0x0d, val | set[i]);
+		}
+	} else {
+		struct phy_reg phy_reg_init[] = {
+			{ 0x1f, 0x0002 },
+			{ 0x05, 0x2642 },
 			{ 0x1f, 0x0005 },
-			{ 0x07, 0x0010 },
-			{ 0x05, 0x83dc },
-			{ 0x06, 0x087d },
-			{ 0x05, 0x8300 },
-			{ 0x06, 0x0101 },
-			{ 0x06, 0x05f8 },
-			{ 0x06, 0xf9fa },
-			{ 0x06, 0xfbef },
-			{ 0x06, 0x79e2 },
-			{ 0x06, 0x835f },
-			{ 0x06, 0xe0f8 },
-			{ 0x06, 0x9ae1 },
-			{ 0x06, 0xf89b },
-			{ 0x06, 0xef31 },
-			{ 0x06, 0x3b65 },
-			{ 0x06, 0xaa07 },
-			{ 0x06, 0x81e4 },
-			{ 0x06, 0xf89a },
-			{ 0x06, 0xe5f8 },
-			{ 0x06, 0x9baf },
-			{ 0x06, 0x06ae },
-			{ 0x05, 0x83dc },
-			{ 0x06, 0x8300 },
+			{ 0x05, 0x8330 },
+			{ 0x06, 0x2642 }
 		};
 
-		rtl_phy_write(ioaddr, phy_reg_init_1,
-			      ARRAY_SIZE(phy_reg_init_1));
+		rtl_phy_write(ioaddr, phy_reg_init, ARRAY_SIZE(phy_reg_init));
 	}
 
-	mdio_write(ioaddr, 0x1f, 0x0000);
+	mdio_write(ioaddr, 0x1f, 0x0002);
+	mdio_plus_minus(ioaddr, 0x02, 0x0100, 0x0600);
+	mdio_plus_minus(ioaddr, 0x03, 0x0000, 0xe000);
+
+	mdio_write(ioaddr, 0x1f, 0x0001);
+	mdio_write(ioaddr, 0x17, 0x0cc0);
+
+	mdio_write(ioaddr, 0x1f, 0x0002);
+	mdio_patch(ioaddr, 0x0f, 0x0017);
+
+	rtl_phy_write(ioaddr, phy_reg_init_1, ARRAY_SIZE(phy_reg_init_1));
+}
+
+static void rtl8168d_3_hw_phy_config(void __iomem *ioaddr)
+{
+	struct phy_reg phy_reg_init[] = {
+		{ 0x1f, 0x0002 },
+		{ 0x10, 0x0008 },
+		{ 0x0d, 0x006c },
+
+		{ 0x1f, 0x0000 },
+		{ 0x0d, 0xf880 },
+
+		{ 0x1f, 0x0001 },
+		{ 0x17, 0x0cc0 },
+
+		{ 0x1f, 0x0001 },
+		{ 0x0b, 0xa4d8 },
+		{ 0x09, 0x281c },
+		{ 0x07, 0x2883 },
+		{ 0x0a, 0x6b35 },
+		{ 0x1d, 0x3da4 },
+		{ 0x1c, 0xeffd },
+		{ 0x14, 0x7f52 },
+		{ 0x18, 0x7fc6 },
+		{ 0x08, 0x0601 },
+		{ 0x06, 0x4063 },
+		{ 0x10, 0xf074 },
+		{ 0x1f, 0x0003 },
+		{ 0x13, 0x0789 },
+		{ 0x12, 0xf4bd },
+		{ 0x1a, 0x04fd },
+		{ 0x14, 0x84b0 },
+		{ 0x1f, 0x0000 },
+		{ 0x00, 0x9200 },
+
+		{ 0x1f, 0x0005 },
+		{ 0x01, 0x0340 },
+		{ 0x1f, 0x0001 },
+		{ 0x04, 0x4000 },
+		{ 0x03, 0x1d21 },
+		{ 0x02, 0x0c32 },
+		{ 0x01, 0x0200 },
+		{ 0x00, 0x5554 },
+		{ 0x04, 0x4800 },
+		{ 0x04, 0x4000 },
+		{ 0x04, 0xf000 },
+		{ 0x03, 0xdf01 },
+		{ 0x02, 0xdf20 },
+		{ 0x01, 0x101a },
+		{ 0x00, 0xa0ff },
+		{ 0x04, 0xf800 },
+		{ 0x04, 0xf000 },
+		{ 0x1f, 0x0000 },
+
+		{ 0x1f, 0x0007 },
+		{ 0x1e, 0x0023 },
+		{ 0x16, 0x0000 },
+		{ 0x1f, 0x0000 }
+	};
+
+	rtl_phy_write(ioaddr, phy_reg_init, ARRAY_SIZE(phy_reg_init));
 }
 
 static void rtl8102e_hw_phy_config(void __iomem *ioaddr)
@@ -1792,7 +2661,13 @@ static void rtl_hw_phy_config(struct net_device *dev)
 		rtl8168cp_2_hw_phy_config(ioaddr);
 		break;
 	case RTL_GIGA_MAC_VER_25:
-		rtl8168d_hw_phy_config(ioaddr);
+		rtl8168d_1_hw_phy_config(ioaddr);
+		break;
+	case RTL_GIGA_MAC_VER_26:
+		rtl8168d_2_hw_phy_config(ioaddr);
+		break;
+	case RTL_GIGA_MAC_VER_27:
+		rtl8168d_3_hw_phy_config(ioaddr);
 		break;
 
 	default:
@@ -2200,6 +3075,11 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	tp->pcie_cap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
 	if (!tp->pcie_cap && netif_msg_probe(tp))
 		dev_info(&pdev->dev, "no PCI Express capability\n");
+	else {
+		pci_write_config_word(pdev, tp->pcie_cap + PCI_EXP_DEVSTA,
+				      PCI_EXP_DEVSTA_CED | PCI_EXP_DEVSTA_NFED |
+				      PCI_EXP_DEVSTA_FED | PCI_EXP_DEVSTA_URD);
+	}
 
 	RTL_W16(IntrMask, 0x0000);
 
@@ -2863,6 +3743,8 @@ static void rtl_hw_start_8168(struct net_device *dev)
 	break;
 
 	case RTL_GIGA_MAC_VER_25:
+	case RTL_GIGA_MAC_VER_26:
+	case RTL_GIGA_MAC_VER_27:
 		rtl_hw_start_8168d(ioaddr, pdev);
 	break;
 
-- 
Ueimor

^ permalink raw reply related

* Re: N_PPP_SYNC ldisc BUG: sleeping function called from invalid context
From: Tilman Schmidt @ 2009-09-30 22:00 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Alan Cox, linux-kernel, netdev, Alan Cox
In-Reply-To: <4AC3BF62.9020706@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 557 bytes --]

Jarek Poplawski schrieb:
> Tilman Schmidt wrote, On 09/30/2009 08:55 PM:
> ...
>> Why then does mutex_lock() complain?
> 
> 
> Maybe it doesn't matter here, but this: 
> 
>> INFO: lockdep is turned off.
> suggests there was some lockdep issue/warning earlier.

That's just because some <deleted> decided that tainted kernels don't deserve lockdep.

-- 
Tilman Schmidt                    E-Mail: tilman@imap.cc
Bonn, Germany
Diese Nachricht besteht zu 100% aus wiederverwerteten Bits.
Ungeöffnet mindestens haltbar bis: (siehe Rückseite)


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 254 bytes --]

^ permalink raw reply

* [PATCH 2.6.32-rc1] net: VMware virtual Ethernet NIC driver: vmxnet3
From: Shreyas Bhatewara @ 2009-09-30 21:34 UTC (permalink / raw)
  To: linux-kernel, netdev, Stephen Hemminger, David S. Miller, Jeff 
  Cc: Anthony Liguori, Chris Wright, Greg Kroah-Hartman, Andrew Morton,
	virtualization, pv-drivers


Ethernet NIC driver for VMware's vmxnet3

From: Shreyas Bhatewara <sbhatewara@vmware.com>

This patch adds driver support for VMware's virtual Ethernet NIC: vmxnet3
Guests running on VMware hypervisors supporting vmxnet3 device will thus have 
access to improved network functionalities and performance.

Signed-off-by: Shreyas Bhatewara <sbhatewara@vmware.com>

---

VMware virtual Ethernet NIC Driver : vmxnet3 - v2

Changelog (v2-v1)
- Rebased the patch to v2.6.32-rc1
- Changed all uint32_t types to u32 and friends
- Removed duplicate max queue size from upt1_defs.h
- Replaced #defines by enum
- uniform spacing between datatype and membername in structures
- removed some noisy printks, eliminated some BUG_ONs
- corrected arguments of kcalloc
- used pc_request_selected_regions, pci_enable_dev_mem
- elminated not-so-useful wrapper functions, used eth_op_ functions instead
- used strlcpy
- used get_sset_counts instead of get_stats_count
- used net_device_stats from struct net_device


Please review the patch and provide your feedback.

Thanking you
->Shreyas

---

diff --git a/MAINTAINERS b/MAINTAINERS
index c450f3a..f65398a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5608,6 +5608,13 @@ S:	Maintained
  F:	drivers/vlynq/vlynq.c
  F:	include/linux/vlynq.h

+VMWARE VMXNET3 ETHERNET DRIVER
+M:     Shreyas Bhatewara <sbhatewara@vmware.com>
+M:     VMware PV-Drivers <pv-drivers@vmware.com>
+L:     netdev@vger.kernel.org
+S:     Maintained
+F:     drivers/net/vmxnet3/
+
  VOLTAGE AND CURRENT REGULATOR FRAMEWORK
  M:	Liam Girdwood <lrg@slimlogic.co.uk>
  M:	Mark Brown <broonie@opensource.wolfsonmicro.com>
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 2bea67c..3b131d2 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -3223,4 +3223,12 @@ config VIRTIO_NET
  	  This is the virtual network driver for virtio.  It can be used with
            lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.

+config VMXNET3
+       tristate "VMware VMXNET3 ethernet driver"
+       depends on PCI && X86
+       help
+         This driver supports VMware's vmxnet3 virtual ethernet NIC.
+         To compile this driver as a module, choose M here: the
+         module will be called vmxnet3.
+
  endif # NETDEVICES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index ae8cd30..49d28a3 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_TEHUTI) += tehuti.o
  obj-$(CONFIG_ENIC) += enic/
  obj-$(CONFIG_JME) += jme.o
  obj-$(CONFIG_BE2NET) += benet/
+obj-$(CONFIG_VMXNET3) += vmxnet3/

  gianfar_driver-objs := gianfar.o \
  		gianfar_ethtool.o \
diff --git a/drivers/net/vmxnet3/Makefile b/drivers/net/vmxnet3/Makefile
new file mode 100644
index 0000000..880f509
--- /dev/null
+++ b/drivers/net/vmxnet3/Makefile
@@ -0,0 +1,35 @@
+################################################################################
+#
+# Linux driver for VMware's vmxnet3 ethernet NIC.
+#
+# Copyright (C) 2007-2009, VMware, Inc. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; version 2 of the License and no later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+# NON INFRINGEMENT.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# The full GNU General Public License is included in this distribution in
+# the file called "COPYING".
+#
+# Maintained by: Shreyas Bhatewara <pv-drivers@vmware.com>
+#
+#
+################################################################################
+
+#
+# Makefile for the VMware vmxnet3 ethernet NIC driver
+#
+
+obj-$(CONFIG_VMXNET3) += vmxnet3.o
+
+vmxnet3-objs := vmxnet3_drv.o vmxnet3_ethtool.o
diff --git a/drivers/net/vmxnet3/upt1_defs.h b/drivers/net/vmxnet3/upt1_defs.h
new file mode 100644
index 0000000..6a49d04
--- /dev/null
+++ b/drivers/net/vmxnet3/upt1_defs.h
@@ -0,0 +1,99 @@
+/*
+ * Linux driver for VMware's vmxnet3 ethernet NIC.
+ *
+ * Copyright (C) 2008-2009, VMware, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Maintained by: Shreyas Bhatewara <pv-drivers@vmware.com>
+ *
+ * Definitions for Uniform Pass Through, a framework developed by VMware
+ * along with it's IHV partners to implement the fast path features of
+ * vmxnet3 in silicon.
+ */
+
+#ifndef _UPT1_DEFS_H
+#define _UPT1_DEFS_H
+
+struct UPT1_TxStats {
+	u64			TSOPktsTxOK;  /* TSO pkts post-segmentation */
+	u64			TSOBytesTxOK;
+	u64			ucastPktsTxOK;
+	u64			ucastBytesTxOK;
+	u64			mcastPktsTxOK;
+	u64			mcastBytesTxOK;
+	u64			bcastPktsTxOK;
+	u64			bcastBytesTxOK;
+	u64			pktsTxError;
+	u64			pktsTxDiscard;
+};
+
+struct UPT1_RxStats {
+	u64			LROPktsRxOK;    /* LRO pkts */
+	u64			LROBytesRxOK;   /* bytes from LRO pkts */
+	/* the following counters are for pkts from the wire, i.e., pre-LRO */
+	u64			ucastPktsRxOK;
+	u64			ucastBytesRxOK;
+	u64			mcastPktsRxOK;
+	u64			mcastBytesRxOK;
+	u64			bcastPktsRxOK;
+	u64			bcastBytesRxOK;
+	u64			pktsRxOutOfBuf;
+	u64			pktsRxError;
+};
+
+/* interrupt moderation level */
+enum {
+	UPT1_IML_NONE		= 0, /* no interrupt moderation */
+	UPT1_IML_HIGHEST	= 7, /* least intr generated */
+	UPT1_IML_ADAPTIVE	= 8, /* adpative intr moderation */
+};
+/* values for UPT1_RSSConf.hashFunc */
+enum {
+	UPT1_RSS_HASH_TYPE_NONE      = 0x0,
+	UPT1_RSS_HASH_TYPE_IPV4      = 0x01,
+	UPT1_RSS_HASH_TYPE_TCP_IPV4  = 0x02,
+	UPT1_RSS_HASH_TYPE_IPV6      = 0x04,
+	UPT1_RSS_HASH_TYPE_TCP_IPV6  = 0x08,
+};
+
+enum {
+	UPT1_RSS_HASH_FUNC_NONE      = 0x0,
+	UPT1_RSS_HASH_FUNC_TOEPLITZ  = 0x01,
+};
+
+#define UPT1_RSS_MAX_KEY_SIZE        40
+#define UPT1_RSS_MAX_IND_TABLE_SIZE  128
+
+struct UPT1_RSSConf {
+	u16			hashType;
+	u16			hashFunc;
+	u16			hashKeySize;
+	u16			indTableSize;
+	u8			hashKey[UPT1_RSS_MAX_KEY_SIZE];
+	u8			indTable[UPT1_RSS_MAX_IND_TABLE_SIZE];
+};
+
+/* features */
+enum {
+	UPT1_F_RXCSUM		= 0x0001,   /* rx csum verification */
+	UPT1_F_RSS		= 0x0002,
+	UPT1_F_RXVLAN		= 0x0004,   /* VLAN tag stripping */
+	UPT1_F_LRO		= 0x0008,
+};
+#endif
diff --git a/drivers/net/vmxnet3/vmxnet3_defs.h 
b/drivers/net/vmxnet3/vmxnet3_defs.h
new file mode 100644
index 0000000..dc8ee44
--- /dev/null
+++ b/drivers/net/vmxnet3/vmxnet3_defs.h
@@ -0,0 +1,535 @@
+/*
+ * Linux driver for VMware's vmxnet3 ethernet NIC.
+ *
+ * Copyright (C) 2008-2009, VMware, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Maintained by: Shreyas Bhatewara <pv-drivers@vmware.com>
+ *
+ */
+
+#ifndef _VMXNET3_DEFS_H_
+#define _VMXNET3_DEFS_H_
+
+#include "upt1_defs.h"
+
+/* all registers are 32 bit wide */
+/* BAR 1 */
+enum {
+	VMXNET3_REG_VRRS	= 0x0,	/* Vmxnet3 Revision Report Selection */
+	VMXNET3_REG_UVRS	= 0x8,	/* UPT Version Report Selection */
+	VMXNET3_REG_DSAL	= 0x10,	/* Driver Shared Address Low */
+	VMXNET3_REG_DSAH	= 0x18,	/* Driver Shared Address High */
+	VMXNET3_REG_CMD		= 0x20,	/* Command */
+	VMXNET3_REG_MACL	= 0x28,	/* MAC Address Low */
+	VMXNET3_REG_MACH	= 0x30,	/* MAC Address High */
+	VMXNET3_REG_ICR		= 0x38,	/* Interrupt Cause Register */
+	VMXNET3_REG_ECR		= 0x40	/* Event Cause Register */
+};
+
+/* BAR 0 */
+enum {
+	VMXNET3_REG_IMR		= 0x0,	 /* Interrupt Mask Register */
+	VMXNET3_REG_TXPROD	= 0x600, /* Tx Producer Index */
+	VMXNET3_REG_RXPROD	= 0x800, /* Rx Producer Index for ring 1 */
+	VMXNET3_REG_RXPROD2	= 0xA00	 /* Rx Producer Index for ring 2 */
+};
+
+#define VMXNET3_PT_REG_SIZE     4096	/* BAR 0 */
+#define VMXNET3_VD_REG_SIZE     4096	/* BAR 1 */
+
+#define VMXNET3_REG_ALIGN       8	/* All registers are 8-byte aligned. */
+#define VMXNET3_REG_ALIGN_MASK  0x7
+
+/* I/O Mapped access to registers */
+#define VMXNET3_IO_TYPE_PT              0
+#define VMXNET3_IO_TYPE_VD              1
+#define VMXNET3_IO_ADDR(type, reg)      (((type) << 24) | ((reg) & 0xFFFFFF))
+#define VMXNET3_IO_TYPE(addr)           ((addr) >> 24)
+#define VMXNET3_IO_REG(addr)            ((addr) & 0xFFFFFF)
+
+enum {
+	VMXNET3_CMD_FIRST_SET = 0xCAFE0000,
+	VMXNET3_CMD_ACTIVATE_DEV = VMXNET3_CMD_FIRST_SET,
+	VMXNET3_CMD_QUIESCE_DEV,
+	VMXNET3_CMD_RESET_DEV,
+	VMXNET3_CMD_UPDATE_RX_MODE,
+	VMXNET3_CMD_UPDATE_MAC_FILTERS,
+	VMXNET3_CMD_UPDATE_VLAN_FILTERS,
+	VMXNET3_CMD_UPDATE_RSSIDT,
+	VMXNET3_CMD_UPDATE_IML,
+	VMXNET3_CMD_UPDATE_PMCFG,
+	VMXNET3_CMD_UPDATE_FEATURE,
+	VMXNET3_CMD_LOAD_PLUGIN,
+
+	VMXNET3_CMD_FIRST_GET = 0xF00D0000,
+	VMXNET3_CMD_GET_QUEUE_STATUS = VMXNET3_CMD_FIRST_GET,
+	VMXNET3_CMD_GET_STATS,
+	VMXNET3_CMD_GET_LINK,
+	VMXNET3_CMD_GET_PERM_MAC_LO,
+	VMXNET3_CMD_GET_PERM_MAC_HI,
+	VMXNET3_CMD_GET_DID_LO,
+	VMXNET3_CMD_GET_DID_HI,
+	VMXNET3_CMD_GET_DEV_EXTRA_INFO,
+	VMXNET3_CMD_GET_CONF_INTR
+};
+
+struct Vmxnet3_TxDesc {
+	u64		addr;
+
+	u32		len:14;
+	u32		gen:1;      /* generation bit */
+	u32		rsvd:1;
+	u32		dtype:1;    /* descriptor type */
+	u32		ext1:1;
+	u32		msscof:14;  /* MSS, checksum offset, flags */
+
+	u32		hlen:10;    /* header len */
+	u32		om:2;       /* offload mode */
+	u32		eop:1;      /* End Of Packet */
+	u32		cq:1;       /* completion request */
+	u32		ext2:1;
+	u32		ti:1;       /* VLAN Tag Insertion */
+	u32		tci:16;     /* Tag to Insert */
+};
+
+/* TxDesc.OM values */
+#define VMXNET3_OM_NONE		0
+#define VMXNET3_OM_CSUM		2
+#define VMXNET3_OM_TSO		3
+
+/* fields in TxDesc we access w/o using bit fields */
+#define VMXNET3_TXD_EOP_SHIFT	12
+#define VMXNET3_TXD_CQ_SHIFT	13
+#define VMXNET3_TXD_GEN_SHIFT	14
+
+#define VMXNET3_TXD_CQ		(1 << VMXNET3_TXD_CQ_SHIFT)
+#define VMXNET3_TXD_EOP		(1 << VMXNET3_TXD_EOP_SHIFT)
+#define VMXNET3_TXD_GEN		(1 << VMXNET3_TXD_GEN_SHIFT)
+
+#define VMXNET3_HDR_COPY_SIZE   128
+
+
+struct Vmxnet3_TxDataDesc {
+	u8		data[VMXNET3_HDR_COPY_SIZE];
+};
+
+
+struct Vmxnet3_TxCompDesc {
+	u32		txdIdx:12;    /* Index of the EOP TxDesc */
+	u32		ext1:20;
+
+	u32		ext2;
+	u32		ext3;
+
+	u32		rsvd:24;
+	u32		type:7;       /* completion type */
+	u32		gen:1;        /* generation bit */
+};
+
+
+struct Vmxnet3_RxDesc {
+	u64		addr;
+
+	u32		len:14;
+	u32		btype:1;      /* Buffer Type */
+	u32		dtype:1;      /* Descriptor type */
+	u32		rsvd:15;
+	u32		gen:1;        /* Generation bit */
+
+	u32		ext1;
+};
+
+/* values of RXD.BTYPE */
+#define VMXNET3_RXD_BTYPE_HEAD   0    /* head only */
+#define VMXNET3_RXD_BTYPE_BODY   1    /* body only */
+
+/* fields in RxDesc we access w/o using bit fields */
+#define VMXNET3_RXD_BTYPE_SHIFT  14
+#define VMXNET3_RXD_GEN_SHIFT    31
+
+
+struct Vmxnet3_RxCompDesc {
+	u32		rxdIdx:12;    /* Index of the RxDesc */
+	u32		ext1:2;
+	u32		eop:1;        /* End of Packet */
+	u32		sop:1;        /* Start of Packet */
+	u32		rqID:10;      /* rx queue/ring ID */
+	u32		rssType:4;    /* RSS hash type used */
+	u32		cnc:1;        /* Checksum Not Calculated */
+	u32		ext2:1;
+
+	u32		rssHash;      /* RSS hash value */
+
+	u32		len:14;       /* data length */
+	u32		err:1;        /* Error */
+	u32		ts:1;         /* Tag is stripped */
+	u32		tci:16;       /* Tag stripped */
+
+	u32		csum:16;
+	u32		tuc:1;        /* TCP/UDP Checksum Correct */
+	u32		udp:1;        /* UDP packet */
+	u32		tcp:1;        /* TCP packet */
+	u32		ipc:1;        /* IP Checksum Correct */
+	u32		v6:1;         /* IPv6 */
+	u32		v4:1;         /* IPv4 */
+	u32		frg:1;        /* IP Fragment */
+	u32		fcs:1;        /* Frame CRC correct */
+	u32		type:7;       /* completion type */
+	u32		gen:1;        /* generation bit */
+};
+
+/* fields in RxCompDesc we access via Vmxnet3_GenericDesc.dword[3] */
+#define VMXNET3_RCD_TUC_SHIFT	16
+#define VMXNET3_RCD_IPC_SHIFT	19
+
+/* fields in RxCompDesc we access via Vmxnet3_GenericDesc.qword[1] */
+#define VMXNET3_RCD_TYPE_SHIFT	56
+#define VMXNET3_RCD_GEN_SHIFT	63
+
+/* csum OK for TCP/UDP pkts over IP */
+#define VMXNET3_RCD_CSUM_OK (1 << VMXNET3_RCD_TUC_SHIFT | \
+			     1 << VMXNET3_RCD_IPC_SHIFT)
+
+/* value of RxCompDesc.rssType */
+enum {
+	VMXNET3_RCD_RSS_TYPE_NONE     = 0,
+	VMXNET3_RCD_RSS_TYPE_IPV4     = 1,
+	VMXNET3_RCD_RSS_TYPE_TCPIPV4  = 2,
+	VMXNET3_RCD_RSS_TYPE_IPV6     = 3,
+	VMXNET3_RCD_RSS_TYPE_TCPIPV6  = 4,
+};
+
+
+/* a union for accessing all cmd/completion descriptors */
+union Vmxnet3_GenericDesc {
+	u64				qword[2];
+	u32				dword[4];
+	u16				word[8];
+	struct Vmxnet3_TxDesc		txd;
+	struct Vmxnet3_RxDesc		rxd;
+	struct Vmxnet3_TxCompDesc	tcd;
+	struct Vmxnet3_RxCompDesc	rcd;
+};
+
+#define VMXNET3_INIT_GEN       1
+
+/* Max size of a single tx buffer */
+#define VMXNET3_MAX_TX_BUF_SIZE  (1 << 14)
+
+/* # of tx desc needed for a tx buffer size */
+#define VMXNET3_TXD_NEEDED(size) (((size) + VMXNET3_MAX_TX_BUF_SIZE - 1) / \
+				  VMXNET3_MAX_TX_BUF_SIZE)
+
+/* max # of tx descs for a non-tso pkt */
+#define VMXNET3_MAX_TXD_PER_PKT 16
+
+/* Max size of a single rx buffer */
+#define VMXNET3_MAX_RX_BUF_SIZE  ((1 << 14) - 1)
+/* Minimum size of a type 0 buffer */
+#define VMXNET3_MIN_T0_BUF_SIZE  128
+#define VMXNET3_MAX_CSUM_OFFSET  1024
+
+/* Ring base address alignment */
+#define VMXNET3_RING_BA_ALIGN   512
+#define VMXNET3_RING_BA_MASK    (VMXNET3_RING_BA_ALIGN - 1)
+
+/* Ring size must be a multiple of 32 */
+#define VMXNET3_RING_SIZE_ALIGN 32
+#define VMXNET3_RING_SIZE_MASK  (VMXNET3_RING_SIZE_ALIGN - 1)
+
+/* Max ring size */
+#define VMXNET3_TX_RING_MAX_SIZE   4096
+#define VMXNET3_TC_RING_MAX_SIZE   4096
+#define VMXNET3_RX_RING_MAX_SIZE   4096
+#define VMXNET3_RC_RING_MAX_SIZE   8192
+
+/* a list of reasons for queue stop */
+
+enum {
+ VMXNET3_ERR_NOEOP        = 0x80000000,  /* cannot find the EOP desc of a pkt 
*/
+ VMXNET3_ERR_TXD_REUSE    = 0x80000001,  /* reuse TxDesc before tx completion 
*/
+ VMXNET3_ERR_BIG_PKT      = 0x80000002,  /* too many TxDesc for a pkt */
+ VMXNET3_ERR_DESC_NOT_SPT = 0x80000003,  /* descriptor type not supported */
+ VMXNET3_ERR_SMALL_BUF    = 0x80000004,  /* type 0 buffer too small */
+ VMXNET3_ERR_STRESS       = 0x80000005,  /* stress option firing in vmkernel 
*/
+ VMXNET3_ERR_SWITCH       = 0x80000006,  /* mode switch failure */
+ VMXNET3_ERR_TXD_INVALID  = 0x80000007,  /* invalid TxDesc */
+};
+
+/* completion descriptor types */
+#define VMXNET3_CDTYPE_TXCOMP      0    /* Tx Completion Descriptor */
+#define VMXNET3_CDTYPE_RXCOMP      3    /* Rx Completion Descriptor */
+
+enum {
+	VMXNET3_GOS_BITS_UNK    = 0,   /* unknown */
+	VMXNET3_GOS_BITS_32     = 1,
+	VMXNET3_GOS_BITS_64     = 2,
+};
+
+#define VMXNET3_GOS_TYPE_LINUX	1
+
+
+struct Vmxnet3_GOSInfo {
+	u32				gosBits:2;	/* 32-bit or 64-bit? */
+	u32				gosType:4;   /* which guest */
+	u32				gosVer:16;   /* gos version */
+	u32				gosMisc:10;  /* other info about gos */
+};
+
+
+struct Vmxnet3_DriverInfo {
+	u32				version;
+	struct Vmxnet3_GOSInfo		gos;
+	u32				vmxnet3RevSpt;
+	u32				uptVerSpt;
+};
+
+
+#define VMXNET3_REV1_MAGIC  0xbabefee1
+
+/*
+ * QueueDescPA must be 128 bytes aligned. It points to an array of
+ * Vmxnet3_TxQueueDesc followed by an array of Vmxnet3_RxQueueDesc.
+ * The number of Vmxnet3_TxQueueDesc/Vmxnet3_RxQueueDesc are specified by
+ * Vmxnet3_MiscConf.numTxQueues/numRxQueues, respectively.
+ */
+#define VMXNET3_QUEUE_DESC_ALIGN  128
+
+
+struct Vmxnet3_MiscConf {
+	struct Vmxnet3_DriverInfo driverInfo;
+	u64		uptFeatures;
+	u64		ddPA;         /* driver data PA */
+	u64		queueDescPA;  /* queue descriptor table PA */
+	u32		ddLen;        /* driver data len */
+	u32		queueDescLen; /* queue desc. table len in bytes */
+	u32		mtu;
+	u16		maxNumRxSG;
+	u8		numTxQueues;
+	u8		numRxQueues;
+	u32		reserved[4];
+};
+
+
+struct Vmxnet3_TxQueueConf {
+	u64		txRingBasePA;
+	u64		dataRingBasePA;
+	u64		compRingBasePA;
+	u64		ddPA;         /* driver data */
+	u64		reserved;
+	u32		txRingSize;   /* # of tx desc */
+	u32		dataRingSize; /* # of data desc */
+	u32		compRingSize; /* # of comp desc */
+	u32		ddLen;        /* size of driver data */
+	u8		intrIdx;
+	u8		_pad[7];
+};
+
+
+struct Vmxnet3_RxQueueConf {
+	u64		rxRingBasePA[2];
+	u64		compRingBasePA;
+	u64		ddPA;            /* driver data */
+	u64		reserved;
+	u32		rxRingSize[2];   /* # of rx desc */
+	u32		compRingSize;    /* # of rx comp desc */
+	u32		ddLen;           /* size of driver data */
+	u8		intrIdx;
+	u8		_pad[7];
+};
+
+
+enum vmxnet3_intr_mask_mode {
+	VMXNET3_IMM_AUTO   = 0,
+	VMXNET3_IMM_ACTIVE = 1,
+	VMXNET3_IMM_LAZY   = 2
+};
+
+enum vmxnet3_intr_type {
+	VMXNET3_IT_AUTO = 0,
+	VMXNET3_IT_INTX = 1,
+	VMXNET3_IT_MSI  = 2,
+	VMXNET3_IT_MSIX = 3
+};
+
+#define VMXNET3_MAX_TX_QUEUES  8
+#define VMXNET3_MAX_RX_QUEUES  16
+/* addition 1 for events */
+#define VMXNET3_MAX_INTRS      25
+
+
+struct Vmxnet3_IntrConf {
+	bool		autoMask;
+	u8		numIntrs;      /* # of interrupts */
+	u8		eventIntrIdx;
+	u8		modLevels[VMXNET3_MAX_INTRS];	/* moderation level for
+							 * each intr */
+	u32		reserved[3];
+};
+
+/* one bit per VLAN ID, the size is in the units of u32	*/
+#define VMXNET3_VFT_SIZE  (4096 / (sizeof(u32) * 8))
+
+
+struct Vmxnet3_QueueStatus {
+	bool		stopped;
+	u8		_pad[3];
+	u32		error;
+};
+
+
+struct Vmxnet3_TxQueueCtrl {
+	u32		txNumDeferred;
+	u32		txThreshold;
+	u64		reserved;
+};
+
+
+struct Vmxnet3_RxQueueCtrl {
+	bool		updateRxProd;
+	u8		_pad[7];
+	u64		reserved;
+};
+
+enum {
+	VMXNET3_RXM_UCAST     = 0x01,  /* unicast only */
+	VMXNET3_RXM_MCAST     = 0x02,  /* multicast passing the filters */
+	VMXNET3_RXM_BCAST     = 0x04,  /* broadcast only */
+	VMXNET3_RXM_ALL_MULTI = 0x08,  /* all multicast */
+	VMXNET3_RXM_PROMISC   = 0x10  /* promiscuous */
+};
+
+struct Vmxnet3_RxFilterConf {
+	u32		rxMode;       /* VMXNET3_RXM_xxx */
+	u16		mfTableLen;   /* size of the multicast filter table */
+	u16		_pad1;
+	u64		mfTablePA;    /* PA of the multicast filters table */
+	u32		vfTable[VMXNET3_VFT_SIZE]; /* vlan filter */
+};
+
+
+#define VMXNET3_PM_MAX_FILTERS        6
+#define VMXNET3_PM_MAX_PATTERN_SIZE   128
+#define VMXNET3_PM_MAX_MASK_SIZE      (VMXNET3_PM_MAX_PATTERN_SIZE / 8)
+
+#define VMXNET3_PM_WAKEUP_MAGIC       0x01  /* wake up on magic pkts */
+#define VMXNET3_PM_WAKEUP_FILTER      0x02  /* wake up on pkts matching
+					     * filters */
+
+
+struct Vmxnet3_PM_PktFilter {
+	u8		maskSize;
+	u8		patternSize;
+	u8		mask[VMXNET3_PM_MAX_MASK_SIZE];
+	u8		pattern[VMXNET3_PM_MAX_PATTERN_SIZE];
+	u8		pad[6];
+};
+
+
+struct Vmxnet3_PMConf {
+	u16		wakeUpEvents;  /* VMXNET3_PM_WAKEUP_xxx */
+	u8		numFilters;
+	u8		pad[5];
+	struct Vmxnet3_PM_PktFilter filters[VMXNET3_PM_MAX_FILTERS];
+};
+
+
+struct Vmxnet3_VariableLenConfDesc {
+	u32		confVer;
+	u32		confLen;
+	u64		confPA;
+};
+
+
+struct Vmxnet3_TxQueueDesc {
+	struct Vmxnet3_TxQueueCtrl		ctrl;
+	struct Vmxnet3_TxQueueConf		conf;
+
+	/* Driver read after a GET command */
+	struct Vmxnet3_QueueStatus		status;
+	struct UPT1_TxStats			stats;
+	u8					_pad[88]; /* 128 aligned */
+};
+
+
+struct Vmxnet3_RxQueueDesc {
+	struct Vmxnet3_RxQueueCtrl		ctrl;
+	struct Vmxnet3_RxQueueConf		conf;
+	/* Driver read after a GET commad */
+	struct Vmxnet3_QueueStatus		status;
+	struct UPT1_RxStats			stats;
+	u8				      __pad[88]; /* 128 aligned */
+};
+
+
+struct Vmxnet3_DSDevRead {
+	/* read-only region for device, read by dev in response to a SET cmd */
+	struct Vmxnet3_MiscConf			misc;
+	struct Vmxnet3_IntrConf			intrConf;
+	struct Vmxnet3_RxFilterConf		rxFilterConf;
+	struct Vmxnet3_VariableLenConfDesc	rssConfDesc;
+	struct Vmxnet3_VariableLenConfDesc	pmConfDesc;
+	struct Vmxnet3_VariableLenConfDesc	pluginConfDesc;
+};
+
+/* All structures in DriverShared are padded to multiples of 8 bytes */
+struct Vmxnet3_DriverShared {
+	u32				magic;
+	/* make devRead start at 64bit boundaries */
+	u32					pad;
+	struct Vmxnet3_DSDevRead		devRead;
+	u32					ecr;
+	u32					reserved[5];
+};
+
+
+#define VMXNET3_ECR_RQERR       (1 << 0)
+#define VMXNET3_ECR_TQERR       (1 << 1)
+#define VMXNET3_ECR_LINK        (1 << 2)
+#define VMXNET3_ECR_DIC         (1 << 3)
+#define VMXNET3_ECR_DEBUG       (1 << 4)
+
+/* flip the gen bit of a ring */
+#define VMXNET3_FLIP_RING_GEN(gen) ((gen) = (gen) ^ 0x1)
+
+/* only use this if moving the idx won't affect the gen bit */
+#define VMXNET3_INC_RING_IDX_ONLY(idx, ring_size) \
+	do {\
+		(idx)++;\
+		if (unlikely((idx) == (ring_size))) {\
+			(idx) = 0;\
+		} \
+	} while (0)
+
+#define VMXNET3_SET_VFTABLE_ENTRY(vfTable, vid) \
+	(vfTable[vid >> 5] |= (1 << (vid & 31)))
+#define VMXNET3_CLEAR_VFTABLE_ENTRY(vfTable, vid) \
+	(vfTable[vid >> 5] &= ~(1 << (vid & 31)))
+
+#define VMXNET3_VFTABLE_ENTRY_IS_SET(vfTable, vid) \
+	((vfTable[vid >> 5] & (1 << (vid & 31))) != 0)
+
+#define VMXNET3_MAX_MTU     9000
+#define VMXNET3_MIN_MTU     60
+
+#define VMXNET3_LINK_UP         (10000 << 16 | 1)    /* 10 Gbps, up */
+#define VMXNET3_LINK_DOWN       0
+
+#endif /* _VMXNET3_DEFS_H_ */
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c 
b/drivers/net/vmxnet3/vmxnet3_drv.c
new file mode 100644
index 0000000..b47fd41
--- /dev/null
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -0,0 +1,2561 @@
+/*
+ * Linux driver for VMware's vmxnet3 ethernet NIC.
+ *
+ * Copyright (C) 2008-2009, VMware, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Maintained by: Shreyas Bhatewara <pv-drivers@vmware.com>
+ *
+ */
+
+#include "vmxnet3_int.h"
+
+char vmxnet3_driver_name[] = "vmxnet3";
+#define VMXNET3_DRIVER_DESC "VMware vmxnet3 virtual NIC driver"
+
+
+/*
+ * PCI Device ID Table
+ * Last entry must be all 0s
+ */
+static const struct pci_device_id vmxnet3_pciid_table[] = {
+	{PCI_VDEVICE(VMWARE, PCI_DEVICE_ID_VMWARE_VMXNET3)},
+	{0}
+};
+
+MODULE_DEVICE_TABLE(pci, vmxnet3_pciid_table);
+
+static int disable_lro;
+static atomic_t devices_found;
+
+
+/*
+ *    Enable/Disable the given intr
+ */
+static void
+vmxnet3_enable_intr(struct vmxnet3_adapter *adapter, unsigned intr_idx)
+{
+	VMXNET3_WRITE_BAR0_REG(adapter, VMXNET3_REG_IMR + intr_idx * 8, 0);
+}
+
+
+static void
+vmxnet3_disable_intr(struct vmxnet3_adapter *adapter, unsigned intr_idx)
+{
+	VMXNET3_WRITE_BAR0_REG(adapter, VMXNET3_REG_IMR + intr_idx * 8, 1);
+}
+
+
+/*
+ *    Enable/Disable all intrs used by the device
+ */
+static void
+vmxnet3_enable_all_intrs(struct vmxnet3_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->intr.num_intrs; i++)
+		vmxnet3_enable_intr(adapter, i);
+}
+
+
+static void
+vmxnet3_disable_all_intrs(struct vmxnet3_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->intr.num_intrs; i++)
+		vmxnet3_disable_intr(adapter, i);
+}
+
+
+static void
+vmxnet3_ack_events(struct vmxnet3_adapter *adapter, u32 events)
+{
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_ECR, events);
+}
+
+
+static bool
+vmxnet3_tq_stopped(struct vmxnet3_tx_queue *tq, struct vmxnet3_adapter 
*adapter)
+{
+	return netif_queue_stopped(adapter->netdev);
+}
+
+
+static void
+vmxnet3_tq_start(struct vmxnet3_tx_queue *tq, struct vmxnet3_adapter *adapter)
+{
+	tq->stopped = false;
+	netif_start_queue(adapter->netdev);
+}
+
+
+static void
+vmxnet3_tq_wake(struct vmxnet3_tx_queue *tq, struct vmxnet3_adapter *adapter)
+{
+	tq->stopped = false;
+	netif_wake_queue(adapter->netdev);
+}
+
+
+static void
+vmxnet3_tq_stop(struct vmxnet3_tx_queue *tq, struct vmxnet3_adapter *adapter)
+{
+	tq->stopped = true;
+	tq->num_stop++;
+	netif_stop_queue(adapter->netdev);
+}
+
+
+/*
+ * Check the link state. This may start or stop the tx queue.
+ */
+static void
+vmxnet3_check_link(struct vmxnet3_adapter *adapter)
+{
+	u32 ret;
+
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, VMXNET3_CMD_GET_LINK);
+	ret = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_CMD);
+	adapter->link_speed = ret >> 16;
+	if (ret & 1) { /* Link is up. */
+		printk(KERN_INFO "%s: NIC Link is Up %d Mbps\n",
+		       adapter->netdev->name, adapter->link_speed);
+		if (!netif_carrier_ok(adapter->netdev))
+			netif_carrier_on(adapter->netdev);
+
+		vmxnet3_tq_start(&adapter->tx_queue, adapter);
+	} else {
+		printk(KERN_INFO "%s: NIC Link is Down\n",
+		       adapter->netdev->name);
+		if (netif_carrier_ok(adapter->netdev))
+			netif_carrier_off(adapter->netdev);
+
+		vmxnet3_tq_stop(&adapter->tx_queue, adapter);
+	}
+}
+
+
+static void
+vmxnet3_process_events(struct vmxnet3_adapter *adapter)
+{
+	u32 events = adapter->shared->ecr;
+	if (!events)
+		return;
+
+	vmxnet3_ack_events(adapter, events);
+
+	/* Check if link state has changed */
+	if (events & VMXNET3_ECR_LINK)
+		vmxnet3_check_link(adapter);
+
+	/* Check if there is an error on xmit/recv queues */
+	if (events & (VMXNET3_ECR_TQERR | VMXNET3_ECR_RQERR)) {
+		VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+				       VMXNET3_CMD_GET_QUEUE_STATUS);
+
+		if (adapter->tqd_start->status.stopped) {
+			printk(KERN_ERR "%s: tq error 0x%x\n",
+			       adapter->netdev->name,
+			       adapter->tqd_start->status.error);
+		}
+		if (adapter->rqd_start->status.stopped) {
+			printk(KERN_ERR "%s: rq error 0x%x\n",
+			       adapter->netdev->name,
+			       adapter->rqd_start->status.error);
+		}
+
+		schedule_work(&adapter->work);
+	}
+}
+
+
+static void
+vmxnet3_unmap_tx_buf(struct vmxnet3_tx_buf_info *tbi,
+		     struct pci_dev *pdev)
+{
+	if (tbi->map_type == VMXNET3_MAP_SINGLE)
+		pci_unmap_single(pdev, tbi->dma_addr, tbi->len,
+				 PCI_DMA_TODEVICE);
+	else if (tbi->map_type == VMXNET3_MAP_PAGE)
+		pci_unmap_page(pdev, tbi->dma_addr, tbi->len,
+			       PCI_DMA_TODEVICE);
+	else
+		BUG_ON(tbi->map_type != VMXNET3_MAP_NONE);
+
+	tbi->map_type = VMXNET3_MAP_NONE; /* to help debugging */
+}
+
+
+static int
+vmxnet3_unmap_pkt(u32 eop_idx, struct vmxnet3_tx_queue *tq,
+		  struct pci_dev *pdev,	struct vmxnet3_adapter *adapter)
+{
+	struct sk_buff *skb;
+	int entries = 0;
+
+	/* no out of order completion */
+	BUG_ON(tq->buf_info[eop_idx].sop_idx != tq->tx_ring.next2comp);
+	BUG_ON(tq->tx_ring.base[eop_idx].txd.eop != 1);
+
+	skb = tq->buf_info[eop_idx].skb;
+	BUG_ON(skb == NULL);
+	tq->buf_info[eop_idx].skb = NULL;
+
+	VMXNET3_INC_RING_IDX_ONLY(eop_idx, tq->tx_ring.size);
+
+	while (tq->tx_ring.next2comp != eop_idx) {
+		vmxnet3_unmap_tx_buf(tq->buf_info + tq->tx_ring.next2comp,
+				     pdev);
+
+		/* update next2comp w/o tx_lock. Since we are marking more,
+		 * instead of less, tx ring entries avail, the worst case is
+		 * that the tx routine incorrectly re-queues a pkt due to
+		 * insufficient tx ring entries.
+		 */
+		vmxnet3_cmd_ring_adv_next2comp(&tq->tx_ring);
+		entries++;
+	}
+
+	dev_kfree_skb_any(skb);
+	return entries;
+}
+
+
+static int
+vmxnet3_tq_tx_complete(struct vmxnet3_tx_queue *tq,
+			struct vmxnet3_adapter *adapter)
+{
+	int completed = 0;
+	union Vmxnet3_GenericDesc *gdesc;
+
+	gdesc = tq->comp_ring.base + tq->comp_ring.next2proc;
+	while (gdesc->tcd.gen == tq->comp_ring.gen) {
+		completed += vmxnet3_unmap_pkt(gdesc->tcd.txdIdx, tq,
+					       adapter->pdev, adapter);
+
+		vmxnet3_comp_ring_adv_next2proc(&tq->comp_ring);
+		gdesc = tq->comp_ring.base + tq->comp_ring.next2proc;
+	}
+
+	if (completed) {
+		spin_lock(&tq->tx_lock);
+		if (unlikely(vmxnet3_tq_stopped(tq, adapter) &&
+			     vmxnet3_cmd_ring_desc_avail(&tq->tx_ring) >
+			     VMXNET3_WAKE_QUEUE_THRESHOLD(tq) &&
+			     netif_carrier_ok(adapter->netdev))) {
+			vmxnet3_tq_wake(tq, adapter);
+		}
+		spin_unlock(&tq->tx_lock);
+	}
+	return completed;
+}
+
+
+static void
+vmxnet3_tq_cleanup(struct vmxnet3_tx_queue *tq,
+		   struct vmxnet3_adapter *adapter)
+{
+	int i;
+
+	while (tq->tx_ring.next2comp != tq->tx_ring.next2fill) {
+		struct vmxnet3_tx_buf_info *tbi;
+		union Vmxnet3_GenericDesc *gdesc;
+
+		tbi = tq->buf_info + tq->tx_ring.next2comp;
+		gdesc = tq->tx_ring.base + tq->tx_ring.next2comp;
+
+		vmxnet3_unmap_tx_buf(tbi, adapter->pdev);
+		if (tbi->skb) {
+			dev_kfree_skb_any(tbi->skb);
+			tbi->skb = NULL;
+		}
+		vmxnet3_cmd_ring_adv_next2comp(&tq->tx_ring);
+	}
+
+	/* sanity check, verify all buffers are indeed unmapped and freed */
+	for (i = 0; i < tq->tx_ring.size; i++) {
+		BUG_ON(tq->buf_info[i].skb != NULL ||
+		       tq->buf_info[i].map_type != VMXNET3_MAP_NONE);
+	}
+
+	tq->tx_ring.gen = VMXNET3_INIT_GEN;
+	tq->tx_ring.next2fill = tq->tx_ring.next2comp = 0;
+
+	tq->comp_ring.gen = VMXNET3_INIT_GEN;
+	tq->comp_ring.next2proc = 0;
+}
+
+
+void
+vmxnet3_tq_destroy(struct vmxnet3_tx_queue *tq,
+		   struct vmxnet3_adapter *adapter)
+{
+	if (tq->tx_ring.base) {
+		pci_free_consistent(adapter->pdev, tq->tx_ring.size *
+				    sizeof(struct Vmxnet3_TxDesc),
+				    tq->tx_ring.base, tq->tx_ring.basePA);
+		tq->tx_ring.base = NULL;
+	}
+	if (tq->data_ring.base) {
+		pci_free_consistent(adapter->pdev, tq->data_ring.size *
+				    sizeof(struct Vmxnet3_TxDataDesc),
+				    tq->data_ring.base, tq->data_ring.basePA);
+		tq->data_ring.base = NULL;
+	}
+	if (tq->comp_ring.base) {
+		pci_free_consistent(adapter->pdev, tq->comp_ring.size *
+				    sizeof(struct Vmxnet3_TxCompDesc),
+				    tq->comp_ring.base, tq->comp_ring.basePA);
+		tq->comp_ring.base = NULL;
+	}
+	kfree(tq->buf_info);
+	tq->buf_info = NULL;
+}
+
+
+static void
+vmxnet3_tq_init(struct vmxnet3_tx_queue *tq,
+		struct vmxnet3_adapter *adapter)
+{
+	int i;
+
+	/* reset the tx ring contents to 0 and reset the tx ring states */
+	memset(tq->tx_ring.base, 0, tq->tx_ring.size *
+	       sizeof(struct Vmxnet3_TxDesc));
+	tq->tx_ring.next2fill = tq->tx_ring.next2comp = 0;
+	tq->tx_ring.gen = VMXNET3_INIT_GEN;
+
+	memset(tq->data_ring.base, 0, tq->data_ring.size *
+	       sizeof(struct Vmxnet3_TxDataDesc));
+
+	/* reset the tx comp ring contents to 0 and reset comp ring states */
+	memset(tq->comp_ring.base, 0, tq->comp_ring.size *
+	       sizeof(struct Vmxnet3_TxCompDesc));
+	tq->comp_ring.next2proc = 0;
+	tq->comp_ring.gen = VMXNET3_INIT_GEN;
+
+	/* reset the bookkeeping data */
+	memset(tq->buf_info, 0, sizeof(tq->buf_info[0]) * tq->tx_ring.size);
+	for (i = 0; i < tq->tx_ring.size; i++)
+		tq->buf_info[i].map_type = VMXNET3_MAP_NONE;
+
+	/* stats are not reset */
+}
+
+
+static int
+vmxnet3_tq_create(struct vmxnet3_tx_queue *tq,
+		  struct vmxnet3_adapter *adapter)
+{
+	BUG_ON(tq->tx_ring.base || tq->data_ring.base ||
+	       tq->comp_ring.base || tq->buf_info);
+
+	tq->tx_ring.base = pci_alloc_consistent(adapter->pdev, tq->tx_ring.size
+			   * sizeof(struct Vmxnet3_TxDesc),
+			   &tq->tx_ring.basePA);
+	if (!tq->tx_ring.base) {
+		printk(KERN_ERR "%s: failed to allocate tx ring\n",
+		       adapter->netdev->name);
+		goto err;
+	}
+
+	tq->data_ring.base = pci_alloc_consistent(adapter->pdev,
+			     tq->data_ring.size *
+			     sizeof(struct Vmxnet3_TxDataDesc),
+			     &tq->data_ring.basePA);
+	if (!tq->data_ring.base) {
+		printk(KERN_ERR "%s: failed to allocate data ring\n",
+		       adapter->netdev->name);
+		goto err;
+	}
+
+	tq->comp_ring.base = pci_alloc_consistent(adapter->pdev,
+			     tq->comp_ring.size *
+			     sizeof(struct Vmxnet3_TxCompDesc),
+			     &tq->comp_ring.basePA);
+	if (!tq->comp_ring.base) {
+		printk(KERN_ERR "%s: failed to allocate tx comp ring\n",
+		       adapter->netdev->name);
+		goto err;
+	}
+
+	tq->buf_info = kcalloc(tq->tx_ring.size, sizeof(tq->buf_info[0]),
+			       GFP_KERNEL);
+	if (!tq->buf_info) {
+		printk(KERN_ERR "%s: failed to allocate tx bufinfo\n",
+		       adapter->netdev->name);
+		goto err;
+	}
+
+	return 0;
+
+err:
+	vmxnet3_tq_destroy(tq, adapter);
+	return -ENOMEM;
+}
+
+
+/*
+ *    starting from ring->next2fill, allocate rx buffers for the given ring
+ *    of the rx queue and update the rx desc. stop after @num_to_alloc buffers
+ *    are allocated or allocation fails
+ */
+
+static int
+vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx,
+			int num_to_alloc, struct vmxnet3_adapter *adapter)
+{
+	int num_allocated = 0;
+	struct vmxnet3_rx_buf_info *rbi_base = rq->buf_info[ring_idx];
+	struct vmxnet3_cmd_ring *ring = &rq->rx_ring[ring_idx];
+	u32 val;
+
+	while (num_allocated < num_to_alloc) {
+		struct vmxnet3_rx_buf_info *rbi;
+		union Vmxnet3_GenericDesc *gd;
+
+		rbi = rbi_base + ring->next2fill;
+		gd = ring->base + ring->next2fill;
+
+		if (rbi->buf_type == VMXNET3_RX_BUF_SKB) {
+			if (rbi->skb == NULL) {
+				rbi->skb = dev_alloc_skb(rbi->len +
+							 NET_IP_ALIGN);
+				if (unlikely(rbi->skb == NULL)) {
+					rq->stats.rx_buf_alloc_failure++;
+					break;
+				}
+				rbi->skb->dev = adapter->netdev;
+
+				skb_reserve(rbi->skb, NET_IP_ALIGN);
+				rbi->dma_addr = pci_map_single(adapter->pdev,
+						rbi->skb->data, rbi->len,
+						PCI_DMA_FROMDEVICE);
+			} else {
+				/* rx buffer skipped by the device */
+			}
+			val = VMXNET3_RXD_BTYPE_HEAD << 
VMXNET3_RXD_BTYPE_SHIFT;
+		} else {
+			BUG_ON(rbi->buf_type != VMXNET3_RX_BUF_PAGE ||
+			       rbi->len  != PAGE_SIZE);
+
+			if (rbi->page == NULL) {
+				rbi->page = alloc_page(GFP_ATOMIC);
+				if (unlikely(rbi->page == NULL)) {
+					rq->stats.rx_buf_alloc_failure++;
+					break;
+				}
+				rbi->dma_addr = pci_map_page(adapter->pdev,
+						rbi->page, 0, PAGE_SIZE,
+						PCI_DMA_FROMDEVICE);
+			} else {
+				/* rx buffers skipped by the device */
+			}
+			val = VMXNET3_RXD_BTYPE_BODY << 
VMXNET3_RXD_BTYPE_SHIFT;
+		}
+
+		BUG_ON(rbi->dma_addr == 0);
+		gd->rxd.addr = rbi->dma_addr;
+		gd->dword[2] = (ring->gen << VMXNET3_RXD_GEN_SHIFT) | val |
+				rbi->len;
+
+		num_allocated++;
+		vmxnet3_cmd_ring_adv_next2fill(ring);
+	}
+	rq->uncommitted[ring_idx] += num_allocated;
+
+	dprintk(KERN_ERR "alloc_rx_buf: %d allocated, next2fill %u, next2comp "
+		"%u, uncommited %u\n", num_allocated, ring->next2fill,
+		ring->next2comp, rq->uncommitted[ring_idx]);
+
+	/* so that the device can distinguish a full ring and an empty ring */
+	BUG_ON(num_allocated != 0 && ring->next2fill == ring->next2comp);
+
+	return num_allocated;
+}
+
+
+static void
+vmxnet3_append_frag(struct sk_buff *skb, struct Vmxnet3_RxCompDesc *rcd,
+		    struct vmxnet3_rx_buf_info *rbi)
+{
+	struct skb_frag_struct *frag = skb_shinfo(skb)->frags +
+		skb_shinfo(skb)->nr_frags;
+
+	BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS);
+
+	frag->page = rbi->page;
+	frag->page_offset = 0;
+	frag->size = rcd->len;
+	skb->data_len += frag->size;
+	skb_shinfo(skb)->nr_frags++;
+}
+
+
+static void
+vmxnet3_map_pkt(struct sk_buff *skb, struct vmxnet3_tx_ctx *ctx,
+		struct vmxnet3_tx_queue *tq, struct pci_dev *pdev,
+		struct vmxnet3_adapter *adapter)
+{
+	u32 dw2, len;
+	unsigned long buf_offset;
+	int i;
+	union Vmxnet3_GenericDesc *gdesc;
+	struct vmxnet3_tx_buf_info *tbi = NULL;
+
+	BUG_ON(ctx->copy_size > skb_headlen(skb));
+
+	/* use the previous gen bit for the SOP desc */
+	dw2 = (tq->tx_ring.gen ^ 0x1) << VMXNET3_TXD_GEN_SHIFT;
+
+	ctx->sop_txd = tq->tx_ring.base + tq->tx_ring.next2fill;
+	gdesc = ctx->sop_txd; /* both loops below can be skipped */
+
+	/* no need to map the buffer if headers are copied */
+	if (ctx->copy_size) {
+		ctx->sop_txd->txd.addr = tq->data_ring.basePA +
+					tq->tx_ring.next2fill *
+					sizeof(struct Vmxnet3_TxDataDesc);
+		ctx->sop_txd->dword[2] = dw2 | ctx->copy_size;
+		ctx->sop_txd->dword[3] = 0;
+
+		tbi = tq->buf_info + tq->tx_ring.next2fill;
+		tbi->map_type = VMXNET3_MAP_NONE;
+
+		dprintk(KERN_ERR "txd[%u]: 0x%Lx 0x%x 0x%x\n",
+			tq->tx_ring.next2fill, ctx->sop_txd->txd.addr,
+			ctx->sop_txd->dword[2], ctx->sop_txd->dword[3]);
+		vmxnet3_cmd_ring_adv_next2fill(&tq->tx_ring);
+
+		/* use the right gen for non-SOP desc */
+		dw2 = tq->tx_ring.gen << VMXNET3_TXD_GEN_SHIFT;
+	}
+
+	/* linear part can use multiple tx desc if it's big */
+	len = skb_headlen(skb) - ctx->copy_size;
+	buf_offset = ctx->copy_size;
+	while (len) {
+		u32 buf_size;
+
+		buf_size = len > VMXNET3_MAX_TX_BUF_SIZE ?
+			   VMXNET3_MAX_TX_BUF_SIZE : len;
+
+		tbi = tq->buf_info + tq->tx_ring.next2fill;
+		tbi->map_type = VMXNET3_MAP_SINGLE;
+		tbi->dma_addr = pci_map_single(adapter->pdev,
+				skb->data + buf_offset, buf_size,
+				PCI_DMA_TODEVICE);
+
+		tbi->len = buf_size; /* this automatically convert 2^14 to 0 */
+
+		gdesc = tq->tx_ring.base + tq->tx_ring.next2fill;
+		BUG_ON(gdesc->txd.gen == tq->tx_ring.gen);
+
+		gdesc->txd.addr = tbi->dma_addr;
+		gdesc->dword[2] = dw2 | buf_size;
+		gdesc->dword[3] = 0;
+
+		dprintk(KERN_ERR "txd[%u]: 0x%Lx 0x%x 0x%x\n",
+			tq->tx_ring.next2fill, gdesc->txd.addr,
+			gdesc->dword[2], gdesc->dword[3]);
+		vmxnet3_cmd_ring_adv_next2fill(&tq->tx_ring);
+		dw2 = tq->tx_ring.gen << VMXNET3_TXD_GEN_SHIFT;
+
+		len -= buf_size;
+		buf_offset += buf_size;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i];
+
+		tbi = tq->buf_info + tq->tx_ring.next2fill;
+		tbi->map_type = VMXNET3_MAP_PAGE;
+		tbi->dma_addr = pci_map_page(adapter->pdev, frag->page,
+					     frag->page_offset, frag->size,
+					     PCI_DMA_TODEVICE);
+
+		tbi->len = frag->size;
+
+		gdesc = tq->tx_ring.base + tq->tx_ring.next2fill;
+		BUG_ON(gdesc->txd.gen == tq->tx_ring.gen);
+
+		gdesc->txd.addr = tbi->dma_addr;
+		gdesc->dword[2] = dw2 | frag->size;
+		gdesc->dword[3] = 0;
+
+		dprintk(KERN_ERR "txd[%u]: 0x%llu %u %u\n",
+			tq->tx_ring.next2fill, gdesc->txd.addr,
+			gdesc->dword[2], gdesc->dword[3]);
+		vmxnet3_cmd_ring_adv_next2fill(&tq->tx_ring);
+		dw2 = tq->tx_ring.gen << VMXNET3_TXD_GEN_SHIFT;
+	}
+
+	ctx->eop_txd = gdesc;
+
+	/* set the last buf_info for the pkt */
+	tbi->skb = skb;
+	tbi->sop_idx = ctx->sop_txd - tq->tx_ring.base;
+}
+
+
+/*
+ *    parse and copy relevant protocol headers:
+ *      For a tso pkt, relevant headers are L2/3/4 including options
+ *      For a pkt requesting csum offloading, they are L2/3 and may include L4
+ *      if it's a TCP/UDP pkt
+ *
+ * Returns:
+ *    -1:  error happens during parsing
+ *     0:  protocol headers parsed, but too big to be copied
+ *     1:  protocol headers parsed and copied
+ *
+ * Other effects:
+ *    1. related *ctx fields are updated.
+ *    2. ctx->copy_size is # of bytes copied
+ *    3. the portion copied is guaranteed to be in the linear part
+ *
+ */
+static int
+vmxnet3_parse_and_copy_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
+			   struct vmxnet3_tx_ctx *ctx,
+			   struct vmxnet3_adapter *adapter)
+{
+	struct Vmxnet3_TxDataDesc *tdd;
+
+	if (ctx->mss) {
+		ctx->eth_ip_hdr_size = skb_transport_offset(skb);
+		ctx->l4_hdr_size = ((struct tcphdr *)
+				   skb_transport_header(skb))->doff * 4;
+		ctx->copy_size = ctx->eth_ip_hdr_size + ctx->l4_hdr_size;
+	} else {
+		unsigned int pull_size;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			ctx->eth_ip_hdr_size = skb_transport_offset(skb);
+
+			if (ctx->ipv4) {
+				struct iphdr *iph = (struct iphdr *)
+						    skb_network_header(skb);
+				if (iph->protocol == IPPROTO_TCP) {
+					pull_size = ctx->eth_ip_hdr_size +
+						    sizeof(struct tcphdr);
+
+					if (unlikely(!pskb_may_pull(skb,
+								pull_size))) {
+						goto err;
+					}
+					ctx->l4_hdr_size = ((struct tcphdr *)
+					   skb_transport_header(skb))->doff * 
4;
+				} else if (iph->protocol == IPPROTO_UDP) {
+					ctx->l4_hdr_size =
+							sizeof(struct udphdr);
+				} else {
+					ctx->l4_hdr_size = 0;
+				}
+			} else {
+				/* for simplicity, don't copy L4 headers */
+				ctx->l4_hdr_size = 0;
+			}
+			ctx->copy_size = ctx->eth_ip_hdr_size +
+					 ctx->l4_hdr_size;
+		} else {
+			ctx->eth_ip_hdr_size = 0;
+			ctx->l4_hdr_size = 0;
+			/* copy as much as allowed */
+			ctx->copy_size = min((unsigned 
int)VMXNET3_HDR_COPY_SIZE
+					     , skb_headlen(skb));
+		}
+
+		/* make sure headers are accessible directly */
+		if (unlikely(!pskb_may_pull(skb, ctx->copy_size)))
+			goto err;
+	}
+
+	if (unlikely(ctx->copy_size > VMXNET3_HDR_COPY_SIZE)) {
+		tq->stats.oversized_hdr++;
+		ctx->copy_size = 0;
+		return 0;
+	}
+
+	tdd = tq->data_ring.base + tq->tx_ring.next2fill;
+
+	memcpy(tdd->data, skb->data, ctx->copy_size);
+	dprintk(KERN_ERR "copy %u bytes to dataRing[%u]\n",
+		ctx->copy_size, tq->tx_ring.next2fill);
+	return 1;
+
+err:
+	return -1;
+}
+
+
+static void
+vmxnet3_prepare_tso(struct sk_buff *skb,
+		    struct vmxnet3_tx_ctx *ctx)
+{
+	struct tcphdr *tcph = (struct tcphdr *)skb_transport_header(skb);
+	if (ctx->ipv4) {
+		struct iphdr *iph = (struct iphdr *)skb_network_header(skb);
+		iph->check = 0;
+		tcph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, 0,
+						 IPPROTO_TCP, 0);
+	} else {
+		struct ipv6hdr *iph = (struct ipv6hdr 
*)skb_network_header(skb);
+		tcph->check = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, 0,
+					       IPPROTO_TCP, 0);
+	}
+}
+
+
+/*
+ * Transmits a pkt thru a given tq
+ * Returns:
+ *    NETDEV_TX_OK:      descriptors are setup successfully
+ *    NETDEV_TX_OK:      error occured, the pkt is dropped
+ *    NETDEV_TX_BUSY:    tx ring is full, queue is stopped
+ *
+ * Side-effects:
+ *    1. tx ring may be changed
+ *    2. tq stats may be updated accordingly
+ *    3. shared->txNumDeferred may be updated
+ */
+
+static int
+vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
+		struct vmxnet3_adapter *adapter, struct net_device *netdev)
+{
+	int ret;
+	u32 count;
+	unsigned long flags;
+	struct vmxnet3_tx_ctx ctx;
+	union Vmxnet3_GenericDesc *gdesc;
+
+	/* conservatively estimate # of descriptors to use */
+	count = VMXNET3_TXD_NEEDED(skb_headlen(skb)) +
+		skb_shinfo(skb)->nr_frags + 1;
+
+	ctx.ipv4 = (skb->protocol == __constant_ntohs(ETH_P_IP));
+
+	ctx.mss = skb_shinfo(skb)->gso_size;
+	if (ctx.mss) {
+		if (skb_header_cloned(skb)) {
+			if (unlikely(pskb_expand_head(skb, 0, 0,
+						      GFP_ATOMIC) != 0)) {
+				tq->stats.drop_tso++;
+				goto drop_pkt;
+			}
+			tq->stats.copy_skb_header++;
+		}
+		vmxnet3_prepare_tso(skb, &ctx);
+	} else {
+		if (unlikely(count > VMXNET3_MAX_TXD_PER_PKT)) {
+
+			/* non-tso pkts must not use more than
+			 * VMXNET3_MAX_TXD_PER_PKT entries
+			 */
+			if (skb_linearize(skb) != 0) {
+				tq->stats.drop_too_many_frags++;
+				goto drop_pkt;
+			}
+			tq->stats.linearized++;
+
+			/* recalculate the # of descriptors to use */
+			count = VMXNET3_TXD_NEEDED(skb_headlen(skb)) + 1;
+		}
+	}
+
+	ret = vmxnet3_parse_and_copy_hdr(skb, tq, &ctx, adapter);
+	if (ret >= 0) {
+		BUG_ON(ret <= 0 && ctx.copy_size != 0);
+		/* hdrs parsed, check against other limits */
+		if (ctx.mss) {
+			if (unlikely(ctx.eth_ip_hdr_size + ctx.l4_hdr_size >
+				     VMXNET3_MAX_TX_BUF_SIZE)) {
+				goto hdr_too_big;
+			}
+		} else {
+			if (skb->ip_summed == CHECKSUM_PARTIAL) {
+				if (unlikely(ctx.eth_ip_hdr_size +
+					     skb->csum_offset >
+					     VMXNET3_MAX_CSUM_OFFSET)) {
+					goto hdr_too_big;
+				}
+			}
+		}
+	} else {
+		tq->stats.drop_hdr_inspect_err++;
+		goto drop_pkt;
+	}
+
+	spin_lock_irqsave(&tq->tx_lock, flags);
+
+	if (count > vmxnet3_cmd_ring_desc_avail(&tq->tx_ring)) {
+		tq->stats.tx_ring_full++;
+		dprintk(KERN_ERR "tx queue stopped on %s, next2comp %u"
+			" next2fill %u\n", adapter->netdev->name,
+			tq->tx_ring.next2comp, tq->tx_ring.next2fill);
+
+		vmxnet3_tq_stop(tq, adapter);
+		spin_unlock_irqrestore(&tq->tx_lock, flags);
+		return NETDEV_TX_BUSY;
+	}
+
+	/* fill tx descs related to addr & len */
+	vmxnet3_map_pkt(skb, &ctx, tq, adapter->pdev, adapter);
+
+	/* setup the EOP desc */
+	ctx.eop_txd->dword[3] = VMXNET3_TXD_CQ | VMXNET3_TXD_EOP;
+
+	/* setup the SOP desc */
+	gdesc = ctx.sop_txd;
+	if (ctx.mss) {
+		gdesc->txd.hlen = ctx.eth_ip_hdr_size + ctx.l4_hdr_size;
+		gdesc->txd.om = VMXNET3_OM_TSO;
+		gdesc->txd.msscof = ctx.mss;
+		tq->shared->txNumDeferred += (skb->len - gdesc->txd.hlen +
+					     ctx.mss - 1) / ctx.mss;
+	} else {
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			gdesc->txd.hlen = ctx.eth_ip_hdr_size;
+			gdesc->txd.om = VMXNET3_OM_CSUM;
+			gdesc->txd.msscof = ctx.eth_ip_hdr_size +
+					    skb->csum_offset;
+		} else {
+			gdesc->txd.om = 0;
+			gdesc->txd.msscof = 0;
+		}
+		tq->shared->txNumDeferred++;
+	}
+
+	if (vlan_tx_tag_present(skb)) {
+		gdesc->txd.ti = 1;
+		gdesc->txd.tci = vlan_tx_tag_get(skb);
+	}
+
+	wmb();
+
+	/* finally flips the GEN bit of the SOP desc */
+	gdesc->dword[2] ^= VMXNET3_TXD_GEN;
+	dprintk(KERN_ERR "txd[%u]: SOP 0x%Lx 0x%x 0x%x\n",
+		(u32)((union Vmxnet3_GenericDesc *)ctx.sop_txd -
+		tq->tx_ring.base), gdesc->txd.addr, gdesc->dword[2],
+		gdesc->dword[3]);
+
+	spin_unlock_irqrestore(&tq->tx_lock, flags);
+
+	if (tq->shared->txNumDeferred >= tq->shared->txThreshold) {
+		tq->shared->txNumDeferred = 0;
+		VMXNET3_WRITE_BAR0_REG(adapter, VMXNET3_REG_TXPROD,
+				       tq->tx_ring.next2fill);
+	}
+	netdev->trans_start = jiffies;
+
+	return NETDEV_TX_OK;
+
+hdr_too_big:
+	tq->stats.drop_oversized_hdr++;
+drop_pkt:
+	tq->stats.drop_total++;
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+
+static netdev_tx_t
+vmxnet3_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	struct vmxnet3_tx_queue *tq = &adapter->tx_queue;
+
+	return vmxnet3_tq_xmit(skb, tq, adapter, netdev);
+}
+
+
+static void
+vmxnet3_rx_csum(struct vmxnet3_adapter *adapter,
+		struct sk_buff *skb,
+		union Vmxnet3_GenericDesc *gdesc)
+{
+	if (!gdesc->rcd.cnc && adapter->rxcsum) {
+		/* typical case: TCP/UDP over IP and both csums are correct */
+		if ((gdesc->dword[3] & VMXNET3_RCD_CSUM_OK) ==
+							VMXNET3_RCD_CSUM_OK) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			BUG_ON(!(gdesc->rcd.tcp || gdesc->rcd.udp));
+			BUG_ON(!(gdesc->rcd.v4  || gdesc->rcd.v6));
+			BUG_ON(gdesc->rcd.frg);
+		} else {
+			if (gdesc->rcd.csum) {
+				skb->csum = htons(gdesc->rcd.csum);
+				skb->ip_summed = CHECKSUM_PARTIAL;
+			} else {
+				skb->ip_summed = CHECKSUM_NONE;
+			}
+		}
+	} else {
+		skb->ip_summed = CHECKSUM_NONE;
+	}
+}
+
+
+static void
+vmxnet3_rx_error(struct vmxnet3_rx_queue *rq, struct Vmxnet3_RxCompDesc *rcd,
+		 struct vmxnet3_rx_ctx *ctx,  struct vmxnet3_adapter *adapter)
+{
+	rq->stats.drop_err++;
+	if (!rcd->fcs)
+		rq->stats.drop_fcs++;
+
+	rq->stats.drop_total++;
+
+	/*
+	 * We do not unmap and chain the rx buffer to the skb.
+	 * We basically pretend this buffer is not used and will be recycled
+	 * by vmxnet3_rq_alloc_rx_buf()
+	 */
+
+	/*
+	 * ctx->skb may be NULL if this is the first and the only one
+	 * desc for the pkt
+	 */
+	if (ctx->skb)
+		dev_kfree_skb_irq(ctx->skb);
+
+	ctx->skb = NULL;
+}
+
+
+static int
+vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
+		       struct vmxnet3_adapter *adapter, int quota)
+{
+	static u32 rxprod_reg[2] = {VMXNET3_REG_RXPROD, VMXNET3_REG_RXPROD2};
+	u32 num_rxd = 0;
+	struct Vmxnet3_RxCompDesc *rcd;
+	struct vmxnet3_rx_ctx *ctx = &rq->rx_ctx;
+
+	rcd = &rq->comp_ring.base[rq->comp_ring.next2proc].rcd;
+	while (rcd->gen == rq->comp_ring.gen) {
+		struct vmxnet3_rx_buf_info *rbi;
+		struct sk_buff *skb;
+		int num_to_alloc;
+		struct Vmxnet3_RxDesc *rxd;
+		u32 idx, ring_idx;
+
+		if (num_rxd >= quota) {
+			/* we may stop even before we see the EOP desc of
+			 * the current pkt
+			 */
+			break;
+		}
+		num_rxd++;
+
+		idx = rcd->rxdIdx;
+		ring_idx = rcd->rqID == rq->qid ? 0 : 1;
+
+		rxd = &rq->rx_ring[ring_idx].base[idx].rxd;
+		rbi = rq->buf_info[ring_idx] + idx;
+
+		BUG_ON(rxd->addr != rbi->dma_addr || rxd->len != rbi->len);
+
+		if (unlikely(rcd->eop && rcd->err)) {
+			vmxnet3_rx_error(rq, rcd, ctx, adapter);
+			goto rcd_done;
+		}
+
+		if (rcd->sop) { /* first buf of the pkt */
+			BUG_ON(rxd->btype != VMXNET3_RXD_BTYPE_HEAD ||
+			       rcd->rqID != rq->qid);
+
+			BUG_ON(rbi->buf_type != VMXNET3_RX_BUF_SKB);
+			BUG_ON(ctx->skb != NULL || rbi->skb == NULL);
+
+			if (unlikely(rcd->len == 0)) {
+				/* Pretend the rx buffer is skipped. */
+				BUG_ON(!(rcd->sop && rcd->eop));
+				dprintk(KERN_ERR "rxRing[%u][%u] 0 length\n",
+					ring_idx, idx);
+				goto rcd_done;
+			}
+
+			ctx->skb = rbi->skb;
+			rbi->skb = NULL;
+
+			pci_unmap_single(adapter->pdev, rbi->dma_addr, 
rbi->len,
+					 PCI_DMA_FROMDEVICE);
+
+			skb_put(ctx->skb, rcd->len);
+		} else {
+			BUG_ON(ctx->skb == NULL);
+			/* non SOP buffer must be type 1 in most cases */
+			if (rbi->buf_type == VMXNET3_RX_BUF_PAGE) {
+				BUG_ON(rxd->btype != VMXNET3_RXD_BTYPE_BODY);
+
+				if (rcd->len) {
+					pci_unmap_page(adapter->pdev,
+						       rbi->dma_addr, rbi->len,
+						       PCI_DMA_FROMDEVICE);
+
+					vmxnet3_append_frag(ctx->skb, rcd, 
rbi);
+					rbi->page = NULL;
+				}
+			} else {
+				/*
+				 * The only time a non-SOP buffer is type 0 is
+				 * when it's EOP and error flag is raised, 
which
+				 * has already been handled.
+				 */
+				BUG_ON(true);
+			}
+		}
+
+		skb = ctx->skb;
+		if (rcd->eop) {
+			skb->len += skb->data_len;
+			skb->truesize += skb->data_len;
+
+			vmxnet3_rx_csum(adapter, skb,
+					(union Vmxnet3_GenericDesc *)rcd);
+			skb->protocol = eth_type_trans(skb, adapter->netdev);
+
+			if (unlikely(adapter->vlan_grp && rcd->ts)) {
+				vlan_hwaccel_receive_skb(skb,
+						adapter->vlan_grp, rcd->tci);
+			} else {
+				netif_receive_skb(skb);
+			}
+
+			adapter->netdev->last_rx = jiffies;
+			ctx->skb = NULL;
+		}
+
+rcd_done:
+		/* device may skip some rx descs */
+		rq->rx_ring[ring_idx].next2comp = idx;
+		VMXNET3_INC_RING_IDX_ONLY(rq->rx_ring[ring_idx].next2comp,
+					  rq->rx_ring[ring_idx].size);
+
+		/* refill rx buffers frequently to avoid starving the h/w */
+		num_to_alloc = vmxnet3_cmd_ring_desc_avail(rq->rx_ring +
+							   ring_idx);
+		if (unlikely(num_to_alloc > VMXNET3_RX_ALLOC_THRESHOLD(rq,
+							ring_idx, adapter))) {
+			vmxnet3_rq_alloc_rx_buf(rq, ring_idx, num_to_alloc,
+						adapter);
+
+			/* if needed, update the register */
+			if (unlikely(rq->shared->updateRxProd)) {
+				VMXNET3_WRITE_BAR0_REG(adapter,
+					rxprod_reg[ring_idx] + rq->qid * 8,
+					rq->rx_ring[ring_idx].next2fill);
+				rq->uncommitted[ring_idx] = 0;
+			}
+		}
+
+		vmxnet3_comp_ring_adv_next2proc(&rq->comp_ring);
+		rcd = &rq->comp_ring.base[rq->comp_ring.next2proc].rcd;
+	}
+
+	return num_rxd;
+}
+
+
+static void
+vmxnet3_rq_cleanup(struct vmxnet3_rx_queue *rq,
+		   struct vmxnet3_adapter *adapter)
+{
+	u32 i, ring_idx;
+	struct Vmxnet3_RxDesc *rxd;
+
+	for (ring_idx = 0; ring_idx < 2; ring_idx++) {
+		for (i = 0; i < rq->rx_ring[ring_idx].size; i++) {
+			rxd = &rq->rx_ring[ring_idx].base[i].rxd;
+
+			if (rxd->btype == VMXNET3_RXD_BTYPE_HEAD &&
+					rq->buf_info[ring_idx][i].skb) {
+				pci_unmap_single(adapter->pdev, rxd->addr,
+						 rxd->len, PCI_DMA_FROMDEVICE);
+				dev_kfree_skb(rq->buf_info[ring_idx][i].skb);
+				rq->buf_info[ring_idx][i].skb = NULL;
+			} else if (rxd->btype == VMXNET3_RXD_BTYPE_BODY &&
+					rq->buf_info[ring_idx][i].page) {
+				pci_unmap_page(adapter->pdev, rxd->addr,
+					       rxd->len, PCI_DMA_FROMDEVICE);
+				put_page(rq->buf_info[ring_idx][i].page);
+				rq->buf_info[ring_idx][i].page = NULL;
+			}
+		}
+
+		rq->rx_ring[ring_idx].gen = VMXNET3_INIT_GEN;
+		rq->rx_ring[ring_idx].next2fill =
+					rq->rx_ring[ring_idx].next2comp = 0;
+		rq->uncommitted[ring_idx] = 0;
+	}
+
+	rq->comp_ring.gen = VMXNET3_INIT_GEN;
+	rq->comp_ring.next2proc = 0;
+}
+
+
+void vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq,
+			struct vmxnet3_adapter *adapter)
+{
+	int i;
+	int j;
+
+	/* all rx buffers must have already been freed */
+	for (i = 0; i < 2; i++) {
+		if (rq->buf_info[i]) {
+			for (j = 0; j < rq->rx_ring[i].size; j++)
+				BUG_ON(rq->buf_info[i][j].page != NULL);
+		}
+	}
+
+
+	kfree(rq->buf_info[0]);
+
+	for (i = 0; i < 2; i++) {
+		if (rq->rx_ring[i].base) {
+			pci_free_consistent(adapter->pdev, rq->rx_ring[i].size
+					    * sizeof(struct Vmxnet3_RxDesc),
+					    rq->rx_ring[i].base,
+					    rq->rx_ring[i].basePA);
+			rq->rx_ring[i].base = NULL;
+		}
+		rq->buf_info[i] = NULL;
+	}
+
+	if (rq->comp_ring.base) {
+		pci_free_consistent(adapter->pdev, rq->comp_ring.size *
+				    sizeof(struct Vmxnet3_RxCompDesc),
+				    rq->comp_ring.base, rq->comp_ring.basePA);
+		rq->comp_ring.base = NULL;
+	}
+}
+
+
+static int
+vmxnet3_rq_init(struct vmxnet3_rx_queue *rq,
+		struct vmxnet3_adapter  *adapter)
+{
+	int i;
+
+	/* initialize buf_info */
+	for (i = 0; i < rq->rx_ring[0].size; i++) {
+
+		/* 1st buf for a pkt is skbuff */
+		if (i % adapter->rx_buf_per_pkt == 0) {
+			rq->buf_info[0][i].buf_type = VMXNET3_RX_BUF_SKB;
+			rq->buf_info[0][i].len = adapter->skb_buf_size;
+		} else { /* subsequent bufs for a pkt is frag */
+			rq->buf_info[0][i].buf_type = VMXNET3_RX_BUF_PAGE;
+			rq->buf_info[0][i].len = PAGE_SIZE;
+		}
+	}
+	for (i = 0; i < rq->rx_ring[1].size; i++) {
+		rq->buf_info[1][i].buf_type = VMXNET3_RX_BUF_PAGE;
+		rq->buf_info[1][i].len = PAGE_SIZE;
+	}
+
+	/* reset internal state and allocate buffers for both rings */
+	for (i = 0; i < 2; i++) {
+		rq->rx_ring[i].next2fill = rq->rx_ring[i].next2comp = 0;
+		rq->uncommitted[i] = 0;
+
+		memset(rq->rx_ring[i].base, 0, rq->rx_ring[i].size *
+		       sizeof(struct Vmxnet3_RxDesc));
+		rq->rx_ring[i].gen = VMXNET3_INIT_GEN;
+	}
+	if (vmxnet3_rq_alloc_rx_buf(rq, 0, rq->rx_ring[0].size - 1,
+				    adapter) == 0) {
+		/* at least has 1 rx buffer for the 1st ring */
+		return -ENOMEM;
+	}
+	vmxnet3_rq_alloc_rx_buf(rq, 1, rq->rx_ring[1].size - 1, adapter);
+
+	/* reset the comp ring */
+	rq->comp_ring.next2proc = 0;
+	memset(rq->comp_ring.base, 0, rq->comp_ring.size *
+	       sizeof(struct Vmxnet3_RxCompDesc));
+	rq->comp_ring.gen = VMXNET3_INIT_GEN;
+
+	/* reset rxctx */
+	rq->rx_ctx.skb = NULL;
+
+	/* stats are not reset */
+	return 0;
+}
+
+
+static int
+vmxnet3_rq_create(struct vmxnet3_rx_queue *rq, struct vmxnet3_adapter 
*adapter)
+{
+	int i;
+	size_t sz;
+	struct vmxnet3_rx_buf_info *bi;
+
+	for (i = 0; i < 2; i++) {
+
+		sz = rq->rx_ring[i].size * sizeof(struct Vmxnet3_RxDesc);
+		rq->rx_ring[i].base = pci_alloc_consistent(adapter->pdev, sz,
+ 
&rq->rx_ring[i].basePA);
+		if (!rq->rx_ring[i].base) {
+			printk(KERN_ERR "%s: failed to allocate rx ring %d\n",
+			       adapter->netdev->name, i);
+			goto err;
+		}
+	}
+
+	sz = rq->comp_ring.size * sizeof(struct Vmxnet3_RxCompDesc);
+	rq->comp_ring.base = pci_alloc_consistent(adapter->pdev, sz,
+						  &rq->comp_ring.basePA);
+	if (!rq->comp_ring.base) {
+		printk(KERN_ERR "%s: failed to allocate rx comp ring\n",
+		       adapter->netdev->name);
+		goto err;
+	}
+
+	sz = sizeof(struct vmxnet3_rx_buf_info) * (rq->rx_ring[0].size +
+						   rq->rx_ring[1].size);
+	bi = kmalloc(sz, GFP_KERNEL);
+	if (!bi) {
+		printk(KERN_ERR "%s: failed to allocate rx bufinfo\n",
+		       adapter->netdev->name);
+		goto err;
+	}
+	memset(bi, 0, sz);
+	rq->buf_info[0] = bi;
+	rq->buf_info[1] = bi + rq->rx_ring[0].size;
+
+	return 0;
+
+err:
+	vmxnet3_rq_destroy(rq, adapter);
+	return -ENOMEM;
+}
+
+
+static void
+vmxnet3_do_poll(struct vmxnet3_adapter *adapter, int budget, int *txd_done,
+		int *rxd_done)
+{
+	if (unlikely(adapter->shared->ecr))
+		vmxnet3_process_events(adapter);
+
+	*txd_done = vmxnet3_tq_tx_complete(&adapter->tx_queue, adapter);
+	*rxd_done = vmxnet3_rq_rx_complete(&adapter->rx_queue, adapter, 
budget);
+}
+
+
+static int
+vmxnet3_poll(struct napi_struct *napi, int budget)
+{
+	struct vmxnet3_adapter *adapter = container_of(napi,
+					  struct vmxnet3_adapter, napi);
+	int rxd_done, txd_done;
+
+	vmxnet3_do_poll(adapter, budget, &txd_done, &rxd_done);
+
+	if (rxd_done < budget) {
+		napi_complete(napi);
+		vmxnet3_enable_intr(adapter, 0);
+	}
+	return rxd_done;
+}
+
+
+/* Interrupt handler for vmxnet3  */
+static irqreturn_t
+vmxnet3_intr(int irq, void *dev_id)
+{
+	struct net_device *dev = dev_id;
+	struct vmxnet3_adapter *adapter = netdev_priv(dev);
+
+	if (unlikely(adapter->intr.type == VMXNET3_IT_INTX)) {
+		u32 icr = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_ICR);
+		if (unlikely(icr == 0))
+			/* not ours */
+			return IRQ_NONE;
+	}
+
+
+	/* disable intr if needed */
+	if (adapter->intr.mask_mode == VMXNET3_IMM_ACTIVE)
+		vmxnet3_disable_intr(adapter, 0);
+
+	napi_schedule(&adapter->napi);
+
+	return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+
+
+/* netpoll callback. */
+static void
+vmxnet3_netpoll(struct net_device *netdev)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	int irq;
+
+	if (adapter->intr.type == VMXNET3_IT_MSIX)
+		irq = adapter->intr.msix_entries[0].vector;
+	else
+		irq = adapter->pdev->irq;
+
+	disable_irq(irq);
+	vmxnet3_intr(irq, netdev);
+	enable_irq(irq);
+}
+#endif
+
+static int
+vmxnet3_request_irqs(struct vmxnet3_adapter *adapter)
+{
+	int err;
+
+	if (adapter->intr.type == VMXNET3_IT_MSIX) {
+		/* we only use 1 MSI-X vector */
+		err = request_irq(adapter->intr.msix_entries[0].vector,
+				  vmxnet3_intr, 0, adapter->netdev->name,
+				  adapter->netdev);
+	} else if (adapter->intr.type == VMXNET3_IT_MSI) {
+		err = request_irq(adapter->pdev->irq, vmxnet3_intr, 0,
+				  adapter->netdev->name, adapter->netdev);
+	} else {
+		err = request_irq(adapter->pdev->irq, vmxnet3_intr,
+				  IRQF_SHARED, adapter->netdev->name,
+				  adapter->netdev);
+	}
+
+	if (err)
+		printk(KERN_ERR "Failed to request irq %s (intr type:%d), 
error"
+		       ":%d\n", adapter->netdev->name, adapter->intr.type, 
err);
+
+
+	if (!err) {
+		int i;
+		/* init our intr settings */
+		for (i = 0; i < adapter->intr.num_intrs; i++)
+			adapter->intr.mod_levels[i] = UPT1_IML_ADAPTIVE;
+
+		/* next setup intr index for all intr sources */
+		adapter->tx_queue.comp_ring.intr_idx = 0;
+		adapter->rx_queue.comp_ring.intr_idx = 0;
+		adapter->intr.event_intr_idx = 0;
+
+		printk(KERN_INFO "%s: intr type %u, mode %u, %u vectors "
+		       "allocated\n", adapter->netdev->name, 
adapter->intr.type,
+		       adapter->intr.mask_mode, adapter->intr.num_intrs);
+	}
+
+	return err;
+}
+
+
+static void
+vmxnet3_free_irqs(struct vmxnet3_adapter *adapter)
+{
+	BUG_ON(adapter->intr.type == VMXNET3_IT_AUTO ||
+	       adapter->intr.num_intrs <= 0);
+
+	switch (adapter->intr.type) {
+	case VMXNET3_IT_MSIX:
+	{
+		int i;
+
+		for (i = 0; i < adapter->intr.num_intrs; i++)
+			free_irq(adapter->intr.msix_entries[i].vector,
+				 adapter->netdev);
+		break;
+	}
+	case VMXNET3_IT_MSI:
+		free_irq(adapter->pdev->irq, adapter->netdev);
+		break;
+	case VMXNET3_IT_INTX:
+		free_irq(adapter->pdev->irq, adapter->netdev);
+		break;
+	default:
+		BUG_ON(true);
+	}
+}
+
+
+static void
+vmxnet3_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	struct Vmxnet3_DriverShared *shared = adapter->shared;
+	u32 *vfTable = adapter->shared->devRead.rxFilterConf.vfTable;
+
+	if (grp) {
+		/* add vlan rx stripping. */
+		if (adapter->netdev->features & NETIF_F_HW_VLAN_RX) {
+			int i;
+			struct Vmxnet3_DSDevRead *devRead = &shared->devRead;
+			adapter->vlan_grp = grp;
+
+			/* update FEATURES to device */
+			devRead->misc.uptFeatures |= UPT1_F_RXVLAN;
+			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+					       VMXNET3_CMD_UPDATE_FEATURE);
+			/*
+			 *  Clear entire vfTable; then enable untagged pkts.
+			 *  Note: setting one entry in vfTable to non-zero 
turns
+			 *  on VLAN rx filtering.
+			 */
+			for (i = 0; i < VMXNET3_VFT_SIZE; i++)
+				vfTable[i] = 0;
+
+			VMXNET3_SET_VFTABLE_ENTRY(vfTable, 0);
+			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+ 
VMXNET3_CMD_UPDATE_VLAN_FILTERS);
+		} else {
+			printk(KERN_ERR "%s: vlan_rx_register when device has "
+			       "no NETIF_F_HW_VLAN_RX\n", netdev->name);
+		}
+	} else {
+		/* remove vlan rx stripping. */
+		struct Vmxnet3_DSDevRead *devRead = &shared->devRead;
+		adapter->vlan_grp = NULL;
+
+		if (devRead->misc.uptFeatures & UPT1_F_RXVLAN) {
+			int i;
+
+			for (i = 0; i < VMXNET3_VFT_SIZE; i++) {
+				/* clear entire vfTable; this also disables
+				 * VLAN rx filtering
+				 */
+				vfTable[i] = 0;
+			}
+			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+ 
VMXNET3_CMD_UPDATE_VLAN_FILTERS);
+
+			/* update FEATURES to device */
+			devRead->misc.uptFeatures &= ~UPT1_F_RXVLAN;
+			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+					       VMXNET3_CMD_UPDATE_FEATURE);
+		}
+	}
+}
+
+
+static void
+vmxnet3_restore_vlan(struct vmxnet3_adapter *adapter)
+{
+	if (adapter->vlan_grp) {
+		u16 vid;
+		u32 *vfTable = adapter->shared->devRead.rxFilterConf.vfTable;
+		bool activeVlan = false;
+
+		for (vid = 0; vid < VLAN_GROUP_ARRAY_LEN; vid++) {
+			if (vlan_group_get_device(adapter->vlan_grp, vid)) {
+				VMXNET3_SET_VFTABLE_ENTRY(vfTable, vid);
+				activeVlan = true;
+			}
+		}
+		if (activeVlan) {
+			/* continue to allow untagged pkts */
+			VMXNET3_SET_VFTABLE_ENTRY(vfTable, 0);
+		}
+	}
+}
+
+
+static void
+vmxnet3_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	u32 *vfTable = adapter->shared->devRead.rxFilterConf.vfTable;
+
+	VMXNET3_SET_VFTABLE_ENTRY(vfTable, vid);
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+			       VMXNET3_CMD_UPDATE_VLAN_FILTERS);
+}
+
+
+static void
+vmxnet3_vlan_rx_kill_vid(struct net_device *netdev, u16 vid)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	u32 *vfTable = adapter->shared->devRead.rxFilterConf.vfTable;
+
+	VMXNET3_CLEAR_VFTABLE_ENTRY(vfTable, vid);
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+			       VMXNET3_CMD_UPDATE_VLAN_FILTERS);
+}
+
+
+static u8 *
+vmxnet3_copy_mc(struct net_device *netdev)
+{
+	u8 *buf = NULL;
+	u32 sz = netdev->mc_count * ETH_ALEN;
+
+	/* struct Vmxnet3_RxFilterConf.mfTableLen is u16. */
+	if (sz <= 0xffff) {
+		/* We may be called with BH disabled */
+		buf = kmalloc(sz, GFP_ATOMIC);
+		if (buf) {
+			int i;
+			struct dev_mc_list *mc = netdev->mc_list;
+
+			for (i = 0; i < netdev->mc_count; i++) {
+				BUG_ON(!mc);
+				memcpy(buf + i * ETH_ALEN, mc->dmi_addr,
+				       ETH_ALEN);
+				mc = mc->next;
+			}
+		}
+	}
+	return buf;
+}
+
+
+static void
+vmxnet3_set_mc(struct net_device *netdev)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	struct Vmxnet3_RxFilterConf *rxConf =
+					&adapter->shared->devRead.rxFilterConf;
+	u8 *new_table = NULL;
+	u32 new_mode = VMXNET3_RXM_UCAST;
+
+	if (netdev->flags & IFF_PROMISC)
+		new_mode |= VMXNET3_RXM_PROMISC;
+
+	if (netdev->flags & IFF_BROADCAST)
+		new_mode |= VMXNET3_RXM_BCAST;
+
+	if (netdev->flags & IFF_ALLMULTI)
+		new_mode |= VMXNET3_RXM_ALL_MULTI;
+	else
+		if (netdev->mc_count > 0) {
+			new_table = vmxnet3_copy_mc(netdev);
+			if (new_table) {
+				new_mode |= VMXNET3_RXM_MCAST;
+				rxConf->mfTableLen = netdev->mc_count *
+						     ETH_ALEN;
+				rxConf->mfTablePA = virt_to_phys(new_table);
+			} else {
+				printk(KERN_INFO "%s: failed to copy mcast 
list"
+				       ", setting ALL_MULTI\n", netdev->name);
+				new_mode |= VMXNET3_RXM_ALL_MULTI;
+			}
+		}
+
+
+	if (!(new_mode & VMXNET3_RXM_MCAST)) {
+		rxConf->mfTableLen = 0;
+		rxConf->mfTablePA = 0;
+	}
+
+	if (new_mode != rxConf->rxMode) {
+		rxConf->rxMode = new_mode;
+		VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+				       VMXNET3_CMD_UPDATE_RX_MODE);
+	}
+
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+			       VMXNET3_CMD_UPDATE_MAC_FILTERS);
+
+	kfree(new_table);
+}
+
+
+/*
+ *   Set up driver_shared based on settings in adapter.
+ */
+
+static void
+vmxnet3_setup_driver_shared(struct vmxnet3_adapter *adapter)
+{
+	struct Vmxnet3_DriverShared *shared = adapter->shared;
+	struct Vmxnet3_DSDevRead *devRead = &shared->devRead;
+	struct Vmxnet3_TxQueueConf *tqc;
+	struct Vmxnet3_RxQueueConf *rqc;
+	int i;
+
+	memset(shared, 0, sizeof(*shared));
+
+	/* driver settings */
+	shared->magic = VMXNET3_REV1_MAGIC;
+	devRead->misc.driverInfo.version = VMXNET3_DRIVER_VERSION_NUM;
+	devRead->misc.driverInfo.gos.gosBits = (sizeof(void *) == 4 ?
+				VMXNET3_GOS_BITS_32 : VMXNET3_GOS_BITS_64);
+	devRead->misc.driverInfo.gos.gosType = VMXNET3_GOS_TYPE_LINUX;
+	devRead->misc.driverInfo.vmxnet3RevSpt = 1;
+	devRead->misc.driverInfo.uptVerSpt = 1;
+
+	devRead->misc.ddPA = virt_to_phys(adapter);
+	devRead->misc.ddLen = sizeof(struct vmxnet3_adapter);
+
+	/* set up feature flags */
+	if (adapter->rxcsum)
+		devRead->misc.uptFeatures |= UPT1_F_RXCSUM;
+
+	if (adapter->lro) {
+		devRead->misc.uptFeatures |= UPT1_F_LRO;
+		devRead->misc.maxNumRxSG = 1 + MAX_SKB_FRAGS;
+	}
+	if ((adapter->netdev->features & NETIF_F_HW_VLAN_RX)
+			&& adapter->vlan_grp) {
+		devRead->misc.uptFeatures |= UPT1_F_RXVLAN;
+	}
+
+	devRead->misc.mtu = adapter->netdev->mtu;
+	devRead->misc.queueDescPA = adapter->queue_desc_pa;
+	devRead->misc.queueDescLen = sizeof(struct Vmxnet3_TxQueueDesc) +
+				     sizeof(struct Vmxnet3_RxQueueDesc);
+
+	/* tx queue settings */
+	BUG_ON(adapter->tx_queue.tx_ring.base == NULL);
+
+	devRead->misc.numTxQueues = 1;
+	tqc = &adapter->tqd_start->conf;
+	tqc->txRingBasePA   = adapter->tx_queue.tx_ring.basePA;
+	tqc->dataRingBasePA = adapter->tx_queue.data_ring.basePA;
+	tqc->compRingBasePA = adapter->tx_queue.comp_ring.basePA;
+	tqc->ddPA           = virt_to_phys(adapter->tx_queue.buf_info);
+	tqc->txRingSize     = adapter->tx_queue.tx_ring.size;
+	tqc->dataRingSize   = adapter->tx_queue.data_ring.size;
+	tqc->compRingSize   = adapter->tx_queue.comp_ring.size;
+	tqc->ddLen          = sizeof(struct vmxnet3_tx_buf_info) *
+			      tqc->txRingSize;
+	tqc->intrIdx        = adapter->tx_queue.comp_ring.intr_idx;
+
+	/* rx queue settings */
+	devRead->misc.numRxQueues = 1;
+	rqc = &adapter->rqd_start->conf;
+	rqc->rxRingBasePA[0] = adapter->rx_queue.rx_ring[0].basePA;
+	rqc->rxRingBasePA[1] = adapter->rx_queue.rx_ring[1].basePA;
+	rqc->compRingBasePA  = adapter->rx_queue.comp_ring.basePA;
+	rqc->ddPA            = virt_to_phys(adapter->rx_queue.buf_info);
+	rqc->rxRingSize[0]   = adapter->rx_queue.rx_ring[0].size;
+	rqc->rxRingSize[1]   = adapter->rx_queue.rx_ring[1].size;
+	rqc->compRingSize    = adapter->rx_queue.comp_ring.size;
+	rqc->ddLen           = sizeof(struct vmxnet3_rx_buf_info) *
+			       (rqc->rxRingSize[0] + rqc->rxRingSize[1]);
+	rqc->intrIdx         = adapter->rx_queue.comp_ring.intr_idx;
+
+	/* intr settings */
+	devRead->intrConf.autoMask = adapter->intr.mask_mode ==
+				     VMXNET3_IMM_AUTO;
+	devRead->intrConf.numIntrs = adapter->intr.num_intrs;
+	for (i = 0; i < adapter->intr.num_intrs; i++)
+		devRead->intrConf.modLevels[i] = adapter->intr.mod_levels[i];
+
+	devRead->intrConf.eventIntrIdx = adapter->intr.event_intr_idx;
+
+	/* rx filter settings */
+	devRead->rxFilterConf.rxMode = 0;
+	vmxnet3_restore_vlan(adapter);
+	/* the rest are already zeroed */
+}
+
+
+int
+vmxnet3_activate_dev(struct vmxnet3_adapter *adapter)
+{
+	int err;
+	u32 ret;
+
+	dprintk(KERN_ERR "%s: skb_buf_size %d, rx_buf_per_pkt %d, ring sizes"
+		" %u %u %u\n", adapter->netdev->name, adapter->skb_buf_size,
+		adapter->rx_buf_per_pkt, adapter->tx_queue.tx_ring.size,
+		adapter->rx_queue.rx_ring[0].size,
+		adapter->rx_queue.rx_ring[1].size);
+
+	vmxnet3_tq_init(&adapter->tx_queue, adapter);
+	err = vmxnet3_rq_init(&adapter->rx_queue, adapter);
+	if (err) {
+		printk(KERN_ERR "Failed to init rx queue for %s: error %d\n",
+		       adapter->netdev->name, err);
+		goto rq_err;
+	}
+
+	err = vmxnet3_request_irqs(adapter);
+	if (err) {
+		printk(KERN_ERR "Failed to setup irq for %s: error %d\n",
+		       adapter->netdev->name, err);
+		goto irq_err;
+	}
+
+	vmxnet3_setup_driver_shared(adapter);
+
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_DSAL,
+			       VMXNET3_GET_ADDR_LO(adapter->shared_pa));
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_DSAH,
+			       VMXNET3_GET_ADDR_HI(adapter->shared_pa));
+
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+			       VMXNET3_CMD_ACTIVATE_DEV);
+	ret = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_CMD);
+
+	if (ret != 0) {
+		printk(KERN_ERR "Failed to activate dev %s: error %u\n",
+		       adapter->netdev->name, ret);
+		err = -EINVAL;
+		goto activate_err;
+	}
+	VMXNET3_WRITE_BAR0_REG(adapter, VMXNET3_REG_RXPROD,
+			       adapter->rx_queue.rx_ring[0].next2fill);
+	VMXNET3_WRITE_BAR0_REG(adapter, VMXNET3_REG_RXPROD2,
+			       adapter->rx_queue.rx_ring[1].next2fill);
+
+	/* Apply the rx filter settins last. */
+	vmxnet3_set_mc(adapter->netdev);
+
+	/*
+	 * Check link state when first activating device. It will start the
+	 * tx queue if the link is up.
+	 */
+	vmxnet3_check_link(adapter);
+
+	napi_enable(&adapter->napi);
+	vmxnet3_enable_all_intrs(adapter);
+	clear_bit(VMXNET3_STATE_BIT_QUIESCED, &adapter->state);
+	return 0;
+
+activate_err:
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_DSAL, 0);
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_DSAH, 0);
+	vmxnet3_free_irqs(adapter);
+irq_err:
+rq_err:
+	/* free up buffers we allocated */
+	vmxnet3_rq_cleanup(&adapter->rx_queue, adapter);
+	return err;
+}
+
+
+void
+vmxnet3_reset_dev(struct vmxnet3_adapter *adapter)
+{
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, 
VMXNET3_CMD_RESET_DEV);
+}
+
+
+int
+vmxnet3_quiesce_dev(struct vmxnet3_adapter *adapter)
+{
+	if (test_and_set_bit(VMXNET3_STATE_BIT_QUIESCED, &adapter->state))
+		return 0;
+
+
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+			       VMXNET3_CMD_QUIESCE_DEV);
+	vmxnet3_disable_all_intrs(adapter);
+
+	napi_disable(&adapter->napi);
+	netif_tx_disable(adapter->netdev);
+	adapter->link_speed = 0;
+	netif_carrier_off(adapter->netdev);
+
+	vmxnet3_tq_cleanup(&adapter->tx_queue, adapter);
+	vmxnet3_rq_cleanup(&adapter->rx_queue, adapter);
+	vmxnet3_free_irqs(adapter);
+	return 0;
+}
+
+
+static void
+vmxnet3_write_mac_addr(struct vmxnet3_adapter *adapter, u8 *mac)
+{
+	u32 tmp;
+
+	tmp = *(u32 *)mac;
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_MACL, tmp);
+
+	tmp = (mac[5] << 8) | mac[4];
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_MACH, tmp);
+}
+
+
+static int
+vmxnet3_set_mac_addr(struct net_device *netdev, void *p)
+{
+	struct sockaddr *addr = p;
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+
+	memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len);
+	vmxnet3_write_mac_addr(adapter, addr->sa_data);
+
+	return 0;
+}
+
+
+/* ==================== initialization and cleanup routines ============ */
+
+static int
+vmxnet3_alloc_pci_resources(struct vmxnet3_adapter *adapter, bool *dma64)
+{
+	int err;
+	unsigned long mmio_start, mmio_len;
+	struct pci_dev *pdev = adapter->pdev;
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		printk(KERN_ERR "Failed to enable adapter %s: error %d\n",
+		       pci_name(pdev), err);
+		return err;
+	}
+
+	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) == 0) {
+		if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)) != 0) {
+			printk(KERN_ERR "pci_set_consistent_dma_mask failed "
+			       "for adapter %s\n", pci_name(pdev));
+			err = -EIO;
+			goto err_set_mask;
+		}
+		*dma64 = true;
+	} else {
+		if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) != 0) {
+			printk(KERN_ERR "pci_set_dma_mask failed for adapter "
+			       "%s\n",	pci_name(pdev));
+			err = -EIO;
+			goto err_set_mask;
+		}
+		*dma64 = false;
+	}
+
+	err = pci_request_selected_regions(pdev, (1 << 2) - 1,
+					   vmxnet3_driver_name);
+	if (err) {
+		printk(KERN_ERR "Failed to request region for adapter %s: "
+		       "error %d\n", pci_name(pdev), err);
+		goto err_set_mask;
+	}
+
+	pci_set_master(pdev);
+
+	mmio_start = pci_resource_start(pdev, 0);
+	mmio_len = pci_resource_len(pdev, 0);
+	adapter->hw_addr0 = ioremap(mmio_start, mmio_len);
+	if (!adapter->hw_addr0) {
+		printk(KERN_ERR "Failed to map bar0 for adapter %s\n",
+		       pci_name(pdev));
+		err = -EIO;
+		goto err_ioremap;
+	}
+
+	mmio_start = pci_resource_start(pdev, 1);
+	mmio_len = pci_resource_len(pdev, 1);
+	adapter->hw_addr1 = ioremap(mmio_start, mmio_len);
+	if (!adapter->hw_addr1) {
+		printk(KERN_ERR "Failed to map bar1 for adapter %s\n",
+		       pci_name(pdev));
+		err = -EIO;
+		goto err_bar1;
+	}
+	return 0;
+
+err_bar1:
+	iounmap(adapter->hw_addr0);
+err_ioremap:
+	pci_release_selected_regions(pdev, (1 << 2) - 1);
+err_set_mask:
+	pci_disable_device(pdev);
+	return err;
+}
+
+
+static void
+vmxnet3_free_pci_resources(struct vmxnet3_adapter *adapter)
+{
+	BUG_ON(!adapter->pdev);
+
+	iounmap(adapter->hw_addr0);
+	iounmap(adapter->hw_addr1);
+	pci_release_selected_regions(adapter->pdev, (1 << 2) - 1);
+	pci_disable_device(adapter->pdev);
+}
+
+
+static void
+vmxnet3_adjust_rx_ring_size(struct vmxnet3_adapter *adapter)
+{
+	size_t sz;
+
+	if (adapter->netdev->mtu <= VMXNET3_MAX_SKB_BUF_SIZE -
+				    VMXNET3_MAX_ETH_HDR_SIZE) {
+		adapter->skb_buf_size = adapter->netdev->mtu +
+					VMXNET3_MAX_ETH_HDR_SIZE;
+		if (adapter->skb_buf_size < VMXNET3_MIN_T0_BUF_SIZE)
+			adapter->skb_buf_size = VMXNET3_MIN_T0_BUF_SIZE;
+
+		adapter->rx_buf_per_pkt = 1;
+	} else {
+		adapter->skb_buf_size = VMXNET3_MAX_SKB_BUF_SIZE;
+		sz = adapter->netdev->mtu - VMXNET3_MAX_SKB_BUF_SIZE +
+					    VMXNET3_MAX_ETH_HDR_SIZE;
+		adapter->rx_buf_per_pkt = 1 + (sz + PAGE_SIZE - 1) / PAGE_SIZE;
+	}
+
+	/*
+	 * for simplicity, force the ring0 size to be a multiple of
+	 * rx_buf_per_pkt * VMXNET3_RING_SIZE_ALIGN
+	 */
+	sz = adapter->rx_buf_per_pkt * VMXNET3_RING_SIZE_ALIGN;
+	adapter->rx_queue.rx_ring[0].size = (adapter->rx_queue.rx_ring[0].size 
+
+					     sz - 1) / sz * sz;
+	adapter->rx_queue.rx_ring[0].size = min_t(u32,
+					    adapter->rx_queue.rx_ring[0].size,
+					    VMXNET3_RX_RING_MAX_SIZE / sz * 
sz);
+}
+
+
+int
+vmxnet3_create_queues(struct vmxnet3_adapter *adapter, u32 tx_ring_size,
+		      u32 rx_ring_size, u32 rx_ring2_size)
+{
+	int err;
+
+	adapter->tx_queue.tx_ring.size   = tx_ring_size;
+	adapter->tx_queue.data_ring.size = tx_ring_size;
+	adapter->tx_queue.comp_ring.size = tx_ring_size;
+	adapter->tx_queue.shared = &adapter->tqd_start->ctrl;
+	adapter->tx_queue.stopped = true;
+	err = vmxnet3_tq_create(&adapter->tx_queue, adapter);
+	if (err)
+		return err;
+
+	adapter->rx_queue.rx_ring[0].size = rx_ring_size;
+	adapter->rx_queue.rx_ring[1].size = rx_ring2_size;
+	vmxnet3_adjust_rx_ring_size(adapter);
+	adapter->rx_queue.comp_ring.size  = adapter->rx_queue.rx_ring[0].size +
+					    adapter->rx_queue.rx_ring[1].size;
+	adapter->rx_queue.qid  = 0;
+	adapter->rx_queue.qid2 = 1;
+	adapter->rx_queue.shared = &adapter->rqd_start->ctrl;
+	err = vmxnet3_rq_create(&adapter->rx_queue, adapter);
+	if (err)
+		vmxnet3_tq_destroy(&adapter->tx_queue, adapter);
+
+	return err;
+}
+
+static int
+vmxnet3_open(struct net_device *netdev)
+{
+	struct vmxnet3_adapter *adapter;
+	int err;
+
+	adapter = netdev_priv(netdev);
+
+	spin_lock_init(&adapter->tx_queue.tx_lock);
+
+	err = vmxnet3_create_queues(adapter, VMXNET3_DEF_TX_RING_SIZE,
+				    VMXNET3_DEF_RX_RING_SIZE,
+				    VMXNET3_DEF_RX_RING_SIZE);
+	if (err)
+		goto queue_err;
+
+	err = vmxnet3_activate_dev(adapter);
+	if (err)
+		goto activate_err;
+
+	return 0;
+
+activate_err:
+	vmxnet3_rq_destroy(&adapter->rx_queue, adapter);
+	vmxnet3_tq_destroy(&adapter->tx_queue, adapter);
+queue_err:
+	return err;
+}
+
+
+static int
+vmxnet3_close(struct net_device *netdev)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+
+	/*
+	 * Reset_work may be in the middle of resetting the device, wait for 
its
+	 * completion.
+	 */
+	while (test_and_set_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state))
+		msleep(1);
+
+	vmxnet3_quiesce_dev(adapter);
+
+	vmxnet3_rq_destroy(&adapter->rx_queue, adapter);
+	vmxnet3_tq_destroy(&adapter->tx_queue, adapter);
+
+	clear_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state);
+
+
+	return 0;
+}
+
+
+void
+vmxnet3_force_close(struct vmxnet3_adapter *adapter)
+{
+	/*
+	 * we must clear VMXNET3_STATE_BIT_RESETTING, otherwise
+	 * vmxnet3_close() will deadlock.
+	 */
+	BUG_ON(test_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state));
+
+	/* we need to enable NAPI, otherwise dev_close will deadlock */
+	napi_enable(&adapter->napi);
+	dev_close(adapter->netdev);
+}
+
+
+static int
+vmxnet3_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	int err = 0;
+
+	if (new_mtu < VMXNET3_MIN_MTU || new_mtu > VMXNET3_MAX_MTU)
+		return -EINVAL;
+
+	if (new_mtu > 1500 && !adapter->jumbo_frame)
+		return -EINVAL;
+
+	netdev->mtu = new_mtu;
+
+	/*
+	 * Reset_work may be in the middle of resetting the device, wait for 
its
+	 * completion.
+	 */
+	while (test_and_set_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state))
+		msleep(1);
+
+	if (netif_running(netdev)) {
+		vmxnet3_quiesce_dev(adapter);
+		vmxnet3_reset_dev(adapter);
+
+		/* we need to re-create the rx queue based on the new mtu */
+		vmxnet3_rq_destroy(&adapter->rx_queue, adapter);
+		vmxnet3_adjust_rx_ring_size(adapter);
+		adapter->rx_queue.comp_ring.size  =
+					adapter->rx_queue.rx_ring[0].size +
+					adapter->rx_queue.rx_ring[1].size;
+		err = vmxnet3_rq_create(&adapter->rx_queue, adapter);
+		if (err) {
+			printk(KERN_ERR "%s: failed to re-create rx queue,"
+				" error %d. Closing it.\n", netdev->name, err);
+			goto out;
+		}
+
+		err = vmxnet3_activate_dev(adapter);
+		if (err) {
+			printk(KERN_ERR "%s: failed to re-activate, error %d. "
+				"Closing it\n", netdev->name, err);
+			goto out;
+		}
+	}
+
+out:
+	clear_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state);
+	if (err)
+		vmxnet3_force_close(adapter);
+
+	return err;
+}
+
+
+static void
+vmxnet3_declare_features(struct vmxnet3_adapter *adapter, bool dma64)
+{
+	struct net_device *netdev = adapter->netdev;
+
+	netdev->features = NETIF_F_SG |
+		NETIF_F_HW_CSUM |
+		NETIF_F_HW_VLAN_TX |
+		NETIF_F_HW_VLAN_RX |
+		NETIF_F_HW_VLAN_FILTER |
+		NETIF_F_TSO |
+		NETIF_F_TSO6;
+
+	printk(KERN_INFO "features: sg csum vlan jf tso tsoIPv6");
+
+	adapter->rxcsum = true;
+	adapter->jumbo_frame = true;
+
+	if (!disable_lro) {
+		adapter->lro = true;
+		printk(" lro");
+	}
+
+	if (dma64) {
+		netdev->features |= NETIF_F_HIGHDMA;
+		printk(" highDMA");
+	}
+
+	netdev->vlan_features = netdev->features;
+	printk("\n");
+}
+
+
+static void
+vmxnet3_read_mac_addr(struct vmxnet3_adapter *adapter, u8 *mac)
+{
+	u32 tmp;
+
+	tmp = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_MACL);
+	*(u32 *)mac = tmp;
+
+	tmp = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_MACH);
+	mac[4] = tmp & 0xff;
+	mac[5] = (tmp >> 8) & 0xff;
+}
+
+
+static void
+vmxnet3_alloc_intr_resources(struct vmxnet3_adapter *adapter)
+{
+	u32 cfg;
+
+	/* intr settings */
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+			       VMXNET3_CMD_GET_CONF_INTR);
+	cfg = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_CMD);
+	adapter->intr.type = cfg & 0x3;
+	adapter->intr.mask_mode = (cfg >> 2) & 0x3;
+
+	if (adapter->intr.type == VMXNET3_IT_AUTO) {
+		int err;
+
+		adapter->intr.msix_entries[0].entry = 0;
+		err = pci_enable_msix(adapter->pdev, 
adapter->intr.msix_entries,
+				      VMXNET3_LINUX_MAX_MSIX_VECT);
+		if (!err) {
+			adapter->intr.num_intrs = 1;
+			adapter->intr.type = VMXNET3_IT_MSIX;
+			return;
+		}
+
+		err = pci_enable_msi(adapter->pdev);
+		if (!err) {
+			adapter->intr.num_intrs = 1;
+			adapter->intr.type = VMXNET3_IT_MSI;
+			return;
+		}
+	}
+
+	adapter->intr.type = VMXNET3_IT_INTX;
+
+	/* INT-X related setting */
+	adapter->intr.num_intrs = 1;
+}
+
+
+static void
+vmxnet3_free_intr_resources(struct vmxnet3_adapter *adapter)
+{
+	if (adapter->intr.type == VMXNET3_IT_MSIX)
+		pci_disable_msix(adapter->pdev);
+	else if (adapter->intr.type == VMXNET3_IT_MSI)
+		pci_disable_msi(adapter->pdev);
+	else
+		BUG_ON(adapter->intr.type != VMXNET3_IT_INTX);
+}
+
+
+static void
+vmxnet3_tx_timeout(struct net_device *netdev)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	adapter->tx_timeout_count++;
+
+	printk(KERN_ERR "%s: tx hang\n", adapter->netdev->name);
+	schedule_work(&adapter->work);
+}
+
+
+static void
+vmxnet3_reset_work(struct work_struct *data)
+{
+	struct vmxnet3_adapter *adapter;
+
+	adapter = container_of(data, struct vmxnet3_adapter, work);
+
+	/* if another thread is resetting the device, no need to proceed */
+	if (test_and_set_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state))
+		return;
+
+	/* if the device is closed, we must leave it alone */
+	if (netif_running(adapter->netdev)) {
+		printk(KERN_INFO "%s: resetting\n", adapter->netdev->name);
+		vmxnet3_quiesce_dev(adapter);
+		vmxnet3_reset_dev(adapter);
+		vmxnet3_activate_dev(adapter);
+	} else {
+		printk(KERN_INFO "%s: already closed\n", 
adapter->netdev->name);
+	}
+
+	clear_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state);
+}
+
+
+static int __devinit
+vmxnet3_probe_device(struct pci_dev *pdev,
+		     const struct pci_device_id *id)
+{
+	static const struct net_device_ops vmxnet3_netdev_ops = {
+		.ndo_open = vmxnet3_open,
+		.ndo_stop = vmxnet3_close,
+		.ndo_start_xmit = vmxnet3_xmit_frame,
+		.ndo_set_mac_address = vmxnet3_set_mac_addr,
+		.ndo_change_mtu = vmxnet3_change_mtu,
+		.ndo_get_stats = vmxnet3_get_stats,
+		.ndo_tx_timeout = vmxnet3_tx_timeout,
+		.ndo_set_multicast_list = vmxnet3_set_mc,
+		.ndo_vlan_rx_register = vmxnet3_vlan_rx_register,
+		.ndo_vlan_rx_add_vid = vmxnet3_vlan_rx_add_vid,
+		.ndo_vlan_rx_kill_vid = vmxnet3_vlan_rx_kill_vid,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+		.ndo_poll_controller = vmxnet3_netpoll,
+#endif
+	};
+	int err;
+	bool dma64 = false; /* stupid gcc */
+	u32 ver;
+	struct net_device *netdev;
+	struct vmxnet3_adapter *adapter;
+	u8 mac[ETH_ALEN];
+
+	netdev = alloc_etherdev(sizeof(struct vmxnet3_adapter));
+	if (!netdev) {
+		printk(KERN_ERR "Failed to alloc ethernet device for adapter "
+			"%s\n",	pci_name(pdev));
+		return -ENOMEM;
+	}
+
+	pci_set_drvdata(pdev, netdev);
+	adapter = netdev_priv(netdev);
+	adapter->netdev = netdev;
+	adapter->pdev = pdev;
+
+	adapter->shared = pci_alloc_consistent(adapter->pdev,
+			  sizeof(struct Vmxnet3_DriverShared),
+			  &adapter->shared_pa);
+	if (!adapter->shared) {
+		printk(KERN_ERR "Failed to allocate memory for %s\n",
+			pci_name(pdev));
+		err = -ENOMEM;
+		goto err_alloc_shared;
+	}
+
+	adapter->tqd_start = pci_alloc_consistent(adapter->pdev,
+			     sizeof(struct Vmxnet3_TxQueueDesc) +
+			     sizeof(struct Vmxnet3_RxQueueDesc),
+			     &adapter->queue_desc_pa);
+
+	if (!adapter->tqd_start) {
+		printk(KERN_ERR "Failed to allocate memory for %s\n",
+			pci_name(pdev));
+		err = -ENOMEM;
+		goto err_alloc_queue_desc;
+	}
+	adapter->rqd_start = (struct Vmxnet3_RxQueueDesc *)(adapter->tqd_start
+							    + 1);
+
+	adapter->pm_conf = kmalloc(sizeof(struct Vmxnet3_PMConf), GFP_KERNEL);
+	if (adapter->pm_conf == NULL) {
+		printk(KERN_ERR "Failed to allocate memory for %s\n",
+			pci_name(pdev));
+		err = -ENOMEM;
+		goto err_alloc_pm;
+	}
+
+	err = vmxnet3_alloc_pci_resources(adapter, &dma64);
+	if (err < 0)
+		goto err_alloc_pci;
+
+	ver = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_VRRS);
+	if (ver & 1) {
+		VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_VRRS, 1);
+	} else {
+		printk(KERN_ERR "Incompatible h/w version (0x%x) for adapter"
+		       " %s\n",	ver, pci_name(pdev));
+		err = -EBUSY;
+		goto err_ver;
+	}
+
+	ver = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_UVRS);
+	if (ver & 1) {
+		VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_UVRS, 1);
+	} else {
+		printk(KERN_ERR "Incompatible upt version (0x%x) for "
+		       "adapter %s\n", ver, pci_name(pdev));
+		err = -EBUSY;
+		goto err_ver;
+	}
+
+	vmxnet3_declare_features(adapter, dma64);
+
+	adapter->dev_number = atomic_read(&devices_found);
+	vmxnet3_alloc_intr_resources(adapter);
+
+	vmxnet3_read_mac_addr(adapter, mac);
+	memcpy(netdev->dev_addr,  mac, netdev->addr_len);
+
+	netdev->netdev_ops = &vmxnet3_netdev_ops;
+	netdev->watchdog_timeo = 5 * HZ;
+	vmxnet3_set_ethtool_ops(netdev);
+
+	INIT_WORK(&adapter->work, vmxnet3_reset_work);
+
+	netif_napi_add(netdev, &adapter->napi, vmxnet3_poll, 64);
+	SET_NETDEV_DEV(netdev, &pdev->dev);
+	err = register_netdev(netdev);
+
+	if (err) {
+		printk(KERN_ERR "Failed to register adapter %s\n",
+			pci_name(pdev));
+		goto err_register;
+	}
+
+	set_bit(VMXNET3_STATE_BIT_QUIESCED, &adapter->state);
+	atomic_inc(&devices_found);
+	return 0;
+
+err_register:
+	vmxnet3_free_intr_resources(adapter);
+err_ver:
+	vmxnet3_free_pci_resources(adapter);
+err_alloc_pci:
+	kfree(adapter->pm_conf);
+err_alloc_pm:
+	pci_free_consistent(adapter->pdev, sizeof(struct Vmxnet3_TxQueueDesc) +
+			    sizeof(struct Vmxnet3_RxQueueDesc),
+			    adapter->tqd_start, adapter->queue_desc_pa);
+err_alloc_queue_desc:
+	pci_free_consistent(adapter->pdev, sizeof(struct Vmxnet3_DriverShared),
+			    adapter->shared, adapter->shared_pa);
+err_alloc_shared:
+	pci_set_drvdata(pdev, NULL);
+	free_netdev(netdev);
+	return err;
+}
+
+
+static void __devexit
+vmxnet3_remove_device(struct pci_dev *pdev)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+
+	flush_scheduled_work();
+
+	unregister_netdev(netdev);
+
+	vmxnet3_free_intr_resources(adapter);
+	vmxnet3_free_pci_resources(adapter);
+	kfree(adapter->pm_conf);
+	pci_free_consistent(adapter->pdev, sizeof(struct Vmxnet3_TxQueueDesc) +
+			    sizeof(struct Vmxnet3_RxQueueDesc),
+			    adapter->tqd_start, adapter->queue_desc_pa);
+	pci_free_consistent(adapter->pdev, sizeof(struct Vmxnet3_DriverShared),
+			    adapter->shared, adapter->shared_pa);
+	free_netdev(netdev);
+}
+
+
+#ifdef CONFIG_PM
+
+static int
+vmxnet3_suspend(struct device *device)
+{
+	struct pci_dev *pdev = to_pci_dev(device);
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	struct Vmxnet3_PMConf *pmConf;
+	struct ethhdr *ehdr;
+	struct arphdr *ahdr;
+	u8 *arpreq;
+	struct in_device *in_dev;
+	struct in_ifaddr *ifa;
+	int i = 0;
+
+	if (!netif_running(netdev))
+		return 0;
+
+	vmxnet3_disable_all_intrs(adapter);
+	netif_device_detach(netdev);
+	netif_stop_queue(netdev);
+
+	/* Create wake-up filters. */
+	pmConf = adapter->pm_conf;
+	memset(pmConf, 0, sizeof(*pmConf));
+
+	if (adapter->wol & WAKE_UCAST) {
+		pmConf->filters[i].patternSize = ETH_ALEN;
+		pmConf->filters[i].maskSize = 1;
+		memcpy(pmConf->filters[i].pattern, netdev->dev_addr, ETH_ALEN);
+		pmConf->filters[i].mask[0] = 0x3F; /* LSB ETH_ALEN bits */
+
+		pmConf->wakeUpEvents |= VMXNET3_PM_WAKEUP_FILTER;
+		i++;
+	}
+
+	if (adapter->wol & WAKE_ARP) {
+		in_dev = in_dev_get(netdev);
+		if (!in_dev)
+			goto skip_arp;
+
+		ifa = (struct in_ifaddr *)in_dev->ifa_list;
+		if (!ifa)
+			goto skip_arp;
+
+		pmConf->filters[i].patternSize = ETH_HLEN + /* Ethernet 
header*/
+			sizeof(struct arphdr) +		/* ARP header */
+			2 * ETH_ALEN +		/* 2 Ethernet addresses*/
+			2 * sizeof(u32);	/*2 IPv4 addresses */
+		pmConf->filters[i].maskSize =
+			(pmConf->filters[i].patternSize - 1) / 8 + 1;
+
+		/* ETH_P_ARP in Ethernet header. */
+		ehdr = (struct ethhdr *)pmConf->filters[i].pattern;
+		ehdr->h_proto = htons(ETH_P_ARP);
+
+		/* ARPOP_REQUEST in ARP header. */
+		ahdr = (struct arphdr *)&pmConf->filters[i].pattern[ETH_HLEN];
+		ahdr->ar_op = htons(ARPOP_REQUEST);
+		arpreq = (u8 *)(ahdr + 1);
+
+		/* The Unicast IPv4 address in 'tip' field. */
+		arpreq += 2 * ETH_ALEN + sizeof(u32);
+		*(u32 *)arpreq = ifa->ifa_address;
+
+		/* The mask for the relevant bits. */
+		pmConf->filters[i].mask[0] = 0x00;
+		pmConf->filters[i].mask[1] = 0x30; /* ETH_P_ARP */
+		pmConf->filters[i].mask[2] = 0x30; /* ARPOP_REQUEST */
+		pmConf->filters[i].mask[3] = 0x00;
+		pmConf->filters[i].mask[4] = 0xC0; /* IPv4 TIP */
+		pmConf->filters[i].mask[5] = 0x03; /* IPv4 TIP */
+		in_dev_put(in_dev);
+
+		pmConf->wakeUpEvents |= VMXNET3_PM_WAKEUP_FILTER;
+		i++;
+	}
+
+skip_arp:
+	if (adapter->wol & WAKE_MAGIC)
+		pmConf->wakeUpEvents |= VMXNET3_PM_WAKEUP_MAGIC;
+
+	pmConf->numFilters = i;
+
+	adapter->shared->devRead.pmConfDesc.confVer = 1;
+	adapter->shared->devRead.pmConfDesc.confLen = sizeof(*pmConf);
+	adapter->shared->devRead.pmConfDesc.confPA = virt_to_phys(pmConf);
+
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+			       VMXNET3_CMD_UPDATE_PMCFG);
+
+	pci_save_state(pdev);
+	pci_enable_wake(pdev, pci_choose_state(pdev, PMSG_SUSPEND),
+			adapter->wol);
+	pci_disable_device(pdev);
+	pci_set_power_state(pdev, pci_choose_state(pdev, PMSG_SUSPEND));
+
+	return 0;
+}
+
+
+static int
+vmxnet3_resume(struct device *device)
+{
+	int err;
+	struct pci_dev *pdev = to_pci_dev(device);
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	struct Vmxnet3_PMConf *pmConf;
+
+	if (!netif_running(netdev))
+		return 0;
+
+	/* Destroy wake-up filters. */
+	pmConf = adapter->pm_conf;
+	memset(pmConf, 0, sizeof(*pmConf));
+
+	adapter->shared->devRead.pmConfDesc.confVer = 1;
+	adapter->shared->devRead.pmConfDesc.confLen = sizeof(*pmConf);
+	adapter->shared->devRead.pmConfDesc.confPA = virt_to_phys(pmConf);
+
+	netif_device_attach(netdev);
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+	err = pci_enable_device_mem(pdev);
+	if (err != 0)
+		return err;
+
+	pci_enable_wake(pdev, PCI_D0, 0);
+
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+			       VMXNET3_CMD_UPDATE_PMCFG);
+	vmxnet3_enable_all_intrs(adapter);
+
+	return 0;
+}
+
+static struct dev_pm_ops vmxnet3_pm_ops = {
+	.suspend = vmxnet3_suspend,
+	.resume = vmxnet3_resume,
+};
+#endif
+
+static struct pci_driver vmxnet3_driver = {
+	.name		= vmxnet3_driver_name,
+	.id_table	= vmxnet3_pciid_table,
+	.probe		= vmxnet3_probe_device,
+	.remove		= __devexit_p(vmxnet3_remove_device),
+#ifdef CONFIG_PM
+	.driver.pm	= &vmxnet3_pm_ops,
+#endif
+};
+
+
+static int __init
+vmxnet3_init_module(void)
+{
+	printk(KERN_INFO "%s - version %s\n", VMXNET3_DRIVER_DESC,
+		VMXNET3_DRIVER_VERSION_REPORT);
+	return pci_register_driver(&vmxnet3_driver);
+}
+
+module_init(vmxnet3_init_module);
+
+
+static void
+vmxnet3_exit_module(void)
+{
+	pci_unregister_driver(&vmxnet3_driver);
+}
+
+module_exit(vmxnet3_exit_module);
+
+MODULE_AUTHOR("VMware, Inc.");
+MODULE_DESCRIPTION(VMXNET3_DRIVER_DESC);
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION(VMXNET3_DRIVER_VERSION_STRING);
+
+/* This paramenter is used to control Large Receive Offload feature
+ * of the NIC. When set to non-zeora LRO is enabled.
+ */
+module_param(disable_lro, int, 0);
diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c 
b/drivers/net/vmxnet3/vmxnet3_ethtool.c
new file mode 100644
index 0000000..4d2b0fc
--- /dev/null
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -0,0 +1,538 @@
+/*
+ * Linux driver for VMware's vmxnet3 ethernet NIC.
+ *
+ * Copyright (C) 2008-2009, VMware, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Maintained by: Shreyas Bhatewara <pv-drivers@vmware.com>
+ *
+ */
+
+
+#include "vmxnet3_int.h"
+
+struct vmxnet3_stat_desc {
+	char desc[ETH_GSTRING_LEN];
+	int  offset;
+};
+
+
+static u32
+vmxnet3_get_rx_csum(struct net_device *netdev)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	return adapter->rxcsum;
+}
+
+
+static int
+vmxnet3_set_rx_csum(struct net_device *netdev, u32 val)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+
+	if (adapter->rxcsum != val) {
+		adapter->rxcsum = val;
+		if (netif_running(netdev)) {
+			if (val)
+				adapter->shared->devRead.misc.uptFeatures |=
+								UPT1_F_RXCSUM;
+			else
+				adapter->shared->devRead.misc.uptFeatures &=
+								~UPT1_F_RXCSUM;
+
+			VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD,
+					       VMXNET3_CMD_UPDATE_FEATURE);
+		}
+	}
+	return 0;
+}
+
+
+/* per tq stats maintained by the device */
+static const struct vmxnet3_stat_desc
+vmxnet3_tq_dev_stats[] = {
+	/* description,         offset */
+	{ "TSO pkts tx",        offsetof(struct UPT1_TxStats, TSOPktsTxOK) },
+	{ "TSO bytes tx",       offsetof(struct UPT1_TxStats, TSOBytesTxOK) },
+	{ "ucast pkts tx",      offsetof(struct UPT1_TxStats, ucastPktsTxOK) },
+	{ "ucast bytes tx",     offsetof(struct UPT1_TxStats, ucastBytesTxOK) 
},
+	{ "mcast pkts tx",      offsetof(struct UPT1_TxStats, mcastPktsTxOK) },
+	{ "mcast bytes tx",     offsetof(struct UPT1_TxStats, mcastBytesTxOK) 
},
+	{ "bcast pkts tx",      offsetof(struct UPT1_TxStats, bcastPktsTxOK) },
+	{ "bcast bytes tx",     offsetof(struct UPT1_TxStats, bcastBytesTxOK) 
},
+	{ "pkts tx err",        offsetof(struct UPT1_TxStats, pktsTxError) },
+	{ "pkts tx discard",    offsetof(struct UPT1_TxStats, pktsTxDiscard) },
+};
+
+/* per tq stats maintained by the driver */
+static const struct vmxnet3_stat_desc
+vmxnet3_tq_driver_stats[] = {
+	/* description,         offset */
+	{"drv dropped tx total", offsetof(struct vmxnet3_tq_driver_stats,
+					drop_total) },
+	{ "   too many frags",  offsetof(struct vmxnet3_tq_driver_stats,
+					drop_too_many_frags) },
+	{ "   giant hdr",       offsetof(struct vmxnet3_tq_driver_stats,
+					drop_oversized_hdr) },
+	{ "   hdr err",         offsetof(struct vmxnet3_tq_driver_stats,
+					drop_hdr_inspect_err) },
+	{ "   tso",             offsetof(struct vmxnet3_tq_driver_stats,
+					drop_tso) },
+	{ "ring full",          offsetof(struct vmxnet3_tq_driver_stats,
+					tx_ring_full) },
+	{ "pkts linearized",    offsetof(struct vmxnet3_tq_driver_stats,
+					linearized) },
+	{ "hdr cloned",         offsetof(struct vmxnet3_tq_driver_stats,
+					copy_skb_header) },
+	{ "giant hdr",          offsetof(struct vmxnet3_tq_driver_stats,
+					oversized_hdr) },
+};
+
+/* per rq stats maintained by the device */
+static const struct vmxnet3_stat_desc
+vmxnet3_rq_dev_stats[] = {
+	{ "LRO pkts rx",        offsetof(struct UPT1_RxStats, LROPktsRxOK) },
+	{ "LRO byte rx",        offsetof(struct UPT1_RxStats, LROBytesRxOK) },
+	{ "ucast pkts rx",      offsetof(struct UPT1_RxStats, ucastPktsRxOK) },
+	{ "ucast bytes rx",     offsetof(struct UPT1_RxStats, ucastBytesRxOK) 
},
+	{ "mcast pkts rx",      offsetof(struct UPT1_RxStats, mcastPktsRxOK) },
+	{ "mcast bytes rx",     offsetof(struct UPT1_RxStats, mcastBytesRxOK) 
},
+	{ "bcast pkts rx",      offsetof(struct UPT1_RxStats, bcastPktsRxOK) },
+	{ "bcast bytes rx",     offsetof(struct UPT1_RxStats, bcastBytesRxOK) 
},
+	{ "pkts rx out of buf", offsetof(struct UPT1_RxStats, pktsRxOutOfBuf) 
},
+	{ "pkts rx err",        offsetof(struct UPT1_RxStats, pktsRxError) },
+};
+
+/* per rq stats maintained by the driver */
+static const struct vmxnet3_stat_desc
+vmxnet3_rq_driver_stats[] = {
+	/* description,         offset */
+	{ "drv dropped rx total", offsetof(struct vmxnet3_rq_driver_stats,
+					   drop_total) },
+	{ "   err",            offsetof(struct vmxnet3_rq_driver_stats,
+					drop_err) },
+	{ "   fcs",            offsetof(struct vmxnet3_rq_driver_stats,
+					drop_fcs) },
+	{ "rx buf alloc fail", offsetof(struct vmxnet3_rq_driver_stats,
+					rx_buf_alloc_failure) },
+};
+
+/* gloabl stats maintained by the driver */
+static const struct vmxnet3_stat_desc
+vmxnet3_global_stats[] = {
+	/* description,         offset */
+	{ "tx timeout count",   offsetof(struct vmxnet3_adapter,
+					 tx_timeout_count) }
+};
+
+
+struct net_device_stats *
+vmxnet3_get_stats(struct net_device *netdev)
+{
+	struct vmxnet3_adapter *adapter;
+	struct vmxnet3_tq_driver_stats *drvTxStats;
+	struct vmxnet3_rq_driver_stats *drvRxStats;
+	struct UPT1_TxStats *devTxStats;
+	struct UPT1_RxStats *devRxStats;
+	struct net_device_stats *net_stats = &netdev->stats;
+
+	adapter = netdev_priv(netdev);
+
+	/* Collect the dev stats into the shared area */
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, 
VMXNET3_CMD_GET_STATS);
+
+	/* Assuming that we have a single queue device */
+	devTxStats = &adapter->tqd_start->stats;
+	devRxStats = &adapter->rqd_start->stats;
+
+	/* Get access to the driver stats per queue */
+	drvTxStats = &adapter->tx_queue.stats;
+	drvRxStats = &adapter->rx_queue.stats;
+
+	memset(net_stats, 0, sizeof(*net_stats));
+
+	net_stats->rx_packets = devRxStats->ucastPktsRxOK +
+				devRxStats->mcastPktsRxOK +
+				devRxStats->bcastPktsRxOK;
+
+	net_stats->tx_packets = devTxStats->ucastPktsTxOK +
+				devTxStats->mcastPktsTxOK +
+				devTxStats->bcastPktsTxOK;
+
+	net_stats->rx_bytes = devRxStats->ucastBytesRxOK +
+			      devRxStats->mcastBytesRxOK +
+			      devRxStats->bcastBytesRxOK;
+
+	net_stats->tx_bytes = devTxStats->ucastBytesTxOK +
+			      devTxStats->mcastBytesTxOK +
+			      devTxStats->bcastBytesTxOK;
+
+	net_stats->rx_errors = devRxStats->pktsRxError;
+	net_stats->tx_errors = devTxStats->pktsTxError;
+	net_stats->rx_dropped = drvRxStats->drop_total;
+	net_stats->tx_dropped = drvTxStats->drop_total;
+	net_stats->multicast =  devRxStats->mcastPktsRxOK;
+
+	return net_stats;
+}
+
+static int
+vmxnet3_get_sset_count(struct net_device *netdev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(vmxnet3_tq_dev_stats) +
+			ARRAY_SIZE(vmxnet3_tq_driver_stats) +
+			ARRAY_SIZE(vmxnet3_rq_dev_stats) +
+			ARRAY_SIZE(vmxnet3_rq_driver_stats) +
+			ARRAY_SIZE(vmxnet3_global_stats);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+
+static int
+vmxnet3_get_regs_len(struct net_device *netdev)
+{
+	return 20 * sizeof(u32);
+}
+
+
+static void
+vmxnet3_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo 
*drvinfo)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+
+	strlcpy(drvinfo->driver, vmxnet3_driver_name, sizeof(drvinfo->driver));
+	drvinfo->driver[sizeof(drvinfo->driver) - 1] = '\0';
+
+	strlcpy(drvinfo->version, VMXNET3_DRIVER_VERSION_REPORT,
+		sizeof(drvinfo->version));
+	drvinfo->driver[sizeof(drvinfo->version) - 1] = '\0';
+
+	strlcpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
+	drvinfo->fw_version[sizeof(drvinfo->fw_version) - 1] = '\0';
+
+	strlcpy(drvinfo->bus_info, pci_name(adapter->pdev),
+		ETHTOOL_BUSINFO_LEN);
+	drvinfo->n_stats = vmxnet3_get_sset_count(netdev, ETH_SS_STATS);
+	drvinfo->testinfo_len = 0;
+	drvinfo->eedump_len   = 0;
+	drvinfo->regdump_len  = vmxnet3_get_regs_len(netdev);
+}
+
+
+static void
+vmxnet3_get_strings(struct net_device *netdev, u32 stringset, u8 *buf)
+{
+	if (stringset == ETH_SS_STATS) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(vmxnet3_tq_dev_stats); i++) {
+			memcpy(buf, vmxnet3_tq_dev_stats[i].desc,
+			       ETH_GSTRING_LEN);
+			buf += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < ARRAY_SIZE(vmxnet3_tq_driver_stats); i++) {
+			memcpy(buf, vmxnet3_tq_driver_stats[i].desc,
+			       ETH_GSTRING_LEN);
+			buf += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < ARRAY_SIZE(vmxnet3_rq_dev_stats); i++) {
+			memcpy(buf, vmxnet3_rq_dev_stats[i].desc,
+			       ETH_GSTRING_LEN);
+			buf += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < ARRAY_SIZE(vmxnet3_rq_driver_stats); i++) {
+			memcpy(buf, vmxnet3_rq_driver_stats[i].desc,
+			       ETH_GSTRING_LEN);
+			buf += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < ARRAY_SIZE(vmxnet3_global_stats); i++) {
+			memcpy(buf, vmxnet3_global_stats[i].desc,
+				ETH_GSTRING_LEN);
+			buf += ETH_GSTRING_LEN;
+		}
+	}
+}
+
+
+static void
+vmxnet3_get_ethtool_stats(struct net_device *netdev,
+			  struct ethtool_stats *stats, u64  *buf)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	u8 *base;
+	int i;
+
+	VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, 
VMXNET3_CMD_GET_STATS);
+
+	/* this does assume each counter is 64-bit wide */
+
+	base = (u8 *)&adapter->tqd_start->stats;
+	for (i = 0; i < ARRAY_SIZE(vmxnet3_tq_dev_stats); i++)
+		*buf++ = *(u64 *)(base + vmxnet3_tq_dev_stats[i].offset);
+
+	base = (u8 *)&adapter->tx_queue.stats;
+	for (i = 0; i < ARRAY_SIZE(vmxnet3_tq_driver_stats); i++)
+		*buf++ = *(u64 *)(base + vmxnet3_tq_driver_stats[i].offset);
+
+	base = (u8 *)&adapter->rqd_start->stats;
+	for (i = 0; i < ARRAY_SIZE(vmxnet3_rq_dev_stats); i++)
+		*buf++ = *(u64 *)(base + vmxnet3_rq_dev_stats[i].offset);
+
+	base = (u8 *)&adapter->rx_queue.stats;
+	for (i = 0; i < ARRAY_SIZE(vmxnet3_rq_driver_stats); i++)
+		*buf++ = *(u64 *)(base + vmxnet3_rq_driver_stats[i].offset);
+
+	base = (u8 *)adapter;
+	for (i = 0; i < ARRAY_SIZE(vmxnet3_global_stats); i++)
+		*buf++ = *(u64 *)(base + vmxnet3_global_stats[i].offset);
+}
+
+
+static void
+vmxnet3_get_regs(struct net_device *netdev, struct ethtool_regs *regs, void 
*p)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	u32 *buf = p;
+
+	memset(p, 0, vmxnet3_get_regs_len(netdev));
+
+	regs->version = 1;
+
+	/* Update vmxnet3_get_regs_len if we want to dump more registers */
+
+	/* make each ring use multiple of 16 bytes */
+	buf[0] = adapter->tx_queue.tx_ring.next2fill;
+	buf[1] = adapter->tx_queue.tx_ring.next2comp;
+	buf[2] = adapter->tx_queue.tx_ring.gen;
+	buf[3] = 0;
+
+	buf[4] = adapter->tx_queue.comp_ring.next2proc;
+	buf[5] = adapter->tx_queue.comp_ring.gen;
+	buf[6] = adapter->tx_queue.stopped;
+	buf[7] = 0;
+
+	buf[8] = adapter->rx_queue.rx_ring[0].next2fill;
+	buf[9] = adapter->rx_queue.rx_ring[0].next2comp;
+	buf[10] = adapter->rx_queue.rx_ring[0].gen;
+	buf[11] = 0;
+
+	buf[12] = adapter->rx_queue.rx_ring[1].next2fill;
+	buf[13] = adapter->rx_queue.rx_ring[1].next2comp;
+	buf[14] = adapter->rx_queue.rx_ring[1].gen;
+	buf[15] = 0;
+
+	buf[16] = adapter->rx_queue.comp_ring.next2proc;
+	buf[17] = adapter->rx_queue.comp_ring.gen;
+	buf[18] = 0;
+	buf[19] = 0;
+}
+
+
+static void
+vmxnet3_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+
+	wol->supported = WAKE_UCAST | WAKE_ARP | WAKE_MAGIC;
+	wol->wolopts = adapter->wol;
+}
+
+
+static int
+vmxnet3_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+
+	if (wol->wolopts & (WAKE_PHY | WAKE_MCAST | WAKE_BCAST |
+			    WAKE_MAGICSECURE)) {
+		return -EOPNOTSUPP;
+	}
+
+	adapter->wol = wol->wolopts;
+
+	device_set_wakeup_enable(&adapter->pdev->dev, adapter->wol);
+
+	return 0;
+}
+
+
+static int
+vmxnet3_get_settings(struct net_device *netdev, struct ethtool_cmd *ecmd)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+
+	ecmd->supported = SUPPORTED_10000baseT_Full | SUPPORTED_1000baseT_Full 
|
+			  SUPPORTED_TP;
+	ecmd->advertising = ADVERTISED_TP;
+	ecmd->port = PORT_TP;
+	ecmd->transceiver = XCVR_INTERNAL;
+
+	if (adapter->link_speed) {
+		ecmd->speed = adapter->link_speed;
+		ecmd->duplex = DUPLEX_FULL;
+	} else {
+		ecmd->speed = -1;
+		ecmd->duplex = -1;
+	}
+	return 0;
+}
+
+
+static void
+vmxnet3_get_ringparam(struct net_device *netdev,
+		      struct ethtool_ringparam *param)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+
+	param->rx_max_pending = VMXNET3_RX_RING_MAX_SIZE;
+	param->tx_max_pending = VMXNET3_TX_RING_MAX_SIZE;
+	param->rx_mini_max_pending = 0;
+	param->rx_jumbo_max_pending = 0;
+
+	param->rx_pending = adapter->rx_queue.rx_ring[0].size;
+	param->tx_pending = adapter->tx_queue.tx_ring.size;
+	param->rx_mini_pending = 0;
+	param->rx_jumbo_pending = 0;
+}
+
+
+static int
+vmxnet3_set_ringparam(struct net_device *netdev,
+		      struct ethtool_ringparam *param)
+{
+	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
+	u32 new_tx_ring_size, new_rx_ring_size;
+	u32 sz;
+	int err = 0;
+
+	if (param->tx_pending == 0 || param->tx_pending >
+						VMXNET3_TX_RING_MAX_SIZE)
+		return -EINVAL;
+
+	if (param->rx_pending == 0 || param->rx_pending >
+						VMXNET3_RX_RING_MAX_SIZE)
+		return -EINVAL;
+
+
+	/* round it up to a multiple of VMXNET3_RING_SIZE_ALIGN */
+	new_tx_ring_size = (param->tx_pending + VMXNET3_RING_SIZE_MASK) &
+ 
~VMXNET3_RING_SIZE_MASK;
+	new_tx_ring_size = min_t(u32, new_tx_ring_size,
+				 VMXNET3_TX_RING_MAX_SIZE);
+	if (new_tx_ring_size > VMXNET3_TX_RING_MAX_SIZE || (new_tx_ring_size %
+						VMXNET3_RING_SIZE_ALIGN) != 0)
+		return -EINVAL;
+
+	/* ring0 has to be a multiple of
+	 * rx_buf_per_pkt * VMXNET3_RING_SIZE_ALIGN
+	 */
+	sz = adapter->rx_buf_per_pkt * VMXNET3_RING_SIZE_ALIGN;
+	new_rx_ring_size = (param->rx_pending + sz - 1) / sz * sz;
+	new_rx_ring_size = min_t(u32, new_rx_ring_size,
+				 VMXNET3_RX_RING_MAX_SIZE / sz * sz);
+	if (new_rx_ring_size > VMXNET3_RX_RING_MAX_SIZE || (new_rx_ring_size %
+							   sz) != 0)
+		return -EINVAL;
+
+	if (new_tx_ring_size == adapter->tx_queue.tx_ring.size &&
+			new_rx_ring_size == adapter->rx_queue.rx_ring[0].size) 
{
+		return 0;
+	}
+
+	/*
+	 * Reset_work may be in the middle of resetting the device, wait for 
its
+	 * completion.
+	 */
+	while (test_and_set_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state))
+		msleep(1);
+
+	if (netif_running(netdev)) {
+		vmxnet3_quiesce_dev(adapter);
+		vmxnet3_reset_dev(adapter);
+
+		/* recreate the rx queue and the tx queue based on the
+		 * new sizes */
+		vmxnet3_tq_destroy(&adapter->tx_queue, adapter);
+		vmxnet3_rq_destroy(&adapter->rx_queue, adapter);
+
+		err = vmxnet3_create_queues(adapter, new_tx_ring_size,
+			new_rx_ring_size, VMXNET3_DEF_RX_RING_SIZE);
+		if (err) {
+			/* failed, most likely because of OOM, try default
+			 * size */
+			printk(KERN_ERR "%s: failed to apply new sizes, try 
the"
+				" default ones\n", netdev->name);
+			err = vmxnet3_create_queues(adapter,
+						    VMXNET3_DEF_TX_RING_SIZE,
+						    VMXNET3_DEF_RX_RING_SIZE,
+						    VMXNET3_DEF_RX_RING_SIZE);
+			if (err) {
+				printk(KERN_ERR "%s: failed to create queues "
+					"with default sizes. Closing it\n",
+					netdev->name);
+				goto out;
+			}
+		}
+
+		err = vmxnet3_activate_dev(adapter);
+		if (err)
+			printk(KERN_ERR "%s: failed to re-activate, error %d."
+				" Closing it\n", netdev->name, err);
+	}
+
+out:
+	clear_bit(VMXNET3_STATE_BIT_RESETTING, &adapter->state);
+	if (err)
+		vmxnet3_force_close(adapter);
+
+	return err;
+}
+
+
+static struct ethtool_ops vmxnet3_ethtool_ops = {
+	.get_settings      = vmxnet3_get_settings,
+	.get_drvinfo       = vmxnet3_get_drvinfo,
+	.get_regs_len      = vmxnet3_get_regs_len,
+	.get_regs          = vmxnet3_get_regs,
+	.get_wol           = vmxnet3_get_wol,
+	.set_wol           = vmxnet3_set_wol,
+	.get_link          = ethtool_op_get_link,
+	.get_rx_csum       = vmxnet3_get_rx_csum,
+	.set_rx_csum       = vmxnet3_set_rx_csum,
+	.get_tx_csum       = ethtool_op_get_tx_csum,
+	.set_tx_csum       = ethtool_op_set_tx_hw_csum,
+	.get_sg            = ethtool_op_get_sg,
+	.set_sg            = ethtool_op_set_sg,
+	.get_tso           = ethtool_op_get_tso,
+	.set_tso           = ethtool_op_set_tso,
+	.get_strings       = vmxnet3_get_strings,
+	.get_sset_count	   = vmxnet3_get_sset_count,
+	.get_ethtool_stats = vmxnet3_get_ethtool_stats,
+	.get_ringparam     = vmxnet3_get_ringparam,
+	.set_ringparam     = vmxnet3_set_ringparam,
+};
+
+void vmxnet3_set_ethtool_ops(struct net_device *netdev)
+{
+	SET_ETHTOOL_OPS(netdev, &vmxnet3_ethtool_ops);
+}
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h 
b/drivers/net/vmxnet3/vmxnet3_int.h
new file mode 100644
index 0000000..cf4b08d
--- /dev/null
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -0,0 +1,389 @@
+/*
+ * Linux driver for VMware's vmxnet3 ethernet NIC.
+ *
+ * Copyright (C) 2008-2009, VMware, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ *
+ * Maintained by: Shreyas Bhatewara <pv-drivers@vmware.com>
+ *
+ */
+
+#ifndef _VMXNET3_INT_H
+#define _VMXNET3_INT_H
+
+#include <linux/types.h>
+#include <linux/ethtool.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/ethtool.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/ioport.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/interrupt.h>
+#include <linux/workqueue.h>
+#include <linux/uaccess.h>
+#include <asm/dma.h>
+#include <asm/page.h>
+
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/etherdevice.h>
+#include <asm/checksum.h>
+#include <linux/if_vlan.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <linux/dst.h>
+
+#include "vmxnet3_defs.h"
+
+#ifdef DEBUG
+# define VMXNET3_DRIVER_VERSION_REPORT 
VMXNET3_DRIVER_VERSION_STRING"-NAPI(debug)"
+#else
+# define VMXNET3_DRIVER_VERSION_REPORT VMXNET3_DRIVER_VERSION_STRING"-NAPI"
+#endif
+
+
+/*
+ * Version numbers
+ */
+#define VMXNET3_DRIVER_VERSION_STRING   "1.0.4.0-k"
+
+/* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
+#define VMXNET3_DRIVER_VERSION_NUM      0x01000400
+
+
+/*
+ * Capabilities
+ */
+
+enum {
+	VMNET_CAP_SG	        = 0x0001, /* Can do scatter-gather transmits. 
*/
+	VMNET_CAP_IP4_CSUM      = 0x0002, /* Can checksum only TCP/UDP over
+					   * IPv4 */
+	VMNET_CAP_HW_CSUM       = 0x0004, /* Can checksum all packets. */
+	VMNET_CAP_HIGH_DMA      = 0x0008, /* Can DMA to high memory. */
+	VMNET_CAP_TOE	        = 0x0010, /* Supports TCP/IP offload. */
+	VMNET_CAP_TSO	        = 0x0020, /* Supports TCP Segmentation
+					   * offload */
+	VMNET_CAP_SW_TSO        = 0x0040, /* Supports SW TCP Segmentation */
+	VMNET_CAP_VMXNET_APROM  = 0x0080, /* Vmxnet APROM support */
+	VMNET_CAP_HW_TX_VLAN    = 0x0100, /* Can we do VLAN tagging in HW */
+	VMNET_CAP_HW_RX_VLAN    = 0x0200, /* Can we do VLAN untagging in HW */
+	VMNET_CAP_SW_VLAN       = 0x0400, /* VLAN tagging/untagging in SW */
+	VMNET_CAP_WAKE_PCKT_RCV = 0x0800, /* Can wake on network packet recv? 
*/
+	VMNET_CAP_ENABLE_INT_INLINE = 0x1000,  /* Enable Interrupt Inline */
+	VMNET_CAP_ENABLE_HEADER_COPY = 0x2000,  /* copy header for vmkernel */
+	VMNET_CAP_TX_CHAIN      = 0x4000, /* Guest can use multiple tx entries
+					  * for a pkt */
+	VMNET_CAP_RX_CHAIN      = 0x8000, /* pkt can span multiple rx entries 
*/
+	VMNET_CAP_LPD           = 0x10000, /* large pkt delivery */
+	VMNET_CAP_BPF           = 0x20000, /* BPF Support in VMXNET Virtual 
HW*/
+	VMNET_CAP_SG_SPAN_PAGES = 0x40000, /* Scatter-gather can span 
multiple*/
+					   /* pages transmits */
+	VMNET_CAP_IP6_CSUM      = 0x80000, /* Can do IPv6 csum offload. */
+	VMNET_CAP_TSO6         = 0x100000, /* TSO seg. offload for IPv6 pkts. 
*/
+	VMNET_CAP_TSO256k      = 0x200000, /* Can do TSO seg offload for */
+					   /* pkts up to 256kB. */
+	VMNET_CAP_UPT          = 0x400000  /* Support UPT */
+};
+
+/*
+ * PCI vendor and device IDs.
+ */
+#define PCI_VENDOR_ID_VMWARE            0x15AD
+#define PCI_DEVICE_ID_VMWARE_VMXNET3    0x07B0
+#define MAX_ETHERNET_CARDS		10
+#define MAX_PCI_PASSTHRU_DEVICE		6
+
+struct vmxnet3_cmd_ring {
+	union Vmxnet3_GenericDesc *base;
+	u32		size;
+	u32		next2fill;
+	u32		next2comp;
+	u8		gen;
+	dma_addr_t	basePA;
+};
+
+static inline void
+vmxnet3_cmd_ring_adv_next2fill(struct vmxnet3_cmd_ring *ring)
+{
+	ring->next2fill++;
+	if (unlikely(ring->next2fill == ring->size)) {
+		ring->next2fill = 0;
+		VMXNET3_FLIP_RING_GEN(ring->gen);
+	}
+}
+
+static inline void
+vmxnet3_cmd_ring_adv_next2comp(struct vmxnet3_cmd_ring *ring)
+{
+	VMXNET3_INC_RING_IDX_ONLY(ring->next2comp, ring->size);
+}
+
+static inline int
+vmxnet3_cmd_ring_desc_avail(struct vmxnet3_cmd_ring *ring)
+{
+	return (ring->next2comp > ring->next2fill ? 0 : ring->size) +
+		ring->next2comp - ring->next2fill - 1;
+}
+
+struct vmxnet3_comp_ring {
+	union Vmxnet3_GenericDesc *base;
+	u32               size;
+	u32               next2proc;
+	u8                gen;
+	u8                intr_idx;
+	dma_addr_t           basePA;
+};
+
+static inline void
+vmxnet3_comp_ring_adv_next2proc(struct vmxnet3_comp_ring *ring)
+{
+	ring->next2proc++;
+	if (unlikely(ring->next2proc == ring->size)) {
+		ring->next2proc = 0;
+		VMXNET3_FLIP_RING_GEN(ring->gen);
+	}
+}
+
+struct vmxnet3_tx_data_ring {
+	struct Vmxnet3_TxDataDesc *base;
+	u32              size;
+	dma_addr_t          basePA;
+};
+
+enum vmxnet3_buf_map_type {
+	VMXNET3_MAP_INVALID = 0,
+	VMXNET3_MAP_NONE,
+	VMXNET3_MAP_SINGLE,
+	VMXNET3_MAP_PAGE,
+};
+
+struct vmxnet3_tx_buf_info {
+	u32      map_type;
+	u16      len;
+	u16      sop_idx;
+	dma_addr_t  dma_addr;
+	struct sk_buff *skb;
+};
+
+struct vmxnet3_tq_driver_stats {
+	u64 drop_total;     /* # of pkts dropped by the driver, the
+				* counters below track droppings due to
+				* different reasons
+				*/
+	u64 drop_too_many_frags;
+	u64 drop_oversized_hdr;
+	u64 drop_hdr_inspect_err;
+	u64 drop_tso;
+
+	u64 tx_ring_full;
+	u64 linearized;         /* # of pkts linearized */
+	u64 copy_skb_header;    /* # of times we have to copy skb header */
+	u64 oversized_hdr;
+};
+
+struct vmxnet3_tx_ctx {
+	bool   ipv4;
+	u16 mss;
+	u32 eth_ip_hdr_size; /* only valid for pkts requesting tso or csum
+				 * offloading
+				 */
+	u32 l4_hdr_size;     /* only valid if mss != 0 */
+	u32 copy_size;       /* # of bytes copied into the data ring */
+	union Vmxnet3_GenericDesc *sop_txd;
+	union Vmxnet3_GenericDesc *eop_txd;
+};
+
+struct vmxnet3_tx_queue {
+	spinlock_t                      tx_lock;
+	struct vmxnet3_cmd_ring         tx_ring;
+	struct vmxnet3_tx_buf_info     *buf_info;
+	struct vmxnet3_tx_data_ring     data_ring;
+	struct vmxnet3_comp_ring        comp_ring;
+	struct Vmxnet3_TxQueueCtrl            *shared;
+	struct vmxnet3_tq_driver_stats  stats;
+	bool                            stopped;
+	int                             num_stop;  /* # of times the queue is
+						    * stopped */
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+enum vmxnet3_rx_buf_type {
+	VMXNET3_RX_BUF_NONE = 0,
+	VMXNET3_RX_BUF_SKB = 1,
+	VMXNET3_RX_BUF_PAGE = 2
+};
+
+struct vmxnet3_rx_buf_info {
+	enum vmxnet3_rx_buf_type buf_type;
+	u16     len;
+	union {
+		struct sk_buff *skb;
+		struct page    *page;
+	};
+	dma_addr_t dma_addr;
+};
+
+struct vmxnet3_rx_ctx {
+	struct sk_buff *skb;
+	u32 sop_idx;
+};
+
+struct vmxnet3_rq_driver_stats {
+	u64 drop_total;
+	u64 drop_err;
+	u64 drop_fcs;
+	u64 rx_buf_alloc_failure;
+};
+
+struct vmxnet3_rx_queue {
+	struct vmxnet3_cmd_ring   rx_ring[2];
+	struct vmxnet3_comp_ring  comp_ring;
+	struct vmxnet3_rx_ctx     rx_ctx;
+	u32 qid;            /* rqID in RCD for buffer from 1st ring */
+	u32 qid2;           /* rqID in RCD for buffer from 2nd ring */
+	u32 uncommitted[2]; /* # of buffers allocated since last RXPROD
+				* update */
+	struct vmxnet3_rx_buf_info     *buf_info[2];
+	struct Vmxnet3_RxQueueCtrl            *shared;
+	struct vmxnet3_rq_driver_stats  stats;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+#define VMXNET3_LINUX_MAX_MSIX_VECT     1
+
+struct vmxnet3_intr {
+	enum vmxnet3_intr_mask_mode  mask_mode;
+	enum vmxnet3_intr_type       type;	/* MSI-X, MSI, or INTx? */
+	u8  num_intrs;			/* # of intr vectors */
+	u8  event_intr_idx;		/* idx of the intr vector for event */
+	u8  mod_levels[VMXNET3_LINUX_MAX_MSIX_VECT]; /* moderation level */
+#ifdef CONFIG_PCI_MSI
+	struct msix_entry msix_entries[VMXNET3_LINUX_MAX_MSIX_VECT];
+#endif
+};
+
+#define VMXNET3_STATE_BIT_RESETTING   0
+#define VMXNET3_STATE_BIT_QUIESCED    1
+struct vmxnet3_adapter {
+	struct vmxnet3_tx_queue         tx_queue;
+	struct vmxnet3_rx_queue         rx_queue;
+	struct napi_struct              napi;
+	struct vlan_group              *vlan_grp;
+
+	struct vmxnet3_intr             intr;
+
+	struct Vmxnet3_DriverShared    *shared;
+	struct Vmxnet3_PMConf          *pm_conf;
+	struct Vmxnet3_TxQueueDesc     *tqd_start;     /* first tx queue desc 
*/
+	struct Vmxnet3_RxQueueDesc     *rqd_start;     /* first rx queue desc 
*/
+	struct net_device              *netdev;
+	struct pci_dev                 *pdev;
+
+	u8				*hw_addr0; /* for BAR 0 */
+	u8				*hw_addr1; /* for BAR 1 */
+
+	/* feature control */
+	bool				rxcsum;
+	bool				lro;
+	bool				jumbo_frame;
+
+	/* rx buffer related */
+	unsigned			skb_buf_size;
+	int		rx_buf_per_pkt;  /* only apply to the 1st ring */
+	dma_addr_t			shared_pa;
+	dma_addr_t queue_desc_pa;
+
+	/* Wake-on-LAN */
+	u32     wol;
+
+	/* Link speed */
+	u32     link_speed; /* in mbps */
+
+	u64     tx_timeout_count;
+	struct work_struct work;
+
+	unsigned long  state;    /* VMXNET3_STATE_BIT_xxx */
+
+	int dev_number;
+};
+
+#define VMXNET3_WRITE_BAR0_REG(adapter, reg, val)  \
+	writel((val), (adapter)->hw_addr0 + (reg))
+#define VMXNET3_READ_BAR0_REG(adapter, reg)        \
+	readl((adapter)->hw_addr0 + (reg))
+
+#define VMXNET3_WRITE_BAR1_REG(adapter, reg, val)  \
+	writel((val), (adapter)->hw_addr1 + (reg))
+#define VMXNET3_READ_BAR1_REG(adapter, reg)        \
+	readl((adapter)->hw_addr1 + (reg))
+
+#define VMXNET3_WAKE_QUEUE_THRESHOLD(tq)  (5)
+#define VMXNET3_RX_ALLOC_THRESHOLD(rq, ring_idx, adapter) \
+	((rq)->rx_ring[ring_idx].size >> 3)
+
+#define VMXNET3_GET_ADDR_LO(dma)   ((u32)(dma))
+#define VMXNET3_GET_ADDR_HI(dma)   ((u32)(((u64)(dma)) >> 32))
+
+/* must be a multiple of VMXNET3_RING_SIZE_ALIGN */
+#define VMXNET3_DEF_TX_RING_SIZE    512
+#define VMXNET3_DEF_RX_RING_SIZE    256
+
+#define VMXNET3_MAX_ETH_HDR_SIZE    22
+#define VMXNET3_MAX_SKB_BUF_SIZE    (3*1024)
+
+int
+vmxnet3_quiesce_dev(struct vmxnet3_adapter *adapter);
+
+int
+vmxnet3_activate_dev(struct vmxnet3_adapter *adapter);
+
+void
+vmxnet3_force_close(struct vmxnet3_adapter *adapter);
+
+void
+vmxnet3_reset_dev(struct vmxnet3_adapter *adapter);
+
+void
+vmxnet3_tq_destroy(struct vmxnet3_tx_queue *tq,
+		   struct vmxnet3_adapter *adapter);
+
+void
+vmxnet3_rq_destroy(struct vmxnet3_rx_queue *rq,
+		   struct vmxnet3_adapter *adapter);
+
+int
+vmxnet3_create_queues(struct vmxnet3_adapter *adapter,
+		      u32 tx_ring_size, u32 rx_ring_size, u32 rx_ring2_size);
+
+extern void vmxnet3_set_ethtool_ops(struct net_device *netdev);
+extern struct net_device_stats *vmxnet3_get_stats(struct net_device *netdev);
+
+extern char vmxnet3_driver_name[];
+#endif

^ permalink raw reply related

* Re: N_PPP_SYNC ldisc BUG: sleeping function called from invalid context
From: Jarek Poplawski @ 2009-09-30 21:12 UTC (permalink / raw)
  To: Tilman Schmidt; +Cc: Alan Cox, linux-kernel, netdev, Alan Cox
In-Reply-To: <4AC3A986.4080808@imap.cc>

Tilman Schmidt wrote, On 09/30/2009 08:55 PM:

> Alan Cox schrieb:
>>>  [<c026d39b>] tty_unthrottle+0x10/0x38
>>>  [<f8dcc31f>] ppp_sync_receive+0x168/0x170 [ppp_synctty]
>>>  [<f8fbb9ce>] handle_minor_recv+0x187/0x1cd [capi]
>>>  [<f8fbc19b>] capi_recv_message+0x1d9/0x24e [capi]
>> Really need to see the rest of the call trace to be sure
> 
> There wasn't more than what I posted. I had six of them, they looked all
> identical, and all of them ended after the kernel_thread_helper line. 
> 
>>> Turns out the ppp_sync_receive() function (drivers/net/ppp_synctty.c
>>> line 385ff.) has a comment in front stating:
>>>
>>> /*
>>>  * This can now be called from hard interrupt level as well
>>>  * as soft interrupt level or mainline.
>>>  */
>> Which is wrong. The flip_buffer_push -> rx processing path should never
>> be called from IRQ context and that was fixed for various drivers that
>> mis-set tty->low_latency, as well as in the PPP rework. The PPP case is
>> actually unrelated in many was.
> 
> Might be worth correcting that text then before is misleads someone.
> 
>>> Opinions?
>> See how we got into that code direct from an IRQ path. The expectation of
>> the tty logic is that it gets processed from work queues either
>> specifically in driver or via tty_flip_buffer_push when tty->low_latency
>> = 0
> 
> I'm at a loss here. According to all the backtraces:
> 
> - ppp_sync_receive() was called, as the LD's receive_buf method,
>   via handle_recv_skb() [drivers/isdn/capi/capi.c line 504, inlined]
>   from handle_minor_recv() [drivers/isdn/capi/capi.c line 519]
> 
> - handle_minor_recv() was called from capi_recv_message()
>   [drivers/isdn/capi/capi.c line 656]
> 
> - capi_recv_message() was called, as the CAPI application's
>   recv_message method, from recv_handler()
>   [drivers/isdn/capi/kcapi.c line 268]
> 
> - recv_handler() is never called directly. It's only scheduled
>   via the work queue ap->recv_work from capi_ctr_handle_message()
>   [drivers/isdn/capi/kcapi.c line 349]
> 
> Even if we don't trust the backtraces, there's not much room for
> another activation path. So for all I know, the expectation of the
> tty logic should have been met. The call was indeed processed from
> a work queue.
> 
> Why then does mutex_lock() complain?
 

Hmm... capi_recv_message() calls handle_minor_recv() under
spin_lock_irqsave(), doesn't it?

Jarek P.

^ permalink raw reply

* kernel doc / docbook pdfdocs question
From: Doug Maxey @ 2009-09-30 19:59 UTC (permalink / raw)
  To: Randy Dunlap; +Cc: netdev


Randy,

This may be slightly off topic for this list, but it does involve an
(as yet un-released) network driver. :)

Do you have any insight that could guide me toward a fix for an issue
seen with some header file constructs when trying to generate a pdf
docbook?

In my .tmpl file I do the "process my header file" construct:
...
!Ipath/to/myheader.h
...

In myheader.h an example decl that is giving me fits looks like:

struct foo {
	int bar;
	DECLARE_BITMAP(baz, LENGTH);
	int fotz;
};

It puts the bar and fotz decls in the output, but only outputs a
warning to stderr on the line with DECLARE_BITMAP, nothing about baz
(or DECLARE_BITMAP) is output into the .xml file.

My question is, would it be easier to pre-process the header, say with
gcc -EC via some rule in Documentation/DocBook/Makefile, or to try and
fix whatever gets called by scripts/basic/docbook?

I tried to find what the exact calling sequence was, but am getting
lost in what actually does the xml generation, and therefore how to
fix what is choking on the DECLARE_BITMAP.

Any tips will be very appreciated!

++doug



^ permalink raw reply

* [PATCH] Regression: e100_phy_init() isolates even selected PHY, causes 10 seconds boot delay
From: Bernhard Kaindl @ 2009-09-30 20:33 UTC (permalink / raw)
  To: David S. Miller, Bruce Allan; +Cc: Jeff Kirsher, netdev

[-- Attachment #1: Type: text/plain, Size: 6815 bytes --]

Dear David, Dear Bruce,

The current e100.c:e100_phy_init() electrically isolates all
the PHYs (even the selected PHY -- for a short time!) from the MII.

This happens only for a short duration before the isolation
of the selected PHY is reverted, but it's enough to cause a
major disturbance in the startup of our e100-based cards:

On a number of Embedded/Industry Pentium boards which are in use,
the result is that the initial DHCP negotiation takes more
than 10 seconds to complete with 2.6.30 and .31, while it's
done in a fraction of a second with 2.6.29 and earlier
(kernels tested with no delay range from 2.6.23 to 2.6.29)

That regression was introduced on March 31 in the by a patch
from Bruce which first appeared in 2.6.30-rc3:

http://marc.info/?l=linux-netdev&m=123766715429780&w=2
http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=b55de80e49892002a1878013ab9aee1a30970be6

> From: Bruce Allan <bruce.w.allan@intel.com>
>
> This patch enables support for the new Intel 82552 adapter (new PHY paired
> with the existing MAC in the ICH7 chipset).  No new features are added to
> the driver, however there are minor changes due to updated registers and a
> few workarounds for hardware errata.

In the middle, the patch has two changes for e100_phy_init()

The first one looks like a code optimization and does not appear to matching
the criteria described by Bruce in his submission, so assume it made it into
the patch submission by accident:

> @@ -1276,16 +1294,12 @@ static int e100_phy_init(struct nic *nic)
> 	if (addr == 32)
> 		return -EAGAIN;
>
> -	/* Selected the phy and isolate the rest */
> -	for (addr = 0; addr < 32; addr++) {
> -		if (addr != nic->mii.phy_id) {
> -			mdio_write(netdev, addr, MII_BMCR, BMCR_ISOLATE);
> -		} else {
> -			bmcr = mdio_read(netdev, addr, MII_BMCR);
> -			mdio_write(netdev, addr, MII_BMCR,
> -				bmcr & ~BMCR_ISOLATE);
> -		}
> -	}
> +	/* Isolate all the PHY ids */
> +	for (addr = 0; addr < 32; addr++)
> +		mdio_write(netdev, addr, MII_BMCR, BMCR_ISOLATE);
> +	/* Select the discovered PHY */
> +	bmcr &= ~BMCR_ISOLATE;
> +	mdio_write(netdev, nic->mii.phy_id, MII_BMCR, bmcr);
>
> 	/* Get phy ID */
> 	id_lo = mdio_read(netdev, nic->mii.phy_id, MII_PHYSID1);

If this was meant as a workaround for a hardware errata, it should
have been mentioned to ensure that no one undoes the change without
knowing.

Anyway:

What this does is that it removes the 2.6.23-2.6.29 PHY isolation loop
which ensured that *ONLY* PHY addresses which *do not* match the selected
PHY address are electrically isolated:

> -	/* Selected the phy and isolate the rest */
> -	for (addr = 0; addr < 32; addr++) {
> -             if (addr != nic->mii.phy_id) {
> -                     mdio_write(netdev, addr, MII_BMCR, BMCR_ISOLATE);

The 2.6.29 loop cleared the isolate bit of the discovered PHY in the
else clause of this if statement:

> -		} else {
> -			bmcr = mdio_read(netdev, addr, MII_BMCR);
> -			mdio_write(netdev, addr, MII_BMCR,
> -				bmcr & ~BMCR_ISOLATE);
> -		}

This loop is then replaced with electrical isolation of *_ALL_* PHYs:

> +	/* Isolate all the PHY ids */
> +	for (addr = 0; addr < 32; addr++)
> +		mdio_write(netdev, addr, MII_BMCR, BMCR_ISOLATE);

Which is reverted for the discovered PHY afterwards:

> +	/* Select the discovered PHY */
> +	bmcr &= ~BMCR_ISOLATE;
> +	mdio_write(netdev, nic->mii.phy_id, MII_BMCR, bmcr);

This change resulted in a delay of the "Link Up" message from the
e100 watchdog routine and in a number of DHCP packages getting lost
for a duration of about five seconds.

I suppose that this may have powered-down our PHYs for a short moment
or at least disturbed the connection which it has with the MII and/or
the outside world.

In any case:

Reverting solely this change alone fixed the 10 second boot delay!

For more information, I attached two driver debug logs of the commands
between firmware load and "Link Up" with and without isolation of the
selected PHY.

----------------------------------------------------------------------

But indeed, there is potential for a possibly valid optimization:

The old e100_phy_init() was reading the "bmcr" variable twice.
First, when probing the PHYs are probed:

/* Discover phy addr by searching addrs in order {1,0,2,..., 31} */
for (addr = 0; addr < 32; addr++) {
	nic->mii.phy_id = (addr == 0) ? 1 : (addr == 1) ? 0 : addr;
	bmcr = mdio_read(netdev, nic->mii.phy_id, MII_BMCR);
	stat = mdio_read(netdev, nic->mii.phy_id, MII_BMSR);
	stat = mdio_read(netdev, nic->mii.phy_id, MII_BMSR);
	if (!((bmcr == 0xFFFF) || ((stat == 0) && (bmcr == 0))))
		break;
-> When the selected PHY is found, the loop aborts here, so bmcr
     has the bmcr of the selected PHY
}

and second, when the old PHY setup loop was clearing the isolate bit:

> -		} else {
> -			bmcr = mdio_read(netdev, addr, MII_BMCR);
> -			mdio_write(netdev, addr, MII_BMCR,
> -				bmcr & ~BMCR_ISOLATE);
> -		}

Reading the MII_BMCR value twice was therefore not strictly necessary,
so I think this was the optimization which Bruce had in mind, and his
other motivation may have been to simplify the PHY setup loop.

So, what should be done is to only isolate the other PHYs and just clear
the isolate bit of the selected PHY, in the simplest possible way.

The patch which we are now using with 2.6.31 to fix this regression
does the following:

* Remove the loop which isolates *ALL* PHYs and then clears of
   the isolate bit of the selected PHY.

* Keep the new code which just clears the isolate bit of the BMCR
   of the discovered PHY

* Afterwards, isolate the unused PHYs.

This essentially resembles what the 2.6.23-2.6.29 code did, only
clearing the isolate bit of the discovered PHY is explicitly done
first, while before, it was in the middle of the PHY isolation loop.

A plain revert is probably safest in case of real paranoia here.
Electrically, this should be the same what we had until 2.6.30-rc3:

Tested by me and my colleagues:

--- linux-2.6.31/drivers/net/e100.c
+++ linux-2.6.31/drivers/net/e100.c
@@ -1294,13 +1294,15 @@ static int e100_phy_init(struct nic *nic
   	if (addr == 32)
   		return -EAGAIN;

-	/* Isolate all the PHY ids */
-	for (addr = 0; addr < 32; addr++)
-		mdio_write(netdev, addr, MII_BMCR, BMCR_ISOLATE);
   	/* Select the discovered PHY */
   	bmcr &= ~BMCR_ISOLATE;
   	mdio_write(netdev, nic->mii.phy_id, MII_BMCR, bmcr);

+	/* Electrically isolate only the unused PHYs */
+	for (addr = 0; addr < 32; addr++)
+		if (addr != nic->mii.phy_id)
+			mdio_write(netdev, addr, MII_BMCR, BMCR_ISOLATE);
+
   	/* Get phy ID */
   	id_lo = mdio_read(netdev, nic->mii.phy_id, MII_PHYSID1);
   	id_hi = mdio_read(netdev, nic->mii.phy_id, MII_PHYSID2);

Signed-off-by: Bernhard Kaindl <bernhard.kaindl@gmx.net>

Best Regards,
Bernhard Kaindl

[-- Attachment #2: e100_phy_init_without_isolation.txt --]
[-- Type: text/plain, Size: 1135 bytes --]

Without the optimzation which isolates the PHY, the link goes up within a 1/10th of
a second after starting to load the firmware:

 7.588075 e100 0000:00:09.0 firmware: using built-in firmware e100/d101s_ucode.bin
 7.604133 e100_configure: [00-07]=16:08:00:01:00:00:26:07
 7.608117 e100_configure: [08-15]=01:00:2E:00:60:00:F2:48
 7.608117 e100_configure: [16-23]=00:40:FA:86:3F:05:00:00
 7.610785 e100_set_multicast_list: mc_count=0, flags=0x1002
 7.615390 e100_configure: [00-07]=16:08:00:01:00:00:26:07
 7.619380 e100_configure: [08-15]=01:00:2E:00:60:00:F2:48
 7.619380 e100_configure: [16-23]=00:40:FA:86:3F:05:00:00
 7.619380 e100_intr: stat_ack = 0x20
 7.622719 e100_watchdog: right now = -74502
 7.627045 mdio_ctrl: READ:addr=1, reg=4, data_in=0x0000, data_out=0x182405E1
 7.630994 mdio_ctrl: READ:addr=1, reg=0, data_in=0x0000, data_out=0x18203000
 7.634011 mdio_ctrl: READ:addr=1, reg=5, data_in=0x0000, data_out=0x182541E1
 7.639498 mdio_ctrl: READ:addr=1, reg=1, data_in=0x0000, data_out=0x1821782D
 7.643447 mdio_ctrl: READ:addr=1, reg=1, data_in=0x0000, data_out=0x1821782D
 7.646419 NIC Link is Up 100 Mbps Full Duplex


[-- Attachment #3: e100_phy_init_with_isolation.txt --]
[-- Type: text/plain, Size: 3422 bytes --]

With the isolation of the PHY, 'link up' takes more than 2 seconds:

 8.241194 e100 0000:00:09.0 firmware: using built-in firmware e100/d101s_ucode.bin
 8.259478 e100_configure: [00-07]=16:08:00:01:00:00:26:07
 8.263462 e100_configure: [08-15]=01:00:2E:00:60:00:F2:48
 8.263462 e100_configure: [16-23]=00:40:FA:86:3F:05:00:00
 8.266134 e100_set_multicast_list: mc_count=0, flags=0x1002
 8.270737 e100_configure: [00-07]=16:08:00:01:00:00:26:07
 8.274727 e100_configure: [08-15]=01:00:2E:00:60:00:F2:48
 8.274727 e100_configure: [16-23]=00:40:FA:86:3F:05:00:00
 8.274727 e100_intr: stat_ack = 0x20
 8.278066 e100_watchdog: right now = -74619
 8.282389 mdio_ctrl: READ:addr=1, reg=4, data_in=0x0000, data_out=0x182405E1
 8.286339 mdio_ctrl: READ:addr=1, reg=0, data_in=0x0000, data_out=0x18203000
 8.289356 mdio_ctrl: READ:addr=1, reg=5, data_in=0x0000, data_out=0x182501E1
 8.294842 mdio_ctrl: READ:addr=1, reg=1, data_in=0x0000, data_out=0x18217809
 8.298791 mdio_ctrl: READ:addr=1, reg=1, data_in=0x0000, data_out=0x18217809
 8.301810 mdio_ctrl: READ:addr=1, reg=1, data_in=0x0000, data_out=0x18217809
 8.307293 mdio_ctrl: READ:addr=1, reg=1, data_in=0x0000, data_out=0x18217809
 8.311244 mdio_ctrl: READ:addr=1, reg=1, data_in=0x0000, data_out=0x18217809
 8.314260 mdio_ctrl: READ:addr=1, reg=1, data_in=0x0000, data_out=0x18217809
 8.318211 e100_rx_indicate: status=0x0000
 8.322954 e100_tx_clean: cb[0]->status = 0xA000
 8.326946 e100_tx_clean: cb[1]->status = 0xA000
 8.329309 e100_tx_clean: cb[2]->status = 0xA000
 8.334034 e100_tx_clean: cb[3]->status = 0xA000
 8.338028 e100_tx_clean: cb[4]->status = 0xA000
...
 8.355248 e100_configure: [00-07]=16:08:00:01:00:00:26:07
 8.359238 e100_configure: [08-15]=01:00:2E:00:60:00:F2:48
 8.359238 e100_configure: [16-23]=00:40:FA:86:3F:05:00:00
 8.359238 e100_intr: stat_ack = 0x20
 8.362559 e100_rx_indicate: status=0x0000
 8.366550 e100_tx_clean: cb[5]->status = 0xA000
 8.369792 e100_tx_clean: cb[6]->status = 0xA000
 8.374548 e100_set_multicast_list: mc_count=1, flags=0x1003
 8.379152 e100_configure: [00-07]=16:08:00:01:00:00:26:07
 8.383143 e100_configure: [08-15]=01:00:2E:00:60:00:F2:48
 8.383143 e100_configure: [16-23]=00:40:FA:86:3F:05:00:00
 8.383143 e100_intr: stat_ack = 0x20
 8.386435 e100_rx_indicate: status=0x0000
 8.390425 e100_tx_clean: cb[7]->status = 0xA000
 8.393667 e100_tx_clean: cb[8]->status = 0xA000
 8.398399 e100_intr: stat_ack = 0x20
 8.403041 e100_rx_indicate: status=0x0000
 8.407033 e100_set_multicast_list: mc_count=1, flags=0x1003
 8.410971 e100_configure: [00-07]=16:08:00:01:00:00:26:07
 8.414961 e100_configure: [08-15]=01:00:2E:00:60:00:F2:48
 8.414961 e100_configure: [16-23]=00:40:FA:86:3F:05:00:00
 8.414961 e100_intr: stat_ack = 0x20
 8.418260 e100_rx_indicate: status=0x0000
 8.422249 e100_tx_clean: cb[9]->status = 0xA000
 8.425492 e100_tx_clean: cb[10]->status = 0xA000
 8.430746 e100_intr: stat_ack = 0x20
 8.435388 e100_rx_indicate: status=0x0000
10.567601 e100_watchdog: right now = -74046
10.571931 mdio_ctrl: READ:addr=1, reg=4, data_in=0x0000, data_out=0x182405E1
10.577416 mdio_ctrl: READ:addr=1, reg=0, data_in=0x0000, data_out=0x18203000
10.582911 mdio_ctrl: READ:addr=1, reg=5, data_in=0x0000, data_out=0x182541E1
10.586862 mdio_ctrl: READ:addr=1, reg=1, data_in=0x0000, data_out=0x18217829
10.589877 mdio_ctrl: READ:addr=1, reg=1, data_in=0x0000, data_out=0x1821782D
10.595317 NIC Link is Up 100 Mbps Full Duplex


^ permalink raw reply

* Re: N_PPP_SYNC ldisc BUG: sleeping function called from invalid context
From: Jarek Poplawski @ 2009-09-30 20:28 UTC (permalink / raw)
  To: Tilman Schmidt; +Cc: Alan Cox, linux-kernel, netdev, Alan Cox
In-Reply-To: <4AC3A986.4080808@imap.cc>

Tilman Schmidt wrote, On 09/30/2009 08:55 PM:

> Alan Cox schrieb:
>>>  [<c026d39b>] tty_unthrottle+0x10/0x38
>>>  [<f8dcc31f>] ppp_sync_receive+0x168/0x170 [ppp_synctty]
>>>  [<f8fbb9ce>] handle_minor_recv+0x187/0x1cd [capi]
>>>  [<f8fbc19b>] capi_recv_message+0x1d9/0x24e [capi]
>> Really need to see the rest of the call trace to be sure
> 
> There wasn't more than what I posted. I had six of them, they looked all
> identical, and all of them ended after the kernel_thread_helper line. 
...
> Why then does mutex_lock() complain?


Maybe it doesn't matter here, but this: 

> INFO: lockdep is turned off.
suggests there was some lockdep issue/warning earlier.


Jarek P.

^ permalink raw reply

* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
From: Gregory Haskins @ 2009-09-30 20:04 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm,
	linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze,
	alacrityvm-devel
In-Reply-To: <4ABF33B2.4000805@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 35235 bytes --]

Avi Kivity wrote:
> On 09/26/2009 12:32 AM, Gregory Haskins wrote:
>>>>
>>>> I realize in retrospect that my choice of words above implies vbus _is_
>>>> complete, but this is not what I was saying.  What I was trying to
>>>> convey is that vbus is _more_ complete.  Yes, in either case some kind
>>>> of glue needs to be written.  The difference is that vbus implements
>>>> more of the glue generally, and leaves less required to be customized
>>>> for each iteration.
>>>>
>>>>        
>>>
>>> No argument there.  Since you care about non-virt scenarios and virtio
>>> doesn't, naturally vbus is a better fit for them as the code stands.
>>>      
>> Thanks for finally starting to acknowledge there's a benefit, at least.
>>    
> 
> I think I've mentioned vbus' finer grained layers as helpful here,
> though I doubt the value of this.  Hypervisors are added rarely, while
> devices and drivers are added (and modified) much more often.  I don't
> buy the anything-to-anything promise.

The ease in which a new hypervisor should be able to integrate into the
stack is only one of vbus's many benefits.

> 
>> To be more precise, IMO virtio is designed to be a performance oriented
>> ring-based driver interface that supports all types of hypervisors (e.g.
>> shmem based kvm, and non-shmem based Xen).  vbus is designed to be a
>> high-performance generic shared-memory interconnect (for rings or
>> otherwise) framework for environments where linux is the underpinning
>> "host" (physical or virtual).  They are distinctly different, but
>> complementary (the former addresses the part of the front-end, and
>> latter addresses the back-end, and a different part of the front-end).
>>    
> 
> They're not truly complementary since they're incompatible.

No, that is incorrect.  Not to be rude, but for clarity:

  Complementary \Com`ple*men"ta*ry\, a.
     Serving to fill out or to complete; as, complementary
     numbers.
     [1913 Webster]

Citation: www.dict.org

IOW: Something being complementary has nothing to do with guest/host
binary compatibility.  virtio-pci and virtio-vbus are both equally
complementary to virtio since they fill in the bottom layer of the
virtio stack.

So yes, vbus is truly complementary to virtio afaict.

> A 2.6.27 guest, or Windows guest with the existing virtio drivers, won't work
> over vbus.

Binary compatibility with existing virtio drivers, while nice to have,
is not a specific requirement nor goal.  We will simply load an updated
KMP/MSI into those guests and they will work again.  As previously
discussed, this is how more or less any system works today.  It's like
we are removing an old adapter card and adding a new one to "uprev the
silicon".

>  Further, non-shmem virtio can't work over vbus.

Actually I misspoke earlier when I said virtio works over non-shmem.
Thinking about it some more, both virtio and vbus fundamentally require
shared-memory, since sharing their metadata concurrently on both sides
is their raison d'être.

The difference is that virtio utilizes a pre-translation/mapping (via
->add_buf) from the guest side.  OTOH, vbus uses a post translation
scheme (via memctx) from the host-side.  If anything, vbus is actually
more flexible because it doesn't assume the entire guest address space
is directly mappable.

In summary, your statement is incorrect (though it is my fault for
putting that idea in your head).

>  Since
> virtio is guest-oriented and host-agnostic, it can't ignore
> non-shared-memory hosts (even though it's unlikely virtio will be
> adopted there)

Well, to be fair no one said it has to ignore them.  Either virtio-vbus
transport is present and available to the virtio stack, or it isn't.  If
its present, it may or may not publish objects for consumption.
Providing a virtio-vbus transport in no way limits or degrades the
existing capabilities of the virtio stack.  It only enhances them.

I digress.  The whole point is moot since I realized that the non-shmem
distinction isn't accurate anyway.  They both require shared-memory for
the metadata, and IIUC virtio requires the entire address space to be
mappable whereas vbus only assumes the metadata is.

> 
>> In addition, the kvm-connector used in AlacrityVM's design strives to
>> add value and improve performance via other mechanisms, such as dynamic
>>   allocation, interrupt coalescing (thus reducing exit-ratio, which is a
>> serious issue in KVM)
> 
> Do you have measurements of inter-interrupt coalescing rates (excluding
> intra-interrupt coalescing).

I actually do not have a rig setup to explicitly test inter-interrupt
rates at the moment.  Once things stabilize for me, I will try to
re-gather some numbers here.  Last time I looked, however, there were
some decent savings for inter as well.

Inter rates are interesting because they are what tends to ramp up with
IO load more than intra since guest interrupt mitigation techniques like
NAPI often quell intra-rates naturally.  This is especially true for
data-center, cloud, hpc-grid, etc, kind of workloads (vs vanilla
desktops, etc) that tend to have multiple IO ports (multi-homed nics,
disk-io, etc).  Those various ports tend to be workload-related to one
another (e.g. 3-tier web stack may use multi-homed network and disk-io
at the same time, trigged by one IO event).

An interesting thing here is that you don't even need a fancy
multi-homed setup to see the effects of my exit-ratio reduction work:
even single port configurations suffer from the phenomenon since many
devices have multiple signal-flows (e.g. network adapters tend to have
at least 3 flows: rx-ready, tx-complete, and control-events (link-state,
etc).  Whats worse, is that the flows often are indirectly related (for
instance, many host adapters will free tx skbs during rx operations, so
you tend to get bursts of tx-completes at the same time as rx-ready.  If
the flows map 1:1 with IDT, they will suffer the same problem.

In any case, here is an example run of a simple single-homed guest over
standard GigE.  Whats interesting here is that .qnotify to .notify
ratio, as this is the interrupt-to-signal ratio.  In this case, its
170047/151918, which comes out to about 11% savings in interrupt injections:

vbus-guest:/home/ghaskins # netperf -H dev
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
dev.laurelwood.net (192.168.1.10) port 0 AF_INET
Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

1048576  16384  16384    10.01     940.77
vbus-guest:/home/ghaskins # cat /sys/kernel/debug/pci-to-vbus-bridge
  .events                        : 170048
  .qnotify                       : 151918
  .qinject                       : 0
  .notify                        : 170047
  .inject                        : 18238
  .bridgecalls                   : 18
  .buscalls                      : 12
vbus-guest:/home/ghaskins # cat /proc/interrupts
            CPU0
   0:         87   IO-APIC-edge      timer
   1:          6   IO-APIC-edge      i8042
   4:        733   IO-APIC-edge      serial
   6:          2   IO-APIC-edge      floppy
   7:          0   IO-APIC-edge      parport0
   8:          0   IO-APIC-edge      rtc0
   9:          0   IO-APIC-fasteoi   acpi
  10:          0   IO-APIC-fasteoi   virtio1
  12:         90   IO-APIC-edge      i8042
  14:       3041   IO-APIC-edge      ata_piix
  15:       1008   IO-APIC-edge      ata_piix
  24:     151933   PCI-MSI-edge      vbus
  25:          0   PCI-MSI-edge      virtio0-config
  26:        190   PCI-MSI-edge      virtio0-input
  27:         28   PCI-MSI-edge      virtio0-output
 NMI:          0   Non-maskable interrupts
 LOC:       9854   Local timer interrupts
 SPU:          0   Spurious interrupts
 CNT:          0   Performance counter interrupts
 PND:          0   Performance pending work
 RES:          0   Rescheduling interrupts
 CAL:          0   Function call interrupts
 TLB:          0   TLB shootdowns
 TRM:          0   Thermal event interrupts
 THR:          0   Threshold APIC interrupts
 MCE:          0   Machine check exceptions
 MCP:          1   Machine check polls
 ERR:          0
 MIS:          0

Its important to note here that we are actually looking at the interrupt
rate, not the exit rate (which is usually a multiple of the interrupt
rate, since you have to factor in as many as three exits per interrupt
(IPI, window, EOI).  Therefore we saved about 18k interrupts in this 10
second burst, but we may have actually saved up to 54k exits in the
process. This is only over a 10 second window at GigE rates, so YMMV.
These numbers get even more dramatic on higher end hardware, but I
haven't had a chance to generate new numbers yet.

Looking at some external stats paints an even bleaker picture: "exits"
as reported by kvm_stat for virtio-pci based virtio-net tip the scales
at 65k/s vs 36k/s for vbus based venet.  And virtio is consuming ~30% of
my quad-core's cpu, vs 19% for venet during the test.  Its hard to know
which innovation or innovations may be responsible for the entire
reduction, but certainly the interrupt-to-signal ratio mentioned above
is probably helping.

The even worse news for 1:1 models is that the ratio of
exits-per-interrupt climbs with load (exactly when it hurts the most)
since that is when the probability that the vcpu will need all three
exits is the highest.

> 
>> and priortizable/nestable signals.
>>    
> 
> That doesn't belong in a bus.

Everyone is of course entitled to an opinion, but the industry as a
whole would disagree with you.  Signal path routing (1:1, aggregated,
etc) is at the discretion of the bus designer.  Most buses actually do
_not_ support 1:1 with IDT (think USB, SCSI, IDE, etc).

PCI is somewhat of an outlier in that regard afaict.  Its actually a
nice feature of PCI when its used within its design spec (HW).  For
SW/PV, 1:1 suffers from, among other issues, that "triple-exit scaling"
issue in the signal path I mentioned above.  This is one of the many
reasons I think PCI is not the best choice for PV.

> 
>> Today there is a large performance disparity between what a KVM guest
>> sees and what a native linux application sees on that same host.  Just
>> take a look at some of my graphs between "virtio", and "native", for
>> example:
>>
>> http://developer.novell.com/wiki/images/b/b7/31-rc4_throughput.png
>>    
> 
> That's a red herring.  The problem is not with virtio as an ABI, but
> with its implementation in userspace.  vhost-net should offer equivalent
> performance to vbus.

That's pure speculation.  I would advise you to reserve such statements
until after a proper bakeoff can be completed.  This is not to mention
that vhost-net does nothing to address our other goals, like scheduler
coordination and non-802.x fabrics.

> 
>> A dominant vbus design principle is to try to achieve the same IO
>> performance for all "linux applications" whether they be literally
>> userspace applications, or things like KVM vcpus or Ira's physical
>> boards.  It also aims to solve problems not previously expressible with
>> current technologies (even virtio), like nested real-time.
>>
>> And even though you repeatedly insist otherwise, the neat thing here is
>> that the two technologies mesh (at least under certain circumstances,
>> like when virtio is deployed on a shared-memory friendly linux backend
>> like KVM).  I hope that my stack diagram below depicts that clearly.
>>    
> 
> Right, when you ignore the points where they don't fit, it's a perfect
> mesh.

Where doesn't it fit?

> 
>>> But that's not a strong argument for vbus; instead of adding vbus you
>>> could make virtio more friendly to non-virt
>>>      
>> Actually, it _is_ a strong argument then because adding vbus is what
>> helps makes virtio friendly to non-virt, at least for when performance
>> matters.
>>    
> 
> As vhost-net shows, you can do that without vbus

Citation please.  Afaict, the one use case that we looked at for vhost
outside of KVM failed to adapt properly, so I do not see how this is true.

> and without breaking compatibility.

Compatibility with what?  vhost hasn't even been officially deployed in
KVM environments afaict, nevermind non-virt.  Therefore, how could it
possibly have compatibility constraints with something non-virt already?
 Citation please.

> 
> 
> 
>>> Right.  virtio assumes that it's in a virt scenario and that the guest
>>> architecture already has enumeration and hotplug mechanisms which it
>>> would prefer to use.  That happens to be the case for kvm/x86.
>>>      
>> No, virtio doesn't assume that.  It's stack provides the "virtio-bus"
>> abstraction and what it does assume is that it will be wired up to
>> something underneath. Kvm/x86 conveniently has pci, so the virtio-pci
>> adapter was created to reuse much of that facility.  For other things
>> like lguest and s360, something new had to be created underneath to make
>> up for the lack of pci-like support.
>>    
> 
> Right, I was wrong there.  But it does allow you to have a 1:1 mapping
> between native devices and virtio devices.

vbus allows you to have 1:1 if that is what you want, but we strive to
do better.

> 
> 
>>>> So to answer your question, the difference is that the part that has to
>>>> be customized in vbus should be a fraction of what needs to be
>>>> customized with vhost because it defines more of the stack.
>>>>        
>>> But if you want to use the native mechanisms, vbus doesn't have any
>>> added value.
>>>      
>> First of all, thats incorrect.  If you want to use the "native"
>> mechanisms (via the way the vbus-connector is implemented, for instance)
>> you at least still have the benefit that the backend design is more
>> broadly re-useable in more environments (like non-virt, for instance),
>> because vbus does a proper job of defining the requisite
>> layers/abstractions compared to vhost.  So it adds value even in that
>> situation.
>>    
> 
> Maybe.  If vhost-net isn't sufficient I'm sure there will be patches sent.

It isn't, and I've already done that.

> 
>> Second of all, with PV there is no such thing as "native".  It's
>> software so it can be whatever we want.  Sure, you could argue that the
>> guest may have built-in support for something like PCI protocol.

[1]

>> However, PCI protocol itself isn't suitable for high-performance PV out
>> of the can.  So you will therefore invariably require new software
>> layers on top anyway, even if part of the support is already included.
>>    
> 
> Of course there is such a thing as native, a pci-ready guest has tons of
> support built into it

I specifically mentioned that already ([1]).

You are also overstating its role, since the basic OS is what implements
the native support for bus-objects, hotswap, etc, _not_ PCI.  PCI just
rides underneath and feeds trivial events up, as do other bus-types
(usb, scsi, vbus, etc).  And once those events are fed, you still need a
PV layer to actually handle the bus interface in a high-performance
manner so its not like you really have a "native" stack in either case.

> that doesn't need to be retrofitted.

No, that is incorrect.  You have to heavily modify the pci model with
layers on top to get any kind of performance out of it.  Otherwise, we
would just use realtek emulation, which is technically the native PCI
you are apparently so enamored with.

Not to mention there are things you just plain can't do in PCI today,
like dynamically assign signal-paths, priority, and coalescing, etc.

> Since
> practically everyone (including Xen) does their paravirt drivers atop
> pci, the claim that pci isn't suitable for high performance is incorrect.

Actually IIUC, I think Xen bridges to their own bus as well (and only
where they have to), just like vbus.  They don't use PCI natively.  PCI
is perfectly suited as a bridge transport for PV, as I think the Xen and
vbus examples have demonstrated.  Its the 1:1 device-model where PCI has
the most problems.

> 
> 
>> And lastly, why would you _need_ to use the so called "native"
>> mechanism?  The short answer is, "you don't".  Any given system (guest
>> or bare-metal) already have a wide-range of buses (try running "tree
>> /sys/bus" in Linux).  More importantly, the concept of adding new buses
>> is widely supported in both the Windows and Linux driver model (and
>> probably any other guest-type that matters).  Therefore, despite claims
>> to the contrary, its not hard or even unusual to add a new bus to the
>> mix.
>>    
> 
> The short answer is "compatibility".

There was a point in time where the same could be said for virtio-pci
based drivers vs realtek and e1000, so that argument is demonstrably
silly.  No one tried to make virtio work in a binary compatible way with
realtek emulation, yet we all survived the requirement for loading a
virtio driver to my knowledge.

The bottom line is: Binary device compatibility is not required in any
other system (as long as you follow sensible versioning/id rules), so
why is KVM considered special?

The fact is, it isn't special (at least not in this regard).  What _is_
required is "support" and we fully intend to support these proposed
components.  I assure you that at least the users that care about
maximum performance will not generally mind loading a driver.  Most of
them would have to anyway if they want to get beyond realtek emulation.

> 
> 
>> In summary, vbus is simply one more bus of many, purpose built to
>> support high-end IO in a virt-like model, giving controlled access to
>> the linux-host underneath it.  You can write a high-performance layer
>> below the OS bus-model (vbus), or above it (virtio-pci) but either way
>> you are modifying the stack to add these capabilities, so we might as
>> well try to get this right.
>>
>> With all due respect, you are making a big deal out of a minor issue.
>>    
> 
> It's not minor to me.

I am certainly in no position to tell you how to feel, but this
declaration would seem from my perspective to be more of a means to an
end than a legitimate concern.  Otherwise we would never have had virtio
support in the first place, since it was not "compatible" with previous
releases.

> 
>>>> And, as
>>>> eluded to in my diagram, both virtio-net and vhost (with some
>>>> modifications to fit into the vbus framework) are potentially
>>>> complementary, not competitors.
>>>>
>>>>        
>>> Only theoretically.  The existing installed base would have to be thrown
>>> away
>>>      
>> "Thrown away" is pure hyperbole.  The installed base, worse case, needs
>> to load a new driver for a missing device.
> 
> Yes, we all know how fun this is.

Making systems perform 5x faster _is_ fun, yes.  I love what I do for a
living.

>  Especially if the device changed is your boot disk.

If and when that becomes a priority concern, that would be a function
transparently supported in the BIOS shipped with the hypervisor, and
would thus be invisible to the user.

>  You may not care about the pain caused to users, but I do, so I will
> continue to insist on compatibility.

No, you are incorrect on two counts.

1) Of course I care about pain to users or I wouldn't be funded.  Right
now the pain from my perspective is caused to users in the
high-performance community who want to deploy KVM based solutions.  They
are unable to do so due to its performance disparity compared to
bare-metal, outside of pass-through hardware which is not widely
available in a lot of existing deployments.  I aim to fix that disparity
while reusing the existing hardware investment by writing smarter
software, and I assure you that these users won't mind loading a driver
in the guest to take advantage of it.

For the users that don't care about maximum performance, there is no
change (and thus zero pain) required.  They can use realtek or virtio if
they really want to.  Neither is going away to my knowledge, and lets
face it: 2.6Gb/s out of virtio to userspace isn't *that* bad.  But "good
enough" isn't good enough, and I won't rest till we get to native
performance.  Additionally, I want to support previously unavailable
modes of operations (e.g. real-time) and advanced fabrics (e.g. IB).

2) True pain to users is not caused by lack of binary compatibility.
Its caused by lack of support.  And its a good thing or we would all be
emulating 8086 architecture forever...

..oh wait, I guess we kind of do that already ;).  But at least we can
slip in something more advanced once in a while (APIC vs PIC, USB vs
uart, iso9660 vs floppy, for instance) and update the guest stack
instead of insisting it must look like ISA forever for compatibility's sake.

> 
>>> or we'd need to support both.
>>>
>>>
>>>      
>> No matter what model we talk about, there's always going to be a "both"
>> since the userspace virtio models are probably not going to go away (nor
>> should they).
>>    
> 
> virtio allows you to have userspace-only, kernel-only, or
> start-with-userspace-and-move-to-kernel-later, all transparent to the
> guest.  In many cases we'll stick with userspace-only.

The user will not care where the model lives, per se.  Only that it is
supported, and it works well.

Likewise, I know from experience that the developer will not like
writing the same code twice, so the "runs in both" model is not
necessarily a great design trait either.

> 
>>> All this is after kvm has decoded that vbus is addresses.  It can't work
>>> without someone outside vbus deciding that.
>>>      
>> How the connector message is delivered is really not relevant.  Some
>> architectures will simply deliver the message point-to-point (like the
>> original hypercall design for KVM, or something like Ira's rig), and
>> some will need additional demuxing (like pci-bridge/pio based KVM).
>> It's an implementation detail of the connector.
>>
>> However, the real point here is that something needs to establish a
>> scoped namespace mechanism, add items to that namespace, and advertise
>> the presence of the items to the guest.  vbus has this facility built in
>> to its stack.  vhost doesn't, so it must come from elsewhere.
>>    
> 
> So we have: vbus needs a connector, vhost needs a connector.  vbus
> doesn't need userspace to program the addresses (but does need userspace
> to instantiate the devices and to program the bus address decode)

First of all, bus-decode is substantially easier than per-device decode
(you have to track all those per-device/per-signal fds somewhere,
integrate with hotswap, etc), and its only done once per guest at
startup and left alone.  So its already not apples to apples.

Second, while its true that the general kvm-connector bus-decode needs
to be programmed,  that is a function of adapting to the environment
that _you_ created for me.  The original kvm-connector was discovered
via cpuid and hypercalls, and didn't need userspace at all to set it up.
 Therefore it would be entirely unfair of you to turn around and somehow
try to use that trait of the design against me since you yourself
imposed it.

As an additional data point, our other connectors have no such
bus-decode programming requirement.  Therefore, this is clearly
just a property of the KVM environment, not a function of the overall
vbus design.

> vhost needs userspace to instantiate the devices and program the addresses.
> 

Right.  And among other shortcomings it also requires a KVM-esque memory
model (which is not always going to work as we recently discussed), and
a redundant device-model to back it up in userspace, which is a
development and maintenance burden, and an external bus-model (filled by
pio-bus in KVM today).

>>>> In fact, it's actually a simpler design to unify things this way
>>>> because
>>>> you avoid splitting the device model up. Consider how painful the vhost
>>>> implementation would be if it didn't already have the userspace
>>>> virtio-net to fall-back on.  This is effectively what we face for new
>>>> devices going forward if that model is to persist.
>>>>
>>>>        
>>>
>>> It doesn't have just virtio-net, it has userspace-based hostplug
>>>      
>> vbus has hotplug too: mkdir and rmdir
>>    
> 
> Does that work from nonprivileged processes?

It will with the ioctl based control interface that I'll merge shortly.

>  Does it work on Windows?

This question doesn't make sense.  Hotswap control occurs on the host,
which is always Linux.

If you were asking about whether a windows guest will support hotswap:
the answer is "yes".  Our windows driver presents a unique PDO/FDO pair
for each logical device instance that is pushed out (just like the built
in usb, pci, scsi bus drivers that windows supports natively).

> 
>> As an added bonus, its device-model is modular.  A developer can write a
>> new device model, compile it, insmod it to the host kernel, hotplug it
>> to the running guest with mkdir/ln, and the come back out again
>> (hotunplug with rmdir, rmmod, etc).  They may do this all without taking
>> the guest down, and while eating QEMU based IO solutions for breakfast
>> performance wise.
>>
>> Afaict, qemu can't do either of those things.
>>    
> 
> We've seen that herring before,

Citation?

> and it's redder than ever.

This is more hyperbole.  I doubt that there would be many that would
argue that a modular architecture (that we get for free with LKM
support) is not desirable, even if its never used dynamically with a
running guest.  OTOH, I actually use this dynamic feature all the time
as I test my components, so its at least useful to me.

> 
> 
> 
>>> Refactor instead of duplicating.
>>>      
>> There is no duplicating.  vbus has no equivalent today as virtio doesn't
>> define these layers.
>>    
> 
> So define them if they're missing.

I just did.

> 
> 
>>>>
>>>>       
>>>>>    Use libraries (virtio-shmem.ko, libvhost.so).
>>>>>
>>>>>          
>>>> What do you suppose vbus is?  vbus-proxy.ko = virtio-shmem.ko, and you
>>>> dont need libvhost.so per se since you can just use standard kernel
>>>> interfaces (like configfs/sysfs).  I could create an .so going forward
>>>> for the new ioctl-based interface, I suppose.
>>>>
>>>>        
>>> Refactor instead of rewriting.
>>>      
>> There is no rewriting.  vbus has no equivalent today as virtio doesn't
>> define these layers.
>>
>> By your own admission, you said if you wanted that capability, use a
>> library.  What I think you are not understanding is vbus _is_ that
>> library.  So what is the problem, exactly?
>>    
> 
> It's not compatible.

No, that is incorrect.  What you are apparently not understanding is
that not only is vbus that library, but its extensible.  So even if
compatibility is your goal (it doesn't need to be IMO) it can be
accommodated by how you interface to the library.

>  If you were truly worried about code duplication
> in virtio, you'd refactor it to remove the duplication,

My primary objective is creating an extensible, high-performance,
shared-memory interconnect for systems that utilize a Linux host as
their IO-hub.  It just so happens that virtio can sit nicely on top of
such a model because shmem-rings are a subclass of shmem.  As a result
of its design, vbus also helps to reduce code duplication in the stack
for new environments due to its extensible nature.

However, vbus also has goals beyond what virtio is providing today that
are of more concern, and part of that is designing a connector/bus that
eliminates the shortcomings in the current pci-based design.

> without affecting existing guests.

Already covered above.

> 
>>>>> For kvm/x86 pci definitely remains king.
>>>>>
>>>>>          
>>>> For full virtualization, sure.  I agree.  However, we are talking about
>>>> PV here.  For PV, PCI is not a requirement and is a technical dead-end
>>>> IMO.
>>>>
>>>> KVM seems to be the only virt solution that thinks otherwise (*), but I
>>>> believe that is primarily a condition of its maturity.  I aim to help
>>>> advance things here.
>>>>
>>>> (*) citation: xen has xenbus, lguest has lguest-bus, vmware has some
>>>> vmi-esq thing (I forget what its called) to name a few.  Love 'em or
>>>> hate 'em, most other hypervisors do something along these lines.  I'd
>>>> like to try to create one for KVM, but to unify them all (at least for
>>>> the Linux-based host designs).
>>>>
>>>>        
>>> VMware are throwing VMI away (won't be supported in their new product,
>>> and they've sent a patch to rip it off from Linux);
>>>      
>> vmware only cares about x86 iiuc, so probably not a good example.
>>    
> 
> Well, you brought it up.  Between you and me, I only care about x86 too.

Fair enough.

> 
>>> Xen has to tunnel
>>> xenbus in pci for full virtualization (which is where Windows is, and
>>> where Linux will be too once people realize it's faster).  lguest is
>>> meant as an example hypervisor, not an attempt to take over the world.
>>>      
>> So pick any other hypervisor, and the situation is often similar.
>>    
> 
> The situation is often pci.

Even if that were true, which is debatable, do not confuse "convenient"
with "optimal".  If you don't care about maximum performance and
advanced features like QOS, sure go ahead and use PCI.  Why not.

> 
>>
>>> An right now you can have a guest using pci to access a mix of
>>> userspace-emulated devices, userspace-emulated-but-kernel-accelerated
>>> virtio devices, and real host devices.  All on one dead-end bus.  Try
>>> that with vbus.
>>>      
>> vbus is not interested in userspace devices.  The charter is to provide
>> facilities for utilizing the host linux kernel's IO capabilities in the
>> most efficient, yet safe, manner possible.  Those devices that fit
>> outside that charter can ride on legacy mechanisms if that suits them
>> best.
>>    
> 
> vbus isn't, but I am.  I would prefer not to have to expose
> implementation decisions (kernel vs userspace) to the guest (vbus vs pci).
> 
>>>> That won't cut it.  For one, creating an eventfd is only part of the
>>>> equation.  I.e. you need to have originate/terminate somewhere
>>>> interesting (and in-kernel, otherwise use tuntap).
>>>>
>>>>        
>>> vbus needs the same thing so it cancels out.
>>>      
>> No, it does not.  vbus just needs a relatively simple single message
>> pipe between the guest and host (think "hypercall tunnel", if you will).
>>    
> 
> That's ioeventfd.  So far so similar.

No, that is incorrect.  For one, vhost uses them on a per-signal path
basis, whereas vbus only has one channel for the entire guest->host.

Second, I do not use ioeventfd anymore because it has too many problems
with the surrounding technology.  However, that is a topic for a
different thread.


> 
>>   Per queue/device addressing is handled by the same conceptual namespace
>> as the one that would trigger eventfds in the model you mention.  And
>> that namespace is built in to the vbus stack, and objects are registered
>> automatically as they are created.
>>
>> Contrast that to vhost, which requires some other kernel interface to
>> exist, and to be managed manually for each object that is created.  Your
>> libvhostconfig would need to somehow know how to perform this
>> registration operation, and there would have to be something in the
>> kernel to receive it, presumably on a per platform basis.  Solving this
>> problem generally would probably end up looking eerily like vbus,
>> because thats what vbus does.
>>    
> 
> vbus devices aren't magically instantiated.  Userspace needs to
> instantiate them too.  Sure, there's less work on the host side since
> you're using vbus instead of the native interface, but more work on the
> guest side since you're using vbus instead of the native interface.


No, that is incorrect.  The amount of "work" that a guest does is
actually the same in both cases, since the guest OS peforms the hotswap
handling natively for all bus types (at least for Linux and Windows).
You still need to have a PV layer to interface with those objects in
both cases, as well, so there is no such thing as "native interface" for
PV.  Its only a matter of where it occurs in the stack.

> 
> 
> 
>>> Well, let's see.  Can vbus today:
>>>
>>> - let userspace know which features are available (so it can decide if
>>> live migration is possible)
>>>      
>> yes, its in sysfs.
>>
>>   
>>> - let userspace limit which features are exposed to the guest (so it can
>>> make live migration possible among hosts of different capabilities)
>>>      
>> yes, its in sysfs.
>>    
> 
> Per-device?

Yes, see /sys/vbus/devices/$dev/ to get per-instance attributes

>  non-privileged-user capable?

The short answer is "not yet (I think)".  I need to write a patch to
properly set the mode attribute in sysfs, but I think this will be trivial.

> 
>>> - let userspace know which features were negotiated (so it can transfer
>>> them to the other host during live migration)
>>>      
>> no, but we can easily add ->save()/->restore() to the model going
>> forward, and the negotiated features are just a subcomponent if its
>> serialized stream.
>>
>>   
>>> - let userspace tell the kernel which features were negotiated (when
>>> live migration completes, to avoid requiring the guest to re-negotiate)
>>>      
>> that would be the function of the ->restore() deserializer.
>>
>>   
>>> - do all that from an unprivileged process
>>>      
>> yes, in the upcoming alacrityvm v0.3 with the ioctl based control plane.
>>    
> 
> Ah, so you have two control planes.

So what?  If anything, it goes to show how extensible the framework is
that a new plane could be added in 119 lines of code:

~/git/linux-2.6> stg show vbus-add-admin-ioctls.patch | diffstat
 Makefile       |    3 -
 config-ioctl.c |  117
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+), 1 deletion(-)

if and when having two control planes exceeds its utility, I will submit
a simple patch that removes the useless one.

> 
>> Bottom line: vbus isn't done, especially w.r.t. live-migration..but that
>> is not an valid argument against the idea if you believe in
>> release-early/release-often. kvm wasn't (isn't) done either when it was
>> proposed/merged.
>>
>>    
> 
> kvm didn't have an existing counterpart in Linux when it was
> proposed/merged.
> 

And likewise, neither does vbus.

Kind Regards,
-Greg










[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 267 bytes --]

^ permalink raw reply

* Re: [PATCH] ipvs: Add boundary check on ioctl arguments
From: Julian Anastasov @ 2009-09-30 19:41 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Hannes Eder, Wensong Zhang, netdev, linux-kernel, Simon Horman
In-Reply-To: <20090930171833.5ce0011d@infradead.org>


	Hello,

On Wed, 30 Sep 2009, Arjan van de Ven wrote:

> fair enough; updated patch below
> 
> >From 28ae217858e683c0c94c02219d46a9a9c87f61c6 Mon Sep 17 00:00:00 2001
> From: Arjan van de Ven <arjan@linux.intel.com>
> Date: Wed, 30 Sep 2009 13:05:51 +0200
> Subject: [PATCH] ipvs: Add boundary check on ioctl arguments
> 
> The ipvs code has a nifty system for doing the size of ioctl command copies;
> it defines an array with values into which it indexes the cmd to find the
> right length.
> 
> Unfortunately, the ipvs code forgot to check if the cmd was in the range
> that the array provides, allowing for an index outside of the array,
> which then gives a "garbage" result into the length, which then gets
> used for copying into a stack buffer.

	do_ip_vs_get_ctl and do_ip_vs_set_ctl are nf_sockopt_ops
handlers, so the range is checked by nf_sockopt_find() in Netfilter
code. get_arglen[] and set_arglen[] are minimum values for
the length and they can be 0. Later len can be checked
additionally and surely can exceed 128 (include/linux/ip_vs.h has
all user structures). Can you show the exact cmd and len
used, may be there is error in some command or may be the
provided user structure is wrong?

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply

* Re: [PATCH] connector: Allow permission checking in the receiver callbacks
From: Evgeniy Polyakov @ 2009-09-30 19:29 UTC (permalink / raw)
  To: Lars Ellenberg; +Cc: Philipp Reisner, linux-kernel, netdev, Andrew Morton
In-Reply-To: <20090930132034.GE8032@barkeeper1-xen.linbit>

On Wed, Sep 30, 2009 at 03:20:35PM +0200, Lars Ellenberg (lars.ellenberg@linbit.com) wrote:
> Actually it is the basis for follow-up security fixes.
> 
> Without this, unprivileged user space is able to send arbitrary
> connector requests to kernel subsystems, which have no way to verify the
> privileges of the sender anymore, because that information, even though
> available at the netlink layer, has been dropped by the connector.

It is not. One can add some checks at receiving time which happens in
process context to get its credentials, but nothing in netlink itself
carry this info. Getting that connector schedules workqueue this ability
is lost.

> Once this is applied, the various in-kernel receiving connector
> callbacks can (and need to) add cap_raised(nsb->eff_cap, cap) where
> appropriate. For example, you don't want some guest user to be able to
> trigger a dst_del_node callback by sending a crafted netlink message,
> right?
> 
> So it _is_ a (design-) bug fix.
> Or am I missing something?

This patchset is not a bugfix, just a cleanup, since none in patchset
uses netlink_skb_parms and currently I see no users which are affected
by this behaviour in the mainline branch (not counting staging tree).

But if proposed configuration changes for DM are on the way, then I
agree and they should force this patchset into the tree as a bugfix.

-- 
	Evgeniy Polyakov

^ permalink raw reply

* Re: N_PPP_SYNC ldisc BUG: sleeping function called from invalid context
From: Tilman Schmidt @ 2009-09-30 18:55 UTC (permalink / raw)
  To: Alan Cox; +Cc: linux-kernel, netdev, Alan Cox
In-Reply-To: <20090930174704.796b24b9@lxorguk.ukuu.org.uk>

[-- Attachment #1: Type: text/plain, Size: 2412 bytes --]

Alan Cox schrieb:
>>  [<c026d39b>] tty_unthrottle+0x10/0x38
>>  [<f8dcc31f>] ppp_sync_receive+0x168/0x170 [ppp_synctty]
>>  [<f8fbb9ce>] handle_minor_recv+0x187/0x1cd [capi]
>>  [<f8fbc19b>] capi_recv_message+0x1d9/0x24e [capi]
> 
> Really need to see the rest of the call trace to be sure

There wasn't more than what I posted. I had six of them, they looked all
identical, and all of them ended after the kernel_thread_helper line. 

>> Turns out the ppp_sync_receive() function (drivers/net/ppp_synctty.c
>> line 385ff.) has a comment in front stating:
>>
>> /*
>>  * This can now be called from hard interrupt level as well
>>  * as soft interrupt level or mainline.
>>  */
> 
> Which is wrong. The flip_buffer_push -> rx processing path should never
> be called from IRQ context and that was fixed for various drivers that
> mis-set tty->low_latency, as well as in the PPP rework. The PPP case is
> actually unrelated in many was.

Might be worth correcting that text then before is misleads someone.

>> Opinions?
> 
> See how we got into that code direct from an IRQ path. The expectation of
> the tty logic is that it gets processed from work queues either
> specifically in driver or via tty_flip_buffer_push when tty->low_latency
> = 0

I'm at a loss here. According to all the backtraces:

- ppp_sync_receive() was called, as the LD's receive_buf method,
  via handle_recv_skb() [drivers/isdn/capi/capi.c line 504, inlined]
  from handle_minor_recv() [drivers/isdn/capi/capi.c line 519]

- handle_minor_recv() was called from capi_recv_message()
  [drivers/isdn/capi/capi.c line 656]

- capi_recv_message() was called, as the CAPI application's
  recv_message method, from recv_handler()
  [drivers/isdn/capi/kcapi.c line 268]

- recv_handler() is never called directly. It's only scheduled
  via the work queue ap->recv_work from capi_ctr_handle_message()
  [drivers/isdn/capi/kcapi.c line 349]

Even if we don't trust the backtraces, there's not much room for
another activation path. So for all I know, the expectation of the
tty logic should have been met. The call was indeed processed from
a work queue.

Why then does mutex_lock() complain?

-- 
Tilman Schmidt                    E-Mail: tilman@imap.cc
Bonn, Germany
Diese Nachricht besteht zu 100% aus wiederverwerteten Bits.
Ungeöffnet mindestens haltbar bis: (siehe Rückseite)


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 254 bytes --]

^ permalink raw reply

* Re: [PATCH] net: fix NOHZ: local_softirq_pending 08
From: John W. Linville @ 2009-09-30 18:47 UTC (permalink / raw)
  To: Oliver Hartkopp
  Cc: David Miller, Johannes Berg, Michael Buesch, Kalle Valo,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <4AC3A0F1.3060306-fJ+pQTUTwRTk1uMJSBkQmQ@public.gmane.org>

On Wed, Sep 30, 2009 at 08:18:25PM +0200, Oliver Hartkopp wrote:
> Socket buffers that are generated and received inside softirqs or from process
> context must not use netif_rx() that's intended to be used from irq context only.
> 
> This patch introduces a new helper function netif_rx_ti(skb) that tests for
> in_interrupt() before invoking netif_rx() or netif_rx_ni().
> 
> It fixes the ratelimited kernel warning
> 
>         NOHZ: local_softirq_pending 08
> 
> in the mac80211 and can subsystems.
> 
> Signed-off-by: Oliver Hartkopp <oliver-fJ+pQTUTwRTk1uMJSBkQmQ@public.gmane.org>

http://bugzilla.kernel.org/show_bug.cgi?id=14278

Acked-by: John W. Linville <linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>

-- 
John W. Linville		Someday the world will need a hero, and you
linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org			might be all we have.  Be ready.
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH] net: fix NOHZ: local_softirq_pending 08
From: Oliver Hartkopp @ 2009-09-30 18:18 UTC (permalink / raw)
  To: David Miller
  Cc: Johannes Berg, Michael Buesch, Kalle Valo, John W. Linville,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <4AC39A90.6060602-fJ+pQTUTwRTk1uMJSBkQmQ@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 505 bytes --]

Socket buffers that are generated and received inside softirqs or from process
context must not use netif_rx() that's intended to be used from irq context only.

This patch introduces a new helper function netif_rx_ti(skb) that tests for
in_interrupt() before invoking netif_rx() or netif_rx_ni().

It fixes the ratelimited kernel warning

        NOHZ: local_softirq_pending 08

in the mac80211 and can subsystems.

Signed-off-by: Oliver Hartkopp <oliver-fJ+pQTUTwRTk1uMJSBkQmQ@public.gmane.org>

---




[-- Attachment #2: net-NOHZ-local_softirq_pending-08.patch --]
[-- Type: text/x-patch, Size: 4031 bytes --]

diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c
index 80ac563..899f3d3 100644
--- a/drivers/net/can/vcan.c
+++ b/drivers/net/can/vcan.c
@@ -80,7 +80,7 @@ static void vcan_rx(struct sk_buff *skb, struct net_device *dev)
 	skb->dev       = dev;
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-	netif_rx_ni(skb);
+	netif_rx_ti(skb);
 }
 
 static netdev_tx_t vcan_tx(struct sk_buff *skb, struct net_device *dev)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 94958c1..dc8dfb2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1509,6 +1509,19 @@ extern int		netdev_budget;
 extern void netdev_run_todo(void);
 
 /**
+ *	netif_rx_ti - test for irq context and post buffer to the network code
+ *	@skb: buffer to post
+ *
+ */
+static inline int netif_rx_ti(struct sk_buff *skb)
+{
+	if (in_interrupt())
+		return netif_rx(skb);
+	else
+		return netif_rx_ni(skb);
+}
+
+/**
  *	dev_put - release reference to device
  *	@dev: network device
  *
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 6068321..c21e7f4 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -199,8 +199,6 @@ static int can_create(struct net *net, struct socket *sock, int protocol)
  * @skb: pointer to socket buffer with CAN frame in data section
  * @loop: loopback for listeners on local CAN sockets (recommended default!)
  *
- * Due to the loopback this routine must not be called from hardirq context.
- *
  * Return:
  *  0 on success
  *  -ENETDOWN when the selected interface is down
@@ -280,7 +278,7 @@ int can_send(struct sk_buff *skb, int loop)
 	}
 
 	if (newskb)
-		netif_rx_ni(newskb);
+		netif_rx_ti(newskb);
 
 	/* update statistics */
 	can_stats.tx_frames++;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 5608f6c..bbcb4cb 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -606,7 +606,7 @@ static void ieee80211_send_layer2_update(struct sta_info *sta)
 	skb->dev = sta->sdata->dev;
 	skb->protocol = eth_type_trans(skb, sta->sdata->dev);
 	memset(skb->cb, 0, sizeof(skb->cb));
-	netif_rx(skb);
+	netif_rx_ti(skb);
 }
 
 static void sta_apply_parameters(struct ieee80211_local *local,
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 797f539..1109f99 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -591,7 +591,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 				skb2 = skb_clone(skb, GFP_ATOMIC);
 				if (skb2) {
 					skb2->dev = prev_dev;
-					netif_rx(skb2);
+					netif_rx_ti(skb2);
 				}
 			}
 
@@ -600,7 +600,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
 	}
 	if (prev_dev) {
 		skb->dev = prev_dev;
-		netif_rx(skb);
+		netif_rx_ti(skb);
 		skb = NULL;
 	}
 	rcu_read_unlock();
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index c01588f..5bb7c04 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -309,7 +309,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 			skb2 = skb_clone(skb, GFP_ATOMIC);
 			if (skb2) {
 				skb2->dev = prev_dev;
-				netif_rx(skb2);
+				netif_rx_ti(skb2);
 			}
 		}
 
@@ -320,7 +320,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 
 	if (prev_dev) {
 		skb->dev = prev_dev;
-		netif_rx(skb);
+		netif_rx_ti(skb);
 	} else
 		dev_kfree_skb(skb);
 
@@ -1349,7 +1349,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
 			/* deliver to local stack */
 			skb->protocol = eth_type_trans(skb, dev);
 			memset(skb->cb, 0, sizeof(skb->cb));
-			netif_rx(skb);
+			netif_rx_ti(skb);
 		}
 	}
 
@@ -1943,7 +1943,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx)
 			skb2 = skb_clone(skb, GFP_ATOMIC);
 			if (skb2) {
 				skb2->dev = prev_dev;
-				netif_rx(skb2);
+				netif_rx_ti(skb2);
 			}
 		}
 
@@ -1954,7 +1954,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx)
 
 	if (prev_dev) {
 		skb->dev = prev_dev;
-		netif_rx(skb);
+		netif_rx_ti(skb);
 		skb = NULL;
 	} else
 		goto out_free_skb;

^ permalink raw reply related

* Re: mac80211: NOHZ: local_softirq_pending 08
From: Oliver Hartkopp @ 2009-09-30 17:51 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Michael Buesch, Kalle Valo, John W. Linville,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1254324077.3959.7.camel-YfaajirXv2244ywRPIzf9A@public.gmane.org>

Johannes Berg wrote:
> On Wed, 2009-09-30 at 17:10 +0200, Michael Buesch wrote:
>> On Wednesday 30 September 2009 16:54:26 Johannes Berg wrote:
>>> On Wed, 2009-09-30 at 17:47 +0300, Kalle Valo wrote:
>>>
>>>> I agree with Michael. The bug is real and I have verified that
>>>> Michael's patch fixes the issue. Better to apply the patch now, it's
>>>> trivial to change the implementation if/when the network stack has
>>>> support for this.
>>> FWIW, I think in mac80211 the in_interrupt() check can never return true
>>> since we postpone all RX to the tasklet. But the tasklet seems to be ok
>>> -- so should it really be in_interrupt()?
>> I think a tasklet is also in_interrupt(), because it's a softirq.
> 
> Ah, yes, indeed, in_interrupt() vs. in_irq().
> 

Oops!

I missed that for my previous patch i added for two occurrences in the CAN
sources.

I'm currently compiling the patch for netif_rx_ti() and will post it in some
minutes (for CAN and mac80211) when it runs without probs.

Regards,
Oliver
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] tg3: Remove prev_vlan_tag from struct tx_ring_info
From: Matt Carlson @ 2009-09-30 17:26 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Matthew Carlson, Linux Netdev List, Michael Chan
In-Reply-To: <4AC36638.8070304@gmail.com>

On Wed, Sep 30, 2009 at 07:07:52AM -0700, Eric Dumazet wrote:
> prev_vlan_tag field is not used.
> 
> Patch saves 512*8 bytes per tx queue ring on 64bit arches.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Looks good to me.

Acked-by: Matthew Carlson <mcarlson@broadcom.com>

> ---
> 
> diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
> index 82b45d8..524691c 100644
> --- a/drivers/net/tg3.h
> +++ b/drivers/net/tg3.h
> @@ -2412,7 +2412,6 @@ struct ring_info {
>  
>  struct tx_ring_info {
>  	struct sk_buff                  *skb;
> -	u32                             prev_vlan_tag;
>  };
>  
>  struct tg3_config_info {
> 


^ permalink raw reply

* Re: N_PPP_SYNC ldisc BUG: sleeping function called from invalid context
From: Alan Cox @ 2009-09-30 16:47 UTC (permalink / raw)
  To: Tilman Schmidt; +Cc: linux-kernel, netdev, Alan Cox
In-Reply-To: <4AC37032.4030901@imap.cc>

>  [<c026d39b>] tty_unthrottle+0x10/0x38
>  [<f8dcc31f>] ppp_sync_receive+0x168/0x170 [ppp_synctty]
>  [<f8fbb9ce>] handle_minor_recv+0x187/0x1cd [capi]
>  [<f8fbc19b>] capi_recv_message+0x1d9/0x24e [capi]

Really need to see the rest of the call trace to be sure

> Turns out the ppp_sync_receive() function (drivers/net/ppp_synctty.c
> line 385ff.) has a comment in front stating:
> 
> /*
>  * This can now be called from hard interrupt level as well
>  * as soft interrupt level or mainline.
>  */

Which is wrong. The flip_buffer_push -> rx processing path should never
be called from IRQ context and that was fixed for various drivers that
mis-set tty->low_latency, as well as in the PPP rework. The PPP case is
actually unrelated in many was.

> Opinions?

See how we got into that code direct from an IRQ path. The expectation of
the tty logic is that it gets processed from work queues either
specifically in driver or via tty_flip_buffer_push when tty->low_latency
= 0

^ permalink raw reply

* Re: [PATCH] ipvs: Add boundary check on ioctl arguments
From: Hannes Eder @ 2009-09-30 15:33 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: Wensong Zhang, netdev, linux-kernel, Simon Horman
In-Reply-To: <20090930171833.5ce0011d@infradead.org>

On Wed, Sep 30, 2009 at 17:18, Arjan van de Ven <arjan@infradead.org> wrote:
> On Wed, 30 Sep 2009 15:38:12 +0200
> Hannes Eder <heder@google.com> wrote:
>>  > @@ -2353,17 +2357,25 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd,
>>  > void __user
>> *user, int *len)
>>  >  {
>>  >    unsigned char arg[128];
>>
>> can MAX_ARG_LEN be used here?
>
> I am not convinced... it is a different numerical value,
> so it could be an ABI change. Rather not do that in this
> type of patch...

For do_ip_vs_set_ctl MAX_ARG_LEN is used:

static int
do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
	int ret;
	unsigned char arg[MAX_ARG_LEN];
...

I assume that will be fine for do_ip_vs_get_ctl as well.

-Hannes

^ permalink raw reply

* Re: mac80211: NOHZ: local_softirq_pending 08
From: Johannes Berg @ 2009-09-30 15:21 UTC (permalink / raw)
  To: Michael Buesch
  Cc: Kalle Valo, Oliver Hartkopp, John W. Linville, linux-wireless,
	netdev
In-Reply-To: <200909301710.31082.mb@bu3sch.de>

[-- Attachment #1: Type: text/plain, Size: 773 bytes --]

On Wed, 2009-09-30 at 17:10 +0200, Michael Buesch wrote:
> On Wednesday 30 September 2009 16:54:26 Johannes Berg wrote:
> > On Wed, 2009-09-30 at 17:47 +0300, Kalle Valo wrote:
> > 
> > > I agree with Michael. The bug is real and I have verified that
> > > Michael's patch fixes the issue. Better to apply the patch now, it's
> > > trivial to change the implementation if/when the network stack has
> > > support for this.
> > 
> > FWIW, I think in mac80211 the in_interrupt() check can never return true
> > since we postpone all RX to the tasklet. But the tasklet seems to be ok
> > -- so should it really be in_interrupt()?
> 
> I think a tasklet is also in_interrupt(), because it's a softirq.

Ah, yes, indeed, in_interrupt() vs. in_irq().

johannes

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: [PATCH] ipvs: Add boundary check on ioctl arguments
From: Arjan van de Ven @ 2009-09-30 15:18 UTC (permalink / raw)
  To: Hannes Eder; +Cc: Wensong Zhang, netdev, linux-kernel, Simon Horman
In-Reply-To: <4AC35F44.60707@google.com>

On Wed, 30 Sep 2009 15:38:12 +0200
Hannes Eder <heder@google.com> wrote:
>  > @@ -2353,17 +2357,25 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd,
>  > void __user 
> *user, int *len)
>  >  {
>  >  	unsigned char arg[128];
> 
> can MAX_ARG_LEN be used here?

I am not convinced... it is a different numerical value,
so it could be an ABI change. Rather not do that in this
type of patch...

>  > +	copylen = get_arglen[GET_CMDID(cmd)];
>  > +	if (copylen > 128)
> 
> I think it's better to use 'copylen > sizeof(arg)' here.

fair enough; updated patch below

>From 28ae217858e683c0c94c02219d46a9a9c87f61c6 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 30 Sep 2009 13:05:51 +0200
Subject: [PATCH] ipvs: Add boundary check on ioctl arguments

The ipvs code has a nifty system for doing the size of ioctl command copies;
it defines an array with values into which it indexes the cmd to find the
right length.

Unfortunately, the ipvs code forgot to check if the cmd was in the range
that the array provides, allowing for an index outside of the array,
which then gives a "garbage" result into the length, which then gets
used for copying into a stack buffer.

Fix this by adding sanity checks on these as well as the copy size.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 net/netfilter/ipvs/ip_vs_ctl.c |   14 +++++++++++++-
 1 files changed, 13 insertions(+), 1 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index ac624e5..7adc876 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2077,6 +2077,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 
+	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX + 1)
+		return -EINVAL;
+	if (len < 0 || len >  sizeof(arg))
+		return -EINVAL;
 	if (len != set_arglen[SET_CMDID(cmd)]) {
 		pr_err("set_ctl: len %u != %u\n",
 		       len, set_arglen[SET_CMDID(cmd)]);
@@ -2353,17 +2357,25 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
 	unsigned char arg[128];
 	int ret = 0;
+	unsigned int copylen;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
 
+	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX + 1)
+		return -EINVAL;
+
 	if (*len < get_arglen[GET_CMDID(cmd)]) {
 		pr_err("get_ctl: len %u < %u\n",
 		       *len, get_arglen[GET_CMDID(cmd)]);
 		return -EINVAL;
 	}
 
-	if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
+	copylen = get_arglen[GET_CMDID(cmd)];
+	if (copylen > sizeof(arg))
+		return -EINVAL;
+
+	if (copy_from_user(arg, user, copylen) != 0)
 		return -EFAULT;
 
 	if (mutex_lock_interruptible(&__ip_vs_mutex))
-- 
1.6.2.5



-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply related

* Re: mac80211: NOHZ: local_softirq_pending 08
From: Michael Buesch @ 2009-09-30 15:10 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Kalle Valo, Oliver Hartkopp, John W. Linville,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1254322466.3959.5.camel-YfaajirXv2244ywRPIzf9A@public.gmane.org>

On Wednesday 30 September 2009 16:54:26 Johannes Berg wrote:
> On Wed, 2009-09-30 at 17:47 +0300, Kalle Valo wrote:
> 
> > I agree with Michael. The bug is real and I have verified that
> > Michael's patch fixes the issue. Better to apply the patch now, it's
> > trivial to change the implementation if/when the network stack has
> > support for this.
> 
> FWIW, I think in mac80211 the in_interrupt() check can never return true
> since we postpone all RX to the tasklet. But the tasklet seems to be ok
> -- so should it really be in_interrupt()?

I think a tasklet is also in_interrupt(), because it's a softirq.
in_interrupt() returns false in process context. The problem appeared when
the b43 driver started passing RX frames while being in process context (threaded IRQ).
It previously was in tasklet (= softirq) context.

-- 
Greetings, Michael.
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: mac80211: NOHZ: local_softirq_pending 08
From: Johannes Berg @ 2009-09-30 14:54 UTC (permalink / raw)
  To: Kalle Valo
  Cc: Michael Buesch, Oliver Hartkopp, John W. Linville, linux-wireless,
	netdev
In-Reply-To: <87ocosqykb.fsf@purkki.valot.fi>

[-- Attachment #1: Type: text/plain, Size: 489 bytes --]

On Wed, 2009-09-30 at 17:47 +0300, Kalle Valo wrote:

> I agree with Michael. The bug is real and I have verified that
> Michael's patch fixes the issue. Better to apply the patch now, it's
> trivial to change the implementation if/when the network stack has
> support for this.

FWIW, I think in mac80211 the in_interrupt() check can never return true
since we postpone all RX to the tasklet. But the tasklet seems to be ok
-- so should it really be in_interrupt()?

johannes

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* N_PPP_SYNC ldisc BUG: sleeping function called from invalid context
From: Tilman Schmidt @ 2009-09-30 14:50 UTC (permalink / raw)
  To: linux-kernel, netdev; +Cc: Alan Cox

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

While testing pppd with capiplugin on kernel 2.6.31 (plus my CAPI and
Gigaset patches) I came across this BUG message. (Please ignore the
Tainted flag, it's from the Nvidia driver and doesn't influence what's
following.)

BUG: sleeping function called from invalid context at kernel/mutex.c:280
in_atomic(): 1, irqs_disabled(): 1, pid: 10, name: events/0
INFO: lockdep is turned off.
irq event stamp: 2034
hardirqs last  enabled at (2033): [<c0347b07>]
_spin_unlock_irqrestore+0x3c/0x6c
hardirqs last disabled at (2034): [<c03475d0>] _spin_lock_irq+0x11/0x3f
softirqs last  enabled at (2008): [<c012f598>] __do_softirq+0x114/0x11c
softirqs last disabled at (1971): [<c010593a>] do_softirq+0x69/0xc7
Pid: 10, comm: events/0 Tainted: P           2.6.31-vanilla #1
Call Trace:
 [<c0124a3c>] __might_sleep+0x10e/0x116
 [<c0346bdd>] mutex_lock_nested+0x18/0x31
 [<c026d39b>] tty_unthrottle+0x10/0x38
 [<f8dcc31f>] ppp_sync_receive+0x168/0x170 [ppp_synctty]
 [<f8fbb9ce>] handle_minor_recv+0x187/0x1cd [capi]
 [<f8fbc19b>] capi_recv_message+0x1d9/0x24e [capi]
 [<fa5be461>] recv_handler+0x56/0x6f [kernelcapi]
 [<c0138cba>] worker_thread+0x14a/0x21d
 [<c0138c9e>] ? worker_thread+0x12e/0x21d
 [<fa5be40b>] ? recv_handler+0x0/0x6f [kernelcapi]
 [<c013b375>] ? autoremove_wake_function+0x0/0x30
 [<c013b201>] kthread+0x64/0x69
 [<c0138b70>] ? worker_thread+0x0/0x21d
 [<c013b19d>] ? kthread+0x0/0x69
 [<c0103f33>] kernel_thread_helper+0x7/0x10

Turns out the ppp_sync_receive() function (drivers/net/ppp_synctty.c
line 385ff.) has a comment in front stating:

/*
 * This can now be called from hard interrupt level as well
 * as soft interrupt level or mainline.
 */

but calls tty_unthrottle() which in turn calls mutex_lock() which of
course can sleep.

That tty_unthrottle() call was already removed once by

commit a6540f731d506d9e82444cf0020e716613d4c46c
Author: Alan Cox <alan@linux.intel.com>
Date:   Mon Jun 22 18:42:29 2009 +0100

    ppp: Fix throttling bugs

but re-added by

commit 4a21b8cb3550f19f838f7c48345fbbf6a0e8536b
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Thu Jul 16 09:14:23 2009 -0700

    Revert "ppp: Fix throttling bugs"

    This reverts commit a6540f731d506d9e82444cf0020e716613d4c46c, as
    requested by Alan:

      "... as it was wrong, the pty code is now fixed and the fact this
       isn't reverted is breaking pptp setups."

Opinions?

Thanks,
Tilman

- --
Tilman Schmidt                    E-Mail: tilman@imap.cc
Bonn, Germany
Diese Nachricht besteht zu 100% aus wiederverwerteten Bits.
Ungeöffnet mindestens haltbar bis: (siehe Rückseite)
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.4 (MingW32)
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org

iD8DBQFKw3AyQ3+did9BuFsRAiZXAKCGaos/qZNTlStEP4SE90PA0ZoMAQCdFtvf
U9chE7at35y8c6CGGS1IGg0=
=Vpq4
-----END PGP SIGNATURE-----

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox