Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next 6/8] bnx2x: Do not call load/unload functionality from DCC
From: Eilon Greenstein @ 2009-10-14 15:10 UTC (permalink / raw)
  To: David Miller; +Cc: netdev



There is really no need to clear the MAC or the FW filtering rules - it was
added for completion, but caused race conditions with load/unload. Removing this
redundant code

Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/bnx2x_main.c |   19 +++++--------------
 1 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index 7c5c300..42cd957 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -2565,21 +2565,12 @@ static void bnx2x_set_rx_mode(struct net_device *dev);
 static void bnx2x_e1h_disable(struct bnx2x *bp)
 {
 	int port = BP_PORT(bp);
-	int i;
-
-	bp->rx_mode = BNX2X_RX_MODE_NONE;
-	bnx2x_set_storm_rx_mode(bp);
 
 	netif_tx_disable(bp->dev);
 	bp->dev->trans_start = jiffies;	/* prevent tx timeout */
 
 	REG_WR(bp, NIG_REG_LLH0_FUNC_EN + port*8, 0);
 
-	bnx2x_set_eth_mac_addr_e1h(bp, 0);
-
-	for (i = 0; i < MC_HASH_SIZE; i++)
-		REG_WR(bp, MC_HASH_OFFSET(bp, i), 0);
-
 	netif_carrier_off(bp->dev);
 }
 
@@ -2589,13 +2580,13 @@ static void bnx2x_e1h_enable(struct bnx2x *bp)
 
 	REG_WR(bp, NIG_REG_LLH0_FUNC_EN + port*8, 1);
 
-	bnx2x_set_eth_mac_addr_e1h(bp, 1);
-
 	/* Tx queue should be only reenabled */
 	netif_tx_wake_all_queues(bp->dev);
 
-	/* Initialize the receive filter. */
-	bnx2x_set_rx_mode(bp->dev);
+	/*
+	 * Should not call netif_carrier_on since it will be called if the link
+	 * is up when checking for link state
+	 */
 }
 
 static void bnx2x_update_min_max(struct bnx2x *bp)
@@ -10538,7 +10529,7 @@ static void bnx2x_self_test(struct net_device *dev,
 		/* disable input for TX port IF */
 		REG_WR(bp, NIG_REG_EGRESS_UMP0_IN_EN + port*4, 0);
 
-		link_up = bp->link_vars.link_up;
+		link_up = (bnx2x_link_test(bp) == 0);
 		bnx2x_nic_unload(bp, UNLOAD_NORMAL);
 		bnx2x_nic_load(bp, LOAD_DIAG);
 		/* wait until link state is restored */
-- 
1.5.4.3





^ permalink raw reply related

* [net-next 4/8] bnx2x: Changing the Disabled state to a flag
From: Eilon Greenstein @ 2009-10-14 15:09 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

When working with DCC, a function can be disabled or enabled (virtual link down
or up). Using the function state introduced some race conditions with the
load/unload flow.
Using a separate flag to indicate that the function is disabled.

Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/bnx2x.h      |    2 +-
 drivers/net/bnx2x_main.c |   34 +++++++++++++++++++---------------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/drivers/net/bnx2x.h b/drivers/net/bnx2x.h
index 60fa14f..185a6ba 100644
--- a/drivers/net/bnx2x.h
+++ b/drivers/net/bnx2x.h
@@ -900,6 +900,7 @@ struct bnx2x {
 #define BP_NOMCP(bp)			(bp->flags & NO_MCP_FLAG)
 #define HW_VLAN_TX_FLAG			0x400
 #define HW_VLAN_RX_FLAG			0x800
+#define MF_FUNC_DIS			0x1000
 
 	int			func;
 #define BP_PORT(bp)			(bp->func % PORT_MAX)
@@ -965,7 +966,6 @@ struct bnx2x {
 #define BNX2X_STATE_CLOSING_WAIT4_HALT	0x4000
 #define BNX2X_STATE_CLOSING_WAIT4_DELETE 0x5000
 #define BNX2X_STATE_CLOSING_WAIT4_UNLOAD 0x6000
-#define BNX2X_STATE_DISABLED		0xd000
 #define BNX2X_STATE_DIAG		0xe000
 #define BNX2X_STATE_ERROR		0xf000
 
diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index 691cf15..e7b3b27 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -1043,7 +1043,6 @@ static void bnx2x_sp_event(struct bnx2x_fastpath *fp,
 		break;
 
 	case (RAMROD_CMD_ID_ETH_SET_MAC | BNX2X_STATE_CLOSING_WAIT4_HALT):
-	case (RAMROD_CMD_ID_ETH_SET_MAC | BNX2X_STATE_DISABLED):
 		DP(NETIF_MSG_IFDOWN, "got (un)set mac ramrod\n");
 		bp->set_mac_pending--;
 		smp_wmb();
@@ -2157,7 +2156,7 @@ static void bnx2x_calc_fc_adv(struct bnx2x *bp)
 
 static void bnx2x_link_report(struct bnx2x *bp)
 {
-	if (bp->state == BNX2X_STATE_DISABLED) {
+	if (bp->flags & MF_FUNC_DIS) {
 		netif_carrier_off(bp->dev);
 		printk(KERN_ERR PFX "%s NIC Link is Down\n", bp->dev->name);
 		return;
@@ -2437,8 +2436,7 @@ static void bnx2x_link_attn(struct bnx2x *bp)
 			memset(&(pstats->mac_stx[0]), 0,
 			       sizeof(struct mac_stx));
 		}
-		if ((bp->state == BNX2X_STATE_OPEN) ||
-		    (bp->state == BNX2X_STATE_DISABLED))
+		if (bp->state == BNX2X_STATE_OPEN)
 			bnx2x_stats_handle(bp, STATS_EVENT_LINK_UP);
 	}
 
@@ -2481,9 +2479,7 @@ static void bnx2x_link_attn(struct bnx2x *bp)
 
 static void bnx2x__link_status_update(struct bnx2x *bp)
 {
-	int func = BP_FUNC(bp);
-
-	if (bp->state != BNX2X_STATE_OPEN)
+	if ((bp->state != BNX2X_STATE_OPEN) || (bp->flags & MF_FUNC_DIS))
 		return;
 
 	bnx2x_link_status_update(&bp->link_params, &bp->link_vars);
@@ -2640,14 +2636,19 @@ static void bnx2x_dcc_event(struct bnx2x *bp, u32 dcc_event)
 
 	if (dcc_event & DRV_STATUS_DCC_DISABLE_ENABLE_PF) {
 
+		/*
+		 * This is the only place besides the function initialization
+		 * where the bp->flags can change so it is done without any
+		 * locks
+		 */
 		if (bp->mf_config & FUNC_MF_CFG_FUNC_DISABLED) {
 			DP(NETIF_MSG_IFDOWN, "mf_cfg function disabled\n");
-			bp->state = BNX2X_STATE_DISABLED;
+			bp->flags |= MF_FUNC_DIS;
 
 			bnx2x_e1h_disable(bp);
 		} else {
 			DP(NETIF_MSG_IFUP, "mf_cfg function enabled\n");
-			bp->state = BNX2X_STATE_OPEN;
+			bp->flags &= ~MF_FUNC_DIS;
 
 			bnx2x_e1h_enable(bp);
 		}
@@ -4695,8 +4696,7 @@ static void bnx2x_timer(unsigned long data)
 		}
 	}
 
-	if ((bp->state == BNX2X_STATE_OPEN) ||
-	    (bp->state == BNX2X_STATE_DISABLED))
+	if (bp->state == BNX2X_STATE_OPEN)
 		bnx2x_stats_handle(bp, STATS_EVENT_UPDATE);
 
 timer_restart:
@@ -7629,7 +7629,7 @@ static int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 	if (CHIP_IS_E1H(bp))
 		if (bp->mf_config & FUNC_MF_CFG_FUNC_DISABLED) {
 			DP(NETIF_MSG_IFUP, "mf_cfg function disabled\n");
-			bp->state = BNX2X_STATE_DISABLED;
+			bp->flags |= MF_FUNC_DIS;
 		}
 
 	if (bp->state == BNX2X_STATE_OPEN) {
@@ -9034,7 +9034,9 @@ static int bnx2x_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 	cmd->supported = bp->port.supported;
 	cmd->advertising = bp->port.advertising;
 
-	if (netif_carrier_ok(dev)) {
+	if ((bp->state == BNX2X_STATE_OPEN) &&
+	    !(bp->flags & MF_FUNC_DIS) &&
+	    (bp->link_vars.link_up)) {
 		cmd->speed = bp->link_vars.line_speed;
 		cmd->duplex = bp->link_vars.duplex;
 		if (IS_E1HMF(bp)) {
@@ -9433,6 +9435,9 @@ static u32 bnx2x_get_link(struct net_device *dev)
 {
 	struct bnx2x *bp = netdev_priv(dev);
 
+	if (bp->flags & MF_FUNC_DIS)
+		return 0;
+
 	return bp->link_vars.link_up;
 }
 
@@ -9837,8 +9842,7 @@ static int bnx2x_set_eeprom(struct net_device *dev,
 
 	} else if (eeprom->magic == 0x50485952) {
 		/* 'PHYR' (0x50485952): re-init link after FW upgrade */
-		if ((bp->state == BNX2X_STATE_OPEN) ||
-		    (bp->state == BNX2X_STATE_DISABLED)) {
+		if (bp->state == BNX2X_STATE_OPEN) {
 			bnx2x_acquire_phy_lock(bp);
 			rc |= bnx2x_link_reset(&bp->link_params,
 					       &bp->link_vars, 1);
-- 
1.5.4.3





^ permalink raw reply related

* [net-next 5/8] bnx2x: Adding FW mailbox mutex
From: Eilon Greenstein @ 2009-10-14 15:10 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

DCC commands are not protected with the RTNL lock, so a mutex should be added

Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/bnx2x.h      |    3 +++
 drivers/net/bnx2x_main.c |    7 +++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bnx2x.h b/drivers/net/bnx2x.h
index 185a6ba..c3b32f7 100644
--- a/drivers/net/bnx2x.h
+++ b/drivers/net/bnx2x.h
@@ -1023,6 +1023,9 @@ struct bnx2x {
 	/* used to synchronize dmae accesses */
 	struct mutex		dmae_mutex;
 
+	/* used to protect the FW mail box */
+	struct mutex		fw_mb_mutex;
+
 	/* used to synchronize stats collecting */
 	int			stats_state;
 	/* used by dmae command loader */
diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index e7b3b27..7c5c300 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -2528,6 +2528,7 @@ u32 bnx2x_fw_command(struct bnx2x *bp, u32 command)
 	u32 cnt = 1;
 	u8 delay = CHIP_REV_IS_SLOW(bp) ? 100 : 10;
 
+	mutex_lock(&bp->fw_mb_mutex);
 	SHMEM_WR(bp, func_mb[func].drv_mb_header, (command | seq));
 	DP(BNX2X_MSG_MCP, "wrote command (%x) to FW MB\n", (command | seq));
 
@@ -2537,8 +2538,8 @@ u32 bnx2x_fw_command(struct bnx2x *bp, u32 command)
 
 		rc = SHMEM_RD(bp, func_mb[func].fw_mb_header);
 
-		/* Give the FW up to 2 second (200*10ms) */
-	} while ((seq != (rc & FW_MSG_SEQ_NUMBER_MASK)) && (cnt++ < 200));
+		/* Give the FW up to 5 second (500*10ms) */
+	} while ((seq != (rc & FW_MSG_SEQ_NUMBER_MASK)) && (cnt++ < 500));
 
 	DP(BNX2X_MSG_MCP, "[after %d ms] read (%x) seq is (%x) from FW MB\n",
 	   cnt*delay, rc, seq);
@@ -2552,6 +2553,7 @@ u32 bnx2x_fw_command(struct bnx2x *bp, u32 command)
 		bnx2x_fw_dump(bp);
 		rc = 0;
 	}
+	mutex_unlock(&bp->fw_mb_mutex);
 
 	return rc;
 }
@@ -8956,6 +8958,7 @@ static int __devinit bnx2x_init_bp(struct bnx2x *bp)
 	smp_wmb(); /* Ensure that bp->intr_sem update is SMP-safe */
 
 	mutex_init(&bp->port.phy_mutex);
+	mutex_init(&bp->fw_mb_mutex);
 #ifdef BCM_CNIC
 	mutex_init(&bp->cnic_mutex);
 #endif
-- 
1.5.4.3





^ permalink raw reply related

* [net-next 7/8] bnx2x: Report the maximal available BW as link speed
From: Eilon Greenstein @ 2009-10-14 15:10 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

The device is limited to the maximal BW allocation, so it should be displayed as
the link speed to notify the user.

Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/bnx2x_main.c |   14 +++++++++++++-
 1 files changed, 13 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index 42cd957..ba131f4 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -2163,11 +2163,23 @@ static void bnx2x_link_report(struct bnx2x *bp)
 	}
 
 	if (bp->link_vars.link_up) {
+		u16 line_speed;
+
 		if (bp->state == BNX2X_STATE_OPEN)
 			netif_carrier_on(bp->dev);
 		printk(KERN_INFO PFX "%s NIC Link is Up, ", bp->dev->name);
 
-		printk("%d Mbps ", bp->link_vars.line_speed);
+		line_speed = bp->link_vars.line_speed;
+		if (IS_E1HMF(bp)) {
+			u16 vn_max_rate;
+
+			vn_max_rate =
+				((bp->mf_config & FUNC_MF_CFG_MAX_BW_MASK) >>
+				 FUNC_MF_CFG_MAX_BW_SHIFT) * 100;
+			if (vn_max_rate < line_speed)
+				line_speed = vn_max_rate;
+		}
+		printk("%d Mbps ", line_speed);
 
 		if (bp->link_vars.duplex == DUPLEX_FULL)
 			printk("full duplex");
-- 
1.5.4.3





^ permalink raw reply related

* [net-next 8/8] bnx2x: Update to version 1.52.1-1
From: Eilon Greenstein @ 2009-10-14 15:10 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
 drivers/net/bnx2x_main.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bnx2x_main.c b/drivers/net/bnx2x_main.c
index ba131f4..59b58d8 100644
--- a/drivers/net/bnx2x_main.c
+++ b/drivers/net/bnx2x_main.c
@@ -56,8 +56,8 @@
 #include "bnx2x_init_ops.h"
 #include "bnx2x_dump.h"
 
-#define DRV_MODULE_VERSION	"1.52.1"
-#define DRV_MODULE_RELDATE	"2009/08/12"
+#define DRV_MODULE_VERSION	"1.52.1-1"
+#define DRV_MODULE_RELDATE	"2009/10/13"
 #define BNX2X_BC_VER		0x040200
 
 #include <linux/firmware.h>
-- 
1.5.4.3





^ permalink raw reply related

* Re: [PATCH] ax25: unsigned cannot be less than 0 in ax25_ctl_ioctl()
From: Roel Kluin @ 2009-10-14 15:26 UTC (permalink / raw)
  To: Kevin Dawson; +Cc: wharms, linux-hams, netdev, Joerg Reuter, Andrew Morton
In-Reply-To: <4AD3A67B.6070703@cerebellum.kd>

struct ax25_ctl_struct member `arg' is unsigned and cannot be less
than 0.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
---
Op 12-10-09 23:58, Kevin Dawson schreef:

>>>  tmp_arg=ax25_ctl.arg * HZ;
>>>
>>>   if (arg == 0 || arg >  ULONG_MAX )
>>>         goto einval_put;
>>
>> I'm not sure, I think this would only work if we made `arg' an
>> unsigned long long.
> 
> That depends on the possible values of ax25_ctl.arg.
> 
>> +    if (ax25_ctl.arg * HZ > ULONG_MAX && ax25_ctl.cmd != AX25_KILL)
>> +        return -EINVAL;
> 
> Why the need to change arg before comparing it with a constant?  Let the
> compiler do the work:
> 
>     if (ax25_ctl.arg > ULONG_MAX / HZ && ...

Ok, How about this then?

diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index f454607..d923ac4 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -369,6 +369,9 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg)
 	if (ax25_ctl.digi_count > AX25_MAX_DIGIS)
 		return -EINVAL;
 
+	if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL)
+		return -EINVAL;
+
 	digi.ndigi = ax25_ctl.digi_count;
 	for (k = 0; k < digi.ndigi; k++)
 		digi.calls[k] = ax25_ctl.digi_addr[k];
@@ -418,14 +421,10 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg)
 		break;
 
 	case AX25_T3:
-		if (ax25_ctl.arg < 0)
-			goto einval_put;
 		ax25->t3 = ax25_ctl.arg * HZ;
 		break;
 
 	case AX25_IDLE:
-		if (ax25_ctl.arg < 0)
-			goto einval_put;
 		ax25->idle = ax25_ctl.arg * 60 * HZ;
 		break;
 

^ permalink raw reply related

* Re: query: tcpdump versus atomic?
From: William Allen Simpson @ 2009-10-14 15:20 UTC (permalink / raw)
  To: netdev
In-Reply-To: <4AD5522B.50101@gmail.com>

William Allen Simpson wrote:
> Anybody know what code path tcpdump changes to running atomic?
> 
> Is there a function to test whether you're running atomic?
> 
To partially answer my own question, after laboriously #if'ing compiling
section by section, it affects the tcp_minisockets.c code at
tcp_create_openreq_child().

I've not found a function to test.  I've found sk->sk_allocation, but
that doesn't seem to be dynamically updated to reflect the current state.

Anyway, sorry David, but there's at least two GFP_ATOMIC here (one existing,
one new).  I've managed to change the others, by careful rearrangement.  At
least, I hope so, until some future testing reveals otherwise....

^ permalink raw reply

* query: bnx2 and tg3 don't check tcp and/or ip header length validity?
From: William Allen Simpson @ 2009-10-14 15:50 UTC (permalink / raw)
  To: netdev

My question is whether it would be OK to add a simple test, and set it to
zero in case of bad values?

In both cases, they get a number that could be negative (in the case of a
badly formed header), and mash it into a flag vector of some sort.

No comments/documentation explaining purpose.

===

bnx2.c:
		u32 tcp_opt_len;
(ipv6 variant)
			vlan_tag_flags |= ((tcp_opt_len >> 2) << 8) |
					  TX_BD_FLAGS_SW_FLAGS;
(ipv4 variant)
			if (tcp_opt_len || (iph->ihl > 5)) {
				vlan_tag_flags |= ((iph->ihl - 5) +
						   (tcp_opt_len >> 2)) << 8;
			}

At least in the latter case, it bothers to check the IP header validity....

These are transmit-only, I cannot find where they use them on receive?

===

tg3.c:
		int tcp_opt_len, ip_tcp_len;

			tcp_opt_len = tcp_optlen(skb);
			ip_tcp_len = ip_hdrlen(skb) + sizeof(struct tcphdr);

			iph->check = 0;
			iph->tot_len = htons(mss + ip_tcp_len + tcp_opt_len);
			hdrlen = ip_tcp_len + tcp_opt_len;

...

		if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5717) {
			mss |= (hdrlen & 0xc) << 12;
			if (hdrlen & 0x10)
				base_flags |= 0x00000010;
			base_flags |= (hdrlen & 0x3e0) << 5;
		} else
			mss |= hdrlen << 9;

Likewise, transmit-only.  With completely different code later, in a dma
bug fix function.  But that's the overall picture....

Anybody have any idea what's going on here?

^ permalink raw reply

* [NET PATCH 0/9] ZC/L4RO enhancements to alacrityvm::vbus-enet driver
From: Gregory Haskins @ 2009-10-14 15:58 UTC (permalink / raw)
  To: alacrityvm-devel; +Cc: linux-kernel, netdev

The following series applies to the "linux-next" branch in the
alacrityvm tree:

git://git.kernel.org/pub/scm/linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git

These patches add support for zero-copy, and reassembly-offloading to the
venet driver.  This means we can transmit a guest GSO packet directly into
the host hardware, and receive fully reassembled LRO frames without
artificially segmenting them.

Unofficial testing against a ZC/L4RO capable backend show that we are
supporting about 6.6Gb/s in throughput (vs 7.3Gb/s for native) which is
up from the prior result of 5.7Gb/s without affecting our latency numbers.
I will officially re-run my tests and update the graphs asap.

http://developer.novell.com/wiki/index.php/AlacrityVM

Kind Regards,
-Greg

---

Gregory Haskins (9):
      venet: add Layer-4 Reassembler Offload (L4RO) support
      venet: add a tx-complete event for out-of-order support
      venet: use an skblist for outstanding descriptors
      venet: add eventq protocol
      venet: cache the ringlen values at init
      venet: report actual used descriptor size
      venet: add pre-mapped tx descriptor feature
      venet: fix gso.hdr_len to report correct length
      venet: Update maintainer

 MAINTAINERS             |    7 
 drivers/net/vbus-enet.c |  770 +++++++++++++++++++++++++++++++++++++++++++----
 include/linux/venet.h   |   61 +++-
 3 files changed, 764 insertions(+), 74 deletions(-)

^ permalink raw reply

* [NET PATCH 1/9] venet: Update maintainer
From: Gregory Haskins @ 2009-10-14 15:58 UTC (permalink / raw)
  To: alacrityvm-devel; +Cc: linux-kernel, netdev
In-Reply-To: <20091014154457.18864.28382.stgit@dev.haskins.net>

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 MAINTAINERS |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index fe97eb1..55fabad 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5617,6 +5617,13 @@ S:	Maintained
 F:	include/linux/vbus*
 F:	drivers/vbus/*
 
+VBUS ETHERNET DRIVER
+M:	Gregory Haskins <ghaskins@novell.com>
+S:	Maintained
+W:	http://developer.novell.com/wiki/index.php/AlacrityVM
+F:	include/linux/venet.h
+F:	drivers/net/vbus-enet.c
+
 VFAT/FAT/MSDOS FILESYSTEM
 M:	OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
 S:	Maintained


^ permalink raw reply related

* [NET PATCH 2/9] venet: fix gso.hdr_len to report correct length
From: Gregory Haskins @ 2009-10-14 15:58 UTC (permalink / raw)
  To: alacrityvm-devel; +Cc: linux-kernel, netdev
In-Reply-To: <20091014154457.18864.28382.stgit@dev.haskins.net>

This seemed to have worked for TSO4/6 frames, but breaks for UFO.  In
either case, its just plain wrong, so lets get the header set properly.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 drivers/net/vbus-enet.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index 91c47a9..3d61444 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -512,7 +512,7 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
 
 			vsg->flags |= VENET_SG_FLAG_GSO;
 
-			vsg->gso.hdrlen = skb_transport_header(skb) - skb->data;
+			vsg->gso.hdrlen = skb_headlen(skb);
 			vsg->gso.size = sinfo->gso_size;
 			if (sinfo->gso_type & SKB_GSO_TCPV4)
 				vsg->gso.type = VENET_GSO_TYPE_TCPV4;

^ permalink raw reply related

* [NET PATCH 3/9] venet: add pre-mapped tx descriptor feature
From: Gregory Haskins @ 2009-10-14 15:59 UTC (permalink / raw)
  To: alacrityvm-devel; +Cc: linux-kernel, netdev
In-Reply-To: <20091014154457.18864.28382.stgit@dev.haskins.net>

What: Pre-allocate and map our scatter-gather descriptors.

Why: The host cannot directly access guest memory, and therefore any
indirection adds additional overhead.  We currently implement
scattergather by pushing a pointer to the sg-descriptor, which points
to the actual SKB.  This means the host must take an extra read
just to obtain the pointer to the SKB data.

Therefore we introduce a new shared-memory region that consists of
pre-allocated scattergather descriptors.  The host may then decode
a descriptor pointer as an offset to this pre-mapped region and
save time/overhead.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 drivers/net/vbus-enet.c |   62 +++++++++++++++++++++++++++++++++++++++++------
 include/linux/venet.h   |   12 +++++----
 2 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index 3d61444..b3e9695 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -61,6 +61,10 @@ struct vbus_enet_priv {
 	struct vbus_enet_queue     txq;
 	struct tasklet_struct      txtask;
 	bool                       sg;
+	struct {
+		bool               enabled;
+		char              *pool;
+	} pmtd; /* pre-mapped transmit descriptors */
 };
 
 static void vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force);
@@ -201,7 +205,9 @@ rx_teardown(struct vbus_enet_priv *priv)
 static int
 tx_setup(struct vbus_enet_priv *priv)
 {
-	struct ioq *ioq = priv->txq.queue;
+	struct ioq *ioq    = priv->txq.queue;
+	size_t      iovlen = sizeof(struct venet_iov) * (MAX_SKB_FRAGS-1);
+	size_t      len    = sizeof(struct venet_sg) + iovlen;
 	struct ioq_iterator iter;
 	int i;
 	int ret;
@@ -213,6 +219,29 @@ tx_setup(struct vbus_enet_priv *priv)
 		 */
 		return 0;
 
+	/* pre-allocate our descriptor pool if pmtd is enabled */
+	if (priv->pmtd.enabled) {
+		struct vbus_device_proxy *dev = priv->vdev;
+		size_t poollen = len * tx_ringlen;
+		char *pool;
+		int shmid;
+
+		/* pmtdquery will return the shm-id to use for the pool */
+		ret = devcall(priv, VENET_FUNC_PMTDQUERY, NULL, 0);
+		BUG_ON(ret < 0);
+
+		shmid = ret;
+
+		pool = kzalloc(poollen, GFP_KERNEL | GFP_DMA);
+		if (!pool)
+			return -ENOMEM;
+
+		priv->pmtd.pool = pool;
+
+		ret = dev->ops->shm(dev, shmid, 0, pool, poollen, 0, NULL, 0);
+		BUG_ON(ret < 0);
+	}
+
 	ret = ioq_iter_init(ioq, &iter, ioq_idxtype_valid, 0);
 	BUG_ON(ret < 0);
 
@@ -224,16 +253,22 @@ tx_setup(struct vbus_enet_priv *priv)
 	 */
 	for (i = 0; i < tx_ringlen; i++) {
 		struct venet_sg *vsg;
-		size_t iovlen = sizeof(struct venet_iov) * (MAX_SKB_FRAGS-1);
-		size_t len = sizeof(*vsg) + iovlen;
 
-		vsg = kzalloc(len, GFP_KERNEL);
-		if (!vsg)
-			return -ENOMEM;
+		if (priv->pmtd.enabled) {
+			size_t offset = (i * len);
+
+			vsg = (struct venet_sg *)&priv->pmtd.pool[offset];
+			iter.desc->ptr = (u64)offset;
+		} else {
+			vsg = kzalloc(len, GFP_KERNEL);
+			if (!vsg)
+				return -ENOMEM;
+
+			iter.desc->ptr = (u64)__pa(vsg);
+		}
 
 		iter.desc->cookie = (u64)vsg;
 		iter.desc->len    = len;
-		iter.desc->ptr    = (u64)__pa(vsg);
 
 		ret = ioq_iter_seek(&iter, ioq_seek_next, 0, 0);
 		BUG_ON(ret < 0);
@@ -259,6 +294,14 @@ tx_teardown(struct vbus_enet_priv *priv)
 		 */
 		return;
 
+	if (priv->pmtd.enabled) {
+		/*
+		 * PMTD mode means we only need to free the pool
+		 */
+		kfree(priv->pmtd.pool);
+		return;
+	}
+
 	ret = ioq_iter_init(ioq, &iter, ioq_idxtype_valid, 0);
 	BUG_ON(ret < 0);
 
@@ -705,7 +748,7 @@ vbus_enet_negcap(struct vbus_enet_priv *priv)
 	if (sg_enabled) {
 		caps.gid = VENET_CAP_GROUP_SG;
 		caps.bits |= (VENET_CAP_SG|VENET_CAP_TSO4|VENET_CAP_TSO6
-			      |VENET_CAP_ECN);
+			      |VENET_CAP_ECN|VENET_CAP_PMTD);
 		/* note: exclude UFO for now due to stack bug */
 	}
 
@@ -726,6 +769,9 @@ vbus_enet_negcap(struct vbus_enet_priv *priv)
 			dev->features |= NETIF_F_TSO6;
 		if (caps.bits & VENET_CAP_ECN)
 			dev->features |= NETIF_F_TSO_ECN;
+
+		if (caps.bits & VENET_CAP_PMTD)
+			priv->pmtd.enabled = true;
 	}
 
 	return 0;
diff --git a/include/linux/venet.h b/include/linux/venet.h
index 47ed37d..57aeddd 100644
--- a/include/linux/venet.h
+++ b/include/linux/venet.h
@@ -45,6 +45,7 @@ struct venet_capabilities {
 #define VENET_CAP_TSO6   (1 << 2)
 #define VENET_CAP_ECN    (1 << 3)
 #define VENET_CAP_UFO    (1 << 4)
+#define VENET_CAP_PMTD   (1 << 5) /* pre-mapped tx desc */
 
 struct venet_iov {
 	__u32 len;
@@ -75,10 +76,11 @@ struct venet_sg {
 	struct venet_iov iov[1];
 };
 
-#define VENET_FUNC_LINKUP   0
-#define VENET_FUNC_LINKDOWN 1
-#define VENET_FUNC_MACQUERY 2
-#define VENET_FUNC_NEGCAP   3 /* negotiate capabilities */
-#define VENET_FUNC_FLUSHRX  4
+#define VENET_FUNC_LINKUP    0
+#define VENET_FUNC_LINKDOWN  1
+#define VENET_FUNC_MACQUERY  2
+#define VENET_FUNC_NEGCAP    3 /* negotiate capabilities */
+#define VENET_FUNC_FLUSHRX   4
+#define VENET_FUNC_PMTDQUERY 5
 
 #endif /* _LINUX_VENET_H */


^ permalink raw reply related

* [NET PATCH 4/9] venet: report actual used descriptor size
From: Gregory Haskins @ 2009-10-14 15:59 UTC (permalink / raw)
  To: alacrityvm-devel; +Cc: linux-kernel, netdev
In-Reply-To: <20091014154457.18864.28382.stgit@dev.haskins.net>

This should reduce wasted effort copying parts of the descriptor
which are not in use, since the descriptors are typically pre-allocated
to their maximum size.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 drivers/net/vbus-enet.c |    2 ++
 include/linux/venet.h   |    3 +++
 2 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index b3e9695..63237f3 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -582,6 +582,8 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
 			iov->ptr = (u64)sg_phys(sg);
 		}
 
+		iter.desc->len = (u64)VSG_DESC_SIZE(vsg->count);
+
 	} else {
 		/*
 		 * non scatter-gather mode: simply put the skb right onto the
diff --git a/include/linux/venet.h b/include/linux/venet.h
index 57aeddd..53b6958 100644
--- a/include/linux/venet.h
+++ b/include/linux/venet.h
@@ -76,6 +76,9 @@ struct venet_sg {
 	struct venet_iov iov[1];
 };
 
+#define VSG_DESC_SIZE(count) (sizeof(struct venet_sg) + \
+			      sizeof(struct venet_iov) * ((count) - 1))
+
 #define VENET_FUNC_LINKUP    0
 #define VENET_FUNC_LINKDOWN  1
 #define VENET_FUNC_MACQUERY  2


^ permalink raw reply related

* [NET PATCH 5/9] venet: cache the ringlen values at init
From: Gregory Haskins @ 2009-10-14 15:59 UTC (permalink / raw)
  To: alacrityvm-devel; +Cc: linux-kernel, netdev
In-Reply-To: <20091014154457.18864.28382.stgit@dev.haskins.net>

We want to prevent the condition where changes to the module-params
could affect the run-time validity of the ringstate

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 drivers/net/vbus-enet.c |    7 +++++--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index 63237f3..fe9eeca 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -50,6 +50,7 @@ module_param(sg_enabled, int, 0444);
 struct vbus_enet_queue {
 	struct ioq              *queue;
 	struct ioq_notifier      notifier;
+	unsigned long            count;
 };
 
 struct vbus_enet_priv {
@@ -94,6 +95,8 @@ queue_init(struct vbus_enet_priv *priv,
 		q->queue->notifier = &q->notifier;
 	}
 
+	q->count = ringsize;
+
 	return 0;
 }
 
@@ -222,7 +225,7 @@ tx_setup(struct vbus_enet_priv *priv)
 	/* pre-allocate our descriptor pool if pmtd is enabled */
 	if (priv->pmtd.enabled) {
 		struct vbus_device_proxy *dev = priv->vdev;
-		size_t poollen = len * tx_ringlen;
+		size_t poollen = len * priv->txq.count;
 		char *pool;
 		int shmid;
 
@@ -251,7 +254,7 @@ tx_setup(struct vbus_enet_priv *priv)
 	/*
 	 * Now populate each descriptor with an empty SG descriptor
 	 */
-	for (i = 0; i < tx_ringlen; i++) {
+	for (i = 0; i < priv->txq.count; i++) {
 		struct venet_sg *vsg;
 
 		if (priv->pmtd.enabled) {

^ permalink raw reply related

* [NET PATCH 6/9] venet: add eventq protocol
From: Gregory Haskins @ 2009-10-14 15:59 UTC (permalink / raw)
  To: alacrityvm-devel; +Cc: linux-kernel, netdev
In-Reply-To: <20091014154457.18864.28382.stgit@dev.haskins.net>

This adds an event-channel for passing host->guest messages to the
guest driver.  We will use this later in the series for linkstate and
asynchronous transmit-complete events.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 drivers/net/vbus-enet.c |  203 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/venet.h   |   28 ++++++
 2 files changed, 229 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index fe9eeca..5fccfd1 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -66,6 +66,14 @@ struct vbus_enet_priv {
 		bool               enabled;
 		char              *pool;
 	} pmtd; /* pre-mapped transmit descriptors */
+	struct {
+		bool                   enabled;
+		bool                   linkstate;
+		unsigned long          evsize;
+		struct vbus_enet_queue veq;
+		struct tasklet_struct  task;
+		char                  *pool;
+	} evq;
 };
 
 static void vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force);
@@ -331,6 +339,16 @@ tx_teardown(struct vbus_enet_priv *priv)
 	}
 }
 
+static void
+evq_teardown(struct vbus_enet_priv *priv)
+{
+	if (!priv->evq.enabled)
+		return;
+
+	ioq_put(priv->evq.veq.queue);
+	kfree(priv->evq.pool);
+}
+
 /*
  * Open and close
  */
@@ -741,8 +759,91 @@ tx_isr(struct ioq_notifier *notifier)
        tasklet_schedule(&priv->txtask);
 }
 
+static void
+evq_linkstate_event(struct vbus_enet_priv *priv,
+		    struct venet_event_header *header)
+{
+	struct venet_event_linkstate *event =
+		(struct venet_event_linkstate *)header;
+
+	switch (event->state) {
+	case 0:
+		netif_carrier_off(priv->dev);
+		break;
+	case 1:
+		netif_carrier_on(priv->dev);
+		break;
+	default:
+		break;
+	}
+}
+
+static void
+deferred_evq_isr(unsigned long data)
+{
+	struct vbus_enet_priv *priv = (struct vbus_enet_priv *)data;
+	int nevents = 0;
+	struct ioq_iterator iter;
+	int ret;
+
+	PDEBUG(priv->dev, "evq: polling...\n");
+
+	/* We want to iterate on the head of the in-use index */
+	ret = ioq_iter_init(priv->evq.veq.queue, &iter, ioq_idxtype_inuse,
+			    IOQ_ITER_AUTOUPDATE);
+	BUG_ON(ret < 0);
+
+	ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * The EOM is indicated by finding a packet that is still owned by
+	 * the south side
+	 */
+	while (!iter.desc->sown) {
+		struct venet_event_header *header;
+
+		header = (struct venet_event_header *)iter.desc->cookie;
+
+		switch (header->id) {
+		case VENET_EVENT_LINKSTATE:
+			evq_linkstate_event(priv, header);
+			break;
+		default:
+			panic("venet: unexpected event id:%d of size %d\n",
+			      header->id, header->size);
+			break;
+		}
+
+		memset((void *)iter.desc->cookie, 0, priv->evq.evsize);
+
+		/* Advance the in-use tail */
+		ret = ioq_iter_pop(&iter, 0);
+		BUG_ON(ret < 0);
+
+		nevents++;
+	}
+
+	PDEBUG(priv->dev, "%d events received\n", nevents);
+
+	ioq_notify_enable(priv->evq.veq.queue, 0);
+}
+
+static void
+evq_isr(struct ioq_notifier *notifier)
+{
+       struct vbus_enet_priv *priv;
+
+       priv = container_of(notifier, struct vbus_enet_priv, evq.veq.notifier);
+
+       PDEBUG(priv->dev, "evq_isr\n");
+
+       ioq_notify_disable(priv->evq.veq.queue, 0);
+       tasklet_schedule(&priv->evq.task);
+}
+
 static int
-vbus_enet_negcap(struct vbus_enet_priv *priv)
+vbus_enet_sg_negcap(struct vbus_enet_priv *priv)
 {
 	struct net_device *dev = priv->dev;
 	struct venet_capabilities caps;
@@ -782,6 +883,103 @@ vbus_enet_negcap(struct vbus_enet_priv *priv)
 	return 0;
 }
 
+static int
+vbus_enet_evq_negcap(struct vbus_enet_priv *priv, unsigned long count)
+{
+	struct venet_capabilities caps;
+	int ret;
+
+	memset(&caps, 0, sizeof(caps));
+
+	caps.gid = VENET_CAP_GROUP_EVENTQ;
+	caps.bits |= VENET_CAP_EVQ_LINKSTATE;
+
+	ret = devcall(priv, VENET_FUNC_NEGCAP, &caps, sizeof(caps));
+	if (ret < 0)
+		return ret;
+
+	if (caps.bits) {
+		struct vbus_device_proxy *dev = priv->vdev;
+		struct venet_eventq_query query;
+		size_t                    poollen;
+		struct ioq_iterator       iter;
+		char                     *pool;
+		int                       i;
+
+		priv->evq.enabled = true;
+
+		if (caps.bits & VENET_CAP_EVQ_LINKSTATE) {
+			/*
+			 * We will assume there is no carrier until we get
+			 * an event telling us otherwise
+			 */
+			netif_carrier_off(priv->dev);
+			priv->evq.linkstate = true;
+		}
+
+		memset(&query, 0, sizeof(query));
+
+		ret = devcall(priv, VENET_FUNC_EVQQUERY, &query, sizeof(query));
+		if (ret < 0)
+			return ret;
+
+		priv->evq.evsize = query.evsize;
+		poollen = query.evsize * count;
+
+		pool = kzalloc(poollen, GFP_KERNEL | GFP_DMA);
+		if (!pool)
+			return -ENOMEM;
+
+		priv->evq.pool = pool;
+
+		ret = dev->ops->shm(dev, query.dpid, 0,
+				    pool, poollen, 0, NULL, 0);
+		if (ret < 0)
+			return ret;
+
+		queue_init(priv, &priv->evq.veq, query.qid, count, evq_isr);
+
+		ret = ioq_iter_init(priv->evq.veq.queue,
+				    &iter, ioq_idxtype_valid, 0);
+		BUG_ON(ret < 0);
+
+		ret = ioq_iter_seek(&iter, ioq_seek_set, 0, 0);
+		BUG_ON(ret < 0);
+
+		/* Now populate each descriptor with an empty event */
+		for (i = 0; i < count; i++) {
+			size_t offset = (i * query.evsize);
+			void *addr = &priv->evq.pool[offset];
+
+			iter.desc->ptr    = (u64)offset;
+			iter.desc->cookie = (u64)addr;
+			iter.desc->len    = query.evsize;
+
+			ret = ioq_iter_push(&iter, 0);
+			BUG_ON(ret < 0);
+		}
+
+		/* Finally, enable interrupts */
+		tasklet_init(&priv->evq.task, deferred_evq_isr,
+			     (unsigned long)priv);
+		ioq_notify_enable(priv->evq.veq.queue, 0);
+	}
+
+	return 0;
+}
+
+static int
+vbus_enet_negcap(struct vbus_enet_priv *priv)
+{
+	int ret;
+
+	ret = vbus_enet_sg_negcap(priv);
+	if (ret < 0)
+		return ret;
+
+	return vbus_enet_evq_negcap(priv, tx_ringlen);
+}
+
 static int vbus_enet_set_tx_csum(struct net_device *dev, u32 data)
 {
 	struct vbus_enet_priv *priv = netdev_priv(dev);
@@ -905,6 +1103,9 @@ vbus_enet_remove(struct vbus_device_proxy *vdev)
 	tx_teardown(priv);
 	ioq_put(priv->txq.queue);
 
+	if (priv->evq.enabled)
+		evq_teardown(priv);
+
 	dev->ops->close(dev, 0);
 
 	free_netdev(priv->dev);
diff --git a/include/linux/venet.h b/include/linux/venet.h
index 53b6958..16b0156 100644
--- a/include/linux/venet.h
+++ b/include/linux/venet.h
@@ -37,7 +37,8 @@ struct venet_capabilities {
 	__u32 bits;
 };
 
-#define VENET_CAP_GROUP_SG 0
+#define VENET_CAP_GROUP_SG     0
+#define VENET_CAP_GROUP_EVENTQ 1
 
 /* CAPABILITIES-GROUP SG */
 #define VENET_CAP_SG     (1 << 0)
@@ -47,6 +48,9 @@ struct venet_capabilities {
 #define VENET_CAP_UFO    (1 << 4)
 #define VENET_CAP_PMTD   (1 << 5) /* pre-mapped tx desc */
 
+/* CAPABILITIES-GROUP EVENTQ */
+#define VENET_CAP_EVQ_LINKSTATE  (1 << 0)
+
 struct venet_iov {
 	__u32 len;
 	__u64 ptr;
@@ -76,6 +80,27 @@ struct venet_sg {
 	struct venet_iov iov[1];
 };
 
+struct venet_eventq_query {
+	__u32 flags;
+	__u32 evsize;  /* size of each event */
+	__u32 dpid;    /* descriptor pool-id */
+	__u32 qid;
+	__u8  pad[16];
+};
+
+#define VENET_EVENT_LINKSTATE 0
+
+struct venet_event_header {
+	__u32 flags;
+	__u32 size;
+	__u32 id;
+};
+
+struct venet_event_linkstate {
+	struct venet_event_header header;
+	__u8                      state; /* 0 = down, 1 = up */
+};
+
 #define VSG_DESC_SIZE(count) (sizeof(struct venet_sg) + \
 			      sizeof(struct venet_iov) * ((count) - 1))
 
@@ -85,5 +110,6 @@ struct venet_sg {
 #define VENET_FUNC_NEGCAP    3 /* negotiate capabilities */
 #define VENET_FUNC_FLUSHRX   4
 #define VENET_FUNC_PMTDQUERY 5
+#define VENET_FUNC_EVQQUERY  6
 
 #endif /* _LINUX_VENET_H */


^ permalink raw reply related

* [NET PATCH 7/9] venet: use an skblist for outstanding descriptors
From: Gregory Haskins @ 2009-10-14 15:59 UTC (permalink / raw)
  To: alacrityvm-devel; +Cc: linux-kernel, netdev
In-Reply-To: <20091014154457.18864.28382.stgit@dev.haskins.net>

This will be useful later in the series so that we can switch to
an asynchronous model.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 drivers/net/vbus-enet.c |   59 +++++++++++++++++++++++++++--------------------
 1 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index 5fccfd1..3032169 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -59,8 +59,11 @@ struct vbus_enet_priv {
 	struct vbus_device_proxy  *vdev;
 	struct napi_struct         napi;
 	struct vbus_enet_queue     rxq;
-	struct vbus_enet_queue     txq;
-	struct tasklet_struct      txtask;
+	struct {
+		struct vbus_enet_queue veq;
+		struct tasklet_struct  task;
+		struct sk_buff_head    outstanding;
+	} tx;
 	bool                       sg;
 	struct {
 		bool               enabled;
@@ -76,7 +79,7 @@ struct vbus_enet_priv {
 	} evq;
 };
 
-static void vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force);
+static void vbus_enet_tx_reap(struct vbus_enet_priv *priv);
 
 static struct vbus_enet_priv *
 napi_to_priv(struct napi_struct *napi)
@@ -216,7 +219,7 @@ rx_teardown(struct vbus_enet_priv *priv)
 static int
 tx_setup(struct vbus_enet_priv *priv)
 {
-	struct ioq *ioq    = priv->txq.queue;
+	struct ioq *ioq    = priv->tx.veq.queue;
 	size_t      iovlen = sizeof(struct venet_iov) * (MAX_SKB_FRAGS-1);
 	size_t      len    = sizeof(struct venet_sg) + iovlen;
 	struct ioq_iterator iter;
@@ -233,7 +236,7 @@ tx_setup(struct vbus_enet_priv *priv)
 	/* pre-allocate our descriptor pool if pmtd is enabled */
 	if (priv->pmtd.enabled) {
 		struct vbus_device_proxy *dev = priv->vdev;
-		size_t poollen = len * priv->txq.count;
+		size_t poollen = len * priv->tx.veq.count;
 		char *pool;
 		int shmid;
 
@@ -262,7 +265,7 @@ tx_setup(struct vbus_enet_priv *priv)
 	/*
 	 * Now populate each descriptor with an empty SG descriptor
 	 */
-	for (i = 0; i < priv->txq.count; i++) {
+	for (i = 0; i < priv->tx.veq.count; i++) {
 		struct venet_sg *vsg;
 
 		if (priv->pmtd.enabled) {
@@ -291,12 +294,14 @@ tx_setup(struct vbus_enet_priv *priv)
 static void
 tx_teardown(struct vbus_enet_priv *priv)
 {
-	struct ioq *ioq = priv->txq.queue;
+	struct ioq *ioq = priv->tx.veq.queue;
 	struct ioq_iterator iter;
+	struct sk_buff *skb;
 	int ret;
 
 	/* forcefully free all outstanding transmissions */
-	vbus_enet_tx_reap(priv, 1);
+	while ((skb = __skb_dequeue(&priv->tx.outstanding)))
+		dev_kfree_skb(skb);
 
 	if (!priv->sg)
 		/*
@@ -529,7 +534,7 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
 
 	spin_lock_irqsave(&priv->lock, flags);
 
-	if (ioq_full(priv->txq.queue, ioq_idxtype_valid)) {
+	if (ioq_full(priv->tx.veq.queue, ioq_idxtype_valid)) {
 		/*
 		 * We must flow-control the kernel by disabling the
 		 * queue
@@ -544,7 +549,7 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
 	 * We want to iterate on the tail of both the "inuse" and "valid" index
 	 * so we specify the "both" index
 	 */
-	ret = ioq_iter_init(priv->txq.queue, &iter, ioq_idxtype_both,
+	ret = ioq_iter_init(priv->tx.veq.queue, &iter, ioq_idxtype_both,
 			    IOQ_ITER_AUTOUPDATE);
 	BUG_ON(ret < 0);
 
@@ -620,6 +625,8 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
 	priv->dev->stats.tx_packets++;
 	priv->dev->stats.tx_bytes += skb->len;
 
+	__skb_queue_tail(&priv->tx.outstanding, skb);
+
 	/*
 	 * This advances both indexes together implicitly, and then
 	 * signals the south side to consume the packet
@@ -629,7 +636,7 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
 
 	dev->trans_start = jiffies; /* save the timestamp */
 
-	if (ioq_full(priv->txq.queue, ioq_idxtype_valid)) {
+	if (ioq_full(priv->tx.veq.queue, ioq_idxtype_valid)) {
 		/*
 		 * If the queue is congested, we must flow-control the kernel
 		 */
@@ -648,7 +655,7 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
  * assumes priv->lock held
  */
 static void
-vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force)
+vbus_enet_tx_reap(struct vbus_enet_priv *priv)
 {
 	struct ioq_iterator iter;
 	int ret;
@@ -658,7 +665,7 @@ vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force)
 	 * do not want the iter_pop (below) to flip the ownership, so
 	 * we set the NOFLIPOWNER option
 	 */
-	ret = ioq_iter_init(priv->txq.queue, &iter, ioq_idxtype_valid,
+	ret = ioq_iter_init(priv->tx.veq.queue, &iter, ioq_idxtype_valid,
 			    IOQ_ITER_NOFLIPOWNER);
 	BUG_ON(ret < 0);
 
@@ -669,7 +676,7 @@ vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force)
 	 * We are done once we find the first packet either invalid or still
 	 * owned by the south-side
 	 */
-	while (iter.desc->valid && (!iter.desc->sown || force)) {
+	while (iter.desc->valid && !iter.desc->sown) {
 		struct sk_buff *skb;
 
 		if (priv->sg) {
@@ -687,6 +694,7 @@ vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force)
 		/* Reset the descriptor */
 		iter.desc->valid  = 0;
 
+		__skb_unlink(skb, &priv->tx.outstanding);
 		dev_kfree_skb(skb);
 
 		/* Advance the valid-index head */
@@ -699,7 +707,7 @@ vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force)
 	 * processing
 	 */
 	if (netif_queue_stopped(priv->dev)
-	    && !ioq_full(priv->txq.queue, ioq_idxtype_valid)) {
+	    && !ioq_full(priv->tx.veq.queue, ioq_idxtype_valid)) {
 		PDEBUG(priv->dev, "re-enabling tx queue\n");
 		netif_wake_queue(priv->dev);
 	}
@@ -714,7 +722,7 @@ vbus_enet_timeout(struct net_device *dev)
 	dev_dbg(&dev->dev, "Transmit timeout\n");
 
 	spin_lock_irqsave(&priv->lock, flags);
-	vbus_enet_tx_reap(priv, 0);
+	vbus_enet_tx_reap(priv);
 	spin_unlock_irqrestore(&priv->lock, flags);
 }
 
@@ -740,10 +748,10 @@ deferred_tx_isr(unsigned long data)
 	PDEBUG(priv->dev, "deferred_tx_isr\n");
 
 	spin_lock_irqsave(&priv->lock, flags);
-	vbus_enet_tx_reap(priv, 0);
+	vbus_enet_tx_reap(priv);
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	ioq_notify_enable(priv->txq.queue, 0);
+	ioq_notify_enable(priv->tx.veq.queue, 0);
 }
 
 static void
@@ -751,12 +759,12 @@ tx_isr(struct ioq_notifier *notifier)
 {
        struct vbus_enet_priv *priv;
 
-       priv = container_of(notifier, struct vbus_enet_priv, txq.notifier);
+       priv = container_of(notifier, struct vbus_enet_priv, tx.veq.notifier);
 
        PDEBUG(priv->dev, "tx_isr\n");
 
-       ioq_notify_disable(priv->txq.queue, 0);
-       tasklet_schedule(&priv->txtask);
+       ioq_notify_disable(priv->tx.veq.queue, 0);
+       tasklet_schedule(&priv->tx.task);
 }
 
 static void
@@ -1043,16 +1051,17 @@ vbus_enet_probe(struct vbus_device_proxy *vdev)
 		goto out_free;
 	}
 
-	tasklet_init(&priv->txtask, deferred_tx_isr, (unsigned long)priv);
+	tasklet_init(&priv->tx.task, deferred_tx_isr, (unsigned long)priv);
+	skb_queue_head_init(&priv->tx.outstanding);
 
 	queue_init(priv, &priv->rxq, VENET_QUEUE_RX, rx_ringlen, rx_isr);
-	queue_init(priv, &priv->txq, VENET_QUEUE_TX, tx_ringlen, tx_isr);
+	queue_init(priv, &priv->tx.veq, VENET_QUEUE_TX, tx_ringlen, tx_isr);
 
 	rx_setup(priv);
 	tx_setup(priv);
 
 	ioq_notify_enable(priv->rxq.queue, 0);  /* enable interrupts */
-	ioq_notify_enable(priv->txq.queue, 0);
+	ioq_notify_enable(priv->tx.veq.queue, 0);
 
 	dev->netdev_ops     = &vbus_enet_netdev_ops;
 	dev->watchdog_timeo = 5 * HZ;
@@ -1101,7 +1110,7 @@ vbus_enet_remove(struct vbus_device_proxy *vdev)
 	ioq_put(priv->rxq.queue);
 
 	tx_teardown(priv);
-	ioq_put(priv->txq.queue);
+	ioq_put(priv->tx.veq.queue);
 
 	if (priv->evq.enabled)
 		evq_teardown(priv);

^ permalink raw reply related

* [NET PATCH 8/9] venet: add a tx-complete event for out-of-order support
From: Gregory Haskins @ 2009-10-14 15:59 UTC (permalink / raw)
  To: alacrityvm-devel; +Cc: linux-kernel, netdev
In-Reply-To: <20091014154457.18864.28382.stgit@dev.haskins.net>

This paves the way for zero-copy support since we cannot predict
the order in which paged-skbs may actually be consumed.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 drivers/net/vbus-enet.c |   77 ++++++++++++++++++++++++++++++++++++++---------
 include/linux/venet.h   |    8 +++++
 2 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index 3032169..e8a0553 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -72,6 +72,7 @@ struct vbus_enet_priv {
 	struct {
 		bool                   enabled;
 		bool                   linkstate;
+		bool                   txc;
 		unsigned long          evsize;
 		struct vbus_enet_queue veq;
 		struct tasklet_struct  task;
@@ -649,6 +650,17 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
 	return 0;
 }
 
+/* assumes priv->lock held */
+static void
+vbus_enet_skb_complete(struct vbus_enet_priv *priv, struct sk_buff *skb)
+{
+	PDEBUG(priv->dev, "completed sending %d bytes\n",
+	       skb->len);
+
+	__skb_unlink(skb, &priv->tx.outstanding);
+	dev_kfree_skb(skb);
+}
+
 /*
  * reclaim any outstanding completed tx packets
  *
@@ -677,26 +689,28 @@ vbus_enet_tx_reap(struct vbus_enet_priv *priv)
 	 * owned by the south-side
 	 */
 	while (iter.desc->valid && !iter.desc->sown) {
-		struct sk_buff *skb;
 
-		if (priv->sg) {
-			struct venet_sg *vsg;
+		if (!priv->evq.txc) {
+			struct sk_buff *skb;
 
-			vsg = (struct venet_sg *)iter.desc->cookie;
-			skb = (struct sk_buff *)vsg->cookie;
+			if (priv->sg) {
+				struct venet_sg *vsg;
 
-		} else {
-			skb = (struct sk_buff *)iter.desc->cookie;
-		}
+				vsg = (struct venet_sg *)iter.desc->cookie;
+				skb = (struct sk_buff *)vsg->cookie;
+			} else
+				skb = (struct sk_buff *)iter.desc->cookie;
 
-		PDEBUG(priv->dev, "completed sending %d bytes\n", skb->len);
+			/*
+			 * If TXC is not enabled, we are required to free
+			 * the buffer resources now
+			 */
+			vbus_enet_skb_complete(priv, skb);
+		}
 
 		/* Reset the descriptor */
 		iter.desc->valid  = 0;
 
-		__skb_unlink(skb, &priv->tx.outstanding);
-		dev_kfree_skb(skb);
-
 		/* Advance the valid-index head */
 		ret = ioq_iter_pop(&iter, 0);
 		BUG_ON(ret < 0);
@@ -787,6 +801,22 @@ evq_linkstate_event(struct vbus_enet_priv *priv,
 }
 
 static void
+evq_txc_event(struct vbus_enet_priv *priv,
+	      struct venet_event_header *header)
+{
+	struct venet_event_txc *event =
+		(struct venet_event_txc *)header;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	vbus_enet_tx_reap(priv);
+	vbus_enet_skb_complete(priv, (struct sk_buff *)event->cookie);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void
 deferred_evq_isr(unsigned long data)
 {
 	struct vbus_enet_priv *priv = (struct vbus_enet_priv *)data;
@@ -817,6 +847,9 @@ deferred_evq_isr(unsigned long data)
 		case VENET_EVENT_LINKSTATE:
 			evq_linkstate_event(priv, header);
 			break;
+		case VENET_EVENT_TXC:
+			evq_txc_event(priv, header);
+			break;
 		default:
 			panic("venet: unexpected event id:%d of size %d\n",
 			      header->id, header->size);
@@ -901,6 +934,7 @@ vbus_enet_evq_negcap(struct vbus_enet_priv *priv, unsigned long count)
 
 	caps.gid = VENET_CAP_GROUP_EVENTQ;
 	caps.bits |= VENET_CAP_EVQ_LINKSTATE;
+	caps.bits |= VENET_CAP_EVQ_TXC;
 
 	ret = devcall(priv, VENET_FUNC_NEGCAP, &caps, sizeof(caps));
 	if (ret < 0)
@@ -925,6 +959,9 @@ vbus_enet_evq_negcap(struct vbus_enet_priv *priv, unsigned long count)
 			priv->evq.linkstate = true;
 		}
 
+		if (caps.bits & VENET_CAP_EVQ_TXC)
+			priv->evq.txc = true;
+
 		memset(&query, 0, sizeof(query));
 
 		ret = devcall(priv, VENET_FUNC_EVQQUERY, &query, sizeof(query));
@@ -1051,7 +1088,6 @@ vbus_enet_probe(struct vbus_device_proxy *vdev)
 		goto out_free;
 	}
 
-	tasklet_init(&priv->tx.task, deferred_tx_isr, (unsigned long)priv);
 	skb_queue_head_init(&priv->tx.outstanding);
 
 	queue_init(priv, &priv->rxq, VENET_QUEUE_RX, rx_ringlen, rx_isr);
@@ -1060,8 +1096,19 @@ vbus_enet_probe(struct vbus_device_proxy *vdev)
 	rx_setup(priv);
 	tx_setup(priv);
 
-	ioq_notify_enable(priv->rxq.queue, 0);  /* enable interrupts */
-	ioq_notify_enable(priv->tx.veq.queue, 0);
+	ioq_notify_enable(priv->rxq.queue, 0);  /* enable rx interrupts */
+
+	if (!priv->evq.txc) {
+		/*
+		 * If the TXC feature is present, we will recieve our
+		 * tx-complete notification via the event-channel.  Therefore,
+		 * we only enable txq interrupts if the TXC feature is not
+		 * present.
+		 */
+		tasklet_init(&priv->tx.task, deferred_tx_isr,
+			     (unsigned long)priv);
+		ioq_notify_enable(priv->tx.veq.queue, 0);
+	}
 
 	dev->netdev_ops     = &vbus_enet_netdev_ops;
 	dev->watchdog_timeo = 5 * HZ;
diff --git a/include/linux/venet.h b/include/linux/venet.h
index 16b0156..b6bfd91 100644
--- a/include/linux/venet.h
+++ b/include/linux/venet.h
@@ -50,6 +50,7 @@ struct venet_capabilities {
 
 /* CAPABILITIES-GROUP EVENTQ */
 #define VENET_CAP_EVQ_LINKSTATE  (1 << 0)
+#define VENET_CAP_EVQ_TXC        (1 << 1) /* tx-complete */
 
 struct venet_iov {
 	__u32 len;
@@ -89,6 +90,7 @@ struct venet_eventq_query {
 };
 
 #define VENET_EVENT_LINKSTATE 0
+#define VENET_EVENT_TXC       1
 
 struct venet_event_header {
 	__u32 flags;
@@ -101,6 +103,12 @@ struct venet_event_linkstate {
 	__u8                      state; /* 0 = down, 1 = up */
 };
 
+struct venet_event_txc {
+	struct venet_event_header header;
+	__u32                     txqid;
+	__u64                     cookie;
+};
+
 #define VSG_DESC_SIZE(count) (sizeof(struct venet_sg) + \
 			      sizeof(struct venet_iov) * ((count) - 1))
 


^ permalink raw reply related

* [NET PATCH 9/9] venet: add Layer-4 Reassembler Offload (L4RO) support
From: Gregory Haskins @ 2009-10-14 15:59 UTC (permalink / raw)
  To: alacrityvm-devel; +Cc: linux-kernel, netdev
In-Reply-To: <20091014154457.18864.28382.stgit@dev.haskins.net>

This is the converse to GSO.  It lets us receive fully reassembled L4
frames from the host.  This allows us to reduce the interrupt rate of
the guest, take advantage of host-based hardware that does reassembly,
and to skip the SAR overhead for localhost (host->guest, guest->guest)
connectivity.

We accomplish this by re-using the SG support from the transmit/GSO side
and supplying a "page-queue" of free pages to use for when we need
frames larger than MTU.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---

 drivers/net/vbus-enet.c |  384 +++++++++++++++++++++++++++++++++++++++++++----
 include/linux/venet.h   |   10 +
 2 files changed, 365 insertions(+), 29 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index e8a0553..6fe2241 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -47,6 +47,8 @@ module_param(sg_enabled, int, 0444);
 
 #define PDEBUG(_dev, fmt, args...) dev_dbg(&(_dev)->dev, fmt, ## args)
 
+#define SG_DESC_SIZE VSG_DESC_SIZE(MAX_SKB_FRAGS)
+
 struct vbus_enet_queue {
 	struct ioq              *queue;
 	struct ioq_notifier      notifier;
@@ -78,6 +80,14 @@ struct vbus_enet_priv {
 		struct tasklet_struct  task;
 		char                  *pool;
 	} evq;
+	struct {
+		bool                   available;
+		char                  *pool;
+		struct vbus_enet_queue pageq;
+	} l4ro;
+
+	struct sk_buff *(*import)(struct vbus_enet_priv *priv,
+				  struct ioq_ring_desc *desc);
 };
 
 static void vbus_enet_tx_reap(struct vbus_enet_priv *priv);
@@ -127,29 +137,88 @@ devcall(struct vbus_enet_priv *priv, u32 func, void *data, size_t len)
  */
 
 static void
-rxdesc_alloc(struct net_device *dev, struct ioq_ring_desc *desc, size_t len)
+rxdesc_alloc(struct vbus_enet_priv *priv, struct ioq_ring_desc *desc, size_t len)
 {
+	struct net_device *dev = priv->dev;
 	struct sk_buff *skb;
 
 	len += ETH_HLEN;
 
-	skb = netdev_alloc_skb(dev, len + 2);
+	skb = netdev_alloc_skb(dev, len + NET_IP_ALIGN);
 	BUG_ON(!skb);
 
 	skb_reserve(skb, NET_IP_ALIGN); /* align IP on 16B boundary */
 
-	desc->cookie = (u64)skb;
-	desc->ptr    = (u64)__pa(skb->data);
-	desc->len    = len; /* total length  */
+	if (priv->l4ro.available) {
+		/*
+		 * We will populate an SG descriptor initially with one
+		 * IOV filled with an MTU SKB.  If the packet needs to be
+		 * larger than MTU, the host will grab pages out of the
+		 * page-queue and populate additional IOVs
+		 */
+		struct venet_sg *vsg = (struct venet_sg *)desc->cookie;
+		struct venet_iov *iov = &vsg->iov[0];
+
+		memset(vsg, 0, SG_DESC_SIZE);
+
+		vsg->cookie  = (u64)skb;
+		vsg->count   = 1;
+
+		iov->ptr     = (u64)__pa(skb->data);
+		iov->len     = len;
+	} else {
+		desc->cookie = (u64)skb;
+		desc->ptr    = (u64)__pa(skb->data);
+		desc->len    = len; /* total length  */
+	}
+
 	desc->valid  = 1;
 }
 
 static void
+rx_pageq_refill(struct vbus_enet_priv *priv)
+{
+	struct ioq *ioq = priv->l4ro.pageq.queue;
+	struct ioq_iterator iter;
+	int ret;
+
+	if (ioq_full(ioq, ioq_idxtype_inuse))
+		/* nothing to do if the pageq is already fully populated */
+		return;
+
+	ret = ioq_iter_init(ioq, &iter, ioq_idxtype_inuse, 0);
+	BUG_ON(ret < 0); /* will never fail unless seriously broken */
+
+	ret = ioq_iter_seek(&iter, ioq_seek_tail, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * Now populate each descriptor with an empty page
+	 */
+	while (!iter.desc->sown) {
+		struct page *page;
+
+		page = alloc_page(GFP_KERNEL);
+		BUG_ON(!page);
+
+		iter.desc->cookie = (u64)page;
+		iter.desc->ptr    = (u64)__pa(page_address(page));
+		iter.desc->len    = PAGE_SIZE;
+
+		ret = ioq_iter_push(&iter, 0);
+		BUG_ON(ret < 0);
+	}
+
+	ioq_signal(ioq, 0);
+}
+
+static void
 rx_setup(struct vbus_enet_priv *priv)
 {
 	struct ioq *ioq = priv->rxq.queue;
 	struct ioq_iterator iter;
 	int ret;
+	int i = 0;
 
 	/*
 	 * We want to iterate on the "valid" index.  By default the iterator
@@ -170,10 +239,19 @@ rx_setup(struct vbus_enet_priv *priv)
 	BUG_ON(ret < 0);
 
 	/*
-	 * Now populate each descriptor with an empty SKB and mark it valid
+	 * Now populate each descriptor with an empty buffer and mark it valid
 	 */
 	while (!iter.desc->valid) {
-		rxdesc_alloc(priv->dev, iter.desc, priv->dev->mtu);
+		if (priv->l4ro.available) {
+			size_t offset = (i * SG_DESC_SIZE);
+			void *addr = &priv->l4ro.pool[offset];
+
+			iter.desc->ptr    = (u64)offset;
+			iter.desc->cookie = (u64)addr;
+			iter.desc->len    = SG_DESC_SIZE;
+		}
+
+		rxdesc_alloc(priv, iter.desc, priv->dev->mtu);
 
 		/*
 		 * This push operation will simultaneously advance the
@@ -182,11 +260,16 @@ rx_setup(struct vbus_enet_priv *priv)
 		 */
 		ret = ioq_iter_push(&iter, 0);
 		BUG_ON(ret < 0);
+
+		i++;
 	}
+
+	if (priv->l4ro.available)
+		rx_pageq_refill(priv);
 }
 
 static void
-rx_teardown(struct vbus_enet_priv *priv)
+rx_rxq_teardown(struct vbus_enet_priv *priv)
 {
 	struct ioq *ioq = priv->rxq.queue;
 	struct ioq_iterator iter;
@@ -202,7 +285,25 @@ rx_teardown(struct vbus_enet_priv *priv)
 	 * free each valid descriptor
 	 */
 	while (iter.desc->valid) {
-		struct sk_buff *skb = (struct sk_buff *)iter.desc->cookie;
+		struct sk_buff *skb;
+
+		if (priv->l4ro.available) {
+			struct venet_sg *vsg;
+			int i;
+
+			vsg = (struct venet_sg *)iter.desc->cookie;
+
+			/* skip i=0, since that is the skb->data IOV */
+			for (i = 1; i < vsg->count; i++) {
+				struct venet_iov *iov = &vsg->iov[i];
+				struct page *page = (struct page *)iov->ptr;
+
+				put_page(page);
+			}
+
+			skb = (struct sk_buff *)vsg->cookie;
+		} else
+			skb = (struct sk_buff *)iter.desc->cookie;
 
 		iter.desc->valid = 0;
 		wmb();
@@ -217,12 +318,54 @@ rx_teardown(struct vbus_enet_priv *priv)
 	}
 }
 
+static void
+rx_l4ro_teardown(struct vbus_enet_priv *priv)
+{
+	struct ioq *ioq = priv->l4ro.pageq.queue;
+	struct ioq_iterator iter;
+	int ret;
+
+	ret = ioq_iter_init(ioq, &iter, ioq_idxtype_inuse, 0);
+	BUG_ON(ret < 0);
+
+	ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
+	BUG_ON(ret < 0);
+
+	/*
+	 * free each valid descriptor
+	 */
+	while (iter.desc->sown) {
+		struct page *page = (struct page *)iter.desc->cookie;
+
+		iter.desc->valid = 0;
+		wmb();
+
+		iter.desc->ptr = 0;
+		iter.desc->cookie = 0;
+
+		ret = ioq_iter_pop(&iter, 0);
+		BUG_ON(ret < 0);
+
+		put_page(page);
+	}
+
+	ioq_put(ioq);
+	kfree(priv->l4ro.pool);
+}
+
+static void
+rx_teardown(struct vbus_enet_priv *priv)
+{
+	rx_rxq_teardown(priv);
+
+	if (priv->l4ro.available)
+		rx_l4ro_teardown(priv);
+}
+
 static int
 tx_setup(struct vbus_enet_priv *priv)
 {
 	struct ioq *ioq    = priv->tx.veq.queue;
-	size_t      iovlen = sizeof(struct venet_iov) * (MAX_SKB_FRAGS-1);
-	size_t      len    = sizeof(struct venet_sg) + iovlen;
 	struct ioq_iterator iter;
 	int i;
 	int ret;
@@ -237,7 +380,7 @@ tx_setup(struct vbus_enet_priv *priv)
 	/* pre-allocate our descriptor pool if pmtd is enabled */
 	if (priv->pmtd.enabled) {
 		struct vbus_device_proxy *dev = priv->vdev;
-		size_t poollen = len * priv->tx.veq.count;
+		size_t poollen = SG_DESC_SIZE * priv->tx.veq.count;
 		char *pool;
 		int shmid;
 
@@ -270,12 +413,12 @@ tx_setup(struct vbus_enet_priv *priv)
 		struct venet_sg *vsg;
 
 		if (priv->pmtd.enabled) {
-			size_t offset = (i * len);
+			size_t offset = (i * SG_DESC_SIZE);
 
 			vsg = (struct venet_sg *)&priv->pmtd.pool[offset];
 			iter.desc->ptr = (u64)offset;
 		} else {
-			vsg = kzalloc(len, GFP_KERNEL);
+			vsg = kzalloc(SG_DESC_SIZE, GFP_KERNEL);
 			if (!vsg)
 				return -ENOMEM;
 
@@ -283,7 +426,7 @@ tx_setup(struct vbus_enet_priv *priv)
 		}
 
 		iter.desc->cookie = (u64)vsg;
-		iter.desc->len    = len;
+		iter.desc->len    = SG_DESC_SIZE;
 
 		ret = ioq_iter_seek(&iter, ioq_seek_next, 0, 0);
 		BUG_ON(ret < 0);
@@ -444,6 +587,120 @@ vbus_enet_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
+static struct sk_buff *
+vbus_enet_l4ro_import(struct vbus_enet_priv *priv, struct ioq_ring_desc *desc)
+{
+	struct venet_sg *vsg = (struct venet_sg *)desc->cookie;
+	struct sk_buff *skb = (struct sk_buff *)vsg->cookie;
+	struct skb_shared_info *sinfo = skb_shinfo(skb);
+	int i;
+
+	rx_pageq_refill(priv);
+
+	if (!vsg->len)
+		/*
+		 * the device may send a zero-length packet when its
+		 * flushing references on the ring.  We can just drop
+		 * these on the floor
+		 */
+		goto fail;
+
+	/* advance only by the linear portion in IOV[0] */
+	skb_put(skb, vsg->iov[0].len);
+
+	/* skip i=0, since that is the skb->data IOV */
+	for (i = 1; i < vsg->count; i++) {
+		struct venet_iov *iov = &vsg->iov[i];
+		struct page *page = (struct page *)iov->ptr;
+		skb_frag_t *f = &sinfo->frags[i-1];
+
+		f->page        = page;
+		f->page_offset = 0;
+		f->size        = iov->len;
+
+		PDEBUG(priv->dev, "SG: Importing %d byte page[%i]\n",
+		       f->size, i);
+
+		skb->data_len += f->size;
+		skb->len      += f->size;
+		skb->truesize += f->size;
+		sinfo->nr_frags++;
+	}
+
+	if (vsg->flags & VENET_SG_FLAG_NEEDS_CSUM
+	    && !skb_partial_csum_set(skb, vsg->csum.start,
+				     vsg->csum.offset)) {
+		priv->dev->stats.rx_frame_errors++;
+		goto fail;
+	}
+
+	if (vsg->flags & VENET_SG_FLAG_GSO) {
+		PDEBUG(priv->dev, "L4RO packet detected\n");
+
+		switch (vsg->gso.type) {
+		case VENET_GSO_TYPE_TCPV4:
+			sinfo->gso_type = SKB_GSO_TCPV4;
+			break;
+		case VENET_GSO_TYPE_TCPV6:
+			sinfo->gso_type = SKB_GSO_TCPV6;
+			break;
+		case VENET_GSO_TYPE_UDP:
+			sinfo->gso_type = SKB_GSO_UDP;
+			break;
+		default:
+			PDEBUG(priv->dev, "Illegal L4RO type: %d\n",
+			       vsg->gso.type);
+			priv->dev->stats.rx_frame_errors++;
+			goto fail;
+		}
+
+		if (vsg->flags & VENET_SG_FLAG_ECN)
+			sinfo->gso_type |= SKB_GSO_TCP_ECN;
+
+		sinfo->gso_size = vsg->gso.size;
+		if (sinfo->gso_size == 0) {
+			PDEBUG(priv->dev, "Illegal L4RO size: %d\n",
+			       vsg->gso.size);
+			priv->dev->stats.rx_frame_errors++;
+			goto fail;
+		}
+
+		/*
+		 * Header must be checked, and gso_segs
+		 * computed.
+		 */
+		sinfo->gso_type |= SKB_GSO_DODGY;
+		sinfo->gso_segs = 0;
+	}
+
+	return skb;
+
+fail:
+	dev_kfree_skb(skb);
+
+	return NULL;
+}
+
+static struct sk_buff *
+vbus_enet_flat_import(struct vbus_enet_priv *priv, struct ioq_ring_desc *desc)
+{
+	struct sk_buff *skb = (struct sk_buff *)desc->cookie;
+
+	if (!desc->len) {
+		/*
+		 * the device may send a zero-length packet when its
+		 * flushing references on the ring.  We can just drop
+		 * these on the floor
+		 */
+		dev_kfree_skb(skb);
+		return NULL;
+	}
+
+	skb_put(skb, desc->len);
+
+	return skb;
+}
+
 /*
  * The poll implementation.
  */
@@ -471,15 +728,14 @@ vbus_enet_poll(struct napi_struct *napi, int budget)
 	 * the south side
 	 */
 	while ((npackets < budget) && (!iter.desc->sown)) {
-		struct sk_buff *skb = (struct sk_buff *)iter.desc->cookie;
-
-		if (iter.desc->len) {
-			skb_put(skb, iter.desc->len);
+		struct sk_buff *skb;
 
+		skb = priv->import(priv, iter.desc);
+		if (skb) {
 			/* Maintain stats */
 			npackets++;
 			priv->dev->stats.rx_packets++;
-			priv->dev->stats.rx_bytes += iter.desc->len;
+			priv->dev->stats.rx_bytes += skb->len;
 
 			/* Pass the buffer up to the stack */
 			skb->dev      = priv->dev;
@@ -487,16 +743,10 @@ vbus_enet_poll(struct napi_struct *napi, int budget)
 			netif_receive_skb(skb);
 
 			mb();
-		} else
-			/*
-			 * the device may send a zero-length packet when its
-			 * flushing references on the ring.  We can just drop
-			 * these on the floor
-			 */
-			dev_kfree_skb(skb);
+		}
 
 		/* Grab a new buffer to put in the ring */
-		rxdesc_alloc(priv->dev, iter.desc, priv->dev->mtu);
+		rxdesc_alloc(priv, iter.desc, priv->dev->mtu);
 
 		/* Advance the in-use tail */
 		ret = ioq_iter_pop(&iter, 0);
@@ -1014,6 +1264,69 @@ vbus_enet_evq_negcap(struct vbus_enet_priv *priv, unsigned long count)
 }
 
 static int
+vbus_enet_l4ro_negcap(struct vbus_enet_priv *priv, unsigned long count)
+{
+	struct venet_capabilities caps;
+	int ret;
+
+	memset(&caps, 0, sizeof(caps));
+
+	caps.gid = VENET_CAP_GROUP_L4RO;
+	caps.bits |= (VENET_CAP_SG|VENET_CAP_TSO4|VENET_CAP_TSO6
+		      |VENET_CAP_ECN);
+
+	ret = devcall(priv, VENET_FUNC_NEGCAP, &caps, sizeof(caps));
+	if (ret < 0) {
+		printk(KERN_ERR "Error negotiating L4RO: %d\n", ret);
+		return ret;
+	}
+
+	if (caps.bits & VENET_CAP_SG) {
+		struct vbus_device_proxy *dev = priv->vdev;
+		size_t                    poollen = SG_DESC_SIZE * count;
+		struct venet_l4ro_query    query;
+		char                     *pool;
+
+		memset(&query, 0, sizeof(query));
+
+		ret = devcall(priv, VENET_FUNC_L4ROQUERY, &query, sizeof(query));
+		if (ret < 0) {
+			printk(KERN_ERR "Error querying L4RO: %d\n", ret);
+			return ret;
+		}
+
+		pool = kzalloc(poollen, GFP_KERNEL | GFP_DMA);
+		if (!pool)
+			return -ENOMEM;
+
+		/*
+		 * pre-mapped descriptor pool
+		 */
+		ret = dev->ops->shm(dev, query.dpid, 0,
+				    pool, poollen, 0, NULL, 0);
+		if (ret < 0) {
+			printk(KERN_ERR "Error registering L4RO pool: %d\n",
+			       ret);
+			kfree(pool);
+			return ret;
+		}
+
+		/*
+		 * page-queue: contains a ring of arbitrary pages for
+		 * consumption by the host for when the SG::IOV count exceeds
+		 * one MTU frame.  All we need to do is keep it populated
+		 * with free pages.
+		 */
+		queue_init(priv, &priv->l4ro.pageq, query.pqid, count, NULL);
+
+		priv->l4ro.pool      = pool;
+		priv->l4ro.available = true;
+	}
+
+	return 0;
+}
+
+static int
 vbus_enet_negcap(struct vbus_enet_priv *priv)
 {
 	int ret;
@@ -1022,7 +1335,15 @@ vbus_enet_negcap(struct vbus_enet_priv *priv)
 	if (ret < 0)
 		return ret;
 
-	return vbus_enet_evq_negcap(priv, tx_ringlen);
+	ret = vbus_enet_evq_negcap(priv, tx_ringlen);
+	if (ret < 0)
+		return ret;
+
+	ret = vbus_enet_l4ro_negcap(priv, rx_ringlen);
+	if (ret < 0)
+		return ret;
+
+	return 0;
 }
 
 static int vbus_enet_set_tx_csum(struct net_device *dev, u32 data)
@@ -1088,6 +1409,11 @@ vbus_enet_probe(struct vbus_device_proxy *vdev)
 		goto out_free;
 	}
 
+	if (priv->l4ro.available)
+		priv->import = &vbus_enet_l4ro_import;
+	else
+		priv->import = &vbus_enet_flat_import;
+
 	skb_queue_head_init(&priv->tx.outstanding);
 
 	queue_init(priv, &priv->rxq, VENET_QUEUE_RX, rx_ringlen, rx_isr);
diff --git a/include/linux/venet.h b/include/linux/venet.h
index b6bfd91..0578d79 100644
--- a/include/linux/venet.h
+++ b/include/linux/venet.h
@@ -39,6 +39,7 @@ struct venet_capabilities {
 
 #define VENET_CAP_GROUP_SG     0
 #define VENET_CAP_GROUP_EVENTQ 1
+#define VENET_CAP_GROUP_L4RO    2 /* layer-4 reassem offloading */
 
 /* CAPABILITIES-GROUP SG */
 #define VENET_CAP_SG     (1 << 0)
@@ -109,6 +110,14 @@ struct venet_event_txc {
 	__u64                     cookie;
 };
 
+struct venet_l4ro_query {
+	__u32 flags;
+	__u32 dpid;    /* descriptor pool-id */
+	__u32 pqid;    /* page queue-id */
+	__u8  pad[20];
+};
+
+
 #define VSG_DESC_SIZE(count) (sizeof(struct venet_sg) + \
 			      sizeof(struct venet_iov) * ((count) - 1))
 
@@ -119,5 +128,6 @@ struct venet_event_txc {
 #define VENET_FUNC_FLUSHRX   4
 #define VENET_FUNC_PMTDQUERY 5
 #define VENET_FUNC_EVQQUERY  6
+#define VENET_FUNC_L4ROQUERY  7
 
 #endif /* _LINUX_VENET_H */

^ permalink raw reply related

* Re: query: tcpdump versus atomic?
From: Stephen Hemminger @ 2009-10-14 15:59 UTC (permalink / raw)
  To: William Allen Simpson; +Cc: netdev
In-Reply-To: <4AD5EC2C.6070005@gmail.com>

On Wed, 14 Oct 2009 11:20:12 -0400
William Allen Simpson <william.allen.simpson@gmail.com> wrote:

> William Allen Simpson wrote:
> > Anybody know what code path tcpdump changes to running atomic?
> > 
> > Is there a function to test whether you're running atomic?
> > 
> To partially answer my own question, after laboriously #if'ing compiling
> section by section, it affects the tcp_minisockets.c code at
> tcp_create_openreq_child().
> 
> I've not found a function to test.  I've found sk->sk_allocation, but
> that doesn't seem to be dynamically updated to reflect the current state.
> 
> Anyway, sorry David, but there's at least two GFP_ATOMIC here (one existing,
> one new).  I've managed to change the others, by careful rearrangement.  At
> least, I hope so, until some future testing reveals otherwise....
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

did you look at your ethernet's drivers code to turn on promiscuous mode.
It could be leaving irq's or bottom half disabled.

-- 

^ permalink raw reply

* Re: PF_RING: Include in main line kernel?
From: Stephen Hemminger @ 2009-10-14 16:01 UTC (permalink / raw)
  To: Brad Doctor; +Cc: netdev, Luca Deri
In-Reply-To: <a07586b0910140733s1976cd05u39286de42d9fac23@mail.gmail.com>

On Wed, 14 Oct 2009 08:33:08 -0600
Brad Doctor <brad.doctor@gmail.com> wrote:

> Greetings,
> 
> On behalf of the users and developers of the PF_RING project, we would
> like to ask consideration to include the PF_RING module in the main
> line kernel.
> 
> PF_RING (http://www.ntop.org/PF_RING.html) is a kernel module that
> implements an mmap()-ed memory ring for accelerating packet capture
> and for providing all the basic features a network monitoring
> application needs. PF_RING includes several features such as packet
> filtering, balancing across capture applications, packet reflection
> (i.e. capture application can decide to bounce selected packets onto
> an as-specified interface). Packets are filtered both using BPF and
> using ACL-like rules (e.g. tcp and ports from 80 to 100). Using
> PF_RING it is also possible to exploit multiple RX queues provided by
> modern NIC adapters. PF_RING achieves a significant speedup by making
> only one copy of the packet. Additionally, PF_RING is able to operate
> in a capture-only installation, further increasing performance.
> 
> PF_RING has been around since 2003 and is very mature with an active
> contributing developer base. The developer and user community use a
> mailing list (http://listgateway.unipi.it/pipermail/ntop-misc/) for
> discussions and submissions. PF_RING is used in several projects,
> ranging from distributions such as DD-WRT/OpenWrt to improving
> performance of applications like Snort and Wireshark. Many commercial
> companies around the world in the field of intrusion detection and
> traffic analysis rely on PF_RING for accelerating their products and
> operations.
> 
> The PF_RING module relies on a small patch to net/core/dev.c that
> intercepts when a packet is received/transmitted so that it can be
> passed to the PF_RING module when present and with an active listener.
> Other than these minor changes, all the PF_RING code is
> self-contained, comprising jut two files: ring.c and ring.h. PF_RING
> is the result of many years of research and development specifically
> into high-speed packet capture, and is homegrown. PF_RING uses the
> stock GPL license.
> 
> We feel that PF_RING is ready to be included with the mainline kernel.
> We are ready and eager to support PF_RING for the long term.
> 
> Thank you in advance for your consideration!

I was going to wrap pfring up for the staging tree.
The code you put in network receive path is not necessary;
it would be cleaner to just use existing packet type all hook, and
then PF_RING could be a loadable module without having to be compiled in.

-- 

^ permalink raw reply

* RE: [PATCH] iproute2: skbedit: Fix help message
From: Duyck, Alexander H @ 2009-10-14 16:04 UTC (permalink / raw)
  To: hadi@cyberus.ca; +Cc: netdev@vger.kernel.org, Stephen Hemminger
In-Reply-To: <1255522041.21940.7.camel@dogo.mojatatu.com>

jamal wrote:
> This fixes the help message on the skbedit action.
> Stephen, please apply if Alexander ACKs.
> 
> cheers,
> jamal

This looks good to me.

Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>

^ permalink raw reply

* Re: PF_RING: Include in main line kernel?
From: Jarek Poplawski @ 2009-10-14 16:46 UTC (permalink / raw)
  To: Brad Doctor; +Cc: netdev, Luca Deri
In-Reply-To: <a07586b0910140733s1976cd05u39286de42d9fac23@mail.gmail.com>

Brad Doctor wrote, On 10/14/2009 04:33 PM:

> into high-speed packet capture, and is homegrown. PF_RING uses the
> stock GPL license.

Are you sure you're using the stock GNU GPL?:

> Download ntop
> 
> ntop is distributed under the GNU GPL. In order to be entitled to download ntop you must accept the GNU license. 

I can't find such a thing neither in GNU GPL v2:

"5.  You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. "

nor in GNU GPL v3:

"9. Acceptance Not Required for Having Copies.

You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so."

Thanks,
Jarek P.

^ permalink raw reply

* Re: PF_RING: Include in main line kernel?
From: Brad Doctor @ 2009-10-14 17:02 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: netdev, Luca Deri
In-Reply-To: <4AD60053.1030804@gmail.com>

This is for ntop, which uses PF_RING, but is a traffic analysis
application and is separate.

But you raise a good point and we will make the change for ntop.

thanks!
-brad

On Wed, Oct 14, 2009 at 10:46 AM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> Brad Doctor wrote, On 10/14/2009 04:33 PM:
>
>> into high-speed packet capture, and is homegrown. PF_RING uses the
>> stock GPL license.
>
>
> Are you sure you're using the stock GNU GPL?:
>
>> Download ntop
>>
>> ntop is distributed under the GNU GPL. In order to be entitled to download ntop you must accept the GNU license.
>
> I can't find such a thing neither in GNU GPL v2:
>
> "5.  You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. "
>
> nor in GNU GPL v3:
>
> "9. Acceptance Not Required for Having Copies.
>
> You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so."
>
> Thanks,
> Jarek P.
>

^ permalink raw reply

* RE: configure VF parameters from PF domain
From: Rose, Gregory V @ 2009-10-14 17:04 UTC (permalink / raw)
  To: Satish Chowdhury, netdev@vger.kernel.org; +Cc: Williams, Mitch A
In-Reply-To: <be5d34890910131105g52ab497av2359b1f974af55a4@mail.gmail.com>


There is no such tool yet.  Proposals for such a tool are welcome.  There are some discussions going on between myself and a few other engineers as to how to proceed but nothing beyond that yet.

I agree completely that such a tool is necessary for SR-IOV networking to move from expermimental to something actually useful in real world applications.

- Greg

>-----Original Message-----
>From: netdev-owner@vger.kernel.org 
>[mailto:netdev-owner@vger.kernel.org] On Behalf Of Satish Chowdhury
>Sent: Tuesday, October 13, 2009 11:06 AM
>To: netdev@vger.kernel.org
>Subject: configure VF parameters from PF domain
>
>Hi,
>
>I am verifying the functionality  of Intel 82576 based NIC card on
>Xen.  VF device is assigned(passthrough) to a VM.
>
>I can configure the MAC, VLAN parameter of VF in the VM domain on the
>VF interface.  But, I want have control over the VF from PF domain.
>i.e I should be able to configure VF MAC, VLAN parameters from PF
>domain.
>
>Is there any tool available ? Else give me pointers how the above can
>be achieved? Is there a way to preconfigure VF parameters before
>assigning VF to VM.
>
>Regards,
>-Satish
>--
>To unsubscribe from this list: send the line "unsubscribe netdev" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: PF_RING: Include in main line kernel?
From: Jarek Poplawski @ 2009-10-14 17:18 UTC (permalink / raw)
  To: Brad Doctor; +Cc: netdev, Luca Deri
In-Reply-To: <a07586b0910141002o31f1a480t306f36fe8da19427@mail.gmail.com>

On Wed, Oct 14, 2009 at 11:02:01AM -0600, Brad Doctor wrote:
> This is for ntop, which uses PF_RING, but is a traffic analysis
> application and is separate.

For some strange reason I thought I could download PF_RING from the
download page too... (not for modifying or distributing! ;-)

Thanks,
Jarek P.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox