netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] vmxnet3: fix starving rx ring when alloc_skb fails
@ 2011-06-16 22:02 Scott J. Goldman
  2011-06-16 22:56 ` [Pv-drivers] " Bhavesh Davda
  2011-06-17  4:16 ` David Miller
  0 siblings, 2 replies; 9+ messages in thread
From: Scott J. Goldman @ 2011-06-16 22:02 UTC (permalink / raw)
  To: netdev, pv-drivers; +Cc: Scott J. Goldman

if the rx ring is completely empty, then the device may never fire an rx
interrupt. unfortunately, the rx interrupt is what triggers populating
the rx ring with fresh buffers, so this will cause networking to lock
up.

this patch recycles the last skb that we were about to indicate up to
the network stack (only if the rx ring is completely starved of skbs)
so the ring will never be completely empty. If we fail to allocate a
secondary page buffer, we just indicate a 0 length buffer to the device.

Signed-off-by: Scott J. Goldman <scottjg@vmware.com>
---
 drivers/net/vmxnet3/vmxnet3_drv.c |   98 +++++++++++++++++++++++++-----------
 drivers/net/vmxnet3/vmxnet3_int.h |    5 +-
 2 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 33097ec..cfa9ff7 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -558,6 +558,17 @@ vmxnet3_tq_cleanup_all(struct vmxnet3_adapter *adapter)
 		vmxnet3_tq_cleanup(&adapter->tx_queue[i], adapter);
 }
 
+static bool
+vmxnet3_rq_empty(const struct vmxnet3_rx_queue *rq, u32 ring_idx)
+{
+	const struct vmxnet3_cmd_ring *ring = &rq->rx_ring[ring_idx];
+	u32 i = ring->next2comp;
+	while (rq->buf_info[ring_idx][i].buf_type != VMXNET3_RX_BUF_SKB)
+		VMXNET3_INC_RING_IDX_ONLY(i, ring->size);
+
+	return (i == ring->next2fill);
+}
+
 /*
  *    starting from ring->next2fill, allocate rx buffers for the given ring
  *    of the rx queue and update the rx desc. stop after @num_to_alloc buffers
@@ -571,9 +582,12 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx,
 	int num_allocated = 0;
 	struct vmxnet3_rx_buf_info *rbi_base = rq->buf_info[ring_idx];
 	struct vmxnet3_cmd_ring *ring = &rq->rx_ring[ring_idx];
+	struct vmxnet3_rx_ctx *ctx = &rq->rx_ctx;
+	bool alloc_skb_failed = false;
 	u32 val;
+	u32 len;
 
-	while (num_allocated < num_to_alloc) {
+	while (num_allocated < num_to_alloc && !alloc_skb_failed) {
 		struct vmxnet3_rx_buf_info *rbi;
 		union Vmxnet3_GenericDesc *gd;
 
@@ -586,7 +600,27 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx,
 							 NET_IP_ALIGN);
 				if (unlikely(rbi->skb == NULL)) {
 					rq->stats.rx_buf_alloc_failure++;
-					break;
+					alloc_skb_failed = true;
+					/*
+					 * if allocation failed and ring is
+					 * empty, we recycle the last skb we
+					 * rx'ed so that we don't starve the
+					 * rx ring
+					 */
+					if (ctx->skb &&
+					    vmxnet3_rq_empty(rq, ring_idx)) {
+						rbi->skb = ctx->skb;
+						ctx->skb = NULL;
+						skb_recycle_check(rbi->skb,
+								  rbi->len +
+								  NET_IP_ALIGN);
+						/*
+						 * free any frags chained to
+						 * the skb
+						 */
+						__pskb_trim(rbi->skb, 0);
+					} else
+						break;
 				}
 				rbi->skb->dev = adapter->netdev;
 
@@ -594,8 +628,10 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx,
 				rbi->dma_addr = pci_map_single(adapter->pdev,
 						rbi->skb->data, rbi->len,
 						PCI_DMA_FROMDEVICE);
+				len = rbi->len;
 			} else {
 				/* rx buffer skipped by the device */
+				len = rbi->len;
 			}
 			val = VMXNET3_RXD_BTYPE_HEAD << VMXNET3_RXD_BTYPE_SHIFT;
 		} else {
@@ -606,13 +642,18 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx,
 				rbi->page = alloc_page(GFP_ATOMIC);
 				if (unlikely(rbi->page == NULL)) {
 					rq->stats.rx_buf_alloc_failure++;
-					break;
+					len = 0;
+				} else {
+					rbi->dma_addr = pci_map_page(
+							    adapter->pdev,
+							    rbi->page, 0,
+							    PAGE_SIZE,
+							    PCI_DMA_FROMDEVICE);
+					len = rbi->len;
 				}
-				rbi->dma_addr = pci_map_page(adapter->pdev,
-						rbi->page, 0, PAGE_SIZE,
-						PCI_DMA_FROMDEVICE);
 			} else {
 				/* rx buffers skipped by the device */
+				len = rbi->len;
 			}
 			val = VMXNET3_RXD_BTYPE_BODY << VMXNET3_RXD_BTYPE_SHIFT;
 		}
@@ -620,7 +661,7 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx,
 		BUG_ON(rbi->dma_addr == 0);
 		gd->rxd.addr = cpu_to_le64(rbi->dma_addr);
 		gd->dword[2] = cpu_to_le32((ring->gen << VMXNET3_RXD_GEN_SHIFT)
-					   | val | rbi->len);
+					   | val | len);
 
 		num_allocated++;
 		vmxnet3_cmd_ring_adv_next2fill(ring);
@@ -1148,7 +1189,6 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 			  &rxComp);
 	while (rcd->gen == rq->comp_ring.gen) {
 		struct vmxnet3_rx_buf_info *rbi;
-		struct sk_buff *skb;
 		int num_to_alloc;
 		struct Vmxnet3_RxDesc *rxd;
 		u32 idx, ring_idx;
@@ -1168,7 +1208,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 		rbi = rq->buf_info[ring_idx] + idx;
 
 		BUG_ON(rxd->addr != rbi->dma_addr ||
-		       rxd->len != rbi->len);
+		       (rxd->len != rbi->len && rbi->len != 0));
 
 		if (unlikely(rcd->eop && rcd->err)) {
 			vmxnet3_rx_error(rq, rcd, ctx, adapter);
@@ -1198,8 +1238,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 					 PCI_DMA_FROMDEVICE);
 
 			skb_put(ctx->skb, rcd->len);
-		} else {
-			BUG_ON(ctx->skb == NULL);
+		} else if (ctx->skb) {
 			/* non SOP buffer must be type 1 in most cases */
 			if (rbi->buf_type == VMXNET3_RX_BUF_PAGE) {
 				BUG_ON(rxd->btype != VMXNET3_RXD_BTYPE_BODY);
@@ -1222,25 +1261,6 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 			}
 		}
 
-		skb = ctx->skb;
-		if (rcd->eop) {
-			skb->len += skb->data_len;
-			skb->truesize += skb->data_len;
-
-			vmxnet3_rx_csum(adapter, skb,
-					(union Vmxnet3_GenericDesc *)rcd);
-			skb->protocol = eth_type_trans(skb, adapter->netdev);
-
-			if (unlikely(adapter->vlan_grp && rcd->ts)) {
-				vlan_hwaccel_receive_skb(skb,
-						adapter->vlan_grp, rcd->tci);
-			} else {
-				netif_receive_skb(skb);
-			}
-
-			ctx->skb = NULL;
-		}
-
 rcd_done:
 		/* device may skip some rx descs */
 		rq->rx_ring[ring_idx].next2comp = idx;
@@ -1264,6 +1284,24 @@ rcd_done:
 			}
 		}
 
+		if (rcd->eop && ctx->skb) {
+			ctx->skb->len += ctx->skb->data_len;
+			ctx->skb->truesize += ctx->skb->data_len;
+
+			vmxnet3_rx_csum(adapter, ctx->skb,
+					(union Vmxnet3_GenericDesc *)rcd);
+			ctx->skb->protocol = eth_type_trans(ctx->skb,
+							    adapter->netdev);
+			if (unlikely(adapter->vlan_grp && rcd->ts)) {
+				vlan_hwaccel_receive_skb(ctx->skb,
+							 adapter->vlan_grp,
+							 rcd->tci);
+			} else {
+				netif_receive_skb(ctx->skb);
+			}
+			ctx->skb = NULL;
+		}
+
 		vmxnet3_comp_ring_adv_next2proc(&rq->comp_ring);
 		vmxnet3_getRxComp(rcd,
 		     &rq->comp_ring.base[rq->comp_ring.next2proc].rcd, &rxComp);
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index 0e567c24..cb13ed7 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -68,10 +68,10 @@
 /*
  * Version numbers
  */
-#define VMXNET3_DRIVER_VERSION_STRING   "1.1.9.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING   "1.1.10.0-k"
 
 /* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
-#define VMXNET3_DRIVER_VERSION_NUM      0x01010900
+#define VMXNET3_DRIVER_VERSION_NUM      0x01010A00
 
 #if defined(CONFIG_PCI_MSI)
 	/* RSS only makes sense if MSI-X is supported. */
@@ -255,7 +255,6 @@ struct vmxnet3_rx_buf_info {
 
 struct vmxnet3_rx_ctx {
 	struct sk_buff *skb;
-	u32 sop_idx;
 };
 
 struct vmxnet3_rq_driver_stats {
-- 
1.7.4.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* RE: [Pv-drivers] [PATCH] vmxnet3: fix starving rx ring when alloc_skb fails
  2011-06-16 22:02 [PATCH] vmxnet3: fix starving rx ring when alloc_skb fails Scott J. Goldman
@ 2011-06-16 22:56 ` Bhavesh Davda
  2011-06-17  4:16 ` David Miller
  1 sibling, 0 replies; 9+ messages in thread
From: Bhavesh Davda @ 2011-06-16 22:56 UTC (permalink / raw)
  To: Scott Goldman, netdev@vger.kernel.org, pv-drivers@vmware.com

LGTM.

Signed-off-by: Bhavesh Davda <bhavesh@vmware.com>

> -----Original Message-----
> From: pv-drivers-bounces@vmware.com [mailto:pv-drivers-bounces@vmware.com] On
> Behalf Of Scott J. Goldman
> Sent: Thursday, June 16, 2011 3:02 PM
> To: netdev@vger.kernel.org; pv-drivers@vmware.com
> Subject: [Pv-drivers] [PATCH] vmxnet3: fix starving rx ring when alloc_skb
> fails
> 
> if the rx ring is completely empty, then the device may never fire an rx
> interrupt. unfortunately, the rx interrupt is what triggers populating
> the rx ring with fresh buffers, so this will cause networking to lock
> up.
> 
> this patch recycles the last skb that we were about to indicate up to
> the network stack (only if the rx ring is completely starved of skbs)
> so the ring will never be completely empty. If we fail to allocate a
> secondary page buffer, we just indicate a 0 length buffer to the device.
> 
> Signed-off-by: Scott J. Goldman <scottjg@vmware.com>
> ---
>  drivers/net/vmxnet3/vmxnet3_drv.c |   98 +++++++++++++++++++++++++-----------
>  drivers/net/vmxnet3/vmxnet3_int.h |    5 +-
>  2 files changed, 70 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c
> b/drivers/net/vmxnet3/vmxnet3_drv.c
> index 33097ec..cfa9ff7 100644
> --- a/drivers/net/vmxnet3/vmxnet3_drv.c
> +++ b/drivers/net/vmxnet3/vmxnet3_drv.c
> @@ -558,6 +558,17 @@ vmxnet3_tq_cleanup_all(struct vmxnet3_adapter *adapter)
>  		vmxnet3_tq_cleanup(&adapter->tx_queue[i], adapter);
>  }
> 
> +static bool
> +vmxnet3_rq_empty(const struct vmxnet3_rx_queue *rq, u32 ring_idx)
> +{
> +	const struct vmxnet3_cmd_ring *ring = &rq->rx_ring[ring_idx];
> +	u32 i = ring->next2comp;
> +	while (rq->buf_info[ring_idx][i].buf_type != VMXNET3_RX_BUF_SKB)
> +		VMXNET3_INC_RING_IDX_ONLY(i, ring->size);
> +
> +	return (i == ring->next2fill);
> +}
> +
>  /*
>   *    starting from ring->next2fill, allocate rx buffers for the given ring
>   *    of the rx queue and update the rx desc. stop after @num_to_alloc
> buffers
> @@ -571,9 +582,12 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32
> ring_idx,
>  	int num_allocated = 0;
>  	struct vmxnet3_rx_buf_info *rbi_base = rq->buf_info[ring_idx];
>  	struct vmxnet3_cmd_ring *ring = &rq->rx_ring[ring_idx];
> +	struct vmxnet3_rx_ctx *ctx = &rq->rx_ctx;
> +	bool alloc_skb_failed = false;
>  	u32 val;
> +	u32 len;
> 
> -	while (num_allocated < num_to_alloc) {
> +	while (num_allocated < num_to_alloc && !alloc_skb_failed) {
>  		struct vmxnet3_rx_buf_info *rbi;
>  		union Vmxnet3_GenericDesc *gd;
> 
> @@ -586,7 +600,27 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32
> ring_idx,
>  							 NET_IP_ALIGN);
>  				if (unlikely(rbi->skb == NULL)) {
>  					rq->stats.rx_buf_alloc_failure++;
> -					break;
> +					alloc_skb_failed = true;
> +					/*
> +					 * if allocation failed and ring is
> +					 * empty, we recycle the last skb we
> +					 * rx'ed so that we don't starve the
> +					 * rx ring
> +					 */
> +					if (ctx->skb &&
> +					    vmxnet3_rq_empty(rq, ring_idx)) {
> +						rbi->skb = ctx->skb;
> +						ctx->skb = NULL;
> +						skb_recycle_check(rbi->skb,
> +								  rbi->len +
> +								  NET_IP_ALIGN);
> +						/*
> +						 * free any frags chained to
> +						 * the skb
> +						 */
> +						__pskb_trim(rbi->skb, 0);
> +					} else
> +						break;
>  				}
>  				rbi->skb->dev = adapter->netdev;
> 
> @@ -594,8 +628,10 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32
> ring_idx,
>  				rbi->dma_addr = pci_map_single(adapter->pdev,
>  						rbi->skb->data, rbi->len,
>  						PCI_DMA_FROMDEVICE);
> +				len = rbi->len;
>  			} else {
>  				/* rx buffer skipped by the device */
> +				len = rbi->len;
>  			}
>  			val = VMXNET3_RXD_BTYPE_HEAD << VMXNET3_RXD_BTYPE_SHIFT;
>  		} else {
> @@ -606,13 +642,18 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32
> ring_idx,
>  				rbi->page = alloc_page(GFP_ATOMIC);
>  				if (unlikely(rbi->page == NULL)) {
>  					rq->stats.rx_buf_alloc_failure++;
> -					break;
> +					len = 0;
> +				} else {
> +					rbi->dma_addr = pci_map_page(
> +							    adapter->pdev,
> +							    rbi->page, 0,
> +							    PAGE_SIZE,
> +							    PCI_DMA_FROMDEVICE);
> +					len = rbi->len;
>  				}
> -				rbi->dma_addr = pci_map_page(adapter->pdev,
> -						rbi->page, 0, PAGE_SIZE,
> -						PCI_DMA_FROMDEVICE);
>  			} else {
>  				/* rx buffers skipped by the device */
> +				len = rbi->len;
>  			}
>  			val = VMXNET3_RXD_BTYPE_BODY << VMXNET3_RXD_BTYPE_SHIFT;
>  		}
> @@ -620,7 +661,7 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32
> ring_idx,
>  		BUG_ON(rbi->dma_addr == 0);
>  		gd->rxd.addr = cpu_to_le64(rbi->dma_addr);
>  		gd->dword[2] = cpu_to_le32((ring->gen << VMXNET3_RXD_GEN_SHIFT)
> -					   | val | rbi->len);
> +					   | val | len);
> 
>  		num_allocated++;
>  		vmxnet3_cmd_ring_adv_next2fill(ring);
> @@ -1148,7 +1189,6 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
>  			  &rxComp);
>  	while (rcd->gen == rq->comp_ring.gen) {
>  		struct vmxnet3_rx_buf_info *rbi;
> -		struct sk_buff *skb;
>  		int num_to_alloc;
>  		struct Vmxnet3_RxDesc *rxd;
>  		u32 idx, ring_idx;
> @@ -1168,7 +1208,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
>  		rbi = rq->buf_info[ring_idx] + idx;
> 
>  		BUG_ON(rxd->addr != rbi->dma_addr ||
> -		       rxd->len != rbi->len);
> +		       (rxd->len != rbi->len && rbi->len != 0));
> 
>  		if (unlikely(rcd->eop && rcd->err)) {
>  			vmxnet3_rx_error(rq, rcd, ctx, adapter);
> @@ -1198,8 +1238,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
>  					 PCI_DMA_FROMDEVICE);
> 
>  			skb_put(ctx->skb, rcd->len);
> -		} else {
> -			BUG_ON(ctx->skb == NULL);
> +		} else if (ctx->skb) {
>  			/* non SOP buffer must be type 1 in most cases */
>  			if (rbi->buf_type == VMXNET3_RX_BUF_PAGE) {
>  				BUG_ON(rxd->btype != VMXNET3_RXD_BTYPE_BODY);
> @@ -1222,25 +1261,6 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
>  			}
>  		}
> 
> -		skb = ctx->skb;
> -		if (rcd->eop) {
> -			skb->len += skb->data_len;
> -			skb->truesize += skb->data_len;
> -
> -			vmxnet3_rx_csum(adapter, skb,
> -					(union Vmxnet3_GenericDesc *)rcd);
> -			skb->protocol = eth_type_trans(skb, adapter->netdev);
> -
> -			if (unlikely(adapter->vlan_grp && rcd->ts)) {
> -				vlan_hwaccel_receive_skb(skb,
> -						adapter->vlan_grp, rcd->tci);
> -			} else {
> -				netif_receive_skb(skb);
> -			}
> -
> -			ctx->skb = NULL;
> -		}
> -
>  rcd_done:
>  		/* device may skip some rx descs */
>  		rq->rx_ring[ring_idx].next2comp = idx;
> @@ -1264,6 +1284,24 @@ rcd_done:
>  			}
>  		}
> 
> +		if (rcd->eop && ctx->skb) {
> +			ctx->skb->len += ctx->skb->data_len;
> +			ctx->skb->truesize += ctx->skb->data_len;
> +
> +			vmxnet3_rx_csum(adapter, ctx->skb,
> +					(union Vmxnet3_GenericDesc *)rcd);
> +			ctx->skb->protocol = eth_type_trans(ctx->skb,
> +							    adapter->netdev);
> +			if (unlikely(adapter->vlan_grp && rcd->ts)) {
> +				vlan_hwaccel_receive_skb(ctx->skb,
> +							 adapter->vlan_grp,
> +							 rcd->tci);
> +			} else {
> +				netif_receive_skb(ctx->skb);
> +			}
> +			ctx->skb = NULL;
> +		}
> +
>  		vmxnet3_comp_ring_adv_next2proc(&rq->comp_ring);
>  		vmxnet3_getRxComp(rcd,
>  		     &rq->comp_ring.base[rq->comp_ring.next2proc].rcd, &rxComp);
> diff --git a/drivers/net/vmxnet3/vmxnet3_int.h
> b/drivers/net/vmxnet3/vmxnet3_int.h
> index 0e567c24..cb13ed7 100644
> --- a/drivers/net/vmxnet3/vmxnet3_int.h
> +++ b/drivers/net/vmxnet3/vmxnet3_int.h
> @@ -68,10 +68,10 @@
>  /*
>   * Version numbers
>   */
> -#define VMXNET3_DRIVER_VERSION_STRING   "1.1.9.0-k"
> +#define VMXNET3_DRIVER_VERSION_STRING   "1.1.10.0-k"
> 
>  /* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION
> */
> -#define VMXNET3_DRIVER_VERSION_NUM      0x01010900
> +#define VMXNET3_DRIVER_VERSION_NUM      0x01010A00
> 
>  #if defined(CONFIG_PCI_MSI)
>  	/* RSS only makes sense if MSI-X is supported. */
> @@ -255,7 +255,6 @@ struct vmxnet3_rx_buf_info {
> 
>  struct vmxnet3_rx_ctx {
>  	struct sk_buff *skb;
> -	u32 sop_idx;
>  };
> 
>  struct vmxnet3_rq_driver_stats {
> --
> 1.7.4.1
> 
> _______________________________________________
> Pv-drivers mailing list
> Pv-drivers@vmware.com
> http://mailman2.vmware.com/mailman/listinfo/pv-drivers

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] vmxnet3: fix starving rx ring when alloc_skb fails
  2011-06-16 22:02 [PATCH] vmxnet3: fix starving rx ring when alloc_skb fails Scott J. Goldman
  2011-06-16 22:56 ` [Pv-drivers] " Bhavesh Davda
@ 2011-06-17  4:16 ` David Miller
  2011-06-17  4:40   ` Scott Goldman
  1 sibling, 1 reply; 9+ messages in thread
From: David Miller @ 2011-06-17  4:16 UTC (permalink / raw)
  To: scottjg; +Cc: netdev, pv-drivers

From: "Scott J. Goldman" <scottjg@vmware.com>
Date: Thu, 16 Jun 2011 15:02:27 -0700

> if the rx ring is completely empty, then the device may never fire an rx
> interrupt. unfortunately, the rx interrupt is what triggers populating
> the rx ring with fresh buffers, so this will cause networking to lock
> up.
> 
> this patch recycles the last skb that we were about to indicate up to
> the network stack (only if the rx ring is completely starved of skbs)
> so the ring will never be completely empty. If we fail to allocate a
> secondary page buffer, we just indicate a 0 length buffer to the device.
> 
> Signed-off-by: Scott J. Goldman <scottjg@vmware.com>

This is why other drivers allocate the replacement skb _first_ before
handing the current receive packet to the stack.

And if the replacement allocation fails, they elide passing the packet
to the stack, and instead recycle it back onto the RX ring.

Please implement your RX policy in this manner, as we advise all Linux
networking drivers to, and you simply won't have this problem.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [PATCH] vmxnet3: fix starving rx ring when alloc_skb fails
  2011-06-17  4:16 ` David Miller
@ 2011-06-17  4:40   ` Scott Goldman
  2011-06-17  4:53     ` David Miller
  0 siblings, 1 reply; 9+ messages in thread
From: Scott Goldman @ 2011-06-17  4:40 UTC (permalink / raw)
  To: David Miller; +Cc: netdev@vger.kernel.org, pv-drivers@vmware.com

Hi David. 

> This is why other drivers allocate the replacement skb _first_ before
> handing the current receive packet to the stack.
> 
> And if the replacement allocation fails, they elide passing the packet
> to the stack, and instead recycle it back onto the RX ring.
> 
> Please implement your RX policy in this manner, as we advise all Linux
> networking drivers to, and you simply won't have this problem.

I'm sorry if my patch description was unclear, but as I understand it, what you are describing is what this patch implements. The patched rx handler does something like:
1) poll the rx ring, peel off a pending skb (don't pass the packet up the stack yet)
2) if the ring needs to be repopulated, we do that
3) if the ring was repopulated successfully, then that's great, and we pass up the pending skb to the network stack.
4) if not and there are no skbs left on the ring, we reuse the pending skb and recycle it back on the ring (effectively dropping the packet we were about to pass up to the network stack)

Maybe the code structure is a bit unclear? Since we don't always repopulate the ring every iteration (only when we hit a low watermark), it's true that we don't strictly always allocate a new skb before passing up the old one, but since this is only a problem when there are no skbs on the ring, and we do elide passing up the pending skb in this case, is that not ok?

thanks,
-sjg

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] vmxnet3: fix starving rx ring when alloc_skb fails
  2011-06-17  4:40   ` Scott Goldman
@ 2011-06-17  4:53     ` David Miller
  2011-06-17 12:14       ` Ben Hutchings
  0 siblings, 1 reply; 9+ messages in thread
From: David Miller @ 2011-06-17  4:53 UTC (permalink / raw)
  To: scottjg; +Cc: netdev, pv-drivers

From: Scott Goldman <scottjg@vmware.com>
Date: Thu, 16 Jun 2011 21:40:15 -0700

> I'm sorry if my patch description was unclear, but as I understand it, what you are describing is what this patch implements. The patched rx handler does something like:
> 1) poll the rx ring, peel off a pending skb (don't pass the packet up the stack yet)
> 2) if the ring needs to be repopulated, we do that
> 3) if the ring was repopulated successfully, then that's great, and we pass up the pending skb to the network stack.
> 4) if not and there are no skbs left on the ring, we reuse the pending skb and recycle it back on the ring (effectively dropping the packet we were about to pass up to the network stack)

You shouldn't restrict this logic to when the ring is empty, you
should never leave any RX ring slots empty.

Every RX ring entry should be processed with a "alloc_skb()" first,
and if the allocation fails you recycle the RX ring's SKB.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] vmxnet3: fix starving rx ring when alloc_skb fails
  2011-06-17  4:53     ` David Miller
@ 2011-06-17 12:14       ` Ben Hutchings
  2011-06-17 16:09         ` David Miller
  0 siblings, 1 reply; 9+ messages in thread
From: Ben Hutchings @ 2011-06-17 12:14 UTC (permalink / raw)
  To: David Miller; +Cc: scottjg, netdev, pv-drivers

On Fri, 2011-06-17 at 00:53 -0400, David Miller wrote:
> From: Scott Goldman <scottjg@vmware.com>
> Date: Thu, 16 Jun 2011 21:40:15 -0700
> 
> > I'm sorry if my patch description was unclear, but as I understand it, what you are describing is what this patch implements. The patched rx handler does something like:
> > 1) poll the rx ring, peel off a pending skb (don't pass the packet up the stack yet)
> > 2) if the ring needs to be repopulated, we do that
> > 3) if the ring was repopulated successfully, then that's great, and we pass up the pending skb to the network stack.
> > 4) if not and there are no skbs left on the ring, we reuse the pending skb and recycle it back on the ring (effectively dropping the packet we were about to pass up to the network stack)
> 
> You shouldn't restrict this logic to when the ring is empty, you
> should never leave any RX ring slots empty.
> 
> Every RX ring entry should be processed with a "alloc_skb()" first,
> and if the allocation fails you recycle the RX ring's SKB.

But this means the driver drops received packets as soon as there is an
allocation failure, rather than only if there are repeated failures.  I
don't see how that's preferable.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] vmxnet3: fix starving rx ring when alloc_skb fails
  2011-06-17 12:14       ` Ben Hutchings
@ 2011-06-17 16:09         ` David Miller
  2011-07-06  0:34           ` [PATCH] vmxnet3: fix starving rx ring whenoc_skb kb fails Shreyas Bhatewara
  0 siblings, 1 reply; 9+ messages in thread
From: David Miller @ 2011-06-17 16:09 UTC (permalink / raw)
  To: bhutchings; +Cc: scottjg, netdev, pv-drivers

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 17 Jun 2011 13:14:20 +0100

> But this means the driver drops received packets as soon as there is an
> allocation failure, rather than only if there are repeated failures.  I
> don't see how that's preferable.

Because it decreases the possibility of RX ring starvation.

If you get one allocation failure during a NAPI poll run, and you're
proabably processing a set of packets, they are all going to fail.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] vmxnet3: fix starving rx ring whenoc_skb kb fails
  2011-06-17 16:09         ` David Miller
@ 2011-07-06  0:34           ` Shreyas Bhatewara
  2011-07-06  1:40             ` David Miller
  0 siblings, 1 reply; 9+ messages in thread
From: Shreyas Bhatewara @ 2011-07-06  0:34 UTC (permalink / raw)
  To: David Miller
  Cc: bhutchings@solarflare.com, pv-drivers@vmware.com,
	netdev@vger.kernel.org, scottjg


If the rx ring is completely empty, then the device may never fire an rx 
interrupt. Unfortunately, the rx interrupt is what triggers populating the 
rx ring with fresh buffers, so this will cause networking to lock up.

This patch replenishes the skb in recv descriptor as soon as it is 
peeled off while processing rx completions. If the skb/buffer 
allocation fails, existing one is recycled and the packet in hand is 
dropped. This way none of the RX desc is ever left empty, thus avoiding 
starvation

Signed-off-by: Scott J. Goldman <scottjg@vmware.com>
Signed-off-by: Shreyas N Bhatewara <sbhatewara@vmware.com>
---

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 2c14736..fabcded 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -573,7 +573,7 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx,
 	struct vmxnet3_cmd_ring *ring = &rq->rx_ring[ring_idx];
 	u32 val;
 
-	while (num_allocated < num_to_alloc) {
+	while (num_allocated <= num_to_alloc) {
 		struct vmxnet3_rx_buf_info *rbi;
 		union Vmxnet3_GenericDesc *gd;
 
@@ -619,9 +619,15 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx,
 
 		BUG_ON(rbi->dma_addr == 0);
 		gd->rxd.addr = cpu_to_le64(rbi->dma_addr);
-		gd->dword[2] = cpu_to_le32((ring->gen << VMXNET3_RXD_GEN_SHIFT)
+		gd->dword[2] = cpu_to_le32((!ring->gen << VMXNET3_RXD_GEN_SHIFT)
 					   | val | rbi->len);
 
+		/* Fill the last buffer but dont mark it ready, or else the
+		 * device will think that the queue is full */
+		if (num_allocated == num_to_alloc)
+			break;
+
+		gd->dword[2] |= cpu_to_le32(ring->gen << VMXNET3_RXD_GEN_SHIFT);
 		num_allocated++;
 		vmxnet3_cmd_ring_adv_next2fill(ring);
 	}
@@ -1138,6 +1144,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 		VMXNET3_REG_RXPROD, VMXNET3_REG_RXPROD2
 	};
 	u32 num_rxd = 0;
+	bool skip_page_frags = false;
 	struct Vmxnet3_RxCompDesc *rcd;
 	struct vmxnet3_rx_ctx *ctx = &rq->rx_ctx;
 #ifdef __BIG_ENDIAN_BITFIELD
@@ -1148,11 +1155,12 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 			  &rxComp);
 	while (rcd->gen == rq->comp_ring.gen) {
 		struct vmxnet3_rx_buf_info *rbi;
-		struct sk_buff *skb;
+		struct sk_buff *skb, *new_skb = NULL;
+		struct page *new_page = NULL;
 		int num_to_alloc;
 		struct Vmxnet3_RxDesc *rxd;
 		u32 idx, ring_idx;
-
+		struct vmxnet3_cmd_ring	*ring = NULL;
 		if (num_rxd >= quota) {
 			/* we may stop even before we see the EOP desc of
 			 * the current pkt
@@ -1163,6 +1171,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 		BUG_ON(rcd->rqID != rq->qid && rcd->rqID != rq->qid2);
 		idx = rcd->rxdIdx;
 		ring_idx = rcd->rqID < adapter->num_rx_queues ? 0 : 1;
+		ring = rq->rx_ring + ring_idx;
 		vmxnet3_getRxDesc(rxd, &rq->rx_ring[ring_idx].base[idx].rxd,
 				  &rxCmdDesc);
 		rbi = rq->buf_info[ring_idx] + idx;
@@ -1191,37 +1200,80 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 				goto rcd_done;
 			}
 
+			skip_page_frags = false;
 			ctx->skb = rbi->skb;
-			rbi->skb = NULL;
+			new_skb = dev_alloc_skb(rbi->len + NET_IP_ALIGN);
+			if (new_skb == NULL) {
+				/* Skb allocation failed, do not handover this
+				 * skb to stack. Reuse it. Drop the existing pkt
+				 */
+				rq->stats.rx_buf_alloc_failure++;
+				ctx->skb = NULL;
+				rq->stats.drop_total++;
+				skip_page_frags = true;
+				goto rcd_done;
+			}
 
 			pci_unmap_single(adapter->pdev, rbi->dma_addr, rbi->len,
 					 PCI_DMA_FROMDEVICE);
 
 			skb_put(ctx->skb, rcd->len);
+
+			/* Immediate refill */
+			new_skb->dev = adapter->netdev;
+			skb_reserve(new_skb, NET_IP_ALIGN);
+			rbi->skb = new_skb;
+			rbi->dma_addr = pci_map_single(adapter->pdev,
+					rbi->skb->data, rbi->len,
+					PCI_DMA_FROMDEVICE);
+			rxd->addr = cpu_to_le64(rbi->dma_addr);
+			rxd->len = rbi->len;
+
 		} else {
-			BUG_ON(ctx->skb == NULL);
+			BUG_ON(ctx->skb == NULL && !skip_page_frags);
+
 			/* non SOP buffer must be type 1 in most cases */
-			if (rbi->buf_type == VMXNET3_RX_BUF_PAGE) {
-				BUG_ON(rxd->btype != VMXNET3_RXD_BTYPE_BODY);
+			BUG_ON(rbi->buf_type != VMXNET3_RX_BUF_PAGE);
+			BUG_ON(rxd->btype != VMXNET3_RXD_BTYPE_BODY);
 
-				if (rcd->len) {
-					pci_unmap_page(adapter->pdev,
-						       rbi->dma_addr, rbi->len,
-						       PCI_DMA_FROMDEVICE);
+			/* If an sop buffer was dropped, skip all
+			 * following non-sop fragments. They will be reused.
+			 */
+			if (skip_page_frags)
+				goto rcd_done;
 
-					vmxnet3_append_frag(ctx->skb, rcd, rbi);
-					rbi->page = NULL;
-				}
-			} else {
-				/*
-				 * The only time a non-SOP buffer is type 0 is
-				 * when it's EOP and error flag is raised, which
-				 * has already been handled.
+			new_page = alloc_page(GFP_ATOMIC);
+			if (unlikely(new_page == NULL)) {
+				/* Replacement page frag could not be allocated.
+				 * Reuse this page. Drop the pkt and free the
+				 * skb which contained this page as a frag. Skip
+				 * processing all the following non-sop frags.
 				 */
-				BUG_ON(true);
+				rq->stats.rx_buf_alloc_failure++;
+				dev_kfree_skb(ctx->skb);
+				ctx->skb = NULL;
+				skip_page_frags = true;
+				goto rcd_done;
+			}
+
+			if (rcd->len) {
+				pci_unmap_page(adapter->pdev,
+					       rbi->dma_addr, rbi->len,
+					       PCI_DMA_FROMDEVICE);
+
+				vmxnet3_append_frag(ctx->skb, rcd, rbi);
 			}
+
+			/* Immediate refill */
+			rbi->page = new_page;
+			rbi->dma_addr = pci_map_page(adapter->pdev, rbi->page,
+						     0, PAGE_SIZE,
+						     PCI_DMA_FROMDEVICE);
+			rxd->addr = cpu_to_le64(rbi->dma_addr);
+			rxd->len = rbi->len;
 		}
 
+
 		skb = ctx->skb;
 		if (rcd->eop) {
 			skb->len += skb->data_len;
@@ -1243,26 +1295,27 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 		}
 
 rcd_done:
-		/* device may skip some rx descs */
-		rq->rx_ring[ring_idx].next2comp = idx;
-		VMXNET3_INC_RING_IDX_ONLY(rq->rx_ring[ring_idx].next2comp,
-					  rq->rx_ring[ring_idx].size);
-
-		/* refill rx buffers frequently to avoid starving the h/w */
-		num_to_alloc = vmxnet3_cmd_ring_desc_avail(rq->rx_ring +
-							   ring_idx);
-		if (unlikely(num_to_alloc > VMXNET3_RX_ALLOC_THRESHOLD(rq,
-							ring_idx, adapter))) {
-			vmxnet3_rq_alloc_rx_buf(rq, ring_idx, num_to_alloc,
-						adapter);
-
-			/* if needed, update the register */
-			if (unlikely(rq->shared->updateRxProd)) {
-				VMXNET3_WRITE_BAR0_REG(adapter,
-					rxprod_reg[ring_idx] + rq->qid * 8,
-					rq->rx_ring[ring_idx].next2fill);
-				rq->uncommitted[ring_idx] = 0;
-			}
+		/* device may have skipped some rx descs */
+		ring->next2comp = idx;
+		num_to_alloc = vmxnet3_cmd_ring_desc_avail(ring);
+		ring = rq->rx_ring + ring_idx;
+		while (num_to_alloc) {
+			vmxnet3_getRxDesc(rxd, &ring->base[ring->next2fill].rxd,
+					  &rxCmdDesc);
+			BUG_ON(!rxd->addr);
+
+			/* Recv desc is ready to be used by the device */
+			rxd->gen = ring->gen;
+			vmxnet3_cmd_ring_adv_next2fill(ring);
+			num_to_alloc--;
+		}
+
+		/* if needed, update the register */
+		if (unlikely(rq->shared->updateRxProd)) {
+			VMXNET3_WRITE_BAR0_REG(adapter,
+				rxprod_reg[ring_idx] + rq->qid * 8,
+				ring->next2fill);
+			rq->uncommitted[ring_idx] = 0;
 		}
 
 		vmxnet3_comp_ring_adv_next2proc(&rq->comp_ring);
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index 2e37985..a9cb3fa 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -69,10 +69,10 @@
 /*
  * Version numbers
  */
-#define VMXNET3_DRIVER_VERSION_STRING   "1.1.9.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING   "1.1.14.0-k"
 
 /* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
-#define VMXNET3_DRIVER_VERSION_NUM      0x01010900
+#define VMXNET3_DRIVER_VERSION_NUM      0x01010E00
 
 #if defined(CONFIG_PCI_MSI)
 	/* RSS only makes sense if MSI-X is supported. */

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] vmxnet3: fix starving rx ring whenoc_skb kb fails
  2011-07-06  0:34           ` [PATCH] vmxnet3: fix starving rx ring whenoc_skb kb fails Shreyas Bhatewara
@ 2011-07-06  1:40             ` David Miller
  0 siblings, 0 replies; 9+ messages in thread
From: David Miller @ 2011-07-06  1:40 UTC (permalink / raw)
  To: sbhatewara; +Cc: bhutchings, pv-drivers, netdev, scottjg

From: Shreyas Bhatewara <sbhatewara@vmware.com>
Date: Tue, 5 Jul 2011 17:34:05 -0700 (PDT)

> 
> If the rx ring is completely empty, then the device may never fire an rx 
> interrupt. Unfortunately, the rx interrupt is what triggers populating the 
> rx ring with fresh buffers, so this will cause networking to lock up.
> 
> This patch replenishes the skb in recv descriptor as soon as it is 
> peeled off while processing rx completions. If the skb/buffer 
> allocation fails, existing one is recycled and the packet in hand is 
> dropped. This way none of the RX desc is ever left empty, thus avoiding 
> starvation
> 
> Signed-off-by: Scott J. Goldman <scottjg@vmware.com>
> Signed-off-by: Shreyas N Bhatewara <sbhatewara@vmware.com>

Applied.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2011-07-06  1:40 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-06-16 22:02 [PATCH] vmxnet3: fix starving rx ring when alloc_skb fails Scott J. Goldman
2011-06-16 22:56 ` [Pv-drivers] " Bhavesh Davda
2011-06-17  4:16 ` David Miller
2011-06-17  4:40   ` Scott Goldman
2011-06-17  4:53     ` David Miller
2011-06-17 12:14       ` Ben Hutchings
2011-06-17 16:09         ` David Miller
2011-07-06  0:34           ` [PATCH] vmxnet3: fix starving rx ring whenoc_skb kb fails Shreyas Bhatewara
2011-07-06  1:40             ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).