[PATCH 10/10] IPoIB batching in internal xmit/handler routines.

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Krishna Kumar <krkumar2@in.ibm.com>
To: davem@davemloft.net, rdreier@cisco.com
Cc: johnpol@2ka.mipt.ru, Robert.Olsson@data.slu.se,
	peter.p.waskiewicz.jr@intel.com, herbert@gondor.apana.org.au,
	gaagaan@gmail.com, kumarkr@linux.ibm.com, xma@us.ibm.com,
	rick.jones2@hp.com, mcarlson@broadcom.com,
	netdev@vger.kernel.org, jagana@us.ibm.com,
	general@lists.openfabrics.org, mchan@broadcom.com, tgraf@suug.ch,
	jeff@garzik.org, hadi@cyberus.ca, kaber@trash.net,
	Krishna Kumar <krkumar2@in.ibm.com>,
	sri@us.ibm.com
Subject: [PATCH 10/10] IPoIB batching in internal xmit/handler routines.
Date: Fri, 20 Jul 2007 12:03:48 +0530	[thread overview]
Message-ID: <20070720063348.26341.73753.sendpatchset@localhost.localdomain> (raw)
In-Reply-To: <20070720063149.26341.84076.sendpatchset@localhost.localdomain>

Add batching support to IPoIB post_send and TX completion handler.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---
 ipoib_ib.c |  233 ++++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 187 insertions(+), 46 deletions(-)

diff -ruNp org/drivers/infiniband/ulp/ipoib/ipoib_ib.c new/drivers/infiniband/ulp/ipoib/ipoib_ib.c
--- org/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-07-20 07:49:28.000000000 +0530
+++ new/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-07-20 08:30:22.000000000 +0530
@@ -242,8 +242,9 @@ repost:
 static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i = 0, num_completions;
+	int tx_ring_index = priv->tx_tail & (ipoib_sendq_size - 1);
 	unsigned int wr_id = wc->wr_id;
-	struct ipoib_tx_buf *tx_req;
 	unsigned long flags;
 
 	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
@@ -255,23 +256,60 @@ static void ipoib_ib_handle_tx_wc(struct
 		return;
 	}
 
-	tx_req = &priv->tx_ring[wr_id];
+	num_completions = wr_id - tx_ring_index + 1;
+	if (num_completions <= 0)
+		num_completions += ipoib_sendq_size;
+
+	/*
+	 * Handle skbs completion from tx_tail to wr_id. It is possible to
+	 * handle WC's from earlier post_sends (possible multiple) in this
+	 * iteration as we move from tx_tail to wr_id, since if the last
+	 * WR (which is the one which had a completion request) failed to be
+	 * sent for any of those earlier request(s), no completion
+	 * notification is generated for successful WR's of those earlier
+	 * request(s).
+	 */
+	while (1) {
+		/*
+		 * Could use while (i < num_completions), but it is costly
+		 * since in most cases there is 1 completion, and we end up
+		 * doing an extra "index = (index+1) & (ipoib_sendq_size-1)"
+		 */
+		struct ipoib_tx_buf *tx_req = &priv->tx_ring[tx_ring_index];
+
+		if (likely(tx_req->skb)) {
+			ib_dma_unmap_single(priv->ca, tx_req->mapping,
+					    tx_req->skb->len, DMA_TO_DEVICE);
 
-	ib_dma_unmap_single(priv->ca, tx_req->mapping,
-			    tx_req->skb->len, DMA_TO_DEVICE);
+			++priv->stats.tx_packets;
+			priv->stats.tx_bytes += tx_req->skb->len;
 
-	++priv->stats.tx_packets;
-	priv->stats.tx_bytes += tx_req->skb->len;
+			dev_kfree_skb_any(tx_req->skb);
+		}
+		/*
+		 * else this skb failed synchronously when posted and was
+		 * freed immediately.
+		 */
+
+		if (++i == num_completions)
+			break;
 
-	dev_kfree_skb_any(tx_req->skb);
+		/* More WC's to handle */
+		tx_ring_index = (tx_ring_index + 1) & (ipoib_sendq_size - 1);
+	}
 
 	spin_lock_irqsave(&priv->tx_lock, flags);
-	++priv->tx_tail;
+
+	priv->tx_tail += num_completions;
 	if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) &&
 	    priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
 		clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
 		netif_wake_queue(dev);
 	}
+
+	/* Make more slots available for posts */
+	dev->xmit_slots = ipoib_sendq_size - (priv->tx_head - priv->tx_tail);
+
 	spin_unlock_irqrestore(&priv->tx_lock, flags);
 
 	if (wc->status != IB_WC_SUCCESS &&
@@ -340,78 +378,181 @@ void ipoib_ib_completion(struct ib_cq *c
 	netif_rx_schedule(dev_ptr);
 }
 
-static inline int post_send(struct ipoib_dev_priv *priv,
-			    unsigned int wr_id,
-			    struct ib_ah *address, u32 qpn,
-			    u64 addr, int len)
+/*
+ * post_send : Post WR(s) to the device.
+ *
+ * num_skbs is the number of WR's, 'start_index' is the first slot in
+ * tx_wr[] or tx_sge[]. Note: 'start_index' is normally zero, unless a
+ * previous post_send returned error and we are trying to send the untried
+ * WR's, in which case start_index will point to the first untried WR.
+ *
+ * We also break the WR link before posting so that the driver knows how
+ * many WR's to process, and this is set back after the post.
+ */
+static inline int post_send(struct ipoib_dev_priv *priv, u32 qpn,
+			    int start_index, int num_skbs,
+			    struct ib_send_wr **bad_wr)
 {
-	struct ib_send_wr *bad_wr;
+	int ret;
+	struct ib_send_wr *last_wr, *next_wr;
+
+	last_wr = &priv->tx_wr[start_index + num_skbs - 1];
+
+	/* Set Completion Notification for last WR */
+	last_wr->send_flags = IB_SEND_SIGNALED;
 
-	priv->tx_sge.addr             = addr;
-	priv->tx_sge.length           = len;
+	/* Terminate the last WR */
+	next_wr = last_wr->next;
+	last_wr->next = NULL;
 
-	priv->tx_wr.wr_id 	      = wr_id;
-	priv->tx_wr.wr.ud.remote_qpn  = qpn;
-	priv->tx_wr.wr.ud.ah 	      = address;
+	/* Send all the WR's in one doorbell */
+	ret = ib_post_send(priv->qp, &priv->tx_wr[start_index], bad_wr);
 
-	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
+	/* Restore send_flags & WR chain */
+	last_wr->send_flags = 0;
+	last_wr->next = next_wr;
+
+	return ret;
 }
 
-void ipoib_send(struct net_device *dev, struct sk_buff *skb,
-		struct ipoib_ah *address, u32 qpn)
+/*
+ * Map skb & store skb/mapping in tx_req; and details of the WR in tx_wr
+ * to pass to the driver.
+ *
+ * Returns :
+ *	- 0 on successful processing of the skb
+ *	- 1 if the skb was freed.
+ */
+int ipoib_process_skb(struct net_device *dev, struct sk_buff *skb,
+		      struct ipoib_dev_priv *priv, int wr_num,
+		      int tx_ring_index, struct ipoib_ah *address, u32 qpn)
 {
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ipoib_tx_buf *tx_req;
 	u64 addr;
+	struct ipoib_tx_buf *tx_req;
 
 	if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
-		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+		ipoib_warn(priv, "packet len %d (> %d) too long to "
+			   "send, dropping\n",
 			   skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN);
 		++priv->stats.tx_dropped;
 		++priv->stats.tx_errors;
 		ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
-		return;
+		return 1;
 	}
 
-	ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
+	ipoib_dbg_data(priv, "sending packet, length=%d address=%p "
+		       "qpn=0x%06x\n",
 		       skb->len, address, qpn);
 
 	/*
 	 * We put the skb into the tx_ring _before_ we call post_send()
 	 * because it's entirely possible that the completion handler will
-	 * run before we execute anything after the post_send().  That
+	 * run before we execute anything after the post_send(). That
 	 * means we have to make sure everything is properly recorded and
 	 * our state is consistent before we call post_send().
 	 */
-	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
-	tx_req->skb = skb;
-	addr = ib_dma_map_single(priv->ca, skb->data, skb->len,
-				 DMA_TO_DEVICE);
+	addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
 	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
 		++priv->stats.tx_errors;
 		dev_kfree_skb_any(skb);
-		return;
+		return 1;
 	}
+
+	tx_req = &priv->tx_ring[tx_ring_index];
+	tx_req->skb = skb;
 	tx_req->mapping = addr;
+	priv->tx_sge[wr_num].addr = addr;
+	priv->tx_sge[wr_num].length = skb->len;
+	priv->tx_wr[wr_num].wr_id = tx_ring_index;
+	priv->tx_wr[wr_num].wr.ud.remote_qpn = qpn;
+	priv->tx_wr[wr_num].wr.ud.ah = address->ah;
 
-	if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
-			       address->ah, qpn, addr, skb->len))) {
-		ipoib_warn(priv, "post_send failed\n");
-		++priv->stats.tx_errors;
-		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
-		dev_kfree_skb_any(skb);
-	} else {
-		dev->trans_start = jiffies;
+	return 0;
+}
 
-		address->last_send = priv->tx_head;
-		++priv->tx_head;
+/*
+ * If an skb is passed to this function, it is the single, unprocessed skb
+ * send case. Otherwise if skb is NULL, it means that all skbs are already
+ * processed and put on the priv->tx_wr,tx_sge,tx_ring, etc.
+ */
+void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+		struct ipoib_ah *address, u32 qpn, int num_skbs)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int start_index = 0;
+
+	if (skb && ipoib_process_skb(dev, skb, priv, 0, priv->tx_head &
+				     (ipoib_sendq_size - 1), address, qpn))
+		return;
+
+	/* Send out all the skb's in one post */
+	while (num_skbs) {
+		struct ib_send_wr *bad_wr;
+
+		if (unlikely((post_send(priv, qpn, start_index, num_skbs,
+					&bad_wr)))) {
+			int done;
+
+			/*
+			 * Better error handling can be done here, like free
+			 * all untried skbs if err == -ENOMEM. However at this
+			 * time, we re-try all the skbs, all of which will
+			 * likely fail anyway (unless device finished sending
+			 * some out in the meantime). This is not a regression
+			 * since the earlier code is not doing this either.
+			 */
+			ipoib_warn(priv, "post_send failed\n");
 
-		if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
-			ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
-			netif_stop_queue(dev);
-			set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
+			/* Get #WR's that finished successfully */
+			done = bad_wr - &priv->tx_wr[start_index];
+
+			/* Handle 1 error */
+			priv->stats.tx_errors++;
+			ib_dma_unmap_single(priv->ca,
+				priv->tx_sge[start_index + done].addr,
+				priv->tx_sge[start_index + done].length,
+				DMA_TO_DEVICE);
+
+			/* Handle 'n' successes */
+			if (done) {
+				dev->trans_start = jiffies;
+				address->last_send = priv->tx_head;
+			}
+
+			/* Free failed WR & reset for WC handler to recognize */
+			dev_kfree_skb_any(priv->tx_ring[bad_wr->wr_id].skb);
+			priv->tx_ring[bad_wr->wr_id].skb = NULL;
+
+			/* Move head to first untried WR */
+			priv->tx_head += (done + 1);
+				/* + 1 for WR that was tried & failed */
+
+			/* Get count of skbs that were not tried */
+			num_skbs -= (done + 1);
+
+			/* Get start index for next iteration */
+			start_index += (done + 1);
+		} else {
+			dev->trans_start = jiffies;
+
+			address->last_send = priv->tx_head;
+			priv->tx_head += num_skbs;
+			num_skbs = 0;
 		}
 	}
+
+	if (unlikely(priv->tx_head - priv->tx_tail == ipoib_sendq_size)) {
+		/*
+		 * Not accurate as some intermediate slots could have been
+		 * freed on error, but no harm - only queue stopped earlier.
+		 */
+		ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+		netif_stop_queue(dev);
+		set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
+	}
+
+	/* Reduce the number of slots for sends */
+	dev->xmit_slots = ipoib_sendq_size - (priv->tx_head - priv->tx_tail);
 }
 
 static void __ipoib_reap_ah(struct net_device *dev)

next prev parent reply	other threads:[~2007-07-20  6:35 UTC|newest]

Thread overview: 55+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-07-20  6:31 [ofa-general] [PATCH 00/10] Implement batching skb API Krishna Kumar
2007-07-20  6:32 ` [ofa-general] [PATCH 01/10] HOWTO documentation for Batching SKB Krishna Kumar
2007-07-20  6:32 ` [PATCH 02/10] Networking include file changes Krishna Kumar
2007-07-20  9:59   ` Patrick McHardy
2007-07-20 17:25   ` [ofa-general] " Sridhar Samudrala
2007-07-21  6:30     ` Krishna Kumar2
2007-07-23  5:59       ` Sridhar Samudrala
2007-07-23  6:27         ` Krishna Kumar2
2007-07-20  6:32 ` [ofa-general] [PATCH 03/10] dev.c changes Krishna Kumar
2007-07-20 10:04   ` [ofa-general] " Patrick McHardy
2007-07-20 10:27     ` Krishna Kumar2
2007-07-20 11:20       ` [ofa-general] " Patrick McHardy
2007-07-20 11:52         ` Krishna Kumar2
2007-07-20 11:55           ` Patrick McHardy
2007-07-20 12:09         ` Krishna Kumar2
2007-07-20 12:25         ` Krishna Kumar2
2007-07-20 12:37           ` Patrick McHardy
2007-07-20 17:44   ` Sridhar Samudrala
2007-07-21  6:44     ` Krishna Kumar2
2007-07-20  6:32 ` [PATCH 04/10] net-sysfs.c changes Krishna Kumar
2007-07-20 10:07   ` [ofa-general] " Patrick McHardy
2007-07-20 10:28     ` Krishna Kumar2
2007-07-20 11:21       ` Patrick McHardy
2007-07-20 16:22         ` Stephen Hemminger
2007-07-21  6:46           ` Krishna Kumar2
2007-07-23  9:56             ` Stephen Hemminger
2007-07-20  6:32 ` [ofa-general] [PATCH 05/10] sch_generic.c changes Krishna Kumar
2007-07-20 10:11   ` [ofa-general] " Patrick McHardy
2007-07-20 10:32     ` Krishna Kumar2
2007-07-20 11:24       ` Patrick McHardy
2007-07-20 18:16   ` Patrick McHardy
2007-07-21  6:56     ` Krishna Kumar2
2007-07-22 17:03       ` Patrick McHardy
2007-07-20  6:33 ` [ofa-general] [PATCH 06/10] IPoIB header file changes Krishna Kumar
2007-07-20  6:33 ` [ofa-general] [PATCH 07/10] IPoIB verb changes Krishna Kumar
2007-07-20  6:33 ` [ofa-general] [PATCH 08/10] IPoIB multicast/CM changes Krishna Kumar
2007-07-20  6:33 ` [PATCH 09/10] IPoIB batching xmit handler support Krishna Kumar
2007-07-20  6:33 ` Krishna Kumar [this message]
2007-07-20  7:18 ` [ofa-general] Re: [PATCH 00/10] Implement batching skb API Stephen Hemminger
2007-07-20  7:30   ` Krishna Kumar2
2007-07-20  7:57     ` [ofa-general] " Stephen Hemminger
2007-07-20  7:47   ` Krishna Kumar2
2007-07-21 13:46   ` [ofa-general] TCP and batching WAS(Re: " jamal
2007-07-23  9:44     ` Stephen Hemminger
2007-07-20 12:54 ` [ofa-general] " Evgeniy Polyakov
2007-07-20 13:02   ` Krishna Kumar2
2007-07-23  4:23   ` Krishna Kumar2
2007-07-21 13:18 ` [ofa-general] " jamal
2007-07-22  6:27   ` Krishna Kumar2
2007-07-22 12:51     ` jamal
2007-07-23  4:49       ` Krishna Kumar2
2007-07-23 12:32         ` jamal
2007-07-24  3:44           ` [ofa-general] " Krishna Kumar2
2007-07-24 19:28             ` jamal
2007-07-25  2:41               ` Krishna Kumar2

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070720063348.26341.73753.sendpatchset@localhost.localdomain \
    --to=krkumar2@in.ibm.com \
    --cc=Robert.Olsson@data.slu.se \
    --cc=davem@davemloft.net \
    --cc=gaagaan@gmail.com \
    --cc=general@lists.openfabrics.org \
    --cc=hadi@cyberus.ca \
    --cc=herbert@gondor.apana.org.au \
    --cc=jagana@us.ibm.com \
    --cc=jeff@garzik.org \
    --cc=johnpol@2ka.mipt.ru \
    --cc=kaber@trash.net \
    --cc=kumarkr@linux.ibm.com \
    --cc=mcarlson@broadcom.com \
    --cc=mchan@broadcom.com \
    --cc=netdev@vger.kernel.org \
    --cc=peter.p.waskiewicz.jr@intel.com \
    --cc=rdreier@cisco.com \
    --cc=rick.jones2@hp.com \
    --cc=sri@us.ibm.com \
    --cc=tgraf@suug.ch \
    --cc=xma@us.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).