All of lore.kernel.org
 help / color / mirror / Atom feed
From: ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org
To: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org,
	devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b@public.gmane.org
Cc: dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org,
	mike.marciniszyn-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org,
	Mitko Haralanov
	<mitko.haralanov-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>,
	Ira Weiny <ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Subject: [PATCH v2 12/22] staging/rdma/hfi1: Prevent silent data corruption with user SDMA
Date: Mon, 19 Oct 2015 22:11:27 -0400	[thread overview]
Message-ID: <1445307097-8244-13-git-send-email-ira.weiny@intel.com> (raw)
In-Reply-To: <1445307097-8244-1-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

From: Mitko Haralanov <mitko.haralanov-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

User SDMA keeps track of progress into the submitted IO vectors by tracking an
offset into the vectors when packets are submitted. This offset is updated
after a successful submission of a txreq to the SDMA engine.

The same offset was used when determining whether an IO vector should be
'freed' (pages unpinned) in the SDMA callback functions.

This was causing a silent data corruption in big jobs (> 2 nodes, 120 ranks
each) on the receive side because the send side was mistakenly unpinning the
vector pages before the HW has processed all descriptors referencing the
vector.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Mitko Haralanov <mitko.haralanov-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Ira Weiny <ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
 drivers/staging/rdma/hfi1/user_sdma.c | 90 ++++++++++++++++++++---------------
 1 file changed, 52 insertions(+), 38 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
index 10ab8073d3f2..36c838dcf023 100644
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -146,7 +146,8 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12
 #define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
 
 /* Last packet in the request */
-#define USER_SDMA_TXREQ_FLAGS_LAST_PKT   (1 << 0)
+#define TXREQ_FLAGS_REQ_LAST_PKT   (1 << 0)
+#define TXREQ_FLAGS_IOVEC_LAST_PKT (1 << 0)
 
 #define SDMA_REQ_IN_USE     0
 #define SDMA_REQ_FOR_THREAD 1
@@ -249,13 +250,22 @@ struct user_sdma_request {
 	unsigned long flags;
 };
 
+/*
+ * A single txreq could span up to 3 physical pages when the MTU
+ * is sufficiently large (> 4K). Each of the IOV pointers also
+ * needs it's own set of flags so the vector has been handled
+ * independently of each other.
+ */
 struct user_sdma_txreq {
 	/* Packet header for the txreq */
 	struct hfi1_pkt_header hdr;
 	struct sdma_txreq txreq;
 	struct user_sdma_request *req;
-	struct user_sdma_iovec *iovec1;
-	struct user_sdma_iovec *iovec2;
+	struct {
+		struct user_sdma_iovec *vec;
+		u8 flags;
+	} iovecs[3];
+	int idx;
 	u16 flags;
 	unsigned busycount;
 	u64 seqnum;
@@ -294,21 +304,6 @@ static int defer_packet_queue(
 	unsigned seq);
 static void activate_packet_queue(struct iowait *, int);
 
-static inline int iovec_may_free(struct user_sdma_iovec *iovec,
-				       void (*free)(struct user_sdma_iovec *))
-{
-	if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
-		free(iovec);
-		return 1;
-	}
-	return 0;
-}
-
-static inline void iovec_set_complete(struct user_sdma_iovec *iovec)
-{
-	iovec->offset = iovec->iov.iov_len;
-}
-
 static int defer_packet_queue(
 	struct sdma_engine *sde,
 	struct iowait *wait,
@@ -825,11 +820,11 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 		tx->flags = 0;
 		tx->req = req;
 		tx->busycount = 0;
-		tx->iovec1 = NULL;
-		tx->iovec2 = NULL;
+		tx->idx = -1;
+		memset(tx->iovecs, 0, sizeof(tx->iovecs));
 
 		if (req->seqnum == req->info.npkts - 1)
-			tx->flags |= USER_SDMA_TXREQ_FLAGS_LAST_PKT;
+			tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT;
 
 		/*
 		 * Calculate the payload size - this is min of the fragment
@@ -858,7 +853,7 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 					goto free_tx;
 			}
 
-			tx->iovec1 = iovec;
+			tx->iovecs[++tx->idx].vec = iovec;
 			datalen = compute_data_length(req, tx);
 			if (!datalen) {
 				SDMA_DBG(req,
@@ -948,10 +943,17 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 					      iovec->pages[pageidx],
 					      offset, len);
 			if (ret) {
+				int i;
+
 				dd_dev_err(pq->dd,
 					   "SDMA txreq add page failed %d\n",
 					   ret);
-				iovec_set_complete(iovec);
+				/* Mark all assigned vectors as complete so they
+				 * are unpinned in the callback. */
+				for (i = tx->idx; i >= 0; i--) {
+					tx->iovecs[i].flags |=
+						TXREQ_FLAGS_IOVEC_LAST_PKT;
+				}
 				goto free_txreq;
 			}
 			iov_offset += len;
@@ -959,8 +961,11 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 			data_sent += len;
 			if (unlikely(queued < datalen &&
 				     pageidx == iovec->npages &&
-				     req->iov_idx < req->data_iovs - 1)) {
+				     req->iov_idx < req->data_iovs - 1 &&
+				     tx->idx < ARRAY_SIZE(tx->iovecs))) {
 				iovec->offset += iov_offset;
+				tx->iovecs[tx->idx].flags |=
+					TXREQ_FLAGS_IOVEC_LAST_PKT;
 				iovec = &req->iovs[++req->iov_idx];
 				if (!iovec->pages) {
 					ret = pin_vector_pages(req, iovec);
@@ -968,8 +973,7 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 						goto free_txreq;
 				}
 				iov_offset = 0;
-				tx->iovec2 = iovec;
-
+				tx->iovecs[++tx->idx].vec = iovec;
 			}
 		}
 		/*
@@ -981,11 +985,15 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 			req->tidoffset += datalen;
 		req->sent += data_sent;
 		if (req->data_len) {
-			if (tx->iovec1 && !tx->iovec2)
-				tx->iovec1->offset += iov_offset;
-			else if (tx->iovec2)
-				tx->iovec2->offset += iov_offset;
+			tx->iovecs[tx->idx].vec->offset += iov_offset;
+			/* If we've reached the end of the io vector, mark it
+			 * so the callback can unpin the pages and free it. */
+			if (tx->iovecs[tx->idx].vec->offset ==
+			    tx->iovecs[tx->idx].vec->iov.iov_len)
+				tx->iovecs[tx->idx].flags |=
+					TXREQ_FLAGS_IOVEC_LAST_PKT;
 		}
+
 		/*
 		 * It is important to increment this here as it is used to
 		 * generate the BTH.PSN and, therefore, can't be bulk-updated
@@ -1214,7 +1222,7 @@ static int set_txreq_header(struct user_sdma_request *req,
 				req->seqnum));
 
 	/* Set ACK request on last packet */
-	if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
 		hdr->bth[2] |= cpu_to_be32(1UL<<31);
 
 	/* Set the new offset */
@@ -1246,7 +1254,7 @@ static int set_txreq_header(struct user_sdma_request *req,
 		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
 			  EXP_TID_GET(tidval, IDX));
 		/* Clear KDETH.SH only on the last packet */
-		if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
 			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
 		/*
 		 * Set the KDETH.OFFSET and KDETH.OM based on size of
@@ -1290,7 +1298,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
 	/* BTH.PSN and BTH.A */
 	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
 		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
-	if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT))
+	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
 		val32 |= 1UL << 31;
 	AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
 	AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
@@ -1331,7 +1339,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
 		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
 					(EXP_TID_GET(tidval, IDX) & 0x3ff));
 		/* Clear KDETH.SH on last packet */
-		if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT)) {
+		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) {
 			val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset,
 								INTR) >> 16);
 			val &= cpu_to_le16(~(1U << 13));
@@ -1358,10 +1366,16 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status,
 	if (unlikely(!req || !pq))
 		return;
 
-	if (tx->iovec1)
-		iovec_may_free(tx->iovec1, unpin_vector_pages);
-	if (tx->iovec2)
-		iovec_may_free(tx->iovec2, unpin_vector_pages);
+	/* If we have any io vectors associated with this txreq,
+	 * check whether they need to be 'freed'. */
+	if (tx->idx != -1) {
+		int i;
+
+		for (i = tx->idx; i >= 0; i--) {
+			if (tx->iovecs[i].flags & TXREQ_FLAGS_IOVEC_LAST_PKT)
+				unpin_vector_pages(tx->iovecs[i].vec);
+		}
+	}
 
 	tx_seqnum = tx->seqnum;
 	kmem_cache_free(pq->txreq_cache, tx);
-- 
1.8.2

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

  parent reply	other threads:[~2015-10-20  2:11 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-10-20  2:11 [PATCH v2 00/22] staging/rdma/hfi1: Fix bugs and performance issues ira.weiny-ral2JQCrhuEAvxtiuMwx3w
     [not found] ` <1445307097-8244-1-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2015-10-20  2:11   ` [PATCH v2 01/22] staging/rdma/hfi1: Fix regression in send performance ira.weiny-ral2JQCrhuEAvxtiuMwx3w
     [not found]     ` <1445307097-8244-2-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2015-10-21 13:18       ` Dan Carpenter
2015-10-26  2:10         ` ira.weiny
2015-10-20  2:11   ` [PATCH v2 02/22] staging/rdma/hfi1: Fix code to reset ASIC CSRs on FLR ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 03/22] staging/rdma/hfi1: Extend the offline timeout ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 04/22] staging/rdma/hfi1: Prevent host software lock up ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 05/22] staging/rdma/hfi1: Remove QSFP_ENABLED from HFI capability mask ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 06/22] staging/rdma/hfi1: Add coalescing support for SDMA TX descriptors ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 07/22] staging/rdma/hfi1: Fix sparse error in sdma.h file ira.weiny-ral2JQCrhuEAvxtiuMwx3w
     [not found]     ` <1445307097-8244-8-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2015-10-21 14:12       ` Dan Carpenter
2015-10-21 16:29         ` Weiny, Ira
2015-10-22 10:01           ` Dan Carpenter
2015-10-25  1:59             ` gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r
2015-10-20  2:11   ` [PATCH v2 08/22] staging/rdma/hfi1: close shared context security hole ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 09/22] staging/rdma/hfi1: Reset firmware instead of reloading Sbus ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 10/22] staging/rdma/hfi1: Add a schedule in send thread ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 11/22] staging/rdma/hfi1: Fix port bounce issues with 0.22 DC firmware ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` ira.weiny-ral2JQCrhuEAvxtiuMwx3w [this message]
2015-10-20  2:11   ` [PATCH v2 13/22] staging/rdma/hfi1: Macro code clean up ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 14/22] staging/rdma/hfi1: Implement Expected Receive TID caching ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-22 10:41     ` Dan Carpenter
2015-10-22 23:18       ` ira.weiny
     [not found]         ` <20151022231819.GB4019-W4f6Xiosr+yv7QzWx2u06xL4W9x8LtSr@public.gmane.org>
2015-10-23  3:53           ` Dan Carpenter
2015-10-20  2:11   ` [PATCH v2 15/22] staging/rdma/hfi1: Allow tuning of SDMA interrupt rate ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-22 10:54     ` Dan Carpenter
2015-10-22 22:27       ` ira.weiny
2015-10-20  2:11   ` [PATCH v2 16/22] staging/rdma/hfi1: Add irqsaves in the packet processing path ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 17/22] staging/rdma/hfi1: Thread the receive interrupt ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 18/22] staging/rdma/hfi: modify workqueue for parallelism ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 19/22] staging/rdma/hfi1: Load SBus firmware once per ASIC ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 20/22] staging/rdma/hfi1: Add unit # to verbs txreq cache name ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 21/22] staging/rdma/hfi1: add additional rc traces ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-20  2:11   ` [PATCH v2 22/22] staging/rdma/hfi1: Update driver version string to 0.9-294 ira.weiny-ral2JQCrhuEAvxtiuMwx3w

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1445307097-8244-13-git-send-email-ira.weiny@intel.com \
    --to=ira.weiny-ral2jqcrhueavxtiumwx3w@public.gmane.org \
    --cc=dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org \
    --cc=devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b@public.gmane.org \
    --cc=dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
    --cc=gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org \
    --cc=linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=mike.marciniszyn-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org \
    --cc=mitko.haralanov-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.