All of lore.kernel.org
 help / color / mirror / Atom feed
From: Long Li <longli@exchange.microsoft.com>
To: Steve French <sfrench@samba.org>,
	linux-cifs@vger.kernel.org, samba-technical@lists.samba.org,
	linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org,
	Christoph Hellwig <hch@infradead.org>,
	Tom Talpey <ttalpey@microsoft.com>,
	Matthew Wilcox <mawilcox@microsoft.com>,
	Stephen Hemminger <sthemmin@microsoft.com>
Cc: Long Li <longli@microsoft.com>
Subject: [Patch v7 16/22] CIFS: SMBD: Implement function to send data via RDMA send
Date: Tue,  7 Nov 2017 01:55:08 -0700	[thread overview]
Message-ID: <20171107085514.12693-17-longli@exchange.microsoft.com> (raw)
In-Reply-To: <20171107085514.12693-1-longli@exchange.microsoft.com>

From: Long Li <longli@microsoft.com>

The transport doesn't maintain send buffers or send queue for transferring
payload via RDMA send. There is no data copy in the transport on send.

Signed-off-by: Long Li <longli@microsoft.com>
---
 fs/cifs/smbdirect.c | 246 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/smbdirect.h |   3 +
 2 files changed, 249 insertions(+)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5a08015..0705f49 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -42,6 +42,12 @@ static int smbd_post_recv(
 		struct smbd_response *response);
 
 static int smbd_post_send_empty(struct smbd_connection *info);
+static int smbd_post_send_data(
+		struct smbd_connection *info,
+		struct kvec *iov, int n_vec, int remaining_data_length);
+static int smbd_post_send_page(struct smbd_connection *info,
+		struct page *page, unsigned long offset,
+		size_t size, int remaining_data_length);
 
 /* SMBD version number */
 #define SMBD_V1	0x0100
@@ -178,6 +184,10 @@ static void smbd_destroy_rdma_work(struct work_struct *work)
 	log_rdma_event(INFO, "cancelling send immediate work\n");
 	cancel_delayed_work_sync(&info->send_immediate_work);
 
+	log_rdma_event(INFO, "wait for all send to finish\n");
+	wait_event(info->wait_smbd_send_pending,
+		info->smbd_send_pending == 0);
+
 	log_rdma_event(INFO, "wait for all recv to finish\n");
 	wake_up_interruptible(&info->wait_reassembly_queue);
 	wait_event(info->wait_smbd_recv_pending,
@@ -1080,6 +1090,24 @@ static int smbd_post_send_sgl(struct smbd_connection *info,
 }
 
 /*
+ * Send a page
+ * page: the page to send
+ * offset: offset in the page to send
+ * size: length in the page to send
+ * remaining_data_length: remaining data to send in this payload
+ */
+static int smbd_post_send_page(struct smbd_connection *info, struct page *page,
+		unsigned long offset, size_t size, int remaining_data_length)
+{
+	struct scatterlist sgl;
+
+	sg_init_table(&sgl, 1);
+	sg_set_page(&sgl, page, size, offset);
+
+	return smbd_post_send_sgl(info, &sgl, size, remaining_data_length);
+}
+
+/*
  * Send an empty message
  * Empty message is used to extend credits to peer to for keep live
  * while there is no upper layer payload to send at the time
@@ -1091,6 +1119,35 @@ static int smbd_post_send_empty(struct smbd_connection *info)
 }
 
 /*
+ * Send a data buffer
+ * iov: the iov array describing the data buffers
+ * n_vec: number of iov array
+ * remaining_data_length: remaining data to send following this packet
+ * in segmented SMBD packet
+ */
+static int smbd_post_send_data(
+	struct smbd_connection *info, struct kvec *iov, int n_vec,
+	int remaining_data_length)
+{
+	int i;
+	u32 data_length = 0;
+	struct scatterlist sgl[SMBDIRECT_MAX_SGE];
+
+	if (n_vec > SMBDIRECT_MAX_SGE) {
+		cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec);
+		return -ENOMEM;
+	}
+
+	sg_init_table(sgl, n_vec);
+	for (i = 0; i < n_vec; i++) {
+		data_length += iov[i].iov_len;
+		sg_set_buf(&sgl[i], iov[i].iov_base, iov[i].iov_len);
+	}
+
+	return smbd_post_send_sgl(info, sgl, data_length, remaining_data_length);
+}
+
+/*
  * Post a receive request to the transport
  * The remote peer can only send data when a receive request is posted
  * The interaction is controlled by send/receive credit system
@@ -1657,6 +1714,9 @@ struct smbd_connection *_smbd_get_connection(
 	queue_delayed_work(info->workqueue, &info->idle_timer_work,
 		info->keep_alive_interval*HZ);
 
+	init_waitqueue_head(&info->wait_smbd_send_pending);
+	info->smbd_send_pending = 0;
+
 	init_waitqueue_head(&info->wait_smbd_recv_pending);
 	info->smbd_recv_pending = 0;
 
@@ -1948,3 +2008,189 @@ int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
 		msg->msg_iter.count = 0;
 	return rc;
 }
+
+/*
+ * Send data to transport
+ * Each rqst is transported as a SMBDirect payload
+ * rqst: the data to write
+ * return value: 0 if successfully write, otherwise error code
+ */
+int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst)
+{
+	struct kvec vec;
+	int nvecs;
+	int size;
+	int buflen = 0, remaining_data_length;
+	int start, i, j;
+	int max_iov_size =
+		info->max_send_size - sizeof(struct smbd_data_transfer);
+	struct kvec iov[SMBDIRECT_MAX_SGE];
+	int rc;
+
+	info->smbd_send_pending++;
+	if (info->transport_status != SMBD_CONNECTED) {
+		rc = -ENODEV;
+		goto done;
+	}
+
+	/*
+	 * This usually means a configuration error
+	 * We use RDMA read/write for packet size > rdma_readwrite_threshold
+	 * as long as it's properly configured we should never get into this
+	 * situation
+	 */
+	if (rqst->rq_nvec + rqst->rq_npages > SMBDIRECT_MAX_SGE) {
+		log_write(ERR, "maximum send segment %x exceeding %x\n",
+			 rqst->rq_nvec + rqst->rq_npages, SMBDIRECT_MAX_SGE);
+		rc = -EINVAL;
+		goto done;
+	}
+
+	/*
+	 * Remove the RFC1002 length defined in MS-SMB2 section 2.1
+	 * It is used only for TCP transport
+	 * In future we may want to add a transport layer under protocol
+	 * layer so this will only be issued to TCP transport
+	 */
+	iov[0].iov_base = (char *)rqst->rq_iov[0].iov_base + 4;
+	iov[0].iov_len = rqst->rq_iov[0].iov_len - 4;
+	buflen += iov[0].iov_len;
+
+	/* total up iov array first */
+	for (i = 1; i < rqst->rq_nvec; i++) {
+		iov[i].iov_base = rqst->rq_iov[i].iov_base;
+		iov[i].iov_len = rqst->rq_iov[i].iov_len;
+		buflen += iov[i].iov_len;
+	}
+
+	/* add in the page array if there is one */
+	if (rqst->rq_npages) {
+		buflen += rqst->rq_pagesz * (rqst->rq_npages - 1);
+		buflen += rqst->rq_tailsz;
+	}
+
+	if (buflen + sizeof(struct smbd_data_transfer) >
+		info->max_fragmented_send_size) {
+		log_write(ERR, "payload size %d > max size %d\n",
+			buflen, info->max_fragmented_send_size);
+		rc = -EINVAL;
+		goto done;
+	}
+
+	remaining_data_length = buflen;
+
+	log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d "
+		"rq_tailsz=%d buflen=%d\n",
+		rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz,
+		rqst->rq_tailsz, buflen);
+
+	start = i = iov[0].iov_len ? 0 : 1;
+	buflen = 0;
+	while (true) {
+		buflen += iov[i].iov_len;
+		if (buflen > max_iov_size) {
+			if (i > start) {
+				remaining_data_length -=
+					(buflen-iov[i].iov_len);
+				log_write(INFO, "sending iov[] from start=%d "
+					"i=%d nvecs=%d "
+					"remaining_data_length=%d\n",
+					start, i, i-start,
+					remaining_data_length);
+				rc = smbd_post_send_data(
+					info, &iov[start], i-start,
+					remaining_data_length);
+				if (rc)
+					goto done;
+			} else {
+				/* iov[start] is too big, break it */
+				nvecs = (buflen+max_iov_size-1)/max_iov_size;
+				log_write(INFO, "iov[%d] iov_base=%p buflen=%d"
+					" break to %d vectors\n",
+					start, iov[start].iov_base,
+					buflen, nvecs);
+				for (j = 0; j < nvecs; j++) {
+					vec.iov_base =
+						(char *)iov[start].iov_base +
+						j*max_iov_size;
+					vec.iov_len = max_iov_size;
+					if (j == nvecs-1)
+						vec.iov_len =
+							buflen -
+							max_iov_size*(nvecs-1);
+					remaining_data_length -= vec.iov_len;
+					log_write(INFO,
+						"sending vec j=%d iov_base=%p"
+						" iov_len=%zu "
+						"remaining_data_length=%d\n",
+						j, vec.iov_base, vec.iov_len,
+						remaining_data_length);
+					rc = smbd_post_send_data(
+						info, &vec, 1,
+						remaining_data_length);
+					if (rc)
+						goto done;
+				}
+				i++;
+			}
+			start = i;
+			buflen = 0;
+		} else {
+			i++;
+			if (i == rqst->rq_nvec) {
+				/* send out all remaining vecs */
+				remaining_data_length -= buflen;
+				log_write(INFO,
+					"sending iov[] from start=%d i=%d "
+					"nvecs=%d remaining_data_length=%d\n",
+					start, i, i-start,
+					remaining_data_length);
+				rc = smbd_post_send_data(info, &iov[start],
+					i-start, remaining_data_length);
+				if (rc)
+					goto done;
+				break;
+			}
+		}
+		log_write(INFO, "looping i=%d buflen=%d\n", i, buflen);
+	}
+
+	/* now sending pages if there are any */
+	for (i = 0; i < rqst->rq_npages; i++) {
+		buflen = (i == rqst->rq_npages-1) ?
+			rqst->rq_tailsz : rqst->rq_pagesz;
+		nvecs = (buflen + max_iov_size - 1) / max_iov_size;
+		log_write(INFO, "sending pages buflen=%d nvecs=%d\n",
+			buflen, nvecs);
+		for (j = 0; j < nvecs; j++) {
+			size = max_iov_size;
+			if (j == nvecs-1)
+				size = buflen - j*max_iov_size;
+			remaining_data_length -= size;
+			log_write(INFO, "sending pages i=%d offset=%d size=%d"
+				" remaining_data_length=%d\n",
+				i, j*max_iov_size, size, remaining_data_length);
+			rc = smbd_post_send_page(
+				info, rqst->rq_pages[i], j*max_iov_size,
+				size, remaining_data_length);
+			if (rc)
+				goto done;
+		}
+	}
+
+done:
+	/*
+	 * As an optimization, we don't wait for individual I/O to finish
+	 * before sending the next one.
+	 * Send them all and wait for pending send count to get to 0
+	 * that means all the I/Os have been out and we are good to return
+	 */
+
+	wait_event(info->wait_send_payload_pending,
+		atomic_read(&info->send_payload_pending) == 0);
+
+	info->smbd_send_pending--;
+	wake_up(&info->wait_smbd_send_pending);
+
+	return rc;
+}
diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h
index 65a431f..e777d77 100644
--- a/fs/cifs/smbdirect.h
+++ b/fs/cifs/smbdirect.h
@@ -92,6 +92,9 @@ struct smbd_connection {
 
 	/* Activity accoutning */
 	/* Pending reqeusts issued from upper layer */
+	int smbd_send_pending;
+	wait_queue_head_t wait_smbd_send_pending;
+
 	int smbd_recv_pending;
 	wait_queue_head_t wait_smbd_recv_pending;
 
-- 
2.7.4

  parent reply	other threads:[~2017-11-07  8:55 UTC|newest]

Thread overview: 63+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-11-07  8:54 [Patch v7 00/22] CIFS: Implement SMB Direct protocol Long Li
2017-11-07  8:54 ` [Patch v7 01/22] CIFS: SMBD: Add parameter rdata to smb2_new_read_req Long Li
     [not found]   ` <20171107085514.12693-2-longli-Lp/cVzEoVyZiJJESP9tAQJZ3qXmFLfmx@public.gmane.org>
2017-11-16 23:06     ` Pavel Shilovskiy
2017-11-16 23:06       ` Pavel Shilovskiy
2017-11-16 23:06       ` Pavel Shilovskiy
2017-11-20  5:28     ` Leif Sahlberg
2017-11-20  5:28       ` Leif Sahlberg
     [not found] ` <20171107085514.12693-1-longli-Lp/cVzEoVyZiJJESP9tAQJZ3qXmFLfmx@public.gmane.org>
2017-11-07  8:54   ` [Patch v7 02/22] CIFS: SMBD: Introduce kernel config option CONFIG_CIFS_SMB_DIRECT Long Li
2017-11-07  8:54     ` Long Li
     [not found]     ` <20171107085514.12693-3-longli-Lp/cVzEoVyZiJJESP9tAQJZ3qXmFLfmx@public.gmane.org>
2017-11-16 23:08       ` Pavel Shilovskiy
2017-11-16 23:08         ` Pavel Shilovskiy
2017-11-16 23:08         ` Pavel Shilovskiy
2017-11-20  5:28       ` Leif Sahlberg
2017-11-20  5:28         ` Leif Sahlberg
2017-11-07  8:54   ` [Patch v7 03/22] CIFS: SMBD: Add rdma mount option Long Li
2017-11-07  8:54     ` Long Li
     [not found]     ` <20171107085514.12693-4-longli-Lp/cVzEoVyZiJJESP9tAQJZ3qXmFLfmx@public.gmane.org>
2017-11-16 23:18       ` Pavel Shilovskiy
2017-11-16 23:18         ` Pavel Shilovskiy
2017-11-16 23:18         ` Pavel Shilovskiy
2017-11-20  5:30       ` Leif Sahlberg
2017-11-20  5:30         ` Leif Sahlberg
2017-11-07  8:54   ` [Patch v7 06/22] CIFS: SMBD: export protocol initial values Long Li
2017-11-07  8:54     ` Long Li
     [not found]     ` <20171107085514.12693-7-longli-Lp/cVzEoVyZiJJESP9tAQJZ3qXmFLfmx@public.gmane.org>
2017-11-20  7:37       ` Leif Sahlberg
2017-11-20  7:37         ` Leif Sahlberg
2017-11-20 16:55         ` Steve French
2017-11-07  8:55   ` [Patch v7 08/22] CIFS: SMBD: Upper layer connects to SMBDirect session Long Li
2017-11-07  8:55     ` Long Li
2017-11-07  8:55   ` [Patch v7 15/22] CIFS: SMBD: Upper layer receives data via RDMA receive Long Li
2017-11-07  8:55     ` Long Li
2017-11-21  5:16   ` [Patch v7 00/22] CIFS: Implement SMB Direct protocol Steve French
2017-11-21  5:16     ` Steve French
2017-11-07  8:54 ` [Patch v7 04/22] CIFS: SMBD: Add SMB Direct protocol initial values and constants Long Li
     [not found]   ` <20171107085514.12693-5-longli-Lp/cVzEoVyZiJJESP9tAQJZ3qXmFLfmx@public.gmane.org>
2017-11-20  5:31     ` Leif Sahlberg
2017-11-20  5:31       ` Leif Sahlberg
2017-11-07  8:54 ` [Patch v7 05/22] CIFS: SMBD: Establish SMB Direct connection Long Li
     [not found]   ` <20171107085514.12693-6-longli-Lp/cVzEoVyZiJJESP9tAQJZ3qXmFLfmx@public.gmane.org>
2017-11-20  1:36     ` ronnie sahlberg
2017-11-20  1:36       ` ronnie sahlberg
2017-11-20  5:46     ` Leif Sahlberg
2017-11-20  5:46       ` Leif Sahlberg
     [not found]       ` <817309867.28473523.1511156807466.JavaMail.zimbra-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2017-11-20  6:07         ` Long Li
2017-11-20  6:07           ` Long Li
2017-11-07  8:54 ` [Patch v7 07/22] CIFS: SMBD: Implement function to create a " Long Li
2017-11-07  8:55 ` [Patch v7 09/22] CIFS: SMBD: Implement function to reconnect to a SMB Direct transport Long Li
2017-11-07  8:55 ` [Patch v7 10/22] CIFS: SMBD: Upper layer reconnects to SMB Direct session Long Li
2017-11-07  8:55 ` [Patch v7 11/22] CIFS: SMBD: Implement function to destroy a SMB Direct connection Long Li
2017-11-07  8:55 ` [Patch v7 12/22] CIFS: SMBD: Upper layer destroys SMB Direct session on shutdown or umount Long Li
2017-11-07  8:55 ` [Patch v7 13/22] CIFS: SMBD: Set SMB Direct maximum read or write size for I/O Long Li
2017-11-07  8:55 ` [Patch v7 14/22] CIFS: SMBD: Implement function to receive data via RDMA receive Long Li
2017-11-07  8:55 ` Long Li [this message]
2017-11-07  8:55 ` [Patch v7 17/22] CIFS: SMBD: Upper layer sends data via RDMA send Long Li
2017-11-07  8:55 ` [Patch v7 18/22] CIFS: SMBD: Implement RDMA memory registration Long Li
2017-11-07  8:55 ` [Patch v7 19/22] CIFS: SMBD: Upper layer performs SMB write via RDMA read through " Long Li
2017-11-07  8:55 ` [Patch v7 20/22] CIFS: SMBD: Read correct returned data length for RDMA write (SMB read) I/O Long Li
2017-11-07  8:55 ` [Patch v7 21/22] CIFS: SMBD: Upper layer performs SMB read via RDMA write through memory registration Long Li
2018-09-19  5:59   ` Tom Talpey
2018-09-20 17:01     ` Long Li
2018-09-22  3:56     ` Stefan Metzmacher
2018-09-22 17:16       ` Tom Talpey
2018-09-23 21:24         ` Stefan Metzmacher
2018-09-24  4:00           ` Tom Talpey
2018-09-24  4:07             ` Stefan Metzmacher
2017-11-07  8:55 ` [Patch v7 22/22] CIFS: SMBD: Add SMB Direct debug counters Long Li

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20171107085514.12693-17-longli@exchange.microsoft.com \
    --to=longli@exchange.microsoft.com \
    --cc=hch@infradead.org \
    --cc=linux-cifs@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=longli@microsoft.com \
    --cc=mawilcox@microsoft.com \
    --cc=samba-technical@lists.samba.org \
    --cc=sfrench@samba.org \
    --cc=sthemmin@microsoft.com \
    --cc=ttalpey@microsoft.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.