[PATCH v15 06/20] nvme-tcp: Add DDP data-path

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Aurelien Aptel <aaptel@nvidia.com>
To: linux-nvme@lists.infradead.org, netdev@vger.kernel.org,
	sagi@grimberg.me, hch@lst.de, kbusch@kernel.org, axboe@fb.com,
	chaitanyak@nvidia.com, davem@davemloft.net, kuba@kernel.org
Cc: Boris Pismenny <borisp@nvidia.com>,
	aaptel@nvidia.com, aurelien.aptel@gmail.com, smalin@nvidia.com,
	malin1024@gmail.com, ogerlitz@nvidia.com, yorayz@nvidia.com,
	galshalom@nvidia.com, mgurtovoy@nvidia.com
Subject: [PATCH v15 06/20] nvme-tcp: Add DDP data-path
Date: Tue, 12 Sep 2023 09:59:35 +0000	[thread overview]
Message-ID: <20230912095949.5474-7-aaptel@nvidia.com> (raw)
In-Reply-To: <20230912095949.5474-1-aaptel@nvidia.com>

From: Boris Pismenny <borisp@nvidia.com>

Introduce the NVMe-TCP DDP data-path offload.
Using this interface, the NIC hardware will scatter TCP payload directly
to the BIO pages according to the command_id in the PDU.
To maintain the correctness of the network stack, the driver is expected
to construct SKBs that point to the BIO pages.

The data-path interface contains two routines: setup/teardown.
The setup provides the mapping from command_id to the request buffers,
while the teardown removes this mapping.

For efficiency, we introduce an asynchronous nvme completion, which is
split between NVMe-TCP and the NIC driver as follows:
NVMe-TCP performs the specific completion, while NIC driver performs the
generic mq_blk completion.

Signed-off-by: Boris Pismenny <borisp@nvidia.com>
Signed-off-by: Ben Ben-Ishay <benishay@nvidia.com>
Signed-off-by: Or Gerlitz <ogerlitz@nvidia.com>
Signed-off-by: Yoray Zack <yorayz@nvidia.com>
Signed-off-by: Shai Malin <smalin@nvidia.com>
Signed-off-by: Aurelien Aptel <aaptel@nvidia.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
---
 drivers/nvme/host/tcp.c | 111 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 106 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index f8322a07e27e..48bc8afe272d 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -112,6 +112,13 @@ struct nvme_tcp_request {
 	size_t			offset;
 	size_t			data_sent;
 	enum nvme_tcp_send_state state;
+
+#ifdef CONFIG_ULP_DDP
+	bool			offloaded;
+	struct ulp_ddp_io	ddp;
+	__le16			ddp_status;
+	union nvme_result	result;
+#endif
 };
 
 enum nvme_tcp_queue_flags {
@@ -319,10 +326,72 @@ static bool nvme_tcp_ddp_query_limits(struct nvme_tcp_ctrl *ctrl)
 }
 
 static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags);
+static void nvme_tcp_ddp_teardown_done(void *ddp_ctx);
 static const struct ulp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops = {
 	.resync_request		= nvme_tcp_resync_request,
+	.ddp_teardown_done	= nvme_tcp_ddp_teardown_done,
 };
 
+static void nvme_tcp_teardown_ddp(struct nvme_tcp_queue *queue,
+				  struct request *rq)
+{
+	struct net_device *netdev = queue->ctrl->ddp_netdev;
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+
+	ulp_ddp_teardown(netdev, queue->sock->sk, &req->ddp, rq);
+	sg_free_table_chained(&req->ddp.sg_table, SG_CHUNK_SIZE);
+}
+
+static void nvme_tcp_ddp_teardown_done(void *ddp_ctx)
+{
+	struct request *rq = ddp_ctx;
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+
+	if (!nvme_try_complete_req(rq, req->ddp_status, req->result))
+		nvme_complete_rq(rq);
+}
+
+static void nvme_tcp_setup_ddp(struct nvme_tcp_queue *queue,
+			       struct request *rq)
+{
+	struct net_device *netdev = queue->ctrl->ddp_netdev;
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	int ret;
+
+	if (rq_data_dir(rq) != READ ||
+	    queue->ctrl->ddp_threshold > blk_rq_payload_bytes(rq))
+		return;
+
+	/*
+	 * DDP offload is best-effort, errors are ignored.
+	 */
+
+	req->ddp.command_id = nvme_cid(rq);
+	req->ddp.sg_table.sgl = req->ddp.first_sgl;
+	ret = sg_alloc_table_chained(&req->ddp.sg_table,
+				     blk_rq_nr_phys_segments(rq),
+				     req->ddp.sg_table.sgl, SG_CHUNK_SIZE);
+	if (ret)
+		goto err;
+	req->ddp.nents = blk_rq_map_sg(rq->q, rq, req->ddp.sg_table.sgl);
+
+	ret = ulp_ddp_setup(netdev, queue->sock->sk, &req->ddp);
+	if (ret) {
+		sg_free_table_chained(&req->ddp.sg_table, SG_CHUNK_SIZE);
+		goto err;
+	}
+
+	/* if successful, sg table is freed in nvme_tcp_teardown_ddp() */
+	req->offloaded = true;
+
+	return;
+err:
+	WARN_ONCE(ret, "ddp setup failed (queue 0x%x, cid 0x%x, ret=%d)",
+		  nvme_tcp_queue_id(queue),
+		  nvme_cid(rq),
+		  ret);
+}
+
 static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue)
 {
 	struct ulp_ddp_config config = {.type = ULP_DDP_NVME};
@@ -427,6 +496,10 @@ static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags)
 
 #else
 
+static void nvme_tcp_setup_ddp(struct nvme_tcp_queue *queue,
+			       struct request *rq)
+{}
+
 static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue)
 {}
 
@@ -705,6 +778,26 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
 	queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
 }
 
+static void nvme_tcp_complete_request(struct request *rq,
+				      __le16 status,
+				      union nvme_result result,
+				      __u16 command_id)
+{
+#ifdef CONFIG_ULP_DDP
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+
+	if (req->offloaded) {
+		req->ddp_status = status;
+		req->result = result;
+		nvme_tcp_teardown_ddp(req->queue, rq);
+		return;
+	}
+#endif
+
+	if (!nvme_try_complete_req(rq, status, result))
+		nvme_complete_rq(rq);
+}
+
 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
 		struct nvme_completion *cqe)
 {
@@ -724,10 +817,9 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
 	if (req->status == cpu_to_le16(NVME_SC_SUCCESS))
 		req->status = cqe->status;
 
-	if (!nvme_try_complete_req(rq, req->status, cqe->result))
-		nvme_complete_rq(rq);
+	nvme_tcp_complete_request(rq, req->status, cqe->result,
+				  cqe->command_id);
 	queue->nr_cqe++;
-
 	return 0;
 }
 
@@ -925,10 +1017,13 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 
 static inline void nvme_tcp_end_request(struct request *rq, u16 status)
 {
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_tcp_queue *queue = req->queue;
+	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 	union nvme_result res = {};
 
-	if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
-		nvme_complete_rq(rq);
+	nvme_tcp_complete_request(rq, cpu_to_le16(status << 1), res,
+				  pdu->command_id);
 }
 
 static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
@@ -1235,6 +1330,9 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 	else
 		msg.msg_flags |= MSG_EOR;
 
+	if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags))
+		nvme_tcp_setup_ddp(queue, blk_mq_rq_from_pdu(req));
+
 	if (queue->hdr_digest && !req->offset)
 		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 
@@ -2541,6 +2639,9 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
 	if (ret)
 		return ret;
 
+#ifdef CONFIG_ULP_DDP
+	req->offloaded = false;
+#endif
 	req->state = NVME_TCP_SEND_CMD_PDU;
 	req->status = cpu_to_le16(NVME_SC_SUCCESS);
 	req->offset = 0;
-- 
2.34.1

next prev parent reply	other threads:[~2023-09-12 10:00 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-09-12  9:59 [PATCH v15 00/20] nvme-tcp receive offloads Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 01/20] net: Introduce direct data placement tcp offload Aurelien Aptel
2023-09-12 16:17   ` David Ahern
2023-09-21  7:43     ` Aurelien Aptel
2023-09-21 12:33       ` David Ahern
2023-09-21 13:02         ` Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 02/20] netlink: add new family to manage ULP_DDP enablement and stats Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 03/20] iov_iter: skip copy if src == dst for direct data placement Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 04/20] net/tls,core: export get_netdev_for_sock Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 05/20] nvme-tcp: Add DDP offload control path Aurelien Aptel
2023-09-12 13:24   ` Sagi Grimberg
2023-09-13  9:10     ` Aurelien Aptel
2023-09-13 10:46       ` Sagi Grimberg
2023-09-18 12:53         ` Aurelien Aptel
2023-09-13 10:49   ` Sagi Grimberg
2023-09-18 18:30     ` Aurelien Aptel
2023-09-12  9:59 ` Aurelien Aptel [this message]
2023-09-13 10:51   ` [PATCH v15 06/20] nvme-tcp: Add DDP data-path Sagi Grimberg
2023-09-18 18:26     ` Aurelien Aptel
2023-09-19  7:04       ` Sagi Grimberg
2023-09-20  8:39         ` Aurelien Aptel
2023-09-20 10:11           ` Sagi Grimberg
2023-09-20 16:04             ` Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 07/20] nvme-tcp: RX DDGST offload Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 08/20] nvme-tcp: Deal with netdevice DOWN events Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 09/20] Documentation: add ULP DDP offload documentation Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 10/20] net/mlx5e: Rename from tls to transport static params Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 11/20] net/mlx5e: Refactor ico sq polling to get budget Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 12/20] net/mlx5: Add NVMEoTCP caps, HW bits, 128B CQE and enumerations Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 13/20] net/mlx5e: NVMEoTCP, offload initialization Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 14/20] net/mlx5e: TCP flow steering for nvme-tcp acceleration Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 15/20] net/mlx5e: NVMEoTCP, use KLM UMRs for buffer registration Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 16/20] net/mlx5e: NVMEoTCP, queue init/teardown Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 17/20] net/mlx5e: NVMEoTCP, ddp setup and resync Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 18/20] net/mlx5e: NVMEoTCP, async ddp invalidation Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 19/20] net/mlx5e: NVMEoTCP, data-path for DDP+DDGST offload Aurelien Aptel
2023-09-12  9:59 ` [PATCH v15 20/20] net/mlx5e: NVMEoTCP, statistics Aurelien Aptel

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:f8322a07e27 dfblob:48bc8afe272 )
 OR (
bs:"[PATCH v15 06/20] nvme-tcp: Add DDP data-path" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230912095949.5474-7-aaptel@nvidia.com \
    --to=aaptel@nvidia.com \
    --cc=aurelien.aptel@gmail.com \
    --cc=axboe@fb.com \
    --cc=borisp@nvidia.com \
    --cc=chaitanyak@nvidia.com \
    --cc=davem@davemloft.net \
    --cc=galshalom@nvidia.com \
    --cc=hch@lst.de \
    --cc=kbusch@kernel.org \
    --cc=kuba@kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=malin1024@gmail.com \
    --cc=mgurtovoy@nvidia.com \
    --cc=netdev@vger.kernel.org \
    --cc=ogerlitz@nvidia.com \
    --cc=sagi@grimberg.me \
    --cc=smalin@nvidia.com \
    --cc=yorayz@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.