From: Aurelien Aptel <aaptel@nvidia.com>
To: linux-nvme@lists.infradead.org, netdev@vger.kernel.org,
sagi@grimberg.me, hch@lst.de, kbusch@kernel.org, axboe@fb.com,
chaitanyak@nvidia.com, davem@davemloft.net, kuba@kernel.org
Cc: aaptel@nvidia.com, aurelien.aptel@gmail.com, smalin@nvidia.com,
malin1024@gmail.com, ogerlitz@nvidia.com, yorayz@nvidia.com,
borisp@nvidia.com, galshalom@nvidia.com, mgurtovoy@nvidia.com,
tariqt@nvidia.com, gus@collabora.com
Subject: [PATCH v28 18/20] net/mlx5e: NVMEoTCP, async ddp invalidation
Date: Wed, 30 Apr 2025 08:57:39 +0000 [thread overview]
Message-ID: <20250430085741.5108-19-aaptel@nvidia.com> (raw)
In-Reply-To: <20250430085741.5108-1-aaptel@nvidia.com>
From: Ben Ben-Ishay <benishay@nvidia.com>
After the ULP consumed the buffers of the offloaded request, it calls the
ddp_teardown op to release the NIC mapping for them and allow the NIC to
reuse the HW contexts associated with offloading this IO. We do a
fast/async un-mapping via UMR WQE. In this case, the ULP does holds off
with completing the request towards the upper/application layers until the
HW unmapping is done.
When the corresponding CQE is received, a notification is done via the
the teardown_done ddp callback advertised by the ULP in the ddp context.
Signed-off-by: Ben Ben-Ishay <benishay@nvidia.com>
Signed-off-by: Boris Pismenny <borisp@nvidia.com>
Signed-off-by: Or Gerlitz <ogerlitz@nvidia.com>
Signed-off-by: Yoray Zack <yorayz@nvidia.com>
Signed-off-by: Aurelien Aptel <aaptel@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/en/txrx.h | 4 ++
.../mellanox/mlx5/core/en_accel/nvmeotcp.c | 58 +++++++++++++++++--
.../mellanox/mlx5/core/en_accel/nvmeotcp.h | 1 +
.../net/ethernet/mellanox/mlx5/core/en_rx.c | 6 ++
4 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
index 528e5c97f5a4..b9264a7fe587 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
@@ -73,6 +73,7 @@ enum mlx5e_icosq_wqe_type {
#endif
#ifdef CONFIG_MLX5_EN_NVMEOTCP
MLX5E_ICOSQ_WQE_UMR_NVMEOTCP,
+ MLX5E_ICOSQ_WQE_UMR_NVMEOTCP_INVALIDATE,
MLX5E_ICOSQ_WQE_SET_PSV_NVMEOTCP,
#endif
};
@@ -264,6 +265,9 @@ struct mlx5e_icosq_wqe_info {
struct {
struct mlx5e_nvmeotcp_queue *queue;
} nvmeotcp_q;
+ struct {
+ struct mlx5e_nvmeotcp_queue_entry *entry;
+ } nvmeotcp_qe;
#endif
};
};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c
index 48dd242af2bb..639a9187d88c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c
@@ -173,6 +173,13 @@ build_nvmeotcp_klm_umr(struct mlx5e_nvmeotcp_queue *queue,
cseg->qpn_ds = cpu_to_be32((sqn << MLX5_WQE_CTRL_QPN_SHIFT) | ds_cnt);
cseg->general_id = cpu_to_be32(id);
+ if (!klm_entries) { /* this is invalidate */
+ ucseg->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE);
+ ucseg->flags = MLX5_UMR_INLINE;
+ mkc->status = MLX5_MKEY_STATUS_FREE;
+ return;
+ }
+
if (klm_type == KLM_UMR && !klm_offset) {
ucseg->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_XLT_OCT_SIZE |
MLX5_MKEY_MASK_LEN |
@@ -285,8 +292,8 @@ build_nvmeotcp_static_params(struct mlx5e_nvmeotcp_queue *queue,
static void
mlx5e_nvmeotcp_fill_wi(struct mlx5e_nvmeotcp_queue *nvmeotcp_queue,
- struct mlx5e_icosq *sq, u32 wqebbs, u16 pi,
- enum wqe_type type)
+ struct mlx5e_icosq *sq, u32 wqebbs,
+ u16 pi, u16 ccid, enum wqe_type type)
{
struct mlx5e_icosq_wqe_info *wi = &sq->db.wqe_info[pi];
@@ -298,6 +305,10 @@ mlx5e_nvmeotcp_fill_wi(struct mlx5e_nvmeotcp_queue *nvmeotcp_queue,
wi->wqe_type = MLX5E_ICOSQ_WQE_SET_PSV_NVMEOTCP;
wi->nvmeotcp_q.queue = nvmeotcp_queue;
break;
+ case KLM_INV_UMR:
+ wi->wqe_type = MLX5E_ICOSQ_WQE_UMR_NVMEOTCP_INVALIDATE;
+ wi->nvmeotcp_qe.entry = &nvmeotcp_queue->ccid_table[ccid];
+ break;
default:
/* cases where no further action is required upon
* completion, such as ddp setup
@@ -319,7 +330,7 @@ mlx5e_nvmeotcp_rx_post_static_params_wqe(struct mlx5e_nvmeotcp_queue *queue,
wqebbs = MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS;
pi = mlx5e_icosq_get_next_pi(sq, wqebbs);
wqe = MLX5E_TRANSPORT_FETCH_SET_STATIC_PARAMS_WQE(sq, pi);
- mlx5e_nvmeotcp_fill_wi(NULL, sq, wqebbs, pi, BSF_UMR);
+ mlx5e_nvmeotcp_fill_wi(NULL, sq, wqebbs, pi, 0, BSF_UMR);
build_nvmeotcp_static_params(queue, wqe, resync_seq, queue->crc_rx);
sq->pc += wqebbs;
mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, &wqe->ctrl);
@@ -337,7 +348,7 @@ mlx5e_nvmeotcp_rx_post_progress_params_wqe(struct mlx5e_nvmeotcp_queue *queue,
wqebbs = MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQEBBS;
pi = mlx5e_icosq_get_next_pi(sq, wqebbs);
wqe = MLX5E_NVMEOTCP_FETCH_PROGRESS_PARAMS_WQE(sq, pi);
- mlx5e_nvmeotcp_fill_wi(queue, sq, wqebbs, pi, SET_PSV_UMR);
+ mlx5e_nvmeotcp_fill_wi(queue, sq, wqebbs, pi, 0, SET_PSV_UMR);
build_nvmeotcp_progress_params(queue, wqe, seq);
sq->pc += wqebbs;
mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, &wqe->ctrl);
@@ -363,7 +374,7 @@ post_klm_wqe(struct mlx5e_nvmeotcp_queue *queue,
wqebbs = DIV_ROUND_UP(wqe_sz, MLX5_SEND_WQE_BB);
pi = mlx5e_icosq_get_next_pi(sq, wqebbs);
wqe = MLX5E_NVMEOTCP_FETCH_KLM_WQE(sq, pi);
- mlx5e_nvmeotcp_fill_wi(queue, sq, wqebbs, pi, wqe_type);
+ mlx5e_nvmeotcp_fill_wi(queue, sq, wqebbs, pi, ccid, wqe_type);
build_nvmeotcp_klm_umr(queue, wqe, ccid, cur_klm_entries, klm_offset,
klm_length, wqe_type);
sq->pc += wqebbs;
@@ -378,7 +389,10 @@ mlx5e_nvmeotcp_post_klm_wqe(struct mlx5e_nvmeotcp_queue *queue,
struct mlx5e_icosq *sq = &queue->sq;
u32 klm_offset = 0, wqes, i;
- wqes = DIV_ROUND_UP(klm_length, queue->max_klms_per_wqe);
+ if (wqe_type == KLM_INV_UMR)
+ wqes = 1;
+ else
+ wqes = DIV_ROUND_UP(klm_length, queue->max_klms_per_wqe);
spin_lock_bh(&queue->sq_lock);
@@ -905,12 +919,44 @@ void mlx5e_nvmeotcp_ctx_complete(struct mlx5e_icosq_wqe_info *wi)
complete(&queue->static_params_done);
}
+void mlx5e_nvmeotcp_ddp_inv_done(struct mlx5e_icosq_wqe_info *wi)
+{
+ struct mlx5e_nvmeotcp_queue_entry *q_entry = wi->nvmeotcp_qe.entry;
+ struct mlx5e_nvmeotcp_queue *queue = q_entry->queue;
+ struct mlx5_core_dev *mdev = queue->priv->mdev;
+ struct ulp_ddp_io *ddp = q_entry->ddp;
+ const struct ulp_ddp_ulp_ops *ulp_ops;
+
+ dma_unmap_sg(mdev->device, ddp->sg_table.sgl,
+ ddp->nents, DMA_FROM_DEVICE);
+
+ q_entry->sgl_length = 0;
+
+ ulp_ops = inet_csk(queue->sk)->icsk_ulp_ddp_ops;
+ if (ulp_ops && ulp_ops->ddp_teardown_done)
+ ulp_ops->ddp_teardown_done(q_entry->ddp_ctx);
+}
+
static void
mlx5e_nvmeotcp_ddp_teardown(struct net_device *netdev,
struct sock *sk,
struct ulp_ddp_io *ddp,
void *ddp_ctx)
{
+ struct mlx5e_nvmeotcp_queue_entry *q_entry;
+ struct mlx5e_nvmeotcp_queue *queue;
+
+ queue = container_of(ulp_ddp_get_ctx(sk), struct mlx5e_nvmeotcp_queue,
+ ulp_ddp_ctx);
+ q_entry = &queue->ccid_table[ddp->command_id];
+ WARN_ONCE(q_entry->sgl_length == 0,
+ "Invalidation of empty sgl (CID 0x%x, queue 0x%x)\n",
+ ddp->command_id, queue->id);
+
+ q_entry->ddp_ctx = ddp_ctx;
+ q_entry->queue = queue;
+
+ mlx5e_nvmeotcp_post_klm_wqe(queue, KLM_INV_UMR, ddp->command_id, 0);
}
static void
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h
index 4850c19e18c7..67805adc6fdf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h
@@ -113,6 +113,7 @@ void mlx5e_nvmeotcp_cleanup(struct mlx5e_priv *priv);
struct mlx5e_nvmeotcp_queue *
mlx5e_nvmeotcp_get_queue(struct mlx5e_nvmeotcp *nvmeotcp, int id);
void mlx5e_nvmeotcp_put_queue(struct mlx5e_nvmeotcp_queue *queue);
+void mlx5e_nvmeotcp_ddp_inv_done(struct mlx5e_icosq_wqe_info *wi);
void mlx5e_nvmeotcp_ctx_complete(struct mlx5e_icosq_wqe_info *wi);
static inline void mlx5e_nvmeotcp_init_rx(struct mlx5e_priv *priv) {}
void mlx5e_nvmeotcp_cleanup_rx(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 38c8825d8678..8af51d1886bd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -958,6 +958,9 @@ void mlx5e_free_icosq_descs(struct mlx5e_icosq *sq)
break;
#endif
#ifdef CONFIG_MLX5_EN_NVMEOTCP
+ case MLX5E_ICOSQ_WQE_UMR_NVMEOTCP_INVALIDATE:
+ mlx5e_nvmeotcp_ddp_inv_done(wi);
+ break;
case MLX5E_ICOSQ_WQE_SET_PSV_NVMEOTCP:
mlx5e_nvmeotcp_ctx_complete(wi);
break;
@@ -1068,6 +1071,9 @@ int mlx5e_poll_ico_cq(struct mlx5e_cq *cq, int budget)
#ifdef CONFIG_MLX5_EN_NVMEOTCP
case MLX5E_ICOSQ_WQE_UMR_NVMEOTCP:
break;
+ case MLX5E_ICOSQ_WQE_UMR_NVMEOTCP_INVALIDATE:
+ mlx5e_nvmeotcp_ddp_inv_done(wi);
+ break;
case MLX5E_ICOSQ_WQE_SET_PSV_NVMEOTCP:
mlx5e_nvmeotcp_ctx_complete(wi);
break;
--
2.34.1
next prev parent reply other threads:[~2025-04-30 8:59 UTC|newest]
Thread overview: 37+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-04-30 8:57 [PATCH v28 00/20] nvme-tcp receive offloads Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 01/20] net: Introduce direct data placement tcp offload Aurelien Aptel
2025-05-14 7:12 ` Eric Dumazet
2025-05-16 14:47 ` Aurelien Aptel
2025-05-16 22:31 ` Jakub Kicinski
2025-05-17 7:38 ` Eric Dumazet
2025-05-22 15:01 ` Aurelien Aptel
2025-06-04 12:33 ` Aurelien Aptel
2025-06-04 12:55 ` Eric Dumazet
2025-06-05 11:54 ` Aurelien Aptel
2025-06-05 12:44 ` Eric Dumazet
2025-04-30 8:57 ` [PATCH v28 02/20] netlink: add new family to manage ULP_DDP enablement and stats Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 03/20] iov_iter: skip copy if src == dst for direct data placement Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 04/20] net/tls,core: export get_netdev_for_sock Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 05/20] nvme-tcp: Add DDP offload control path Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 06/20] nvme-tcp: Add DDP data-path Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 07/20] nvme-tcp: RX DDGST offload Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 08/20] nvme-tcp: Deal with netdevice DOWN events Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 09/20] Documentation: add ULP DDP offload documentation Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 10/20] net/mlx5e: Rename from tls to transport static params Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 11/20] net/mlx5e: Refactor ico sq polling to get budget Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 12/20] net/mlx5: Add NVMEoTCP caps, HW bits, 128B CQE and enumerations Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 13/20] net/mlx5e: NVMEoTCP, offload initialization Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 14/20] net/mlx5e: TCP flow steering for nvme-tcp acceleration Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 15/20] net/mlx5e: NVMEoTCP, use KLM UMRs for buffer registration Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 16/20] net/mlx5e: NVMEoTCP, queue init/teardown Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 17/20] net/mlx5e: NVMEoTCP, ddp setup and resync Aurelien Aptel
2025-04-30 8:57 ` Aurelien Aptel [this message]
2025-04-30 8:57 ` [PATCH v28 19/20] net/mlx5e: NVMEoTCP, data-path for DDP+DDGST offload Aurelien Aptel
2025-04-30 8:57 ` [PATCH v28 20/20] net/mlx5e: NVMEoTCP, statistics Aurelien Aptel
2025-04-30 12:52 ` [PATCH v28 00/20] nvme-tcp receive offloads Gustavo Padovan
2025-05-05 20:43 ` Jakub Kicinski
2025-05-05 21:51 ` Keith Busch
2025-05-05 22:51 ` Jakub Kicinski
2025-05-13 12:56 ` Aurelien Aptel
2025-05-13 14:36 ` Jakub Kicinski
2025-05-06 13:34 ` Sagi Grimberg
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250430085741.5108-19-aaptel@nvidia.com \
--to=aaptel@nvidia.com \
--cc=aurelien.aptel@gmail.com \
--cc=axboe@fb.com \
--cc=borisp@nvidia.com \
--cc=chaitanyak@nvidia.com \
--cc=davem@davemloft.net \
--cc=galshalom@nvidia.com \
--cc=gus@collabora.com \
--cc=hch@lst.de \
--cc=kbusch@kernel.org \
--cc=kuba@kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=malin1024@gmail.com \
--cc=mgurtovoy@nvidia.com \
--cc=netdev@vger.kernel.org \
--cc=ogerlitz@nvidia.com \
--cc=sagi@grimberg.me \
--cc=smalin@nvidia.com \
--cc=tariqt@nvidia.com \
--cc=yorayz@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).