public inbox for linux-rdma@vger.kernel.org
 help / color / mirror / Atom feed
From: Michael Guralnik <michaelgur@nvidia.com>
To: <leonro@nvidia.com>, <jgg@nvidia.com>
Cc: <linux-rdma@vger.kernel.org>, <saeedm@nvidia.com>,
	<tariqt@nvidia.com>, Michael Guralnik <michaelgur@nvidia.com>
Subject: [PATCH v2 rdma-next 7/8] RDMA/mlx5: Add implicit MR handling to ODP memory scheme
Date: Mon, 9 Sep 2024 13:05:03 +0300	[thread overview]
Message-ID: <20240909100504.29797-8-michaelgur@nvidia.com> (raw)
In-Reply-To: <20240909100504.29797-1-michaelgur@nvidia.com>

Implicit MRs in ODP memory scheme require allocating a private null mkey
and assigning the mkey and va differently in the KSM mkey.
The page faults are received on the null mkey so we also add storing the
null mkey in the odp_mkey xarray.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |   3 +
 drivers/infiniband/hw/mlx5/odp.c     | 116 +++++++++++++++++++++++++--
 2 files changed, 111 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index ea8eb368108f..227dbaf7a754 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -630,6 +630,8 @@ enum mlx5_mkey_type {
 	MLX5_MKEY_MR = 1,
 	MLX5_MKEY_MW,
 	MLX5_MKEY_INDIRECT_DEVX,
+	MLX5_MKEY_NULL,
+	MLX5_MKEY_IMPLICIT_CHILD,
 };
 
 struct mlx5r_cache_rb_key {
@@ -715,6 +717,7 @@ struct mlx5_ib_mr {
 			struct mlx5_ib_mr *dd_crossed_mr;
 			struct list_head dd_node;
 			u8 revoked :1;
+			struct mlx5_ib_mkey null_mmkey;
 		};
 	};
 };
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 841725557f2a..4b37446758fd 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -107,13 +107,20 @@ static u64 mlx5_imr_ksm_entries;
 static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
 			struct mlx5_ib_mr *imr, int flags)
 {
+	struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev;
 	struct mlx5_klm *end = pklm + nentries;
+	int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0;
+	__be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ?
+			     cpu_to_be32(imr->null_mmkey.key) :
+			     mr_to_mdev(imr)->mkeys.null_mkey;
+	u64 va =
+		MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0;
 
 	if (flags & MLX5_IB_UPD_XLT_ZAP) {
-		for (; pklm != end; pklm++, idx++) {
+		for (; pklm != end; pklm++, idx++, va += step) {
 			pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
-			pklm->key = mr_to_mdev(imr)->mkeys.null_mkey;
-			pklm->va = 0;
+			pklm->key = key;
+			pklm->va = cpu_to_be64(va);
 		}
 		return;
 	}
@@ -137,7 +144,7 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
 	 */
 	lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
 
-	for (; pklm != end; pklm++, idx++) {
+	for (; pklm != end; pklm++, idx++, va += step) {
 		struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
 
 		pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
@@ -145,8 +152,8 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
 			pklm->key = cpu_to_be32(mtt->ibmr.lkey);
 			pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
 		} else {
-			pklm->key = mr_to_mdev(imr)->mkeys.null_mkey;
-			pklm->va = 0;
+			pklm->key = key;
+			pklm->va = cpu_to_be64(va);
 		}
 	}
 }
@@ -225,6 +232,9 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
 		return;
 
 	xa_erase(&imr->implicit_children, idx);
+	if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault))
+		xa_erase(&mr_to_mdev(mr)->odp_mkeys,
+			 mlx5_base_mkey(mr->mmkey.key));
 
 	/* Freeing a MR is a sleeping operation, so bounce to a work queue */
 	INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
@@ -492,6 +502,16 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 	}
 	xa_unlock(&imr->implicit_children);
 
+	if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) {
+		ret = xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
+			       &mr->mmkey, GFP_KERNEL);
+		if (xa_is_err(ret)) {
+			ret = ERR_PTR(xa_err(ret));
+			xa_erase(&imr->implicit_children, idx);
+			goto out_mr;
+		}
+		mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD;
+	}
 	mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
 	return mr;
 
@@ -502,6 +522,57 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 	return ret;
 }
 
+/*
+ * When using memory scheme ODP, implicit MRs can't use the reserved null mkey
+ * and each implicit MR needs to assign a private null mkey to get the page
+ * faults on.
+ * The null mkey is created with the properties to enable getting the page
+ * fault for every time it is accessed and having all relevant access flags.
+ */
+static int alloc_implicit_mr_null_mkey(struct mlx5_ib_dev *dev,
+				       struct mlx5_ib_mr *imr,
+				       struct mlx5_ib_pd *pd)
+{
+	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 64;
+	void *mkc;
+	u32 *in;
+	int err;
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 4);
+	MLX5_SET(create_mkey_in, in, pg_access, 1);
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, a, 1);
+	MLX5_SET(mkc, mkc, rw, 1);
+	MLX5_SET(mkc, mkc, rr, 1);
+	MLX5_SET(mkc, mkc, lw, 1);
+	MLX5_SET(mkc, mkc, lr, 1);
+	MLX5_SET(mkc, mkc, free, 0);
+	MLX5_SET(mkc, mkc, umr_en, 0);
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+
+	MLX5_SET(mkc, mkc, translations_octword_size, 4);
+	MLX5_SET(mkc, mkc, log_page_size, 61);
+	MLX5_SET(mkc, mkc, length64, 1);
+	MLX5_SET(mkc, mkc, pd, pd->pdn);
+	MLX5_SET64(mkc, mkc, start_addr, 0);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+
+	err = mlx5_core_create_mkey(dev->mdev, &imr->null_mmkey.key, in, inlen);
+	if (err)
+		goto free_in;
+
+	imr->null_mmkey.type = MLX5_MKEY_NULL;
+
+free_in:
+	kfree(in);
+	return err;
+}
+
 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 					     int access_flags)
 {
@@ -534,6 +605,16 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 	imr->is_odp_implicit = true;
 	xa_init(&imr->implicit_children);
 
+	if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) {
+		err = alloc_implicit_mr_null_mkey(dev, imr, pd);
+		if (err)
+			goto out_mr;
+
+		err = mlx5r_store_odp_mkey(dev, &imr->null_mmkey);
+		if (err)
+			goto out_mr;
+	}
+
 	err = mlx5r_umr_update_xlt(imr, 0,
 				   mlx5_imr_ksm_entries,
 				   MLX5_KSM_PAGE_SHIFT,
@@ -568,6 +649,14 @@ void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr)
 		xa_erase(&mr->implicit_children, idx);
 		mlx5_ib_dereg_mr(&mtt->ibmr, NULL);
 	}
+
+	if (mr->null_mmkey.key) {
+		xa_erase(&mr_to_mdev(mr)->odp_mkeys,
+			 mlx5_base_mkey(mr->null_mmkey.key));
+
+		mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev,
+				       mr->null_mmkey.key);
+	}
 }
 
 #define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
@@ -1410,14 +1499,25 @@ static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev *dev,
 			       pfault->memory.fault_byte_count +
 			       pfault->memory.prefetch_after_byte_count;
 	struct mlx5_ib_mkey *mmkey;
-	struct mlx5_ib_mr *mr;
+	struct mlx5_ib_mr *mr, *child_mr;
 	int ret = 0;
 
 	mmkey = find_odp_mkey(dev, pfault->memory.mkey);
 	if (IS_ERR(mmkey))
 		goto err;
 
-	mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+	switch (mmkey->type) {
+	case MLX5_MKEY_IMPLICIT_CHILD:
+		child_mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+		mr = child_mr->parent;
+		break;
+	case MLX5_MKEY_NULL:
+		mr = container_of(mmkey, struct mlx5_ib_mr, null_mmkey);
+		break;
+	default:
+		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+		break;
+	}
 
 	/* If prefetch fails, handle only demanded page fault */
 	ret = pagefault_mr(mr, prefetch_va, prefetch_size, NULL, 0, true);
-- 
2.17.2


  parent reply	other threads:[~2024-09-09 10:05 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-09 10:04 [PATCH v2 rdma-next 0/8] Introduce mlx5 Memory Scheme ODP Michael Guralnik
2024-09-09 10:04 ` [PATCH v2 rdma-next 1/8] net/mlx5: Expand mkey page size to support 6 bits Michael Guralnik
2024-09-09 10:04 ` [PATCH v2 rdma-next 2/8] net/mlx5: Expose HW bits for Memory scheme ODP Michael Guralnik
2024-09-09 10:04 ` [PATCH v2 rdma-next 3/8] RDMA/mlx5: Add new ODP memory scheme eqe format Michael Guralnik
2024-09-09 10:05 ` [PATCH v2 rdma-next 4/8] RDMA/mlx5: Enforce umem boundaries for explicit ODP page faults Michael Guralnik
2024-09-09 10:05 ` [PATCH v2 rdma-next 5/8] RDMA/mlx5: Split ODP mkey search logic Michael Guralnik
2024-09-09 10:05 ` [PATCH v2 rdma-next 6/8] RDMA/mlx5: Add handling for memory scheme page fault events Michael Guralnik
2024-09-09 10:05 ` Michael Guralnik [this message]
2024-09-09 10:05 ` [PATCH v2 rdma-next 8/8] net/mlx5: Handle memory scheme ODP capabilities Michael Guralnik
2024-09-11 12:07 ` [PATCH v2 rdma-next 0/8] Introduce mlx5 Memory Scheme ODP Leon Romanovsky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240909100504.29797-8-michaelgur@nvidia.com \
    --to=michaelgur@nvidia.com \
    --cc=jgg@nvidia.com \
    --cc=leonro@nvidia.com \
    --cc=linux-rdma@vger.kernel.org \
    --cc=saeedm@nvidia.com \
    --cc=tariqt@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox