public inbox for linux-rdma@vger.kernel.org
 help / color / mirror / Atom feed
From: Michael Guralnik <michaelgur@nvidia.com>
To: <leonro@nvidia.com>, <jgg@nvidia.com>
Cc: <linux-rdma@vger.kernel.org>, <saeedm@nvidia.com>,
	<tariqt@nvidia.com>, Michael Guralnik <michaelgur@nvidia.com>
Subject: [PATCH v2 rdma-next 6/8] RDMA/mlx5: Add handling for memory scheme page fault events
Date: Mon, 9 Sep 2024 13:05:02 +0300	[thread overview]
Message-ID: <20240909100504.29797-7-michaelgur@nvidia.com> (raw)
In-Reply-To: <20240909100504.29797-1-michaelgur@nvidia.com>

The memory scheme page fault event is a new approch in handling page fault
on mkeys using the on-demand-paging feature.
The major shift in handling the page fault in this scheme is that the HW
is taking responsibilty for parsing the faulted mkey instead of the
previous approach where the driver would read and parse the wqes and
query the mkeys to get to the direct mkey that we need to handle.

Therefore, the event we get from FW in this scheme will contain the
direct mkey and address we need to handle and require much less work
from driver.

Additionally, to optimize performance, the FW can generate the event on
a memory area that is larger than the faulted memory operation is
requiring, to 'prefetch' memory that is around it and will likely be
used soon.

Unlike previous types of page fault, the memory page scheme fault does
not always require a resume command after handling the page fault as the FW
can post multiple events on same mkey and will set the 'last' flag only on
the page fault that requires the resume command.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/odp.c | 120 +++++++++++++++++++++++++++++--
 1 file changed, 114 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 05b92f4cac0e..841725557f2a 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -401,12 +401,24 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
 
 	MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
 
-	info = MLX5_ADDR_OF(page_fault_resume_in, in,
-			    page_fault_info.trans_page_fault_info);
-	MLX5_SET(trans_page_fault_info, info, page_fault_type, pfault->type);
-	MLX5_SET(trans_page_fault_info, info, fault_token, pfault->token);
-	MLX5_SET(trans_page_fault_info, info, wq_number, wq_num);
-	MLX5_SET(trans_page_fault_info, info, error, !!error);
+	if (pfault->event_subtype == MLX5_PFAULT_SUBTYPE_MEMORY) {
+		info = MLX5_ADDR_OF(page_fault_resume_in, in,
+				    page_fault_info.mem_page_fault_info);
+		MLX5_SET(mem_page_fault_info, info, fault_token_31_0,
+			 pfault->token & 0xffffffff);
+		MLX5_SET(mem_page_fault_info, info, fault_token_47_32,
+			 (pfault->token >> 32) & 0xffff);
+		MLX5_SET(mem_page_fault_info, info, error, !!error);
+	} else {
+		info = MLX5_ADDR_OF(page_fault_resume_in, in,
+				    page_fault_info.trans_page_fault_info);
+		MLX5_SET(trans_page_fault_info, info, page_fault_type,
+			 pfault->type);
+		MLX5_SET(trans_page_fault_info, info, fault_token,
+			 pfault->token);
+		MLX5_SET(trans_page_fault_info, info, wq_number, wq_num);
+		MLX5_SET(trans_page_fault_info, info, error, !!error);
+	}
 
 	err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in);
 	if (err)
@@ -1388,6 +1400,63 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
 	}
 }
 
+#define MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST BIT(7)
+static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev *dev,
+					     struct mlx5_pagefault *pfault)
+{
+	u64 prefetch_va =
+		pfault->memory.va - pfault->memory.prefetch_before_byte_count;
+	size_t prefetch_size = pfault->memory.prefetch_before_byte_count +
+			       pfault->memory.fault_byte_count +
+			       pfault->memory.prefetch_after_byte_count;
+	struct mlx5_ib_mkey *mmkey;
+	struct mlx5_ib_mr *mr;
+	int ret = 0;
+
+	mmkey = find_odp_mkey(dev, pfault->memory.mkey);
+	if (IS_ERR(mmkey))
+		goto err;
+
+	mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+
+	/* If prefetch fails, handle only demanded page fault */
+	ret = pagefault_mr(mr, prefetch_va, prefetch_size, NULL, 0, true);
+	if (ret < 0) {
+		ret = pagefault_mr(mr, pfault->memory.va,
+				   pfault->memory.fault_byte_count, NULL, 0,
+				   true);
+		if (ret < 0)
+			goto err;
+	}
+
+	mlx5_update_odp_stats(mr, faults, ret);
+	mlx5r_deref_odp_mkey(mmkey);
+
+	if (pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST)
+		mlx5_ib_page_fault_resume(dev, pfault, 0);
+
+	mlx5_ib_dbg(
+		dev,
+		"PAGE FAULT completed %s. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x\n",
+		pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST ?
+			"" :
+			"without resume cmd",
+		pfault->token, pfault->memory.mkey, pfault->memory.va,
+		pfault->memory.fault_byte_count);
+
+	return;
+
+err:
+	if (!IS_ERR(mmkey))
+		mlx5r_deref_odp_mkey(mmkey);
+	mlx5_ib_page_fault_resume(dev, pfault, 1);
+	mlx5_ib_dbg(
+		dev,
+		"PAGE FAULT error. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x, err: %d\n",
+		pfault->token, pfault->memory.mkey, pfault->memory.va,
+		pfault->memory.fault_byte_count, ret);
+}
+
 static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault)
 {
 	u8 event_subtype = pfault->event_subtype;
@@ -1399,6 +1468,9 @@ static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfaul
 	case MLX5_PFAULT_SUBTYPE_RDMA:
 		mlx5_ib_mr_rdma_pfault_handler(dev, pfault);
 		break;
+	case MLX5_PFAULT_SUBTYPE_MEMORY:
+		mlx5_ib_mr_memory_pfault_handler(dev, pfault);
+		break;
 	default:
 		mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n",
 			    event_subtype);
@@ -1417,6 +1489,7 @@ static void mlx5_ib_eqe_pf_action(struct work_struct *work)
 	mempool_free(pfault, eq->pool);
 }
 
+#define MEMORY_SCHEME_PAGE_FAULT_GRANULARITY 4096
 static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
 {
 	struct mlx5_eqe_page_fault *pf_eqe;
@@ -1487,6 +1560,41 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
 				pfault->wqe.wqe_index);
 			break;
 
+		case MLX5_PFAULT_SUBTYPE_MEMORY:
+			/* Memory based event */
+			pfault->bytes_committed = 0;
+			pfault->token =
+				be32_to_cpu(pf_eqe->memory.token31_0) |
+				((u64)be16_to_cpu(pf_eqe->memory.token47_32)
+				 << 32);
+			pfault->memory.va = be64_to_cpu(pf_eqe->memory.va);
+			pfault->memory.mkey = be32_to_cpu(pf_eqe->memory.mkey);
+			pfault->memory.fault_byte_count = (be32_to_cpu(
+				pf_eqe->memory.demand_fault_pages) >> 12) *
+				MEMORY_SCHEME_PAGE_FAULT_GRANULARITY;
+			pfault->memory.prefetch_before_byte_count =
+				be16_to_cpu(
+					pf_eqe->memory.pre_demand_fault_pages) *
+				MEMORY_SCHEME_PAGE_FAULT_GRANULARITY;
+			pfault->memory.prefetch_after_byte_count =
+				be16_to_cpu(
+					pf_eqe->memory.post_demand_fault_pages) *
+				MEMORY_SCHEME_PAGE_FAULT_GRANULARITY;
+			pfault->memory.flags = pf_eqe->memory.flags;
+			mlx5_ib_dbg(
+				eq->dev,
+				"PAGE_FAULT: subtype: 0x%02x, token: 0x%06llx, mkey: 0x%06x, fault_byte_count: 0x%06x, va: 0x%016llx, flags: 0x%02x\n",
+				eqe->sub_type, pfault->token,
+				pfault->memory.mkey,
+				pfault->memory.fault_byte_count,
+				pfault->memory.va, pfault->memory.flags);
+			mlx5_ib_dbg(
+				eq->dev,
+				"PAGE_FAULT: prefetch size: before: 0x%06x, after 0x%06x\n",
+				pfault->memory.prefetch_before_byte_count,
+				pfault->memory.prefetch_after_byte_count);
+			break;
+
 		default:
 			mlx5_ib_warn(eq->dev,
 				     "Unsupported page fault event sub-type: 0x%02hhx\n",
-- 
2.17.2


  parent reply	other threads:[~2024-09-09 10:05 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-09 10:04 [PATCH v2 rdma-next 0/8] Introduce mlx5 Memory Scheme ODP Michael Guralnik
2024-09-09 10:04 ` [PATCH v2 rdma-next 1/8] net/mlx5: Expand mkey page size to support 6 bits Michael Guralnik
2024-09-09 10:04 ` [PATCH v2 rdma-next 2/8] net/mlx5: Expose HW bits for Memory scheme ODP Michael Guralnik
2024-09-09 10:04 ` [PATCH v2 rdma-next 3/8] RDMA/mlx5: Add new ODP memory scheme eqe format Michael Guralnik
2024-09-09 10:05 ` [PATCH v2 rdma-next 4/8] RDMA/mlx5: Enforce umem boundaries for explicit ODP page faults Michael Guralnik
2024-09-09 10:05 ` [PATCH v2 rdma-next 5/8] RDMA/mlx5: Split ODP mkey search logic Michael Guralnik
2024-09-09 10:05 ` Michael Guralnik [this message]
2024-09-09 10:05 ` [PATCH v2 rdma-next 7/8] RDMA/mlx5: Add implicit MR handling to ODP memory scheme Michael Guralnik
2024-09-09 10:05 ` [PATCH v2 rdma-next 8/8] net/mlx5: Handle memory scheme ODP capabilities Michael Guralnik
2024-09-11 12:07 ` [PATCH v2 rdma-next 0/8] Introduce mlx5 Memory Scheme ODP Leon Romanovsky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240909100504.29797-7-michaelgur@nvidia.com \
    --to=michaelgur@nvidia.com \
    --cc=jgg@nvidia.com \
    --cc=leonro@nvidia.com \
    --cc=linux-rdma@vger.kernel.org \
    --cc=saeedm@nvidia.com \
    --cc=tariqt@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox