linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: jglisse@redhat.com
To: akpm@linux-foundation.org
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	"Linus Torvalds" <torvalds@linux-foundation.org>,
	joro@8bytes.org, "Mel Gorman" <mgorman@suse.de>,
	"H. Peter Anvin" <hpa@zytor.com>,
	"Peter Zijlstra" <peterz@infradead.org>,
	"Andrea Arcangeli" <aarcange@redhat.com>,
	"Johannes Weiner" <jweiner@redhat.com>,
	"Larry Woodman" <lwoodman@redhat.com>,
	"Rik van Riel" <riel@redhat.com>,
	"Dave Airlie" <airlied@redhat.com>,
	"Brendan Conoboy" <blc@redhat.com>,
	"Joe Donohue" <jdonohue@redhat.com>,
	"Duncan Poole" <dpoole@nvidia.com>,
	"Sherry Cheung" <SCheung@nvidia.com>,
	"Subhash Gutti" <sgutti@nvidia.com>,
	"John Hubbard" <jhubbard@nvidia.com>,
	"Mark Hairgrove" <mhairgrove@nvidia.com>,
	"Lucien Dunning" <ldunning@nvidia.com>,
	"Cameron Buschardt" <cabuschardt@nvidia.com>,
	"Arvind Gopalakrishnan" <arvindg@nvidia.com>,
	"Haggai Eran" <haggaie@mellanox.com>,
	"Shachar Raindel" <raindel@mellanox.com>,
	"Liran Liss" <liranl@mellanox.com>,
	"Roland Dreier" <roland@purestorage.com>,
	"Ben Sander" <ben.sander@amd.com>,
	"Greg Stoner" <Greg.Stoner@amd.com>,
	"John Bridgman" <John.Bridgman@amd.com>,
	"Michael Mantor" <Michael.Mantor@amd.com>,
	"Paul Blinzer" <Paul.Blinzer@amd.com>,
	"Laurent Morichetti" <Laurent.Morichetti@amd.com>,
	"Alexander Deucher" <Alexander.Deucher@amd.com>,
	"Oded Gabbay" <Oded.Gabbay@amd.com>,
	"Jérôme Glisse" <jglisse@redhat.com>,
	linux-rdma@vger.kernel.org
Subject: [PATCH 35/36] IB/mlx5/hmm: add page fault support for ODP on HMM.
Date: Thu, 21 May 2015 16:23:11 -0400	[thread overview]
Message-ID: <1432239792-5002-16-git-send-email-jglisse@redhat.com> (raw)
In-Reply-To: <1432239792-5002-1-git-send-email-jglisse@redhat.com>

From: JA(C)rA'me Glisse <jglisse@redhat.com>

This patch add HMM specific support for hardware page faulting of
user memory region.

Signed-off-by: JA(C)rA'me Glisse <jglisse@redhat.com>
cc: <linux-rdma@vger.kernel.org>
---
 drivers/infiniband/hw/mlx5/odp.c | 147 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 146 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index bd29155..093f5b8 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -56,6 +56,55 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM
 
 
+struct mlx5_hmm_pfault {
+	struct mlx5_ib_mr	*mlx5_ib_mr;
+	u64			start_idx;
+	dma_addr_t		access_mask;
+	unsigned		npages;
+	struct hmm_event	event;
+};
+
+static int mlx5_hmm_pfault(struct mlx5_ib_dev *mlx5_ib_dev,
+			   struct hmm_mirror *mirror,
+			   const struct hmm_event *event)
+{
+	struct mlx5_hmm_pfault *pfault;
+	struct hmm_pt_iter iter;
+	unsigned long addr, cnt;
+	int ret;
+
+	pfault = container_of(event, struct mlx5_hmm_pfault, event);
+	hmm_pt_iter_init(&iter);
+
+	for (addr = event->start, cnt = 0; addr < event->end;
+	     addr += PAGE_SIZE, ++cnt) {
+		dma_addr_t *ptep;
+
+		/* Get and lock pointer to mirror page table. */
+		ptep = hmm_pt_iter_update(&iter, &mirror->pt, addr);
+		/* This could be BUG_ON() as it can not happen. */
+		if (!ptep || !hmm_pte_test_valid_dma(ptep)) {
+			pr_warn("got empty mirror page table on pagefault.\n");
+			return -EINVAL;
+		}
+		if ((pfault->access_mask & ODP_WRITE_ALLOWED_BIT)) {
+			if (!hmm_pte_test_write(ptep)) {
+				pr_warn("got wrong protection permission on "
+					"pagefault.\n");
+				return -EINVAL;
+			}
+			hmm_pte_set_bit(ptep, ODP_WRITE_ALLOWED_SHIFT);
+		}
+		hmm_pte_set_bit(ptep, ODP_READ_ALLOWED_SHIFT);
+		pfault->npages++;
+	}
+	ret = mlx5_ib_update_mtt(pfault->mlx5_ib_mr,
+				 pfault->start_idx,
+				 cnt, 0, &iter);
+	hmm_pt_iter_fini(&iter, &mirror->pt);
+	return ret;
+}
+
 int mlx5_ib_umem_invalidate(struct ib_umem *umem, u64 start,
 			    u64 end, void *cookie)
 {
@@ -178,12 +227,19 @@ static int mlx5_hmm_update(struct hmm_mirror *mirror,
 			   const struct hmm_event *event)
 {
 	struct device *device = mirror->device->dev;
+	struct mlx5_ib_dev *mlx5_ib_dev;
+	struct ib_device *ib_device;
 	int ret = 0;
 
+	ib_device = container_of(mirror->device, struct ib_device, hmm_dev);
+	mlx5_ib_dev = to_mdev(ib_device);
+
 	switch (event->etype) {
 	case HMM_DEVICE_RFAULT:
 	case HMM_DEVICE_WFAULT:
-		/* FIXME implement. */
+		ret = mlx5_hmm_pfault(mlx5_ib_dev, mirror, event);
+		if (ret)
+			return ret;
 		break;
 	case HMM_ISDIRTY:
 		hmm_mirror_range_dirty(mirror, event->start, event->end);
@@ -228,6 +284,95 @@ void mlx5_dev_fini_odp_hmm(struct ib_device *ib_device)
 	hmm_device_unregister(&ib_device->hmm_dev);
 }
 
+/*
+ * Handle a single data segment in a page-fault WQE.
+ *
+ * Returns number of pages retrieved on success. The caller will continue to
+ * the next data segment.
+ * Can return the following error codes:
+ * -EAGAIN to designate a temporary error. The caller will abort handling the
+ *  page fault and resolve it.
+ * -EFAULT when there's an error mapping the requested pages. The caller will
+ *  abort the page fault handling and possibly move the QP to an error state.
+ * On other errors the QP should also be closed with an error.
+ */
+static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
+					 struct mlx5_ib_pfault *pfault,
+					 u32 key, u64 io_virt, size_t bcnt,
+					 u32 *bytes_mapped)
+{
+	struct mlx5_ib_dev *mlx5_ib_dev = to_mdev(qp->ibqp.pd->device);
+	struct ib_mirror *ib_mirror;
+	struct mlx5_hmm_pfault hmm_pfault;
+	int srcu_key;
+	int ret = 0;
+
+	srcu_key = srcu_read_lock(&mlx5_ib_dev->mr_srcu);
+	hmm_pfault.mlx5_ib_mr = mlx5_ib_odp_find_mr_lkey(mlx5_ib_dev, key);
+	/*
+	 * If we didn't find the MR, it means the MR was closed while we were
+	 * handling the ODP event. In this case we return -EFAULT so that the
+	 * QP will be closed.
+	 */
+	if (!hmm_pfault.mlx5_ib_mr || !hmm_pfault.mlx5_ib_mr->ibmr.pd) {
+		pr_err("Failed to find relevant mr for lkey=0x%06x, probably "
+		       "the MR was destroyed\n", key);
+		ret = -EFAULT;
+		goto srcu_unlock;
+	}
+	if (!hmm_pfault.mlx5_ib_mr->umem->odp_data) {
+		pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault "
+		         "handler.\n", key);
+		if (bytes_mapped)
+			*bytes_mapped +=
+				(bcnt - pfault->mpfault.bytes_committed);
+		goto srcu_unlock;
+	}
+	if (hmm_pfault.mlx5_ib_mr->ibmr.pd != qp->ibqp.pd) {
+		pr_err("Page-fault with different PDs for QP and MR.\n");
+		ret = -EFAULT;
+		goto srcu_unlock;
+	}
+
+	ib_mirror = hmm_pfault.mlx5_ib_mr->umem->odp_data->ib_mirror;
+	if (ib_mirror->base.hmm == NULL) {
+		/* Somehow the mirror was kill from under us. */
+		ret = -EFAULT;
+		goto srcu_unlock;
+	}
+
+	/*
+	 * Avoid branches - this code will perform correctly
+	 * in all iterations (in iteration 2 and above,
+	 * bytes_committed == 0).
+	 */
+	io_virt += pfault->mpfault.bytes_committed;
+	bcnt -= pfault->mpfault.bytes_committed;
+
+	hmm_pfault.npages = 0;
+	hmm_pfault.start_idx = (io_virt - (hmm_pfault.mlx5_ib_mr->mmr.iova &
+					   PAGE_MASK)) >> PAGE_SHIFT;
+	hmm_pfault.access_mask = ODP_READ_ALLOWED_BIT;
+	hmm_pfault.access_mask |= hmm_pfault.mlx5_ib_mr->umem->writable ?
+				  ODP_WRITE_ALLOWED_BIT : 0;
+	hmm_pfault.event.start = io_virt & PAGE_MASK;
+	hmm_pfault.event.end = PAGE_ALIGN(io_virt + bcnt);
+	hmm_pfault.event.etype = hmm_pfault.mlx5_ib_mr->umem->writable ?
+				 HMM_DEVICE_WFAULT : HMM_DEVICE_RFAULT;
+	ret = hmm_mirror_fault(&ib_mirror->base, &hmm_pfault.event);
+
+	if (!ret && hmm_pfault.npages && bytes_mapped) {
+		u32 new_mappings = hmm_pfault.npages * PAGE_SIZE -
+				   (io_virt - round_down(io_virt, PAGE_SIZE));
+		*bytes_mapped += min_t(u32, new_mappings, bcnt);
+	}
+
+srcu_unlock:
+	srcu_read_unlock(&mlx5_ib_dev->mr_srcu, srcu_key);
+	pfault->mpfault.bytes_committed = 0;
+	return ret ? ret : hmm_pfault.npages;
+}
+
 
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */
 
-- 
1.9.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2015-05-21 20:24 UTC|newest]

Thread overview: 79+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-05-21 19:31 HMM (Heterogeneous Memory Management) v8 j.glisse
2015-05-21 19:31 ` [PATCH 01/36] mmu_notifier: add event information to address invalidation v7 j.glisse
2015-05-30  3:43   ` John Hubbard
2015-06-01 19:03     ` Jerome Glisse
2015-06-01 23:10       ` John Hubbard
2015-06-03 16:07         ` Jerome Glisse
2015-06-03 23:02           ` John Hubbard
2015-05-21 19:31 ` [PATCH 02/36] mmu_notifier: keep track of active invalidation ranges v3 j.glisse
2015-05-27  5:09   ` Aneesh Kumar K.V
2015-05-27 14:32     ` Jerome Glisse
2015-06-02  9:32   ` John Hubbard
2015-06-03 17:15     ` Jerome Glisse
2015-06-05  3:29       ` John Hubbard
2015-05-21 19:31 ` [PATCH 03/36] mmu_notifier: pass page pointer to mmu_notifier_invalidate_page() j.glisse
2015-05-27  5:17   ` Aneesh Kumar K.V
2015-05-27 14:33     ` Jerome Glisse
2015-06-03  4:25   ` John Hubbard
2015-05-21 19:31 ` [PATCH 04/36] mmu_notifier: allow range invalidation to exclude a specific mmu_notifier j.glisse
2015-05-21 19:31 ` [PATCH 05/36] HMM: introduce heterogeneous memory management v3 j.glisse
2015-05-27  5:50   ` Aneesh Kumar K.V
2015-05-27 14:38     ` Jerome Glisse
2015-06-08 19:40   ` Mark Hairgrove
2015-06-08 21:17     ` Jerome Glisse
2015-06-09  1:54       ` Mark Hairgrove
2015-06-09 15:56         ` Jerome Glisse
2015-06-10  3:33           ` Mark Hairgrove
2015-06-10 15:42             ` Jerome Glisse
2015-06-11  1:15               ` Mark Hairgrove
2015-06-11 14:23                 ` Jerome Glisse
2015-06-11 22:26                   ` Mark Hairgrove
2015-06-15 14:32                     ` Jerome Glisse
2015-05-21 19:31 ` [PATCH 06/36] HMM: add HMM page table v2 j.glisse
2015-06-19  2:06   ` Mark Hairgrove
2015-06-19 18:07     ` Jerome Glisse
2015-06-20  2:34       ` Mark Hairgrove
2015-06-25 22:57   ` Mark Hairgrove
2015-06-26 16:30     ` Jerome Glisse
2015-06-27  1:34       ` Mark Hairgrove
2015-06-29 14:43         ` Jerome Glisse
2015-07-01  2:51           ` Mark Hairgrove
2015-07-01 15:07             ` Jerome Glisse
2015-05-21 19:31 ` [PATCH 07/36] HMM: add per mirror page table v3 j.glisse
2015-06-25 23:05   ` Mark Hairgrove
2015-06-26 16:43     ` Jerome Glisse
2015-06-27  3:02       ` Mark Hairgrove
2015-06-29 14:50         ` Jerome Glisse
2015-05-21 19:31 ` [PATCH 08/36] HMM: add device page fault support v3 j.glisse
2015-05-21 19:31 ` [PATCH 09/36] HMM: add mm page table iterator helpers j.glisse
2015-05-21 19:31 ` [PATCH 10/36] HMM: use CPU page table during invalidation j.glisse
2015-05-21 19:31 ` [PATCH 11/36] HMM: add discard range helper (to clear and free resources for a range) j.glisse
2015-05-21 19:31 ` [PATCH 12/36] HMM: add dirty range helper (to toggle dirty bit inside mirror page table) j.glisse
2015-05-21 19:31 ` [PATCH 13/36] HMM: DMA map memory on behalf of device driver j.glisse
2015-05-21 19:31 ` [PATCH 14/36] fork: pass the dst vma to copy_page_range() and its sub-functions j.glisse
2015-05-21 19:31 ` [PATCH 15/36] memcg: export get_mem_cgroup_from_mm() j.glisse
2015-05-21 19:31 ` [PATCH 16/36] HMM: add special swap filetype for memory migrated to HMM device memory j.glisse
2015-06-24  7:49   ` Haggai Eran
2015-05-21 19:31 ` [PATCH 17/36] HMM: add new HMM page table flag (valid device memory) j.glisse
2015-05-21 19:31 ` [PATCH 18/36] HMM: add new HMM page table flag (select flag) j.glisse
2015-05-21 19:31 ` [PATCH 19/36] HMM: handle HMM device page table entry on mirror page table fault and update j.glisse
2015-05-21 20:22 ` [PATCH 20/36] HMM: mm add helper to update page table when migrating memory back jglisse
2015-05-21 20:22   ` [PATCH 21/36] HMM: mm add helper to update page table when migrating memory jglisse
2015-05-21 20:22   ` [PATCH 22/36] HMM: add new callback for copying memory from and to device memory jglisse
2015-05-21 20:22   ` [PATCH 23/36] HMM: allow to get pointer to spinlock protecting a directory jglisse
2015-05-21 20:23   ` [PATCH 24/36] HMM: split DMA mapping function in two jglisse
2015-05-21 20:23   ` [PATCH 25/36] HMM: add helpers for migration back to system memory jglisse
2015-05-21 20:23   ` [PATCH 26/36] HMM: fork copy migrated memory into system memory for child process jglisse
2015-05-21 20:23   ` [PATCH 27/36] HMM: CPU page fault on migrated memory jglisse
2015-05-21 20:23   ` [PATCH 28/36] HMM: add mirror fault support for system to device memory migration jglisse
2015-05-21 20:23   ` [PATCH 29/36] IB/mlx5: add a new paramter to __mlx_ib_populated_pas for ODP with HMM jglisse
2015-05-21 20:23   ` [PATCH 30/36] IB/mlx5: add a new paramter to mlx5_ib_update_mtt() " jglisse
2015-05-21 20:23   ` [PATCH 31/36] IB/odp: export rbt_ib_umem_for_each_in_range() jglisse
2015-05-21 20:23   ` [PATCH 32/36] IB/odp/hmm: add new kernel option to use HMM for ODP jglisse
2015-05-21 20:23   ` [PATCH 33/36] IB/odp/hmm: add core infiniband structure and helper for ODP with HMM jglisse
2015-06-24 13:59     ` Haggai Eran
2015-05-21 20:23   ` [PATCH 34/36] IB/mlx5/hmm: add mlx5 HMM device initialization and callback jglisse
2015-05-21 20:23   ` jglisse [this message]
2015-05-21 20:23   ` [PATCH 36/36] IB/mlx5/hmm: enable ODP using HMM jglisse
2015-05-30  3:01 ` HMM (Heterogeneous Memory Management) v8 John Hubbard
2015-05-31  6:56 ` Haggai Eran

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1432239792-5002-16-git-send-email-jglisse@redhat.com \
    --to=jglisse@redhat.com \
    --cc=Alexander.Deucher@amd.com \
    --cc=Greg.Stoner@amd.com \
    --cc=John.Bridgman@amd.com \
    --cc=Laurent.Morichetti@amd.com \
    --cc=Michael.Mantor@amd.com \
    --cc=Oded.Gabbay@amd.com \
    --cc=Paul.Blinzer@amd.com \
    --cc=SCheung@nvidia.com \
    --cc=aarcange@redhat.com \
    --cc=airlied@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=arvindg@nvidia.com \
    --cc=ben.sander@amd.com \
    --cc=blc@redhat.com \
    --cc=cabuschardt@nvidia.com \
    --cc=dpoole@nvidia.com \
    --cc=haggaie@mellanox.com \
    --cc=hpa@zytor.com \
    --cc=jdonohue@redhat.com \
    --cc=jhubbard@nvidia.com \
    --cc=joro@8bytes.org \
    --cc=jweiner@redhat.com \
    --cc=ldunning@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=liranl@mellanox.com \
    --cc=lwoodman@redhat.com \
    --cc=mgorman@suse.de \
    --cc=mhairgrove@nvidia.com \
    --cc=peterz@infradead.org \
    --cc=raindel@mellanox.com \
    --cc=riel@redhat.com \
    --cc=roland@purestorage.com \
    --cc=sgutti@nvidia.com \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).