* [PATCH 1/3] vfio/type1: change success value of vaddr_get_pfn()
2021-02-03 20:47 [PATCH 0/3] vfio/type1: batch page pinning Daniel Jordan
@ 2021-02-03 20:47 ` Daniel Jordan
2021-02-03 20:47 ` [PATCH 2/3] vfio/type1: prepare for batched pinning with struct vfio_batch Daniel Jordan
2021-02-03 20:47 ` [PATCH 3/3] vfio/type1: batch page pinning Daniel Jordan
2 siblings, 0 replies; 6+ messages in thread
From: Daniel Jordan @ 2021-02-03 20:47 UTC (permalink / raw)
To: Alex Williamson, Cornelia Huck
Cc: Jason Gunthorpe, Matthew Wilcox, Pavel Tatashin, Steven Sistare,
kvm, linux-kernel, Daniel Jordan
vaddr_get_pfn() simply returns 0 on success. Have it report the number
of pfns successfully gotten instead, whether from page pinning or
follow_fault_pfn(), which will be used later when batching pinning.
Change the last check in vfio_pin_pages_remote() for consistency with
the other two.
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
---
drivers/vfio/vfio_iommu_type1.c | 21 ++++++++++++++-------
1 file changed, 14 insertions(+), 7 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0b4dedaa9128..4d608bc552a4 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -441,6 +441,10 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
return ret;
}
+/*
+ * Returns the positive number of pfns successfully obtained or a negative
+ * error code.
+ */
static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
int prot, unsigned long *pfn)
{
@@ -457,7 +461,6 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
page, NULL, NULL);
if (ret == 1) {
*pfn = page_to_pfn(page[0]);
- ret = 0;
goto done;
}
@@ -471,8 +474,12 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
if (ret == -EAGAIN)
goto retry;
- if (!ret && !is_invalid_reserved_pfn(*pfn))
- ret = -EFAULT;
+ if (!ret) {
+ if (is_invalid_reserved_pfn(*pfn))
+ ret = 1;
+ else
+ ret = -EFAULT;
+ }
}
done:
mmap_read_unlock(mm);
@@ -498,7 +505,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
return -ENODEV;
ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
- if (ret)
+ if (ret < 0)
return ret;
pinned++;
@@ -525,7 +532,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
- if (ret)
+ if (ret < 0)
break;
if (pfn != *pfn_base + pinned ||
@@ -551,7 +558,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
ret = vfio_lock_acct(dma, lock_acct, false);
unpin_out:
- if (ret) {
+ if (ret < 0) {
if (!rsvd) {
for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
put_pfn(pfn, dma->prot);
@@ -595,7 +602,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
return -ENODEV;
ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
- if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
+ if (ret == 1 && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
ret = vfio_lock_acct(dma, 1, true);
if (ret) {
put_pfn(*pfn_base, dma->prot);
--
2.30.0
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 2/3] vfio/type1: prepare for batched pinning with struct vfio_batch
2021-02-03 20:47 [PATCH 0/3] vfio/type1: batch page pinning Daniel Jordan
2021-02-03 20:47 ` [PATCH 1/3] vfio/type1: change success value of vaddr_get_pfn() Daniel Jordan
@ 2021-02-03 20:47 ` Daniel Jordan
2021-02-03 20:47 ` [PATCH 3/3] vfio/type1: batch page pinning Daniel Jordan
2 siblings, 0 replies; 6+ messages in thread
From: Daniel Jordan @ 2021-02-03 20:47 UTC (permalink / raw)
To: Alex Williamson, Cornelia Huck
Cc: Jason Gunthorpe, Matthew Wilcox, Pavel Tatashin, Steven Sistare,
kvm, linux-kernel, Daniel Jordan
Get ready to pin more pages at once with struct vfio_batch, which
represents a batch of pinned pages.
The struct has a fallback page pointer to avoid two unlikely scenarios:
pointlessly allocating a page if disable_hugepages is enabled or failing
the whole pinning operation if the kernel can't allocate memory.
vaddr_get_pfn() becomes vaddr_get_pfns() to prepare for handling
multiple pages, though for now only one page is stored in the pages
array.
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
---
drivers/vfio/vfio_iommu_type1.c | 71 +++++++++++++++++++++++++++------
1 file changed, 58 insertions(+), 13 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 4d608bc552a4..c26c1a4697e5 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -97,6 +97,12 @@ struct vfio_dma {
unsigned long *bitmap;
};
+struct vfio_batch {
+ struct page **pages; /* for pin_user_pages_remote */
+ struct page *fallback_page; /* if pages alloc fails */
+ int capacity; /* length of pages array */
+};
+
struct vfio_group {
struct iommu_group *iommu_group;
struct list_head next;
@@ -415,6 +421,31 @@ static int put_pfn(unsigned long pfn, int prot)
return 0;
}
+#define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
+
+static void vfio_batch_init(struct vfio_batch *batch)
+{
+ if (unlikely(disable_hugepages))
+ goto fallback;
+
+ batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
+ if (!batch->pages)
+ goto fallback;
+
+ batch->capacity = VFIO_BATCH_MAX_CAPACITY;
+ return;
+
+fallback:
+ batch->pages = &batch->fallback_page;
+ batch->capacity = 1;
+}
+
+static void vfio_batch_fini(struct vfio_batch *batch)
+{
+ if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
+ free_page((unsigned long)batch->pages);
+}
+
static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
unsigned long vaddr, unsigned long *pfn,
bool write_fault)
@@ -445,10 +476,10 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
* Returns the positive number of pfns successfully obtained or a negative
* error code.
*/
-static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
- int prot, unsigned long *pfn)
+static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
+ long npages, int prot, unsigned long *pfn,
+ struct page **pages)
{
- struct page *page[1];
struct vm_area_struct *vma;
unsigned int flags = 0;
int ret;
@@ -457,10 +488,10 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
flags |= FOLL_WRITE;
mmap_read_lock(mm);
- ret = pin_user_pages_remote(mm, vaddr, 1, flags | FOLL_LONGTERM,
- page, NULL, NULL);
- if (ret == 1) {
- *pfn = page_to_pfn(page[0]);
+ ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
+ pages, NULL, NULL);
+ if (ret > 0) {
+ *pfn = page_to_pfn(pages[0]);
goto done;
}
@@ -493,7 +524,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
*/
static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
long npage, unsigned long *pfn_base,
- unsigned long limit)
+ unsigned long limit, struct vfio_batch *batch)
{
unsigned long pfn = 0;
long ret, pinned = 0, lock_acct = 0;
@@ -504,7 +535,8 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
if (!current->mm)
return -ENODEV;
- ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
+ ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, pfn_base,
+ batch->pages);
if (ret < 0)
return ret;
@@ -531,7 +563,8 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
/* Lock all the consecutive pages from pfn_base */
for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
- ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
+ ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, &pfn,
+ batch->pages);
if (ret < 0)
break;
@@ -594,6 +627,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
unsigned long *pfn_base, bool do_accounting)
{
+ struct page *pages[1];
struct mm_struct *mm;
int ret;
@@ -601,7 +635,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
if (!mm)
return -ENODEV;
- ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
+ ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
if (ret == 1 && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
ret = vfio_lock_acct(dma, 1, true);
if (ret) {
@@ -1246,15 +1280,19 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
{
dma_addr_t iova = dma->iova;
unsigned long vaddr = dma->vaddr;
+ struct vfio_batch batch;
size_t size = map_size;
long npage;
unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
int ret = 0;
+ vfio_batch_init(&batch);
+
while (size) {
/* Pin a contiguous chunk of memory */
npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
- size >> PAGE_SHIFT, &pfn, limit);
+ size >> PAGE_SHIFT, &pfn, limit,
+ &batch);
if (npage <= 0) {
WARN_ON(!npage);
ret = (int)npage;
@@ -1274,6 +1312,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
dma->size += npage << PAGE_SHIFT;
}
+ vfio_batch_fini(&batch);
dma->iommu_mapped = true;
if (ret)
@@ -1432,6 +1471,7 @@ static int vfio_bus_type(struct device *dev, void *data)
static int vfio_iommu_replay(struct vfio_iommu *iommu,
struct vfio_domain *domain)
{
+ struct vfio_batch batch;
struct vfio_domain *d = NULL;
struct rb_node *n;
unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -1442,6 +1482,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
d = list_first_entry(&iommu->domain_list,
struct vfio_domain, next);
+ vfio_batch_init(&batch);
+
n = rb_first(&iommu->dma_list);
for (; n; n = rb_next(n)) {
@@ -1489,7 +1531,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
npage = vfio_pin_pages_remote(dma, vaddr,
n >> PAGE_SHIFT,
- &pfn, limit);
+ &pfn, limit,
+ &batch);
if (npage <= 0) {
WARN_ON(!npage);
ret = (int)npage;
@@ -1522,6 +1565,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
dma->iommu_mapped = true;
}
+ vfio_batch_fini(&batch);
return 0;
unwind:
@@ -1562,6 +1606,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
}
}
+ vfio_batch_fini(&batch);
return ret;
}
--
2.30.0
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH 3/3] vfio/type1: batch page pinning
2021-02-03 20:47 [PATCH 0/3] vfio/type1: batch page pinning Daniel Jordan
2021-02-03 20:47 ` [PATCH 1/3] vfio/type1: change success value of vaddr_get_pfn() Daniel Jordan
2021-02-03 20:47 ` [PATCH 2/3] vfio/type1: prepare for batched pinning with struct vfio_batch Daniel Jordan
@ 2021-02-03 20:47 ` Daniel Jordan
2021-02-18 19:01 ` Alex Williamson
2 siblings, 1 reply; 6+ messages in thread
From: Daniel Jordan @ 2021-02-03 20:47 UTC (permalink / raw)
To: Alex Williamson, Cornelia Huck
Cc: Jason Gunthorpe, Matthew Wilcox, Pavel Tatashin, Steven Sistare,
kvm, linux-kernel, Daniel Jordan
Pinning one 4K page at a time is inefficient, so do it in batches of 512
instead. This is just an optimization with no functional change
intended, and in particular the driver still calls iommu_map() with the
largest physically contiguous range possible.
Add two fields in vfio_batch to remember where to start between calls to
vfio_pin_pages_remote(), and use vfio_batch_unpin() to handle remaining
pages in the batch in case of error.
qemu pins pages for guests around 8% faster on my test system, a
two-node Broadwell server with 128G memory per node. The qemu process
was bound to one node with its allocations constrained there as well.
base test
guest ---------------- ----------------
mem (GB) speedup avg sec (std) avg sec (std)
1 7.4% 0.61 (0.00) 0.56 (0.00)
2 8.3% 0.93 (0.00) 0.85 (0.00)
4 8.4% 1.46 (0.00) 1.34 (0.00)
8 8.6% 2.54 (0.01) 2.32 (0.00)
16 8.3% 4.66 (0.00) 4.27 (0.01)
32 8.3% 8.94 (0.01) 8.20 (0.01)
64 8.2% 17.47 (0.01) 16.04 (0.03)
120 8.5% 32.45 (0.13) 29.69 (0.01)
perf diff confirms less time spent in pup. Here are the top ten
functions:
Baseline Delta Abs Symbol
78.63% +6.64% clear_page_erms
1.50% -1.50% __gup_longterm_locked
1.27% -0.78% __get_user_pages
+0.76% kvm_zap_rmapp.constprop.0
0.54% -0.53% vmacache_find
0.55% -0.51% get_pfnblock_flags_mask
0.48% -0.48% __get_user_pages_remote
+0.39% slot_rmap_walk_next
+0.32% vfio_pin_map_dma
+0.26% kvm_handle_hva_range
...
Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
---
drivers/vfio/vfio_iommu_type1.c | 133 +++++++++++++++++++++-----------
1 file changed, 88 insertions(+), 45 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index c26c1a4697e5..ac59bfc4e332 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -101,6 +101,8 @@ struct vfio_batch {
struct page **pages; /* for pin_user_pages_remote */
struct page *fallback_page; /* if pages alloc fails */
int capacity; /* length of pages array */
+ int size; /* of batch currently */
+ int offset; /* of next entry in pages */
};
struct vfio_group {
@@ -425,6 +427,9 @@ static int put_pfn(unsigned long pfn, int prot)
static void vfio_batch_init(struct vfio_batch *batch)
{
+ batch->size = 0;
+ batch->offset = 0;
+
if (unlikely(disable_hugepages))
goto fallback;
@@ -440,6 +445,17 @@ static void vfio_batch_init(struct vfio_batch *batch)
batch->capacity = 1;
}
+static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
+{
+ while (batch->size) {
+ unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
+
+ put_pfn(pfn, dma->prot);
+ batch->offset++;
+ batch->size--;
+ }
+}
+
static void vfio_batch_fini(struct vfio_batch *batch)
{
if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
@@ -526,65 +542,88 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
long npage, unsigned long *pfn_base,
unsigned long limit, struct vfio_batch *batch)
{
- unsigned long pfn = 0;
+ unsigned long pfn;
+ struct mm_struct *mm = current->mm;
long ret, pinned = 0, lock_acct = 0;
bool rsvd;
dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
/* This code path is only user initiated */
- if (!current->mm)
+ if (!mm)
return -ENODEV;
- ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, pfn_base,
- batch->pages);
- if (ret < 0)
- return ret;
-
- pinned++;
- rsvd = is_invalid_reserved_pfn(*pfn_base);
-
- /*
- * Reserved pages aren't counted against the user, externally pinned
- * pages are already counted against the user.
- */
- if (!rsvd && !vfio_find_vpfn(dma, iova)) {
- if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
- put_pfn(*pfn_base, dma->prot);
- pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
- limit << PAGE_SHIFT);
- return -ENOMEM;
- }
- lock_acct++;
+ if (batch->size) {
+ /* Leftover pages in batch from an earlier call. */
+ *pfn_base = page_to_pfn(batch->pages[batch->offset]);
+ pfn = *pfn_base;
+ rsvd = is_invalid_reserved_pfn(*pfn_base);
+ } else {
+ *pfn_base = 0;
}
- if (unlikely(disable_hugepages))
- goto out;
+ while (npage) {
+ if (!batch->size) {
+ /* Empty batch, so refill it. */
+ long req_pages = min_t(long, npage, batch->capacity);
- /* Lock all the consecutive pages from pfn_base */
- for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
- pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
- ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, &pfn,
- batch->pages);
- if (ret < 0)
- break;
+ ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
+ &pfn, batch->pages);
+ if (ret < 0)
+ return ret;
- if (pfn != *pfn_base + pinned ||
- rsvd != is_invalid_reserved_pfn(pfn)) {
- put_pfn(pfn, dma->prot);
- break;
+ batch->size = ret;
+ batch->offset = 0;
+
+ if (!*pfn_base) {
+ *pfn_base = pfn;
+ rsvd = is_invalid_reserved_pfn(*pfn_base);
+ }
}
- if (!rsvd && !vfio_find_vpfn(dma, iova)) {
- if (!dma->lock_cap &&
- current->mm->locked_vm + lock_acct + 1 > limit) {
- put_pfn(pfn, dma->prot);
- pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
- __func__, limit << PAGE_SHIFT);
- ret = -ENOMEM;
- goto unpin_out;
+ /*
+ * pfn is preset for the first iteration of this inner loop and
+ * updated at the end to handle a VM_PFNMAP pfn. In that case,
+ * batch->pages isn't valid (there's no struct page), so allow
+ * batch->pages to be touched only when there's more than one
+ * pfn to check, which guarantees the pfns are from a
+ * !VM_PFNMAP vma.
+ */
+ while (true) {
+ if (pfn != *pfn_base + pinned ||
+ rsvd != is_invalid_reserved_pfn(pfn))
+ goto out;
+
+ /*
+ * Reserved pages aren't counted against the user,
+ * externally pinned pages are already counted against
+ * the user.
+ */
+ if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+ if (!dma->lock_cap &&
+ mm->locked_vm + lock_acct + 1 > limit) {
+ pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
+ __func__, limit << PAGE_SHIFT);
+ ret = -ENOMEM;
+ goto unpin_out;
+ }
+ lock_acct++;
}
- lock_acct++;
+
+ pinned++;
+ npage--;
+ vaddr += PAGE_SIZE;
+ iova += PAGE_SIZE;
+ batch->offset++;
+ batch->size--;
+
+ if (!batch->size)
+ break;
+
+ pfn = page_to_pfn(batch->pages[batch->offset]);
}
+
+ if (unlikely(disable_hugepages))
+ break;
}
out:
@@ -596,6 +635,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
put_pfn(pfn, dma->prot);
}
+ vfio_batch_unpin(batch, dma);
return ret;
}
@@ -1305,6 +1345,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
if (ret) {
vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
npage, true);
+ vfio_batch_unpin(&batch, dma);
break;
}
@@ -1546,11 +1587,13 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
ret = iommu_map(domain->domain, iova, phys,
size, dma->prot | domain->prot);
if (ret) {
- if (!dma->iommu_mapped)
+ if (!dma->iommu_mapped) {
vfio_unpin_pages_remote(dma, iova,
phys >> PAGE_SHIFT,
size >> PAGE_SHIFT,
true);
+ vfio_batch_unpin(&batch, dma);
+ }
goto unwind;
}
--
2.30.0
^ permalink raw reply related [flat|nested] 6+ messages in thread* Re: [PATCH 3/3] vfio/type1: batch page pinning
2021-02-03 20:47 ` [PATCH 3/3] vfio/type1: batch page pinning Daniel Jordan
@ 2021-02-18 19:01 ` Alex Williamson
2021-02-19 1:50 ` Daniel Jordan
0 siblings, 1 reply; 6+ messages in thread
From: Alex Williamson @ 2021-02-18 19:01 UTC (permalink / raw)
To: Daniel Jordan
Cc: Cornelia Huck, Jason Gunthorpe, Matthew Wilcox, Pavel Tatashin,
Steven Sistare, kvm, linux-kernel
On Wed, 3 Feb 2021 15:47:56 -0500
Daniel Jordan <daniel.m.jordan@oracle.com> wrote:
> Pinning one 4K page at a time is inefficient, so do it in batches of 512
> instead. This is just an optimization with no functional change
> intended, and in particular the driver still calls iommu_map() with the
> largest physically contiguous range possible.
>
> Add two fields in vfio_batch to remember where to start between calls to
> vfio_pin_pages_remote(), and use vfio_batch_unpin() to handle remaining
> pages in the batch in case of error.
>
> qemu pins pages for guests around 8% faster on my test system, a
> two-node Broadwell server with 128G memory per node. The qemu process
> was bound to one node with its allocations constrained there as well.
>
> base test
> guest ---------------- ----------------
> mem (GB) speedup avg sec (std) avg sec (std)
> 1 7.4% 0.61 (0.00) 0.56 (0.00)
> 2 8.3% 0.93 (0.00) 0.85 (0.00)
> 4 8.4% 1.46 (0.00) 1.34 (0.00)
> 8 8.6% 2.54 (0.01) 2.32 (0.00)
> 16 8.3% 4.66 (0.00) 4.27 (0.01)
> 32 8.3% 8.94 (0.01) 8.20 (0.01)
> 64 8.2% 17.47 (0.01) 16.04 (0.03)
> 120 8.5% 32.45 (0.13) 29.69 (0.01)
>
> perf diff confirms less time spent in pup. Here are the top ten
> functions:
>
> Baseline Delta Abs Symbol
>
> 78.63% +6.64% clear_page_erms
> 1.50% -1.50% __gup_longterm_locked
> 1.27% -0.78% __get_user_pages
> +0.76% kvm_zap_rmapp.constprop.0
> 0.54% -0.53% vmacache_find
> 0.55% -0.51% get_pfnblock_flags_mask
> 0.48% -0.48% __get_user_pages_remote
> +0.39% slot_rmap_walk_next
> +0.32% vfio_pin_map_dma
> +0.26% kvm_handle_hva_range
> ...
>
> Suggested-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
> ---
> drivers/vfio/vfio_iommu_type1.c | 133 +++++++++++++++++++++-----------
> 1 file changed, 88 insertions(+), 45 deletions(-)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index c26c1a4697e5..ac59bfc4e332 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -101,6 +101,8 @@ struct vfio_batch {
> struct page **pages; /* for pin_user_pages_remote */
> struct page *fallback_page; /* if pages alloc fails */
> int capacity; /* length of pages array */
> + int size; /* of batch currently */
> + int offset; /* of next entry in pages */
> };
>
> struct vfio_group {
> @@ -425,6 +427,9 @@ static int put_pfn(unsigned long pfn, int prot)
>
> static void vfio_batch_init(struct vfio_batch *batch)
> {
> + batch->size = 0;
> + batch->offset = 0;
> +
> if (unlikely(disable_hugepages))
> goto fallback;
>
> @@ -440,6 +445,17 @@ static void vfio_batch_init(struct vfio_batch *batch)
> batch->capacity = 1;
> }
>
> +static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
> +{
> + while (batch->size) {
> + unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
> +
> + put_pfn(pfn, dma->prot);
> + batch->offset++;
> + batch->size--;
> + }
> +}
> +
> static void vfio_batch_fini(struct vfio_batch *batch)
> {
> if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
> @@ -526,65 +542,88 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> long npage, unsigned long *pfn_base,
> unsigned long limit, struct vfio_batch *batch)
> {
> - unsigned long pfn = 0;
> + unsigned long pfn;
> + struct mm_struct *mm = current->mm;
> long ret, pinned = 0, lock_acct = 0;
> bool rsvd;
> dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
>
> /* This code path is only user initiated */
> - if (!current->mm)
> + if (!mm)
> return -ENODEV;
>
> - ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, pfn_base,
> - batch->pages);
> - if (ret < 0)
> - return ret;
> -
> - pinned++;
> - rsvd = is_invalid_reserved_pfn(*pfn_base);
> -
> - /*
> - * Reserved pages aren't counted against the user, externally pinned
> - * pages are already counted against the user.
> - */
> - if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> - if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
> - put_pfn(*pfn_base, dma->prot);
> - pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> - limit << PAGE_SHIFT);
> - return -ENOMEM;
> - }
> - lock_acct++;
> + if (batch->size) {
> + /* Leftover pages in batch from an earlier call. */
> + *pfn_base = page_to_pfn(batch->pages[batch->offset]);
> + pfn = *pfn_base;
> + rsvd = is_invalid_reserved_pfn(*pfn_base);
> + } else {
> + *pfn_base = 0;
> }
>
> - if (unlikely(disable_hugepages))
> - goto out;
> + while (npage) {
> + if (!batch->size) {
> + /* Empty batch, so refill it. */
> + long req_pages = min_t(long, npage, batch->capacity);
>
> - /* Lock all the consecutive pages from pfn_base */
> - for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
> - pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> - ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, &pfn,
> - batch->pages);
> - if (ret < 0)
> - break;
> + ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
> + &pfn, batch->pages);
> + if (ret < 0)
> + return ret;
This might not be the first batch we fill, I think this needs to unwind
rather than direct return. Series looks good otherwise. Thanks,
Alex
>
> - if (pfn != *pfn_base + pinned ||
> - rsvd != is_invalid_reserved_pfn(pfn)) {
> - put_pfn(pfn, dma->prot);
> - break;
> + batch->size = ret;
> + batch->offset = 0;
> +
> + if (!*pfn_base) {
> + *pfn_base = pfn;
> + rsvd = is_invalid_reserved_pfn(*pfn_base);
> + }
> }
>
> - if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> - if (!dma->lock_cap &&
> - current->mm->locked_vm + lock_acct + 1 > limit) {
> - put_pfn(pfn, dma->prot);
> - pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> - __func__, limit << PAGE_SHIFT);
> - ret = -ENOMEM;
> - goto unpin_out;
> + /*
> + * pfn is preset for the first iteration of this inner loop and
> + * updated at the end to handle a VM_PFNMAP pfn. In that case,
> + * batch->pages isn't valid (there's no struct page), so allow
> + * batch->pages to be touched only when there's more than one
> + * pfn to check, which guarantees the pfns are from a
> + * !VM_PFNMAP vma.
> + */
> + while (true) {
> + if (pfn != *pfn_base + pinned ||
> + rsvd != is_invalid_reserved_pfn(pfn))
> + goto out;
> +
> + /*
> + * Reserved pages aren't counted against the user,
> + * externally pinned pages are already counted against
> + * the user.
> + */
> + if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> + if (!dma->lock_cap &&
> + mm->locked_vm + lock_acct + 1 > limit) {
> + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> + __func__, limit << PAGE_SHIFT);
> + ret = -ENOMEM;
> + goto unpin_out;
> + }
> + lock_acct++;
> }
> - lock_acct++;
> +
> + pinned++;
> + npage--;
> + vaddr += PAGE_SIZE;
> + iova += PAGE_SIZE;
> + batch->offset++;
> + batch->size--;
> +
> + if (!batch->size)
> + break;
> +
> + pfn = page_to_pfn(batch->pages[batch->offset]);
> }
> +
> + if (unlikely(disable_hugepages))
> + break;
> }
>
> out:
> @@ -596,6 +635,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> put_pfn(pfn, dma->prot);
> }
> + vfio_batch_unpin(batch, dma);
>
> return ret;
> }
> @@ -1305,6 +1345,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> if (ret) {
> vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
> npage, true);
> + vfio_batch_unpin(&batch, dma);
> break;
> }
>
> @@ -1546,11 +1587,13 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
> ret = iommu_map(domain->domain, iova, phys,
> size, dma->prot | domain->prot);
> if (ret) {
> - if (!dma->iommu_mapped)
> + if (!dma->iommu_mapped) {
> vfio_unpin_pages_remote(dma, iova,
> phys >> PAGE_SHIFT,
> size >> PAGE_SHIFT,
> true);
> + vfio_batch_unpin(&batch, dma);
> + }
> goto unwind;
> }
>
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [PATCH 3/3] vfio/type1: batch page pinning
2021-02-18 19:01 ` Alex Williamson
@ 2021-02-19 1:50 ` Daniel Jordan
0 siblings, 0 replies; 6+ messages in thread
From: Daniel Jordan @ 2021-02-19 1:50 UTC (permalink / raw)
To: Alex Williamson
Cc: Cornelia Huck, Jason Gunthorpe, Matthew Wilcox, Pavel Tatashin,
Steven Sistare, kvm, linux-kernel
Alex Williamson <alex.williamson@redhat.com> writes:
> This might not be the first batch we fill, I think this needs to unwind
> rather than direct return.
So it does, well spotted. And it's the same thing with the ENODEV case
farther up.
> Series looks good otherwise.
Thanks for going through it!
^ permalink raw reply [flat|nested] 6+ messages in thread