From: Alexey Kardashevskiy <aik@ozlabs.ru>
To: linuxppc-dev@lists.ozlabs.org
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>,
kvm-ppc@vger.kernel.org,
David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH kernel] KVM: PPC: Allocate guest TCEs on demand too
Date: Wed, 27 Feb 2019 19:42:35 +1100 [thread overview]
Message-ID: <20190227084235.22795-1-aik@ozlabs.ru> (raw)
We already allocate hardware TCE tables in multiple levels and skip
intermediate levels when we can, now it is a turn of the KVM TCE tables.
Thankfully these are allocated already in 2 levels.
This moves the table's last level allocation from the creating helper to
kvmppc_tce_put() and kvm_spapr_tce_fault().
This adds kvmppc_rm_ioba_validate() to do an additional test if
the consequent kvmppc_tce_put() needs a page which has not been allocated;
if this is the case, we bail out to virtual mode handlers.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
For NVLink2 passthrough guests with 128TiB DMA windows (when we push GPU RAM
above the PCI MMIO window in the guest) and very fragmented system RAM
the difference is about 16GiB of RAM before and after this patch.
---
arch/powerpc/kvm/book3s_64_vio.c | 21 ++++------
arch/powerpc/kvm/book3s_64_vio_hv.c | 62 ++++++++++++++++++++++++++---
2 files changed, 64 insertions(+), 19 deletions(-)
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index f02b049..281b56b 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -228,7 +228,8 @@ static void release_spapr_tce_table(struct rcu_head *head)
unsigned long i, npages = kvmppc_tce_pages(stt->size);
for (i = 0; i < npages; i++)
- __free_page(stt->pages[i]);
+ if (stt->pages[i])
+ __free_page(stt->pages[i]);
kfree(stt);
}
@@ -241,6 +242,12 @@ static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf)
if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
return VM_FAULT_SIGBUS;
+ if (!stt->pages[vmf->pgoff]) {
+ stt->pages[vmf->pgoff] = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!stt->pages[vmf->pgoff])
+ return VM_FAULT_OOM;
+ }
+
page = stt->pages[vmf->pgoff];
get_page(page);
vmf->page = page;
@@ -296,7 +303,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvmppc_spapr_tce_table *siter;
unsigned long npages, size = args->size;
int ret = -ENOMEM;
- int i;
if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
@@ -320,12 +326,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
stt->kvm = kvm;
INIT_LIST_HEAD_RCU(&stt->iommu_tables);
- for (i = 0; i < npages; i++) {
- stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!stt->pages[i])
- goto fail;
- }
-
mutex_lock(&kvm->lock);
/* Check this LIOBN hasn't been previously allocated */
@@ -352,11 +352,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
if (ret >= 0)
return ret;
- fail:
- for (i = 0; i < npages; i++)
- if (stt->pages[i])
- __free_page(stt->pages[i]);
-
kfree(stt);
fail_acct:
kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 2206bc7..fbb920d 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -158,23 +158,69 @@ static u64 *kvmppc_page_address(struct page *page)
return (u64 *) page_address(page);
}
+/*
+ * TCEs pages are allocated in kvmppc_tce_put() which won't be able to do so
+ * in real mode.
+ * Check if kvmppc_tce_put() can succeed in real mode, i.e. a TCEs page is
+ * allocated or not required (when clearing a tce entry).
+ */
+static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+ unsigned long ioba, unsigned long npages, bool clearing)
+{
+ unsigned long i, sttpage, sttpages;
+ unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages);
+
+ if (ret)
+ return ret;
+ /*
+ * clearing==true says kvmppc_tce_put won't be allocating pages
+ * for empty tces.
+ */
+ if (clearing)
+ return H_SUCCESS;
+
+ sttpage = ((ioba >> stt->page_shift) - stt->offset) / TCES_PER_PAGE;
+ sttpages = (npages + TCES_PER_PAGE - 1) / TCES_PER_PAGE;
+ for (i = sttpage; i < sttpage + sttpages; ++i)
+ if (!stt->pages[i])
+ return H_TOO_HARD;
+
+ return H_SUCCESS;
+}
+
/*
* Handles TCE requests for emulated devices.
* Puts guest TCE values to the table and expects user space to convert them.
* Called in both real and virtual modes.
* Cannot fail so kvmppc_tce_validate must be called before it.
*
- * WARNING: This will be called in real-mode on HV KVM and virtual
- * mode on PR KVM
+ * WARNING: This will be called in real-mode on HV HPT KVM and virtual
+ * mode on PR KVM or HV radix KVM
*/
void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
unsigned long idx, unsigned long tce)
{
struct page *page;
u64 *tbl;
+ unsigned long sttpage;
idx -= stt->offset;
- page = stt->pages[idx / TCES_PER_PAGE];
+ sttpage = idx / TCES_PER_PAGE;
+ page = stt->pages[sttpage];
+
+ if (!page) {
+ /* We allow any TCE, not just with read|write permissions */
+ if (!tce)
+ return;
+ /*
+ * We must not end up here in real mode,
+ * kvmppc_rm_ioba_validate() takes care of this.
+ */
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (WARN_ON_ONCE(!page))
+ return;
+ stt->pages[sttpage] = page;
+ }
tbl = kvmppc_page_address(page);
tbl[idx % TCES_PER_PAGE] = tce;
@@ -381,7 +427,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
if (!stt)
return H_TOO_HARD;
- ret = kvmppc_ioba_validate(stt, ioba, 1);
+ ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0);
if (ret != H_SUCCESS)
return ret;
@@ -480,7 +526,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
if (tce_list & (SZ_4K - 1))
return H_PARAMETER;
- ret = kvmppc_ioba_validate(stt, ioba, npages);
+ ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false);
if (ret != H_SUCCESS)
return ret;
@@ -583,7 +629,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
if (!stt)
return H_TOO_HARD;
- ret = kvmppc_ioba_validate(stt, ioba, npages);
+ ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0);
if (ret != H_SUCCESS)
return ret;
@@ -635,6 +681,10 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
idx = (ioba >> stt->page_shift) - stt->offset;
page = stt->pages[idx / TCES_PER_PAGE];
+ if (!page) {
+ vcpu->arch.regs.gpr[4] = 0;
+ return H_SUCCESS;
+ }
tbl = (u64 *)page_address(page);
vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE];
--
2.17.1
next reply other threads:[~2019-02-27 8:44 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-02-27 8:42 Alexey Kardashevskiy [this message]
2019-02-28 0:15 ` [PATCH kernel] KVM: PPC: Allocate guest TCEs on demand too David Gibson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190227084235.22795-1-aik@ozlabs.ru \
--to=aik@ozlabs.ru \
--cc=david@gibson.dropbear.id.au \
--cc=kvm-ppc@vger.kernel.org \
--cc=linuxppc-dev@lists.ozlabs.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).