* [PATCH v2] mmap_vmcore: skip non-ram pages reported by hypervisors
@ 2014-07-09 11:40 Vitaly Kuznetsov
2014-07-10 14:48 ` Vivek Goyal
2014-07-10 14:52 ` Vivek Goyal
0 siblings, 2 replies; 3+ messages in thread
From: Vitaly Kuznetsov @ 2014-07-09 11:40 UTC (permalink / raw)
To: Andrew Morton, Michael Holzheu, Vivek Goyal, David Vrabel
Cc: Andrew Jones, xen-devel, linux-kernel
We have a special check in read_vmcore() handler to check if the page was
reported as ram or not by the hypervisor (pfn_is_ram()). However, when
vmcore is read with mmap() no such check is performed. That can lead to
unpredictable results, e.g. when running Xen PVHVM guest memcpy() after
mmap() on /proc/vmcore will hang processing HVMMEM_mmio_dm pages creating
enormous load in both DomU and Dom0.
Fix the issue by mapping each non-ram page to the zero page. Keep direct
path with remap_oldmem_pfn_range() to avoid looping through all pages on
bare metal.
The issue can also be solved by overriding remap_oldmem_pfn_range() in
xen-specific code, as remap_oldmem_pfn_range() was been designed for.
That, however, would involve non-obvious xen code path for all x86 builds
with CONFIG_XEN_PVHVM=y and would prevent all other hypervisor-specific
code on x86 arch from doing the same override.
Changes from v1:
- comment style changes
- change remap_oldmem_pfn_checked() interface to closer match the
remap_oldmem_pfn() interface
- preserve formal parameters within the loop, make the loop conditions
easier to understand
- use my_zero_pfn() for the zero page
- return remapped length instead of new offset
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
---
fs/proc/vmcore.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 84 insertions(+), 5 deletions(-)
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 382aa89..5cd13f8 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -328,6 +328,67 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz)
* virtually contiguous user-space in ELF layout.
*/
#ifdef CONFIG_MMU
+/*
+ * remap_oldmem_pfn_checked - do remap_oldmem_pfn replacing all pages reported
+ * as not being ram with the zero page.
+ *
+ * @vma: vm_area_struct describing requested mapping
+ * @vma_addr: start remapping from
+ * @pfn: page frame number to start remapping to
+ * @size: remapping size
+ *
+ * Returns the remapped length. If no errors were hit during the remapping it
+ * should be equal to size.
+ */
+static u64 remap_oldmem_pfn_checked(struct vm_area_struct *vma,
+ unsigned long vma_addr, unsigned long pfn,
+ unsigned long size)
+{
+ size_t map_size;
+ unsigned long pos_start, pos_end, pos;
+ unsigned long zeropage_pfn = my_zero_pfn(0);
+ u64 len = 0;
+
+ pos_start = pfn;
+ pos_end = pfn + (size >> PAGE_SHIFT);
+
+ for (pos = pos_start; pos < pos_end; ++pos) {
+ if (!pfn_is_ram(pos)) {
+ /* We hit a page which is not ram. Remap the continuous
+ * region between pos_start and pos-1 and replace
+ * the non-ram page at pos with the zero page.
+ */
+ if (pos > pos_start) {
+ /* Remap continuous region */
+ map_size = (pos - pos_start) << PAGE_SHIFT;
+ if (remap_oldmem_pfn_range(vma, vma_addr + len,
+ pos_start, map_size,
+ vma->vm_page_prot))
+ return len;
+ len += map_size;
+ }
+ /* Remap the zero page */
+ if (remap_oldmem_pfn_range(vma, vma_addr + len,
+ zeropage_pfn,
+ PAGE_SIZE,
+ vma->vm_page_prot))
+ return len;
+ len += PAGE_SIZE;
+ pos_start = pos + 1;
+ }
+ }
+ if (pos > pos_start) {
+ /* Remap the rest */
+ map_size = (pos - pos_start) << PAGE_SHIFT;
+ if (remap_oldmem_pfn_range(vma, vma_addr + len, pos_start,
+ map_size,
+ vma->vm_page_prot))
+ return len;
+ len += map_size;
+ }
+ return len;
+}
+
static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
{
size_t size = vma->vm_end - vma->vm_start;
@@ -387,13 +448,31 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
tsz = min_t(size_t, m->offset + m->size - start, size);
paddr = m->paddr + start - m->offset;
- if (remap_oldmem_pfn_range(vma, vma->vm_start + len,
- paddr >> PAGE_SHIFT, tsz,
- vma->vm_page_prot))
- goto fail;
+
+ /* Check if oldmem_pfn_is_ram was registered to avoid
+ looping over all pages without a reason. */
+ if (oldmem_pfn_is_ram) {
+ u64 original_len;
+ unsigned long pfn, vma_addr;
+
+ pfn = paddr >> PAGE_SHIFT;
+ vma_addr = vma->vm_start + len;
+ original_len = len;
+ len += remap_oldmem_pfn_checked(vma, vma_addr,
+ pfn, tsz);
+ if (len != original_len + tsz)
+ goto fail;
+ } else {
+ if (remap_oldmem_pfn_range(vma,
+ vma->vm_start + len,
+ paddr >> PAGE_SHIFT,
+ tsz,
+ vma->vm_page_prot))
+ goto fail;
+ len += tsz;
+ }
size -= tsz;
start += tsz;
- len += tsz;
if (size == 0)
return 0;
--
1.9.3
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH v2] mmap_vmcore: skip non-ram pages reported by hypervisors
2014-07-09 11:40 [PATCH v2] mmap_vmcore: skip non-ram pages reported by hypervisors Vitaly Kuznetsov
@ 2014-07-10 14:48 ` Vivek Goyal
2014-07-10 14:52 ` Vivek Goyal
1 sibling, 0 replies; 3+ messages in thread
From: Vivek Goyal @ 2014-07-10 14:48 UTC (permalink / raw)
To: Vitaly Kuznetsov
Cc: Andrew Morton, Michael Holzheu, David Vrabel, Andrew Jones,
xen-devel, linux-kernel
On Wed, Jul 09, 2014 at 01:40:22PM +0200, Vitaly Kuznetsov wrote:
> We have a special check in read_vmcore() handler to check if the page was
> reported as ram or not by the hypervisor (pfn_is_ram()). However, when
> vmcore is read with mmap() no such check is performed. That can lead to
> unpredictable results, e.g. when running Xen PVHVM guest memcpy() after
> mmap() on /proc/vmcore will hang processing HVMMEM_mmio_dm pages creating
> enormous load in both DomU and Dom0.
>
> Fix the issue by mapping each non-ram page to the zero page. Keep direct
> path with remap_oldmem_pfn_range() to avoid looping through all pages on
> bare metal.
>
> The issue can also be solved by overriding remap_oldmem_pfn_range() in
> xen-specific code, as remap_oldmem_pfn_range() was been designed for.
> That, however, would involve non-obvious xen code path for all x86 builds
> with CONFIG_XEN_PVHVM=y and would prevent all other hypervisor-specific
> code on x86 arch from doing the same override.
>
> Changes from v1:
> - comment style changes
> - change remap_oldmem_pfn_checked() interface to closer match the
> remap_oldmem_pfn() interface
> - preserve formal parameters within the loop, make the loop conditions
> easier to understand
> - use my_zero_pfn() for the zero page
> - return remapped length instead of new offset
>
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> Reviewed-by: Andrew Jones <drjones@redhat.com>
> ---
> fs/proc/vmcore.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 84 insertions(+), 5 deletions(-)
>
> diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
> index 382aa89..5cd13f8 100644
> --- a/fs/proc/vmcore.c
> +++ b/fs/proc/vmcore.c
> @@ -328,6 +328,67 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz)
> * virtually contiguous user-space in ELF layout.
> */
> #ifdef CONFIG_MMU
> +/*
> + * remap_oldmem_pfn_checked - do remap_oldmem_pfn replacing all pages reported
> + * as not being ram with the zero page.
> + *
> + * @vma: vm_area_struct describing requested mapping
> + * @vma_addr: start remapping from
> + * @pfn: page frame number to start remapping to
> + * @size: remapping size
> + *
> + * Returns the remapped length. If no errors were hit during the remapping it
> + * should be equal to size.
> + */
Hi,
So we are returning length of successful mapping so that it can be
unmapped in outer loop in case of failure? If yes, can't we handle
failure in this function itself instead of relying on outer function.
IOW, why not unmap whatever we have mapped in this function and return
the error code. That sounds simpler to me.
> +static u64 remap_oldmem_pfn_checked(struct vm_area_struct *vma,
> + unsigned long vma_addr, unsigned long pfn,
> + unsigned long size)
> +{
> + size_t map_size;
> + unsigned long pos_start, pos_end, pos;
> + unsigned long zeropage_pfn = my_zero_pfn(0);
> + u64 len = 0;
> +
> + pos_start = pfn;
> + pos_end = pfn + (size >> PAGE_SHIFT);
> +
> + for (pos = pos_start; pos < pos_end; ++pos) {
> + if (!pfn_is_ram(pos)) {
> + /* We hit a page which is not ram. Remap the continuous
> + * region between pos_start and pos-1 and replace
> + * the non-ram page at pos with the zero page.
> + */
> + if (pos > pos_start) {
> + /* Remap continuous region */
> + map_size = (pos - pos_start) << PAGE_SHIFT;
> + if (remap_oldmem_pfn_range(vma, vma_addr + len,
> + pos_start, map_size,
> + vma->vm_page_prot))
> + return len;
> + len += map_size;
> + }
> + /* Remap the zero page */
> + if (remap_oldmem_pfn_range(vma, vma_addr + len,
> + zeropage_pfn,
> + PAGE_SIZE,
> + vma->vm_page_prot))
> + return len;
> + len += PAGE_SIZE;
> + pos_start = pos + 1;
> + }
> + }
> + if (pos > pos_start) {
> + /* Remap the rest */
> + map_size = (pos - pos_start) << PAGE_SHIFT;
> + if (remap_oldmem_pfn_range(vma, vma_addr + len, pos_start,
> + map_size,
> + vma->vm_page_prot))
> + return len;
> + len += map_size;
> + }
> + return len;
> +}
> +
> static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
> {
> size_t size = vma->vm_end - vma->vm_start;
> @@ -387,13 +448,31 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
>
> tsz = min_t(size_t, m->offset + m->size - start, size);
> paddr = m->paddr + start - m->offset;
> - if (remap_oldmem_pfn_range(vma, vma->vm_start + len,
> - paddr >> PAGE_SHIFT, tsz,
> - vma->vm_page_prot))
> - goto fail;
> +
> + /* Check if oldmem_pfn_is_ram was registered to avoid
> + looping over all pages without a reason. */
> + if (oldmem_pfn_is_ram) {
> + u64 original_len;
> + unsigned long pfn, vma_addr;
> +
> + pfn = paddr >> PAGE_SHIFT;
> + vma_addr = vma->vm_start + len;
> + original_len = len;
> + len += remap_oldmem_pfn_checked(vma, vma_addr,
How about keeping argument list same as remap_oldmem_pfn_range()? Just
becomes easier to read.
> + pfn, tsz);
> + if (len != original_len + tsz)
> + goto fail;
If remap_oldmem_pfn_checked() can take care of internal error handling
then this should simplify too.
> + } else {
> + if (remap_oldmem_pfn_range(vma,
> + vma->vm_start + len,
> + paddr >> PAGE_SHIFT,
> + tsz,
> + vma->vm_page_prot))
> + goto fail;
> + len += tsz;
> + }
Can we move this code inside another wrapper function? This is looking
little ugly inline.
May be something like.
vmcore_remap_pfn_range() {
if (oldmem_pfn_is_ram)
remap_oldmem_pfn_checked()
else
remap_pfn_range();
}
This will also reduce the depth of braces and you might not have to use
5 lines to pass arguments to remap_oldmem_pfn_range().
Thanks
Vivek
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH v2] mmap_vmcore: skip non-ram pages reported by hypervisors
2014-07-09 11:40 [PATCH v2] mmap_vmcore: skip non-ram pages reported by hypervisors Vitaly Kuznetsov
2014-07-10 14:48 ` Vivek Goyal
@ 2014-07-10 14:52 ` Vivek Goyal
1 sibling, 0 replies; 3+ messages in thread
From: Vivek Goyal @ 2014-07-10 14:52 UTC (permalink / raw)
To: Vitaly Kuznetsov
Cc: Andrew Morton, Michael Holzheu, David Vrabel, Andrew Jones,
xen-devel, linux-kernel, HATAYAMA Daisuke
On Wed, Jul 09, 2014 at 01:40:22PM +0200, Vitaly Kuznetsov wrote:
> We have a special check in read_vmcore() handler to check if the page was
> reported as ram or not by the hypervisor (pfn_is_ram()). However, when
> vmcore is read with mmap() no such check is performed. That can lead to
> unpredictable results, e.g. when running Xen PVHVM guest memcpy() after
> mmap() on /proc/vmcore will hang processing HVMMEM_mmio_dm pages creating
> enormous load in both DomU and Dom0.
>
> Fix the issue by mapping each non-ram page to the zero page. Keep direct
> path with remap_oldmem_pfn_range() to avoid looping through all pages on
> bare metal.
>
> The issue can also be solved by overriding remap_oldmem_pfn_range() in
> xen-specific code, as remap_oldmem_pfn_range() was been designed for.
> That, however, would involve non-obvious xen code path for all x86 builds
> with CONFIG_XEN_PVHVM=y and would prevent all other hypervisor-specific
> code on x86 arch from doing the same override.
>
> Changes from v1:
> - comment style changes
> - change remap_oldmem_pfn_checked() interface to closer match the
> remap_oldmem_pfn() interface
> - preserve formal parameters within the loop, make the loop conditions
> easier to understand
> - use my_zero_pfn() for the zero page
> - return remapped length instead of new offset
>
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> Reviewed-by: Andrew Jones <drjones@redhat.com>
Also please cc HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com> in next
posting. He did the mmap() work and might be interested in having a
look at the patch.
Thanks
Vivek
> ---
> fs/proc/vmcore.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 84 insertions(+), 5 deletions(-)
>
> diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
> index 382aa89..5cd13f8 100644
> --- a/fs/proc/vmcore.c
> +++ b/fs/proc/vmcore.c
> @@ -328,6 +328,67 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz)
> * virtually contiguous user-space in ELF layout.
> */
> #ifdef CONFIG_MMU
> +/*
> + * remap_oldmem_pfn_checked - do remap_oldmem_pfn replacing all pages reported
> + * as not being ram with the zero page.
> + *
> + * @vma: vm_area_struct describing requested mapping
> + * @vma_addr: start remapping from
> + * @pfn: page frame number to start remapping to
> + * @size: remapping size
> + *
> + * Returns the remapped length. If no errors were hit during the remapping it
> + * should be equal to size.
> + */
> +static u64 remap_oldmem_pfn_checked(struct vm_area_struct *vma,
> + unsigned long vma_addr, unsigned long pfn,
> + unsigned long size)
> +{
> + size_t map_size;
> + unsigned long pos_start, pos_end, pos;
> + unsigned long zeropage_pfn = my_zero_pfn(0);
> + u64 len = 0;
> +
> + pos_start = pfn;
> + pos_end = pfn + (size >> PAGE_SHIFT);
> +
> + for (pos = pos_start; pos < pos_end; ++pos) {
> + if (!pfn_is_ram(pos)) {
> + /* We hit a page which is not ram. Remap the continuous
> + * region between pos_start and pos-1 and replace
> + * the non-ram page at pos with the zero page.
> + */
> + if (pos > pos_start) {
> + /* Remap continuous region */
> + map_size = (pos - pos_start) << PAGE_SHIFT;
> + if (remap_oldmem_pfn_range(vma, vma_addr + len,
> + pos_start, map_size,
> + vma->vm_page_prot))
> + return len;
> + len += map_size;
> + }
> + /* Remap the zero page */
> + if (remap_oldmem_pfn_range(vma, vma_addr + len,
> + zeropage_pfn,
> + PAGE_SIZE,
> + vma->vm_page_prot))
> + return len;
> + len += PAGE_SIZE;
> + pos_start = pos + 1;
> + }
> + }
> + if (pos > pos_start) {
> + /* Remap the rest */
> + map_size = (pos - pos_start) << PAGE_SHIFT;
> + if (remap_oldmem_pfn_range(vma, vma_addr + len, pos_start,
> + map_size,
> + vma->vm_page_prot))
> + return len;
> + len += map_size;
> + }
> + return len;
> +}
> +
> static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
> {
> size_t size = vma->vm_end - vma->vm_start;
> @@ -387,13 +448,31 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
>
> tsz = min_t(size_t, m->offset + m->size - start, size);
> paddr = m->paddr + start - m->offset;
> - if (remap_oldmem_pfn_range(vma, vma->vm_start + len,
> - paddr >> PAGE_SHIFT, tsz,
> - vma->vm_page_prot))
> - goto fail;
> +
> + /* Check if oldmem_pfn_is_ram was registered to avoid
> + looping over all pages without a reason. */
> + if (oldmem_pfn_is_ram) {
> + u64 original_len;
> + unsigned long pfn, vma_addr;
> +
> + pfn = paddr >> PAGE_SHIFT;
> + vma_addr = vma->vm_start + len;
> + original_len = len;
> + len += remap_oldmem_pfn_checked(vma, vma_addr,
> + pfn, tsz);
> + if (len != original_len + tsz)
> + goto fail;
> + } else {
> + if (remap_oldmem_pfn_range(vma,
> + vma->vm_start + len,
> + paddr >> PAGE_SHIFT,
> + tsz,
> + vma->vm_page_prot))
> + goto fail;
> + len += tsz;
> + }
> size -= tsz;
> start += tsz;
> - len += tsz;
>
> if (size == 0)
> return 0;
> --
> 1.9.3
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2014-07-10 14:52 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-07-09 11:40 [PATCH v2] mmap_vmcore: skip non-ram pages reported by hypervisors Vitaly Kuznetsov
2014-07-10 14:48 ` Vivek Goyal
2014-07-10 14:52 ` Vivek Goyal
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox