All of lore.kernel.org
 help / color / mirror / Atom feed
From: Stephen Brennan <stephen.s.brennan@oracle.com>
To: Baoquan He <bhe@redhat.com>, linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org, akpm@linux-foundation.org, urezki@gmail.com,
	hch@infradead.org, Baoquan He <bhe@redhat.com>
Subject: Re: [PATCH RFC 3/3] mm/vmalloc.c: allow vread() to read out vm_map_ram areas
Date: Wed, 09 Nov 2022 16:59:38 -0800	[thread overview]
Message-ID: <87v8nnfwud.fsf@oracle.com> (raw)
In-Reply-To: <20221109033535.269229-4-bhe@redhat.com>

Baoquan He <bhe@redhat.com> writes:
> Currently, vread() can read out vmalloc areas which is associated with
> a vm_struct. While this doesn't work for areas created by vm_map_ram()
> interface because it doesn't allocate a vm_struct. Then in vread(),
> these areas will be skipped.
>
> Here, add a new function vb_vread() to read out areas managed by
> vmap_block specifically. Then recognize vm_map_ram areas via vmap->flags
> and handle  them respectively.
>
> Stephen Brennan <stephen.s.brennan@oracle.com>
> Signed-off-by: Baoquan He <bhe@redhat.com>
> Link: https://lore.kernel.org/all/87ilk6gos2.fsf@oracle.com/T/#u
> ---
>  mm/vmalloc.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 51 insertions(+), 6 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 41d82dc07e13..5a8d5659bfb0 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -3518,6 +3518,46 @@ static int aligned_vread(char *buf, char *addr, unsigned long count)
>  	return copied;
>  }
>  
> +static void vb_vread(char *buf, char *addr, int count)
> +{
> +	char *start;
> +	struct vmap_block *vb;
> +	unsigned long offset;
> +	unsigned int rs, re, n;
> +
> +	offset = ((unsigned long)addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
> +	vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr));
> +
> +	spin_lock(&vb->lock);
> +	if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
> +		spin_unlock(&vb->lock);
> +		memset(buf, 0, count);
> +		return;
> +	}
> +	for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
> +		if (!count)
> +			break;
> +		start = vmap_block_vaddr(vb->va->va_start, rs);
> +		if (addr < start) {
> +			if (count == 0)
> +				break;
> +			*buf = '\0';
> +			buf++;
> +			addr++;
> +			count--;
> +		}
> +		n = (re - rs + 1) << PAGE_SHIFT;
> +		if (n > count)
> +			n = count;
> +		aligned_vread(buf, start, n);
> +
> +		buf += n;
> +		addr += n;
> +		count -= n;
> +	}
> +	spin_unlock(&vb->lock);
> +}
> +
>  /**
>   * vread() - read vmalloc area in a safe way.
>   * @buf:     buffer for reading data
> @@ -3548,7 +3588,7 @@ long vread(char *buf, char *addr, unsigned long count)
>  	struct vm_struct *vm;
>  	char *vaddr, *buf_start = buf;
>  	unsigned long buflen = count;
> -	unsigned long n;
> +	unsigned long n, size;
>  
>  	addr = kasan_reset_tag(addr);
>  
> @@ -3569,12 +3609,14 @@ long vread(char *buf, char *addr, unsigned long count)
>  		if (!count)
>  			break;
>  
> -		if (!va->vm)
> +		if (!(va->flags & VMAP_RAM) && !va->vm)
>  			continue;
>  
>  		vm = va->vm;
> -		vaddr = (char *) vm->addr;
> -		if (addr >= vaddr + get_vm_area_size(vm))
> +		vaddr = (char *) va->va_start;
> +		size = vm ? get_vm_area_size(vm) : va_size(va);

Hi Baoquan,

Thanks for working on this. I tested your patches out by using drgn to
debug /proc/kcore. I have a kernel module[1] to do a vm_map_ram() call
and print the virtual address to the kernel log so I can try to read
that memory address in drgn. When I did this test, I got a panic on the
above line of code.

[  167.101113] BUG: kernel NULL pointer dereference, address: 0000000000000013
[  167.104538] #PF: supervisor read access in kernel mode
[  167.106446] #PF: error_code(0x0000) - not-present page
[  167.108474] PGD 0 P4D 0
[  167.109311] Oops: 0000 [#1] PREEMPT SMP NOPTI
[  167.111727] CPU: 3 PID: 7647 Comm: drgn Kdump: loaded Tainted: G           OE      6.1.0-rc4.bugvreadtest.el8.dev02.x86_64 #1
[  167.115076] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.5.1 06/16/2021
[  167.117348] RIP: 0010:vread+0xaf/0x210
[  167.118345] Code: 86 3e 01 00 00 48 85 db 0f 84 35 01 00 00 49 8d 47 28 48 3d 10 f8 a7 8f 0f 84 25 01 00 00 4d 89 f4 49 8b 57 38 48 85 d2 74 21 <48> 8b 42 10 f6 42 18 40 48 89 d6 49 8b 0f 48 8d b8 00 f0 ff ff 48
[  167.123776] RSP: 0018:ffffaeb380a1fb90 EFLAGS: 00010206
[  167.125669] RAX: ffff9853a1397b28 RBX: 0000000000000040 RCX: 0000000000000000
[  167.128401] RDX: 0000000000000003 RSI: 0000000000000000 RDI: 0000000000000000
[  167.130948] RBP: ffffaeb382400000 R08: 0000000000000000 R09: 0000000000000000
[  167.133372] R10: 0000000000000000 R11: 0000000000000000 R12: ffff985385877000
[  167.135397] R13: 0000000000000040 R14: ffff985385877000 R15: ffff9853a1397b00
[  167.137533] FS:  00007f71eae33b80(0000) GS:ffff9856afd80000(0000) knlGS:0000000000000000
[  167.140210] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  167.142440] CR2: 0000000000000013 CR3: 000000012048a000 CR4: 00000000003506e0
[  167.144640] Call Trace:
[  167.145494]  <TASK>
[  167.146263]  read_kcore+0x33a/0xa30
[  167.147392]  ? remove_entity_load_avg+0x2e/0x70
[  167.148425]  ? _raw_spin_unlock_irqrestore+0x11/0x60
[  167.150657]  ? __wake_up_common_lock+0x8b/0xd0
[  167.152261]  ? tty_set_termios+0x211/0x280
[  167.153397]  ? set_termios+0x16b/0x1d0
[  167.154698]  ? _raw_spin_unlock+0xe/0x40
[  167.155737]  ? wp_page_reuse+0x60/0x80
[  167.157138]  ? do_wp_page+0x169/0x3a0
[  167.158752]  ? pmd_pfn+0x9/0x50
[  167.159645]  ? __handle_mm_fault+0x3b0/0x690
[  167.160837]  ? inode_security+0x22/0x60
[  167.161761]  proc_reg_read+0x5a/0xb0
[  167.162777]  vfs_read+0xa7/0x320
[  167.163512]  ? handle_mm_fault+0xb6/0x2c0
[  167.164400]  __x64_sys_pread64+0x9c/0xd0
[  167.165763]  do_syscall_64+0x3f/0xa0
[  167.167610]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[  167.169951] RIP: 0033:0x7f71e9c123d7

I debugged the resulting core dump and found the reason:

>>> stack_trace = prog.crashed_thread().stack_trace()
>>> stack_trace
#0  crash_setup_regs (./arch/x86/include/asm/kexec.h:95:3)
#1  __crash_kexec (kernel/kexec_core.c:974:4)
#2  panic (kernel/panic.c:330:3)
#3  oops_end (arch/x86/kernel/dumpstack.c:379:3)
#4  page_fault_oops (arch/x86/mm/fault.c:729:2)
#5  handle_page_fault (arch/x86/mm/fault.c:1519:3)
#6  exc_page_fault (arch/x86/mm/fault.c:1575:2)
#7  asm_exc_page_fault+0x26/0x2b (./arch/x86/include/asm/idtentry.h:570)
#8  get_vm_area_size (./include/linux/vmalloc.h:203:14)
#9  vread (mm/vmalloc.c:3617:15)
#10 read_kcore (fs/proc/kcore.c:510:4)
#11 pde_read (fs/proc/inode.c:316:10)
#12 proc_reg_read (fs/proc/inode.c:328:8)
#13 vfs_read (fs/read_write.c:468:9)
#14 ksys_pread64 (fs/read_write.c:665:10)
#15 __do_sys_pread64 (fs/read_write.c:675:9)
#16 __se_sys_pread64 (fs/read_write.c:672:1)
#17 __x64_sys_pread64 (fs/read_write.c:672:1)
#18 do_syscall_x64 (arch/x86/entry/common.c:50:14)
#19 do_syscall_64 (arch/x86/entry/common.c:80:7)
#20 entry_SYSCALL_64+0x9f/0x19b (arch/x86/entry/entry_64.S:120)
#21 0x7f71e9c123d7
>>> stack_trace[9]["va"]
*(struct vmap_area *)0xffff9853a1397b00 = {
        .va_start = (unsigned long)18446654684740452352,
        .va_end = (unsigned long)18446654684741500928,
        .rb_node = (struct rb_node){
                .__rb_parent_color = (unsigned long)18446630083335569168,
                .rb_right = (struct rb_node *)0x0,
                .rb_left = (struct rb_node *)0x0,
        },
        .list = (struct list_head){
                .next = (struct list_head *)0xffff98538c403f28,
                .prev = (struct list_head *)0xffff98538c54e1e8,
        },
        .subtree_max_size = (unsigned long)3,
        .vm = (struct vm_struct *)0x3,
        .flags = (unsigned long)3,
}

Since flags is in a union, it shadows "vm" and causes the condition to
be true, and then get_vm_area_size() tries to follow the pointer defined
by flags. I'm not sure if the fix is to have flags be a separate field
inside vmap_area, or to have more careful handling in the vread path.

Thanks,
Stephen

> +
> +		if (addr >= vaddr + size)
>  			continue;
>  		while (addr < vaddr) {
>  			if (count == 0)
> @@ -3584,10 +3626,13 @@ long vread(char *buf, char *addr, unsigned long count)
>  			addr++;
>  			count--;
>  		}
> -		n = vaddr + get_vm_area_size(vm) - addr;
> +		n = vaddr + size - addr;
>  		if (n > count)
>  			n = count;
> -		if (!(vm->flags & VM_IOREMAP))
> +
> +		if ((va->flags & (VMAP_RAM|VMAP_BLOCK)) == (VMAP_RAM|VMAP_BLOCK))
> +			vb_vread(buf, addr, n);
> +		else if ((va->flags & VMAP_RAM) || !(vm->flags & VM_IOREMAP))
>  			aligned_vread(buf, addr, n);
>  		else /* IOREMAP area is treated as memory hole */
>  			memset(buf, 0, n);
> -- 
> 2.34.1


  reply	other threads:[~2022-11-10  1:07 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-09  3:35 [PATCH RFC 0/3] mm/vmalloc.c: allow vread() to read out vm_map_ram areas Baoquan He
2022-11-09  3:35 ` [PATCH RFC 1/3] mm/vmalloc.c: add used_map into vmap_block to track space of vmap_block Baoquan He
2022-11-09  3:35 ` [PATCH RFC 2/3] mm/vmalloc.c: add flags to mark vm_map_ram area Baoquan He
2022-11-09  3:35 ` [PATCH RFC 3/3] mm/vmalloc.c: allow vread() to read out vm_map_ram areas Baoquan He
2022-11-10  0:59   ` Stephen Brennan [this message]
2022-11-10 10:23     ` Baoquan He
2022-11-10 18:48       ` Stephen Brennan
2022-11-14 10:06         ` Baoquan He
2022-11-11  9:48   ` kernel test robot
2022-11-18  8:01   ` Matthew Wilcox
2022-11-23  3:38     ` Baoquan He
2022-11-23 13:24       ` Matthew Wilcox
2022-11-24  9:52         ` Baoquan He
2022-11-30 13:06           ` Uladzislau Rezki
2022-12-01  4:46             ` Baoquan He

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87v8nnfwud.fsf@oracle.com \
    --to=stephen.s.brennan@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=bhe@redhat.com \
    --cc=hch@infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=urezki@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.