linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Luis Chamberlain <mcgrof@kernel.org>
To: Song Liu <song@kernel.org>, Vlastimil Babka <vbabka@suse.cz>,
	Mel Gorman <mgorman@suse.de>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	akpm@linux-foundation.org, x86@kernel.org, peterz@infradead.org,
	hch@lst.de, kernel-team@fb.com, rick.p.edgecombe@intel.com,
	dave.hansen@intel.com
Subject: Re: [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec
Date: Thu, 6 Oct 2022 16:15:17 -0700	[thread overview]
Message-ID: <Yz9hhTRzMr0ZEnA/@bombadil.infradead.org> (raw)
In-Reply-To: <20220818224218.2399791-2-song@kernel.org>

On Thu, Aug 18, 2022 at 03:42:14PM -0700, Song Liu wrote:
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
>  }
>  EXPORT_SYMBOL(vm_map_pages_zero);
>  
> +void *vmalloc_exec(unsigned long size, unsigned long align)
> +{
> +	return NULL;
> +}

Well that's not so nice for no-mmu systems. Shouldn't we have a
fallback?

> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index effd1ff6a4b4..472287e71bf1 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>  	va->va_end = addr + size;
>  	va->vm = NULL;
>  
> -	spin_lock(&vmap_area_lock);
> -	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> -	spin_unlock(&vmap_area_lock);
> +	if (vm_flags & VM_KERNEL_EXEC) {
> +		spin_lock(&free_text_area_lock);
> +		insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
> +		/* update subtree_max_size now as we need this soon */
> +		augment_tree_propagate_from(va);

Sorry, it is not clear to me why its needed only for exec, can you elaborate a
bit more?

> +		spin_unlock(&free_text_area_lock);
> +	} else {
> +		spin_lock(&vmap_area_lock);
> +		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> +		spin_unlock(&vmap_area_lock);
> +	}
>  
>  	BUG_ON(!IS_ALIGNED(va->va_start, align));
>  	BUG_ON(va->va_start < vstart);

<-- snip -->

> @@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size)
>  }
>  EXPORT_SYMBOL(vmalloc);
>  
> +void *vmalloc_exec(unsigned long size, unsigned long align)
> +{
> +	struct vmap_area *va, *tmp;
> +	unsigned long addr;
> +	enum fit_type type;
> +	int ret;
> +
> +	va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE);
> +	if (unlikely(!va))
> +		return ERR_PTR(-ENOMEM);
> +
> +again:
> +	preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE);
> +	tmp = find_vmap_lowest_match(free_text_area_root.rb_node,
> +				     size, align, 1, false);
> +
> +	if (!tmp) {
> +		unsigned long alloc_size;
> +		void *ptr;
> +
> +		spin_unlock(&free_text_area_lock);
> +
> +		alloc_size = roundup(size, PMD_SIZE * num_online_nodes());
> +		ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR,
> +					   MODULES_END, GFP_KERNEL, PAGE_KERNEL,
> +					   VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD,
> +					   NUMA_NO_NODE, __builtin_return_address(0));

We can review the guard stuff on the other thread with Peter.

> +		if (unlikely(!ptr)) {
> +			ret = -ENOMEM;
> +			goto err_out;
> +		}
> +		memset(ptr, 0, alloc_size);
> +		set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
> +		set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT);

I *really* like that this is now not something users have to muck with thanks!

> +
> +		goto again;
> +	}
> +
> +	addr = roundup(tmp->va_start, align);
> +	type = classify_va_fit_type(tmp, addr, size);
> +	if (WARN_ON_ONCE(type == NOTHING_FIT)) {
> +		addr = -ENOMEM;
> +		goto err_out;
> +	}
> +
> +	ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list,
> +				    tmp, addr, size, type);
> +	if (ret) {
> +		addr = ret;
> +		goto err_out;
> +	}
> +	spin_unlock(&free_text_area_lock);
> +
> +	va->va_start = addr;
> +	va->va_end = addr + size;
> +	va->vm = tmp->vm;
> +
> +	spin_lock(&vmap_area_lock);
> +	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> +	spin_unlock(&vmap_area_lock);
> +
> +	return (void *)addr;
> +
> +err_out:
> +	spin_unlock(&free_text_area_lock);
> +	return ERR_PTR(ret);
> +}
> +
> +void vfree_exec(const void *addr)
> +{
> +	struct vmap_area *va;
> +
> +	might_sleep();
> +
> +	spin_lock(&vmap_area_lock);
> +	va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node);
> +	if (WARN_ON_ONCE(!va)) {
> +		spin_unlock(&vmap_area_lock);
> +		return;
> +	}
> +
> +	unlink_va(va, &vmap_area_root);

Curious why we don't memset to 0 before merge_or_add_vmap_area_augment()?
I realize other code doesn't seem to do it, though.

> +	spin_unlock(&vmap_area_lock);
> +
> +	spin_lock(&free_text_area_lock);
> +	merge_or_add_vmap_area_augment(va,
> +		&free_text_area_root, &free_text_area_list);

I have concern that we can be using precious physically contigous memory
from huge pages to then end up in a situation where we create our own
pool and allow things to be non-contigous afterwards.

I'm starting to suspect that if the allocation is > PAGE_SIZE we just
give it back generally. Otherwise wouldn't the fragmentation cause us
to eventually just eat up most huge pages available? Probably not for
eBPF but if we use this on a system with tons of module insertions /
deletions this seems like it could happen?

  Luis

> +	spin_unlock(&free_text_area_lock);
> +	/* TODO: when the whole vm_struct is not in use, free it */
> +}
> +
>  /**
>   * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
>   * @size:      allocation size
> @@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>  			/* It is a BUG(), but trigger recovery instead. */
>  			goto recovery;
>  
> -		ret = adjust_va_to_fit_type(va, start, size, type);
> +		ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list,
> +					    va, start, size, type);
>  		if (unlikely(ret))
>  			goto recovery;
>  
> -- 
> 2.30.2
> 


  reply	other threads:[~2022-10-06 23:15 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-18 22:42 [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
2022-08-18 22:42 ` [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec Song Liu
2022-10-06 23:15   ` Luis Chamberlain [this message]
2022-10-07  6:39     ` Song Liu
2022-08-18 22:42 ` [RFC 2/5] bpf: use vmalloc_exec Song Liu
2022-08-18 22:42 ` [RFC 3/5] modules, x86: use vmalloc_exec for module core Song Liu
2022-10-06 23:38   ` Luis Chamberlain
2022-10-07  6:46     ` Song Liu
2022-08-18 22:42 ` [RFC 4/5] vmalloc_exec: share a huge page with kernel text Song Liu
2022-10-06 23:44   ` Luis Chamberlain
2022-10-07  6:53     ` Song Liu
2022-08-18 22:42 ` [RFC 5/5] vmalloc: vfree_exec: free unused vm_struct Song Liu
2022-08-22 15:46 ` [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
2022-08-22 16:34   ` Peter Zijlstra
2022-08-22 16:56     ` Song Liu
2022-08-23  5:42       ` Peter Zijlstra
2022-08-23  6:39         ` Christophe Leroy
2022-08-23  6:57           ` Song Liu
2022-08-23  6:55         ` Song Liu
2022-08-24 17:06       ` Song Liu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Yz9hhTRzMr0ZEnA/@bombadil.infradead.org \
    --to=mcgrof@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=dave.hansen@intel.com \
    --cc=hch@lst.de \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=peterz@infradead.org \
    --cc=rick.p.edgecombe@intel.com \
    --cc=song@kernel.org \
    --cc=vbabka@suse.cz \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).