All of lore.kernel.org
 help / color / mirror / Atom feed
From: Song Liu <song@kernel.org>
To: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>
Cc: <akpm@linux-foundation.org>, <x86@kernel.org>,
	<peterz@infradead.org>, <hch@lst.de>, <kernel-team@fb.com>,
	<rick.p.edgecombe@intel.com>, <mcgrof@kernel.org>,
	<dave.hansen@intel.com>, Song Liu <song@kernel.org>
Subject: [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec
Date: Thu, 18 Aug 2022 15:42:14 -0700	[thread overview]
Message-ID: <20220818224218.2399791-2-song@kernel.org> (raw)
In-Reply-To: <20220818224218.2399791-1-song@kernel.org>

This is a prototype to host dynamic kernel text (modules, BPF programs,
etc.) with huge pages. This is similar to the proposal by Peter in [1].

A new tree of vmap_area, free_text_area_* tree, is introduced in addition
to free_vmap_area_* and vmap_area_*. vmalloc_exec allocates pages from
free_text_area_*. When there isn't enough space left in free_text_area_*,
new PMD_SIZE page(s) is allocated from free_vmap_area_* and added to
free_text_area_*.

The new tree allows separate handling of < PAGE_SIZE allocations, as
current vmalloc code mostly assumes PAGE_SIZE aligned allocations. This
version of vmalloc_exec can handle bpf programs, which uses 64 byte
aligned allocations), and modules, which uses PAGE_SIZE aligned
allocations.

[1] https://lore.kernel.org/bpf/Ys6cWUMHO8XwyYgr@hirez.programming.kicks-ass.net/
---
 include/linux/vmalloc.h |   4 +
 mm/nommu.c              |   7 ++
 mm/vmalloc.c            | 163 +++++++++++++++++++++++++++++++++-------
 3 files changed, 147 insertions(+), 27 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 096d48aa3437..691c02ffe3db 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -35,6 +35,8 @@ struct notifier_block;		/* in notifier.h */
 #define VM_DEFER_KMEMLEAK	0
 #endif
 
+#define VM_KERNEL_EXEC		0x00001000	/* kernel text mapped as RO+X */
+
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
@@ -154,6 +156,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
 		int node, const void *caller) __alloc_size(1);
 void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+void *vmalloc_exec(unsigned long size, unsigned long align) __alloc_size(1);
+void vfree_exec(const void *addr);
 
 extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
 extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2);
diff --git a/mm/nommu.c b/mm/nommu.c
index 9d7afc2d959e..11e0fc996006 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
 }
 EXPORT_SYMBOL(vm_map_pages_zero);
 
+void *vmalloc_exec(unsigned long size, unsigned long align)
+{
+	return NULL;
+}
+
+void vfree_exec(const void *addr) { }
+
 /*
  *  sys_brk() for the most part doesn't need the global kernel
  *  lock, except when an application is doing something nasty
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index effd1ff6a4b4..472287e71bf1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -753,6 +753,10 @@ static LIST_HEAD(free_vmap_area_list);
  */
 static struct rb_root free_vmap_area_root = RB_ROOT;
 
+static DEFINE_SPINLOCK(free_text_area_lock);
+static LIST_HEAD(free_text_area_list);
+static struct rb_root free_text_area_root = RB_ROOT;
+
 /*
  * Preload a CPU with one object for "no edge" split case. The
  * aim is to get rid of allocations from the atomic context, thus
@@ -814,9 +818,11 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
 	return va;
 }
 
-static struct vmap_area *__find_vmap_area(unsigned long addr)
+static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_node *root)
 {
-	struct rb_node *n = vmap_area_root.rb_node;
+	struct rb_node *n;
+
+	n = root ? root : vmap_area_root.rb_node;
 
 	addr = (unsigned long)kasan_reset_tag((void *)addr);
 
@@ -926,7 +932,7 @@ link_va(struct vmap_area *va, struct rb_root *root,
 
 	/* Insert to the rb-tree */
 	rb_link_node(&va->rb_node, parent, link);
-	if (root == &free_vmap_area_root) {
+	if (root == &free_vmap_area_root || root == &free_text_area_root) {
 		/*
 		 * Some explanation here. Just perform simple insertion
 		 * to the tree. We do not set va->subtree_max_size to
@@ -955,7 +961,7 @@ unlink_va(struct vmap_area *va, struct rb_root *root)
 	if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
 		return;
 
-	if (root == &free_vmap_area_root)
+	if (root == &free_vmap_area_root || root == &free_text_area_root)
 		rb_erase_augmented(&va->rb_node,
 			root, &free_vmap_area_rb_augment_cb);
 	else
@@ -1198,15 +1204,15 @@ is_within_this_va(struct vmap_area *va, unsigned long size,
  * overhead.
  */
 static __always_inline struct vmap_area *
-find_vmap_lowest_match(unsigned long size, unsigned long align,
-	unsigned long vstart, bool adjust_search_size)
+find_vmap_lowest_match(struct rb_node *root, unsigned long size,
+       unsigned long align, unsigned long vstart, bool adjust_search_size)
 {
 	struct vmap_area *va;
 	struct rb_node *node;
 	unsigned long length;
 
 	/* Start from the root. */
-	node = free_vmap_area_root.rb_node;
+	node = root;
 
 	/* Adjust the search size for alignment overhead. */
 	length = adjust_search_size ? size + align - 1 : size;
@@ -1290,8 +1296,9 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align)
 	get_random_bytes(&rnd, sizeof(rnd));
 	vstart = VMALLOC_START + rnd;
 
-	va_1 = find_vmap_lowest_match(size, align, vstart, false);
-	va_2 = find_vmap_lowest_linear_match(size, align, vstart);
+	va_1 = find_vmap_lowest_match(free_vmap_area_root.rb_node, size,
+				      align, vstart, false);
+	va_2 = find_vmap_lowest_linear_match(root, size, align, vstart);
 
 	if (va_1 != va_2)
 		pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
@@ -1334,7 +1341,8 @@ classify_va_fit_type(struct vmap_area *va,
 }
 
 static __always_inline int
-adjust_va_to_fit_type(struct vmap_area *va,
+adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
+	struct vmap_area *va,
 	unsigned long nva_start_addr, unsigned long size,
 	enum fit_type type)
 {
@@ -1348,7 +1356,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
 		 * V      NVA      V
 		 * |---------------|
 		 */
-		unlink_va(va, &free_vmap_area_root);
+		unlink_va(va, root);
 		kmem_cache_free(vmap_area_cachep, va);
 	} else if (type == LE_FIT_TYPE) {
 		/*
@@ -1426,8 +1434,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
 		augment_tree_propagate_from(va);
 
 		if (lva)	/* type == NE_FIT_TYPE */
-			insert_vmap_area_augment(lva, &va->rb_node,
-				&free_vmap_area_root, &free_vmap_area_list);
+			insert_vmap_area_augment(lva, &va->rb_node, root, head);
 	}
 
 	return 0;
@@ -1459,7 +1466,8 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
 	if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
 		adjust_search_size = false;
 
-	va = find_vmap_lowest_match(size, align, vstart, adjust_search_size);
+	va = find_vmap_lowest_match(free_vmap_area_root.rb_node,
+				    size, align, vstart, adjust_search_size);
 	if (unlikely(!va))
 		return vend;
 
@@ -1478,7 +1486,8 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
 		return vend;
 
 	/* Update the free vmap_area. */
-	ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
+	ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list,
+				    va, nva_start_addr, size, type);
 	if (ret)
 		return vend;
 
@@ -1539,7 +1548,7 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
 static struct vmap_area *alloc_vmap_area(unsigned long size,
 				unsigned long align,
 				unsigned long vstart, unsigned long vend,
-				int node, gfp_t gfp_mask)
+				int node, unsigned long vm_flags, gfp_t gfp_mask)
 {
 	struct vmap_area *va;
 	unsigned long freed;
@@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
 	va->va_end = addr + size;
 	va->vm = NULL;
 
-	spin_lock(&vmap_area_lock);
-	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
-	spin_unlock(&vmap_area_lock);
+	if (vm_flags & VM_KERNEL_EXEC) {
+		spin_lock(&free_text_area_lock);
+		insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
+		/* update subtree_max_size now as we need this soon */
+		augment_tree_propagate_from(va);
+		spin_unlock(&free_text_area_lock);
+	} else {
+		spin_lock(&vmap_area_lock);
+		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+		spin_unlock(&vmap_area_lock);
+	}
 
 	BUG_ON(!IS_ALIGNED(va->va_start, align));
 	BUG_ON(va->va_start < vstart);
@@ -1803,7 +1820,7 @@ struct vmap_area *find_vmap_area(unsigned long addr)
 	struct vmap_area *va;
 
 	spin_lock(&vmap_area_lock);
-	va = __find_vmap_area(addr);
+	va = __find_vmap_area(addr, vmap_area_root.rb_node);
 	spin_unlock(&vmap_area_lock);
 
 	return va;
@@ -1912,8 +1929,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 		return ERR_PTR(-ENOMEM);
 
 	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
-					VMALLOC_START, VMALLOC_END,
-					node, gfp_mask);
+				     VMALLOC_START, VMALLOC_END,
+				     node, 0, gfp_mask);
 	if (IS_ERR(va)) {
 		kfree(vb);
 		return ERR_CAST(va);
@@ -2209,8 +2226,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
 		addr = (unsigned long)mem;
 	} else {
 		struct vmap_area *va;
-		va = alloc_vmap_area(size, PAGE_SIZE,
-				VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
+		va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END,
+				     node, 0, GFP_KERNEL);
 		if (IS_ERR(va))
 			return NULL;
 
@@ -2450,7 +2467,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 	if (!(flags & VM_NO_GUARD))
 		size += PAGE_SIZE;
 
-	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
+	va = alloc_vmap_area(size, align, start, end, node, flags, gfp_mask);
 	if (IS_ERR(va)) {
 		kfree(area);
 		return NULL;
@@ -2546,7 +2563,7 @@ struct vm_struct *remove_vm_area(const void *addr)
 	might_sleep();
 
 	spin_lock(&vmap_area_lock);
-	va = __find_vmap_area((unsigned long)addr);
+	va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node);
 	if (va && va->vm) {
 		struct vm_struct *vm = va->vm;
 
@@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc);
 
+void *vmalloc_exec(unsigned long size, unsigned long align)
+{
+	struct vmap_area *va, *tmp;
+	unsigned long addr;
+	enum fit_type type;
+	int ret;
+
+	va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE);
+	if (unlikely(!va))
+		return ERR_PTR(-ENOMEM);
+
+again:
+	preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE);
+	tmp = find_vmap_lowest_match(free_text_area_root.rb_node,
+				     size, align, 1, false);
+
+	if (!tmp) {
+		unsigned long alloc_size;
+		void *ptr;
+
+		spin_unlock(&free_text_area_lock);
+
+		alloc_size = roundup(size, PMD_SIZE * num_online_nodes());
+		ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR,
+					   MODULES_END, GFP_KERNEL, PAGE_KERNEL,
+					   VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD,
+					   NUMA_NO_NODE, __builtin_return_address(0));
+		if (unlikely(!ptr)) {
+			ret = -ENOMEM;
+			goto err_out;
+		}
+		memset(ptr, 0, alloc_size);
+		set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
+		set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
+
+		goto again;
+	}
+
+	addr = roundup(tmp->va_start, align);
+	type = classify_va_fit_type(tmp, addr, size);
+	if (WARN_ON_ONCE(type == NOTHING_FIT)) {
+		addr = -ENOMEM;
+		goto err_out;
+	}
+
+	ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list,
+				    tmp, addr, size, type);
+	if (ret) {
+		addr = ret;
+		goto err_out;
+	}
+	spin_unlock(&free_text_area_lock);
+
+	va->va_start = addr;
+	va->va_end = addr + size;
+	va->vm = tmp->vm;
+
+	spin_lock(&vmap_area_lock);
+	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+	spin_unlock(&vmap_area_lock);
+
+	return (void *)addr;
+
+err_out:
+	spin_unlock(&free_text_area_lock);
+	return ERR_PTR(ret);
+}
+
+void vfree_exec(const void *addr)
+{
+	struct vmap_area *va;
+
+	might_sleep();
+
+	spin_lock(&vmap_area_lock);
+	va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node);
+	if (WARN_ON_ONCE(!va)) {
+		spin_unlock(&vmap_area_lock);
+		return;
+	}
+
+	unlink_va(va, &vmap_area_root);
+	spin_unlock(&vmap_area_lock);
+
+	spin_lock(&free_text_area_lock);
+	merge_or_add_vmap_area_augment(va,
+		&free_text_area_root, &free_text_area_list);
+	spin_unlock(&free_text_area_lock);
+	/* TODO: when the whole vm_struct is not in use, free it */
+}
+
 /**
  * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
  * @size:      allocation size
@@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 			/* It is a BUG(), but trigger recovery instead. */
 			goto recovery;
 
-		ret = adjust_va_to_fit_type(va, start, size, type);
+		ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list,
+					    va, start, size, type);
 		if (unlikely(ret))
 			goto recovery;
 
-- 
2.30.2



  reply	other threads:[~2022-08-18 23:22 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-18 22:42 [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
2022-08-18 22:42 ` Song Liu [this message]
2022-10-06 23:15   ` [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec Luis Chamberlain
2022-10-07  6:39     ` Song Liu
2022-08-18 22:42 ` [RFC 2/5] bpf: use vmalloc_exec Song Liu
2022-08-18 22:42 ` [RFC 3/5] modules, x86: use vmalloc_exec for module core Song Liu
2022-10-06 23:38   ` Luis Chamberlain
2022-10-07  6:46     ` Song Liu
2022-08-18 22:42 ` [RFC 4/5] vmalloc_exec: share a huge page with kernel text Song Liu
2022-10-06 23:44   ` Luis Chamberlain
2022-10-07  6:53     ` Song Liu
2022-08-18 22:42 ` [RFC 5/5] vmalloc: vfree_exec: free unused vm_struct Song Liu
2022-08-22 15:46 ` [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
2022-08-22 16:34   ` Peter Zijlstra
2022-08-22 16:56     ` Song Liu
2022-08-23  5:42       ` Peter Zijlstra
2022-08-23  6:39         ` Christophe Leroy
2022-08-23  6:57           ` Song Liu
2022-08-23  6:55         ` Song Liu
2022-08-24 17:06       ` Song Liu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220818224218.2399791-2-song@kernel.org \
    --to=song@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=dave.hansen@intel.com \
    --cc=hch@lst.de \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mcgrof@kernel.org \
    --cc=peterz@infradead.org \
    --cc=rick.p.edgecombe@intel.com \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.