From: Song Liu <song@kernel.org>
To: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>
Cc: <akpm@linux-foundation.org>, <x86@kernel.org>,
<peterz@infradead.org>, <hch@lst.de>, <kernel-team@fb.com>,
<rick.p.edgecombe@intel.com>, <mcgrof@kernel.org>,
<dave.hansen@intel.com>, Song Liu <song@kernel.org>
Subject: [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec
Date: Thu, 18 Aug 2022 15:42:14 -0700 [thread overview]
Message-ID: <20220818224218.2399791-2-song@kernel.org> (raw)
In-Reply-To: <20220818224218.2399791-1-song@kernel.org>
This is a prototype to host dynamic kernel text (modules, BPF programs,
etc.) with huge pages. This is similar to the proposal by Peter in [1].
A new tree of vmap_area, free_text_area_* tree, is introduced in addition
to free_vmap_area_* and vmap_area_*. vmalloc_exec allocates pages from
free_text_area_*. When there isn't enough space left in free_text_area_*,
new PMD_SIZE page(s) is allocated from free_vmap_area_* and added to
free_text_area_*.
The new tree allows separate handling of < PAGE_SIZE allocations, as
current vmalloc code mostly assumes PAGE_SIZE aligned allocations. This
version of vmalloc_exec can handle bpf programs, which uses 64 byte
aligned allocations), and modules, which uses PAGE_SIZE aligned
allocations.
[1] https://lore.kernel.org/bpf/Ys6cWUMHO8XwyYgr@hirez.programming.kicks-ass.net/
---
include/linux/vmalloc.h | 4 +
mm/nommu.c | 7 ++
mm/vmalloc.c | 163 +++++++++++++++++++++++++++++++++-------
3 files changed, 147 insertions(+), 27 deletions(-)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 096d48aa3437..691c02ffe3db 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -35,6 +35,8 @@ struct notifier_block; /* in notifier.h */
#define VM_DEFER_KMEMLEAK 0
#endif
+#define VM_KERNEL_EXEC 0x00001000 /* kernel text mapped as RO+X */
+
/* bits [20..32] reserved for arch specific ioremap internals */
/*
@@ -154,6 +156,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
int node, const void *caller) __alloc_size(1);
void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+void *vmalloc_exec(unsigned long size, unsigned long align) __alloc_size(1);
+void vfree_exec(const void *addr);
extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2);
diff --git a/mm/nommu.c b/mm/nommu.c
index 9d7afc2d959e..11e0fc996006 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
}
EXPORT_SYMBOL(vm_map_pages_zero);
+void *vmalloc_exec(unsigned long size, unsigned long align)
+{
+ return NULL;
+}
+
+void vfree_exec(const void *addr) { }
+
/*
* sys_brk() for the most part doesn't need the global kernel
* lock, except when an application is doing something nasty
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index effd1ff6a4b4..472287e71bf1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -753,6 +753,10 @@ static LIST_HEAD(free_vmap_area_list);
*/
static struct rb_root free_vmap_area_root = RB_ROOT;
+static DEFINE_SPINLOCK(free_text_area_lock);
+static LIST_HEAD(free_text_area_list);
+static struct rb_root free_text_area_root = RB_ROOT;
+
/*
* Preload a CPU with one object for "no edge" split case. The
* aim is to get rid of allocations from the atomic context, thus
@@ -814,9 +818,11 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
return va;
}
-static struct vmap_area *__find_vmap_area(unsigned long addr)
+static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_node *root)
{
- struct rb_node *n = vmap_area_root.rb_node;
+ struct rb_node *n;
+
+ n = root ? root : vmap_area_root.rb_node;
addr = (unsigned long)kasan_reset_tag((void *)addr);
@@ -926,7 +932,7 @@ link_va(struct vmap_area *va, struct rb_root *root,
/* Insert to the rb-tree */
rb_link_node(&va->rb_node, parent, link);
- if (root == &free_vmap_area_root) {
+ if (root == &free_vmap_area_root || root == &free_text_area_root) {
/*
* Some explanation here. Just perform simple insertion
* to the tree. We do not set va->subtree_max_size to
@@ -955,7 +961,7 @@ unlink_va(struct vmap_area *va, struct rb_root *root)
if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
return;
- if (root == &free_vmap_area_root)
+ if (root == &free_vmap_area_root || root == &free_text_area_root)
rb_erase_augmented(&va->rb_node,
root, &free_vmap_area_rb_augment_cb);
else
@@ -1198,15 +1204,15 @@ is_within_this_va(struct vmap_area *va, unsigned long size,
* overhead.
*/
static __always_inline struct vmap_area *
-find_vmap_lowest_match(unsigned long size, unsigned long align,
- unsigned long vstart, bool adjust_search_size)
+find_vmap_lowest_match(struct rb_node *root, unsigned long size,
+ unsigned long align, unsigned long vstart, bool adjust_search_size)
{
struct vmap_area *va;
struct rb_node *node;
unsigned long length;
/* Start from the root. */
- node = free_vmap_area_root.rb_node;
+ node = root;
/* Adjust the search size for alignment overhead. */
length = adjust_search_size ? size + align - 1 : size;
@@ -1290,8 +1296,9 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align)
get_random_bytes(&rnd, sizeof(rnd));
vstart = VMALLOC_START + rnd;
- va_1 = find_vmap_lowest_match(size, align, vstart, false);
- va_2 = find_vmap_lowest_linear_match(size, align, vstart);
+ va_1 = find_vmap_lowest_match(free_vmap_area_root.rb_node, size,
+ align, vstart, false);
+ va_2 = find_vmap_lowest_linear_match(root, size, align, vstart);
if (va_1 != va_2)
pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
@@ -1334,7 +1341,8 @@ classify_va_fit_type(struct vmap_area *va,
}
static __always_inline int
-adjust_va_to_fit_type(struct vmap_area *va,
+adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
+ struct vmap_area *va,
unsigned long nva_start_addr, unsigned long size,
enum fit_type type)
{
@@ -1348,7 +1356,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
* V NVA V
* |---------------|
*/
- unlink_va(va, &free_vmap_area_root);
+ unlink_va(va, root);
kmem_cache_free(vmap_area_cachep, va);
} else if (type == LE_FIT_TYPE) {
/*
@@ -1426,8 +1434,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
augment_tree_propagate_from(va);
if (lva) /* type == NE_FIT_TYPE */
- insert_vmap_area_augment(lva, &va->rb_node,
- &free_vmap_area_root, &free_vmap_area_list);
+ insert_vmap_area_augment(lva, &va->rb_node, root, head);
}
return 0;
@@ -1459,7 +1466,8 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
adjust_search_size = false;
- va = find_vmap_lowest_match(size, align, vstart, adjust_search_size);
+ va = find_vmap_lowest_match(free_vmap_area_root.rb_node,
+ size, align, vstart, adjust_search_size);
if (unlikely(!va))
return vend;
@@ -1478,7 +1486,8 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
return vend;
/* Update the free vmap_area. */
- ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
+ ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list,
+ va, nva_start_addr, size, type);
if (ret)
return vend;
@@ -1539,7 +1548,7 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
- int node, gfp_t gfp_mask)
+ int node, unsigned long vm_flags, gfp_t gfp_mask)
{
struct vmap_area *va;
unsigned long freed;
@@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
va->va_end = addr + size;
va->vm = NULL;
- spin_lock(&vmap_area_lock);
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
- spin_unlock(&vmap_area_lock);
+ if (vm_flags & VM_KERNEL_EXEC) {
+ spin_lock(&free_text_area_lock);
+ insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
+ /* update subtree_max_size now as we need this soon */
+ augment_tree_propagate_from(va);
+ spin_unlock(&free_text_area_lock);
+ } else {
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ spin_unlock(&vmap_area_lock);
+ }
BUG_ON(!IS_ALIGNED(va->va_start, align));
BUG_ON(va->va_start < vstart);
@@ -1803,7 +1820,7 @@ struct vmap_area *find_vmap_area(unsigned long addr)
struct vmap_area *va;
spin_lock(&vmap_area_lock);
- va = __find_vmap_area(addr);
+ va = __find_vmap_area(addr, vmap_area_root.rb_node);
spin_unlock(&vmap_area_lock);
return va;
@@ -1912,8 +1929,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_PTR(-ENOMEM);
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
- VMALLOC_START, VMALLOC_END,
- node, gfp_mask);
+ VMALLOC_START, VMALLOC_END,
+ node, 0, gfp_mask);
if (IS_ERR(va)) {
kfree(vb);
return ERR_CAST(va);
@@ -2209,8 +2226,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
addr = (unsigned long)mem;
} else {
struct vmap_area *va;
- va = alloc_vmap_area(size, PAGE_SIZE,
- VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
+ va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END,
+ node, 0, GFP_KERNEL);
if (IS_ERR(va))
return NULL;
@@ -2450,7 +2467,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
if (!(flags & VM_NO_GUARD))
size += PAGE_SIZE;
- va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
+ va = alloc_vmap_area(size, align, start, end, node, flags, gfp_mask);
if (IS_ERR(va)) {
kfree(area);
return NULL;
@@ -2546,7 +2563,7 @@ struct vm_struct *remove_vm_area(const void *addr)
might_sleep();
spin_lock(&vmap_area_lock);
- va = __find_vmap_area((unsigned long)addr);
+ va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node);
if (va && va->vm) {
struct vm_struct *vm = va->vm;
@@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size)
}
EXPORT_SYMBOL(vmalloc);
+void *vmalloc_exec(unsigned long size, unsigned long align)
+{
+ struct vmap_area *va, *tmp;
+ unsigned long addr;
+ enum fit_type type;
+ int ret;
+
+ va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE);
+ if (unlikely(!va))
+ return ERR_PTR(-ENOMEM);
+
+again:
+ preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE);
+ tmp = find_vmap_lowest_match(free_text_area_root.rb_node,
+ size, align, 1, false);
+
+ if (!tmp) {
+ unsigned long alloc_size;
+ void *ptr;
+
+ spin_unlock(&free_text_area_lock);
+
+ alloc_size = roundup(size, PMD_SIZE * num_online_nodes());
+ ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR,
+ MODULES_END, GFP_KERNEL, PAGE_KERNEL,
+ VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD,
+ NUMA_NO_NODE, __builtin_return_address(0));
+ if (unlikely(!ptr)) {
+ ret = -ENOMEM;
+ goto err_out;
+ }
+ memset(ptr, 0, alloc_size);
+ set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
+ set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
+
+ goto again;
+ }
+
+ addr = roundup(tmp->va_start, align);
+ type = classify_va_fit_type(tmp, addr, size);
+ if (WARN_ON_ONCE(type == NOTHING_FIT)) {
+ addr = -ENOMEM;
+ goto err_out;
+ }
+
+ ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list,
+ tmp, addr, size, type);
+ if (ret) {
+ addr = ret;
+ goto err_out;
+ }
+ spin_unlock(&free_text_area_lock);
+
+ va->va_start = addr;
+ va->va_end = addr + size;
+ va->vm = tmp->vm;
+
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ spin_unlock(&vmap_area_lock);
+
+ return (void *)addr;
+
+err_out:
+ spin_unlock(&free_text_area_lock);
+ return ERR_PTR(ret);
+}
+
+void vfree_exec(const void *addr)
+{
+ struct vmap_area *va;
+
+ might_sleep();
+
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node);
+ if (WARN_ON_ONCE(!va)) {
+ spin_unlock(&vmap_area_lock);
+ return;
+ }
+
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
+ spin_lock(&free_text_area_lock);
+ merge_or_add_vmap_area_augment(va,
+ &free_text_area_root, &free_text_area_list);
+ spin_unlock(&free_text_area_lock);
+ /* TODO: when the whole vm_struct is not in use, free it */
+}
+
/**
* vmalloc_huge - allocate virtually contiguous memory, allow huge pages
* @size: allocation size
@@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
/* It is a BUG(), but trigger recovery instead. */
goto recovery;
- ret = adjust_va_to_fit_type(va, start, size, type);
+ ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list,
+ va, start, size, type);
if (unlikely(ret))
goto recovery;
--
2.30.2
next prev parent reply other threads:[~2022-08-18 23:22 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-08-18 22:42 [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
2022-08-18 22:42 ` Song Liu [this message]
2022-10-06 23:15 ` [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec Luis Chamberlain
2022-10-07 6:39 ` Song Liu
2022-08-18 22:42 ` [RFC 2/5] bpf: use vmalloc_exec Song Liu
2022-08-18 22:42 ` [RFC 3/5] modules, x86: use vmalloc_exec for module core Song Liu
2022-10-06 23:38 ` Luis Chamberlain
2022-10-07 6:46 ` Song Liu
2022-08-18 22:42 ` [RFC 4/5] vmalloc_exec: share a huge page with kernel text Song Liu
2022-10-06 23:44 ` Luis Chamberlain
2022-10-07 6:53 ` Song Liu
2022-08-18 22:42 ` [RFC 5/5] vmalloc: vfree_exec: free unused vm_struct Song Liu
2022-08-22 15:46 ` [RFC 0/5] vmalloc_exec for modules and BPF programs Song Liu
2022-08-22 16:34 ` Peter Zijlstra
2022-08-22 16:56 ` Song Liu
2022-08-23 5:42 ` Peter Zijlstra
2022-08-23 6:39 ` Christophe Leroy
2022-08-23 6:57 ` Song Liu
2022-08-23 6:55 ` Song Liu
2022-08-24 17:06 ` Song Liu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220818224218.2399791-2-song@kernel.org \
--to=song@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=dave.hansen@intel.com \
--cc=hch@lst.de \
--cc=kernel-team@fb.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mcgrof@kernel.org \
--cc=peterz@infradead.org \
--cc=rick.p.edgecombe@intel.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).