From: Song Liu <song@kernel.org>
To: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>
Cc: <akpm@linux-foundation.org>, <x86@kernel.org>,
<peterz@infradead.org>, <hch@lst.de>, <kernel-team@fb.com>,
<rick.p.edgecombe@intel.com>, <dave.hansen@intel.com>,
<urezki@gmail.com>, Song Liu <song@kernel.org>
Subject: [RFC v2 1/4] vmalloc: introduce vmalloc_exec and vfree_exec
Date: Fri, 7 Oct 2022 16:43:12 -0700 [thread overview]
Message-ID: <20221007234315.2877365-2-song@kernel.org> (raw)
In-Reply-To: <20221007234315.2877365-1-song@kernel.org>
vmalloc_exec is used to allocate memory to host dynamic kernel text
(modules, BPF programs, etc.) with huge pages. This is similar to the
proposal by Peter in [1].
A new tree of vmap_area, free_text_area_* tree, is introduced in addition
to free_vmap_area_* and vmap_area_*. vmalloc_exec allocates pages from
free_text_area_*. When there isn't enough space left in free_text_area_*,
new PMD_SIZE page(s) is allocated from free_vmap_area_* and added to
free_text_area_*. To be more accurate, the vmap_area is first added to
vmap_area_* tree and then moved to free_text_area_*. This extra move
simplifies the logic of vmalloc_exec.
vmap_area in free_text_area_* tree are backed with memory, but we need
subtree_max_size for tree operations. Therefore, vm_struct for these
vmap_area are stored in a separate list, all_text_vm.
The new tree allows separate handling of < PAGE_SIZE allocations, as
current vmalloc code mostly assumes PAGE_SIZE aligned allocations. This
version of vmalloc_exec can handle bpf programs, which uses 64 byte
aligned allocations), and modules, which uses PAGE_SIZE aligned
allocations.
In vfree_exec(), the memory is first added to free_text_area_*. If this
free creates big enough free space (> PMD_SIZE), vfree_exec() will try to
free the backing vm_struct.
[1] https://lore.kernel.org/bpf/Ys6cWUMHO8XwyYgr@hirez.programming.kicks-ass.net/
Signed-off-by: Song Liu <song@kernel.org>
---
include/linux/vmalloc.h | 2 +
mm/nommu.c | 7 ++
mm/vmalloc.c | 269 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 278 insertions(+)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 096d48aa3437..16c0adc1daee 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -154,6 +154,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
int node, const void *caller) __alloc_size(1);
void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+void *vmalloc_exec(unsigned long size, unsigned long align) __alloc_size(1);
+void vfree_exec(const void *addr);
extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2);
diff --git a/mm/nommu.c b/mm/nommu.c
index e819cbc21b39..c7dcd920ec26 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
}
EXPORT_SYMBOL(vm_map_pages_zero);
+void *vmalloc_exec(unsigned long size, unsigned long align)
+{
+ return NULL;
+}
+
+void vfree_exec(const void *addr) { }
+
/*
* sys_brk() for the most part doesn't need the global kernel
* lock, except when an application is doing something nasty
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 088b421601c4..9212ff96b871 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -72,6 +72,9 @@ early_param("nohugevmalloc", set_nohugevmalloc);
static const bool vmap_allow_huge = false;
#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+#define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE)
+#define PMD_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PMD_SIZE)
+
bool is_vmalloc_addr(const void *x)
{
unsigned long addr = (unsigned long)kasan_reset_tag(x);
@@ -753,6 +756,39 @@ static LIST_HEAD(free_vmap_area_list);
*/
static struct rb_root free_vmap_area_root = RB_ROOT;
+/*
+ * free_text_area for vmalloc_exec() and vfree_exec()
+ *
+ */
+static DEFINE_SPINLOCK(free_text_area_lock);
+/*
+ * This linked list is used in pair with free_text_area_root.
+ * It gives O(1) access to prev/next to perform fast coalescing.
+ */
+static LIST_HEAD(free_text_area_list);
+
+/*
+ * This augment red-black tree represents the free text space.
+ * All vmap_area objects in this tree are sorted by va->va_start
+ * address. It is used for allocation and merging when a vmap
+ * object is released.
+ *
+ * Each vmap_area node contains a maximum available free block
+ * of its sub-tree, right or left. Therefore it is possible to
+ * find a lowest match of free area.
+ *
+ * vmap_area in this tree are backed by RO+X memory, but they do
+ * not have valid vm pointer (because we need subtree_max_size).
+ * The vm for these vmap_area are stored in all_text_vm.
+ */
+static struct rb_root free_text_area_root = RB_ROOT;
+
+/*
+ * List of vm_struct for free_text_area_root. This list is rarely
+ * accessed, so the O(N) complexity is not likely a real issue.
+ */
+struct vm_struct *all_text_vm;
+
/*
* Preload a CPU with one object for "no edge" split case. The
* aim is to get rid of allocations from the atomic context, thus
@@ -3297,6 +3333,239 @@ void *vmalloc(unsigned long size)
}
EXPORT_SYMBOL(vmalloc);
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+#define VMALLOC_EXEC_START MODULES_VADDR
+#define VMALLOC_EXEC_END MODULES_END
+#else
+#define VMALLOC_EXEC_START VMALLOC_START
+#define VMALLOC_EXEC_END VMALLOC_END
+#endif
+
+static void move_vmap_to_free_text_tree(void *addr)
+{
+ struct vmap_area *va;
+
+ /* remove from vmap_area_root */
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area((unsigned long)addr, &vmap_area_root);
+ if (WARN_ON_ONCE(!va)) {
+ spin_unlock(&vmap_area_lock);
+ return;
+ }
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
+ /* make the memory RO+X */
+ memset(addr, 0, va->va_end - va->va_start);
+ set_memory_ro(va->va_start, (va->va_end - va->va_start) >> PAGE_SHIFT);
+ set_memory_x(va->va_start, (va->va_end - va->va_start) >> PAGE_SHIFT);
+
+ /* add to all_text_vm */
+ va->vm->next = all_text_vm;
+ all_text_vm = va->vm;
+
+ /* add to free_text_area_root */
+ spin_lock(&free_text_area_lock);
+ merge_or_add_vmap_area_augment(va, &free_text_area_root, &free_text_area_list);
+ spin_unlock(&free_text_area_lock);
+}
+
+/**
+ * vmalloc_exec - allocate virtually contiguous RO+X memory
+ * @size: allocation size
+ *
+ * This is used to allocate dynamic kernel text, such as module text, BPF
+ * programs, etc. User need to use text_poke to update the memory allocated
+ * by vmalloc_exec.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_exec(unsigned long size, unsigned long align)
+{
+ struct vmap_area *va, *tmp;
+ unsigned long addr;
+ enum fit_type type;
+ int ret;
+
+ va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE);
+ if (unlikely(!va))
+ return NULL;
+
+again:
+ preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE);
+ tmp = find_vmap_lowest_match(&free_text_area_root, size, align, 1, false);
+
+ if (!tmp) {
+ unsigned long alloc_size;
+ void *ptr;
+
+ spin_unlock(&free_text_area_lock);
+
+ /*
+ * Not enough continuous space in free_text_area_root, try
+ * allocate more memory. The memory is first added to
+ * vmap_area_root, and then moved to free_text_area_root.
+ */
+ alloc_size = roundup(size, PMD_SIZE * num_online_nodes());
+ ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, VMALLOC_EXEC_START,
+ VMALLOC_EXEC_END, GFP_KERNEL, PAGE_KERNEL,
+ VM_ALLOW_HUGE_VMAP | VM_NO_GUARD,
+ NUMA_NO_NODE, __builtin_return_address(0));
+ if (unlikely(!ptr))
+ goto err_out;
+
+ move_vmap_to_free_text_tree(ptr);
+ goto again;
+ }
+
+ addr = roundup(tmp->va_start, align);
+ type = classify_va_fit_type(tmp, addr, size);
+ if (WARN_ON_ONCE(type == NOTHING_FIT))
+ goto err_out;
+
+ ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list,
+ tmp, addr, size);
+ if (ret)
+ goto err_out;
+
+ spin_unlock(&free_text_area_lock);
+
+ va->va_start = addr;
+ va->va_end = addr + size;
+ va->vm = NULL;
+
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ spin_unlock(&vmap_area_lock);
+
+ return (void *)addr;
+
+err_out:
+ spin_unlock(&free_text_area_lock);
+ kmem_cache_free(vmap_area_cachep, va);
+ return NULL;
+}
+
+static struct vm_struct *find_and_unlink_text_vm(unsigned long start, unsigned long end)
+{
+ struct vm_struct *vm, *prev_vm;
+
+ lockdep_assert_held(&free_text_area_lock);
+
+ vm = all_text_vm;
+ while (vm) {
+ unsigned long vm_addr = (unsigned long)vm->addr;
+
+ /* vm is within this free space, we can free it */
+ if ((vm_addr >= start) && ((vm_addr + vm->size) <= end))
+ goto unlink_vm;
+ vm = vm->next;
+ }
+ return NULL;
+
+unlink_vm:
+ if (all_text_vm == vm) {
+ all_text_vm = vm->next;
+ } else {
+ prev_vm = all_text_vm;
+ while (prev_vm->next != vm)
+ prev_vm = prev_vm->next;
+ prev_vm = vm->next;
+ }
+ return vm;
+}
+
+/**
+ * vfree_exec - Release memory allocated by vmalloc_exec()
+ * @addr: Memory base address
+ *
+ * If @addr is NULL, no operation is performed.
+ */
+void vfree_exec(const void *addr)
+{
+ unsigned long free_start, free_end, free_addr;
+ struct vm_struct *vm;
+ struct vmap_area *va;
+
+ might_sleep();
+
+ if (!addr)
+ return;
+
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area((unsigned long)addr, &vmap_area_root);
+ if (WARN_ON_ONCE(!va)) {
+ spin_unlock(&vmap_area_lock);
+ return;
+ }
+ WARN_ON_ONCE(va->vm);
+
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
+ spin_lock(&free_text_area_lock);
+ va = merge_or_add_vmap_area_augment(va,
+ &free_text_area_root, &free_text_area_list);
+
+ if (WARN_ON_ONCE(!va))
+ goto out;
+
+ free_start = PMD_ALIGN(va->va_start);
+ free_end = PMD_ALIGN_DOWN(va->va_end);
+
+ /*
+ * Only try to free vm when there is at least one PMD_SIZE free
+ * continuous memory.
+ */
+ if (free_start >= free_end)
+ goto out;
+
+ /*
+ * TODO: It is possible that multiple vm are ready to be freed
+ * after one vfree_exec(). But we free at most one vm for now.
+ */
+ vm = find_and_unlink_text_vm(free_start, free_end);
+ if (!vm)
+ goto out;
+
+ va = kmem_cache_alloc_node(vmap_area_cachep, GFP_ATOMIC, NUMA_NO_NODE);
+ if (unlikely(!va))
+ goto out_save_vm;
+
+ free_addr = __alloc_vmap_area(&free_text_area_root, &free_text_area_list,
+ vm->size, 1, (unsigned long)vm->addr,
+ (unsigned long)vm->addr + vm->size);
+
+ if (WARN_ON_ONCE(free_addr != (unsigned long)vm->addr))
+ goto out_save_vm;
+
+ va->va_start = (unsigned long)vm->addr;
+ va->va_end = va->va_start + vm->size;
+ va->vm = vm;
+ spin_unlock(&free_text_area_lock);
+
+ set_memory_nx(va->va_start, vm->size >> PAGE_SHIFT);
+ set_memory_rw(va->va_start, vm->size >> PAGE_SHIFT);
+
+ /* put the va to vmap_area_root, and then free it with vfree */
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ spin_unlock(&vmap_area_lock);
+
+ vfree(vm->addr);
+ return;
+
+out_save_vm:
+ /*
+ * vm is removed from all_text_vm, but not freed. Add it back,
+ * so that we can use or free it later.
+ */
+ vm->next = all_text_vm;
+ all_text_vm = vm;
+out:
+ spin_unlock(&free_text_area_lock);
+}
+
/**
* vmalloc_huge - allocate virtually contiguous memory, allow huge pages
* @size: allocation size
--
2.30.2
next prev parent reply other threads:[~2022-10-07 23:46 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-10-07 23:43 [RFC v2 0/4] vmalloc_exec for modules and BPF programs Song Liu
2022-10-07 23:43 ` Song Liu [this message]
2022-10-10 18:13 ` [RFC v2 1/4] vmalloc: introduce vmalloc_exec and vfree_exec Edgecombe, Rick P
2022-10-10 19:04 ` Song Liu
2022-10-10 19:59 ` Edgecombe, Rick P
2022-10-07 23:43 ` [RFC v2 2/4] bpf: use vmalloc_exec Song Liu
2022-10-07 23:43 ` [RFC v2 3/4] modules, x86: use vmalloc_exec for module core Song Liu
2022-10-14 3:48 ` Aaron Lu
2022-10-14 6:07 ` Song Liu
[not found] ` <fb7a38faa52ce0f35061473c9c8b56394a726e59.camel@intel.com>
2022-10-14 18:26 ` Song Liu
2022-10-07 23:43 ` [RFC v2 4/4] vmalloc_exec: share a huge page with kernel text Song Liu
2022-10-10 18:32 ` Edgecombe, Rick P
2022-10-10 19:08 ` Song Liu
2022-10-10 20:09 ` Edgecombe, Rick P
[not found] ` <2B66E2E7-7D32-418C-9DFD-1E17180300B4@fb.com>
2022-10-11 20:40 ` Edgecombe, Rick P
2022-10-12 5:37 ` Song Liu
2022-10-12 18:38 ` Edgecombe, Rick P
2022-10-12 19:01 ` Song Liu
2022-10-08 0:17 ` [RFC v2 0/4] vmalloc_exec for modules and BPF programs Song Liu
2022-10-12 19:03 ` Song Liu
2022-10-17 7:26 ` Christoph Hellwig
2022-10-17 16:23 ` Song Liu
2022-10-18 14:50 ` Christoph Hellwig
2022-10-18 15:05 ` Song Liu
2022-10-18 15:40 ` Christoph Hellwig
2022-10-18 15:40 ` Christoph Hellwig
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20221007234315.2877365-2-song@kernel.org \
--to=song@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=dave.hansen@intel.com \
--cc=hch@lst.de \
--cc=kernel-team@fb.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=peterz@infradead.org \
--cc=rick.p.edgecombe@intel.com \
--cc=urezki@gmail.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).