From: Song Liu <song@kernel.org>
To: <bpf@vger.kernel.org>, <linux-mm@kvack.org>
Cc: <akpm@linux-foundation.org>, <x86@kernel.org>,
<peterz@infradead.org>, <hch@lst.de>,
<rick.p.edgecombe@intel.com>, <rppt@kernel.org>,
<mcgrof@kernel.org>, Song Liu <song@kernel.org>
Subject: [PATCH bpf-next v5 4/6] bpf: use execmem_alloc for bpf program and bpf dispatcher
Date: Mon, 28 Nov 2022 11:02:43 -0800 [thread overview]
Message-ID: <20221128190245.2337461-5-song@kernel.org> (raw)
In-Reply-To: <20221128190245.2337461-1-song@kernel.org>
Use execmem_alloc, execmem_free, and execmem_fill instead of
bpf_prog_pack_alloc, bpf_prog_pack_free, and bpf_arch_text_copy.
execmem_free doesn't require extra size information. Therefore, the free
and error handling path can be simplified.
There are some tests that show the benefit of execmem_alloc.
Run 100 instances of the following benchmark from bpf selftests:
tools/testing/selftests/bpf/bench -w2 -d100 -a trig-kprobe
which loads 7 BPF programs, and triggers one of them.
Then use perf to monitor TLB related counters:
perf stat -e iTLB-load-misses,itlb_misses.walk_completed_4k, \
itlb_misses.walk_completed_2m_4m -a
The following results are from a qemu VM with 32 cores.
Before bpf_prog_pack:
iTLB-load-misses: 350k/s
itlb_misses.walk_completed_4k: 90k/s
itlb_misses.walk_completed_2m_4m: 0.1/s
With bpf_prog_pack (current upstream):
iTLB-load-misses: 220k/s
itlb_misses.walk_completed_4k: 68k/s
itlb_misses.walk_completed_2m_4m: 0.2/s
With execmem_alloc (with this set):
iTLB-load-misses: 185k/s
itlb_misses.walk_completed_4k: 58k/s
itlb_misses.walk_completed_2m_4m: 1/s
Signed-off-by: Song Liu <song@kernel.org>
---
arch/x86/net/bpf_jit_comp.c | 23 +----
include/linux/bpf.h | 3 -
include/linux/filter.h | 5 -
kernel/bpf/core.c | 180 +++---------------------------------
kernel/bpf/dispatcher.c | 11 +--
5 files changed, 21 insertions(+), 201 deletions(-)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index cec5195602bc..43b93570d8f9 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -229,11 +229,6 @@ static void jit_fill_hole(void *area, unsigned int size)
memset(area, 0xcc, size);
}
-int bpf_arch_text_invalidate(void *dst, size_t len)
-{
- return IS_ERR_OR_NULL(text_poke_set(dst, 0xcc, len));
-}
-
struct jit_context {
int cleanup_addr; /* Epilogue code offset */
@@ -2509,11 +2504,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
if (proglen <= 0) {
out_image:
image = NULL;
- if (header) {
- bpf_arch_text_copy(&header->size, &rw_header->size,
- sizeof(rw_header->size));
+ if (header)
bpf_jit_binary_pack_free(header, rw_header);
- }
+
/* Fall back to interpreter mode */
prog = orig_prog;
if (extra_pass) {
@@ -2563,8 +2556,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
if (!prog->is_func || extra_pass) {
/*
* bpf_jit_binary_pack_finalize fails in two scenarios:
- * 1) header is not pointing to proper module memory;
- * 2) the arch doesn't support bpf_arch_text_copy().
+ * 1) header is not pointing to memory allocated by
+ * execmem_alloc;
+ * 2) the arch doesn't support execmem_free().
*
* Both cases are serious bugs and justify WARN_ON.
*/
@@ -2610,13 +2604,6 @@ bool bpf_jit_supports_kfunc_call(void)
return true;
}
-void *bpf_arch_text_copy(void *dst, void *src, size_t len)
-{
- if (text_poke_copy(dst, src, len) == NULL)
- return ERR_PTR(-EINVAL);
- return dst;
-}
-
/* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */
bool bpf_jit_supports_subprog_tailcalls(void)
{
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c6aa6912ea16..e04c63b621da 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2775,9 +2775,6 @@ enum bpf_text_poke_type {
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *addr1, void *addr2);
-void *bpf_arch_text_copy(void *dst, void *src, size_t len);
-int bpf_arch_text_invalidate(void *dst, size_t len);
-
struct btf_id_set;
bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index bf701976056e..4373ff388759 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1023,8 +1023,6 @@ extern long bpf_jit_limit_max;
typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
-void bpf_jit_fill_hole_with_zero(void *area, unsigned int size);
-
struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
unsigned int alignment,
@@ -1037,9 +1035,6 @@ void bpf_jit_free(struct bpf_prog *fp);
struct bpf_binary_header *
bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);
-void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
-void bpf_prog_pack_free(struct bpf_binary_header *hdr);
-
static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
{
return list_empty(&fp->aux->ksym.lnode) ||
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2e57fc839a5c..56397556c622 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -810,149 +810,6 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return slot;
}
-/*
- * BPF program pack allocator.
- *
- * Most BPF programs are pretty small. Allocating a hole page for each
- * program is sometime a waste. Many small bpf program also adds pressure
- * to instruction TLB. To solve this issue, we introduce a BPF program pack
- * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
- * to host BPF programs.
- */
-#define BPF_PROG_CHUNK_SHIFT 6
-#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT)
-#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1))
-
-struct bpf_prog_pack {
- struct list_head list;
- void *ptr;
- unsigned long bitmap[];
-};
-
-void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
-{
- memset(area, 0, size);
-}
-
-#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
-
-static DEFINE_MUTEX(pack_mutex);
-static LIST_HEAD(pack_list);
-
-/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
- * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
- */
-#ifdef PMD_SIZE
-#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes())
-#else
-#define BPF_PROG_PACK_SIZE PAGE_SIZE
-#endif
-
-#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
-
-static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
-{
- struct bpf_prog_pack *pack;
-
- pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
- GFP_KERNEL);
- if (!pack)
- return NULL;
- pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
- if (!pack->ptr) {
- kfree(pack);
- return NULL;
- }
- bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
- bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
- list_add_tail(&pack->list, &pack_list);
-
- set_vm_flush_reset_perms(pack->ptr);
- set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
- set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
- return pack;
-}
-
-void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
-{
- unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
- struct bpf_prog_pack *pack;
- unsigned long pos;
- void *ptr = NULL;
-
- mutex_lock(&pack_mutex);
- if (size > BPF_PROG_PACK_SIZE) {
- size = round_up(size, PAGE_SIZE);
- ptr = module_alloc(size);
- if (ptr) {
- bpf_fill_ill_insns(ptr, size);
- set_vm_flush_reset_perms(ptr);
- set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
- set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
- }
- goto out;
- }
- list_for_each_entry(pack, &pack_list, list) {
- pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
- nbits, 0);
- if (pos < BPF_PROG_CHUNK_COUNT)
- goto found_free_area;
- }
-
- pack = alloc_new_pack(bpf_fill_ill_insns);
- if (!pack)
- goto out;
-
- pos = 0;
-
-found_free_area:
- bitmap_set(pack->bitmap, pos, nbits);
- ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
-
-out:
- mutex_unlock(&pack_mutex);
- return ptr;
-}
-
-void bpf_prog_pack_free(struct bpf_binary_header *hdr)
-{
- struct bpf_prog_pack *pack = NULL, *tmp;
- unsigned int nbits;
- unsigned long pos;
-
- mutex_lock(&pack_mutex);
- if (hdr->size > BPF_PROG_PACK_SIZE) {
- module_memfree(hdr);
- goto out;
- }
-
- list_for_each_entry(tmp, &pack_list, list) {
- if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) {
- pack = tmp;
- break;
- }
- }
-
- if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
- goto out;
-
- nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
- pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
-
- WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
- "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
-
- bitmap_clear(pack->bitmap, pos, nbits);
- if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
- BPF_PROG_CHUNK_COUNT, 0) == 0) {
- list_del(&pack->list);
- module_memfree(pack->ptr);
- kfree(pack);
- }
-out:
- mutex_unlock(&pack_mutex);
-}
-
static atomic_long_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
@@ -1052,6 +909,9 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
bpf_jit_uncharge_modmem(size);
}
+#define BPF_PROG_EXEC_ALIGN 64
+#define BPF_PROG_EXEC_MASK (~(BPF_PROG_EXEC_ALIGN - 1))
+
/* Allocate jit binary from bpf_prog_pack allocator.
* Since the allocated memory is RO+X, the JIT engine cannot write directly
* to the memory. To solve this problem, a RW buffer is also allocated at
@@ -1074,11 +934,11 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
alignment > BPF_IMAGE_ALIGNMENT);
/* add 16 bytes for a random section of illegal instructions */
- size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
+ size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_EXEC_ALIGN);
if (bpf_jit_charge_modmem(size))
return NULL;
- ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns);
+ ro_header = execmem_alloc(size, BPF_PROG_EXEC_ALIGN);
if (!ro_header) {
bpf_jit_uncharge_modmem(size);
return NULL;
@@ -1086,8 +946,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
*rw_header = kvmalloc(size, GFP_KERNEL);
if (!*rw_header) {
- bpf_arch_text_copy(&ro_header->size, &size, sizeof(size));
- bpf_prog_pack_free(ro_header);
+ execmem_free(ro_header);
bpf_jit_uncharge_modmem(size);
return NULL;
}
@@ -1097,7 +956,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
(*rw_header)->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
- BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
+ BPF_PROG_EXEC_ALIGN - sizeof(*ro_header));
start = prandom_u32_max(hole) & ~(alignment - 1);
*image_ptr = &ro_header->image[start];
@@ -1113,12 +972,12 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
{
void *ptr;
- ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
+ ptr = execmem_fill(ro_header, rw_header, rw_header->size);
kvfree(rw_header);
if (IS_ERR(ptr)) {
- bpf_prog_pack_free(ro_header);
+ execmem_free(ro_header);
return PTR_ERR(ptr);
}
return 0;
@@ -1128,18 +987,13 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
* 1) when the program is freed after;
* 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
* For case 2), we need to free both the RO memory and the RW buffer.
- *
- * bpf_jit_binary_pack_free requires proper ro_header->size. However,
- * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
- * must be set with either bpf_jit_binary_pack_finalize (normal path) or
- * bpf_arch_text_copy (when jit fails).
*/
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
struct bpf_binary_header *rw_header)
{
- u32 size = ro_header->size;
+ u32 size = rw_header ? rw_header->size : ro_header->size;
- bpf_prog_pack_free(ro_header);
+ execmem_free(ro_header);
kvfree(rw_header);
bpf_jit_uncharge_modmem(size);
}
@@ -1150,7 +1004,7 @@ bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
unsigned long real_start = (unsigned long)fp->bpf_func;
unsigned long addr;
- addr = real_start & BPF_PROG_CHUNK_MASK;
+ addr = real_start & BPF_PROG_EXEC_MASK;
return (void *)addr;
}
@@ -2740,16 +2594,6 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
return -ENOTSUPP;
}
-void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
-{
- return ERR_PTR(-ENOTSUPP);
-}
-
-int __weak bpf_arch_text_invalidate(void *dst, size_t len)
-{
- return -ENOTSUPP;
-}
-
#ifdef CONFIG_BPF_SYSCALL
static int __init bpf_global_ma_init(void)
{
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 04f0a045dcaa..c41dff3379db 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -126,11 +126,11 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
tmp = d->num_progs ? d->rw_image + noff : NULL;
if (new) {
/* Prepare the dispatcher in d->rw_image. Then use
- * bpf_arch_text_copy to update d->image, which is RO+X.
+ * execmem_fill to update d->image, which is RO+X.
*/
if (bpf_dispatcher_prepare(d, new, tmp))
return;
- if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2)))
+ if (IS_ERR(execmem_fill(new, tmp, PAGE_SIZE / 2)))
return;
}
@@ -152,15 +152,12 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
mutex_lock(&d->mutex);
if (!d->image) {
- d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero);
+ d->image = execmem_alloc(PAGE_SIZE, PAGE_SIZE /* align */);
if (!d->image)
goto out;
d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!d->rw_image) {
- u32 size = PAGE_SIZE;
-
- bpf_arch_text_copy(d->image, &size, sizeof(size));
- bpf_prog_pack_free((struct bpf_binary_header *)d->image);
+ execmem_free((struct bpf_binary_header *)d->image);
d->image = NULL;
goto out;
}
--
2.30.2
next prev parent reply other threads:[~2022-11-28 19:05 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-11-28 19:02 [PATCH bpf-next v5 0/6] execmem_alloc for BPF programs Song Liu
2022-11-28 19:02 ` [PATCH bpf-next v5 1/6] vmalloc: introduce execmem_alloc, execmem_free, and execmem_fill Song Liu
2022-11-28 19:02 ` [PATCH bpf-next v5 2/6] x86/alternative: support execmem_alloc() and execmem_free() Song Liu
2022-11-28 19:02 ` [PATCH bpf-next v5 3/6] selftests/vm: extend test_vmalloc to test execmem_* APIs Song Liu
2022-11-29 8:35 ` Christoph Hellwig
2022-11-29 17:31 ` Song Liu
2022-11-29 20:22 ` Christoph Hellwig
2022-11-28 19:02 ` Song Liu [this message]
2022-11-28 19:02 ` [PATCH bpf-next v5 5/6] vmalloc: introduce register_text_tail_vm() Song Liu
2022-11-28 19:02 ` [PATCH bpf-next v5 6/6] x86: use register_text_tail_vm Song Liu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20221128190245.2337461-5-song@kernel.org \
--to=song@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=bpf@vger.kernel.org \
--cc=hch@lst.de \
--cc=linux-mm@kvack.org \
--cc=mcgrof@kernel.org \
--cc=peterz@infradead.org \
--cc=rick.p.edgecombe@intel.com \
--cc=rppt@kernel.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.