- * [PATCH v4 bpf-next 1/7] x86/Kconfig: select HAVE_ARCH_HUGE_VMALLOC with HAVE_ARCH_HUGE_VMAP
  2022-01-19 23:06 [PATCH v4 bpf-next 0/7] bpf_prog_pack allocator Song Liu
@ 2022-01-19 23:06 ` Song Liu
  2022-01-19 23:06 ` [PATCH v4 bpf-next 2/7] bpf: use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem Song Liu
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 13+ messages in thread
From: Song Liu @ 2022-01-19 23:06 UTC (permalink / raw)
  To: bpf, netdev, linux-kernel
  Cc: ast, daniel, andrii, kernel-team, peterz, x86, Song Liu
From: Song Liu <songliubraving@fb.com>
This enables module_alloc() to allocate huge page for 2MB+ requests.
To check the difference of this change, we need enable config
CONFIG_PTDUMP_DEBUGFS, and call module_alloc(2MB). Before the change,
/sys/kernel/debug/page_tables/kernel shows pte for this map. With the
change, /sys/kernel/debug/page_tables/ show pmd for thie map.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8910b09b5601..b5d1582ea848 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -158,6 +158,7 @@ config X86
 	select HAVE_ALIGNED_STRUCT_PAGE		if SLUB
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_HUGE_VMAP		if X86_64 || X86_PAE
+	select HAVE_ARCH_HUGE_VMALLOC		if HAVE_ARCH_HUGE_VMAP
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_JUMP_LABEL_RELATIVE
 	select HAVE_ARCH_KASAN			if X86_64
-- 
2.30.2
^ permalink raw reply related	[flat|nested] 13+ messages in thread
- * [PATCH v4 bpf-next 2/7] bpf: use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem
  2022-01-19 23:06 [PATCH v4 bpf-next 0/7] bpf_prog_pack allocator Song Liu
  2022-01-19 23:06 ` [PATCH v4 bpf-next 1/7] x86/Kconfig: select HAVE_ARCH_HUGE_VMALLOC with HAVE_ARCH_HUGE_VMAP Song Liu
@ 2022-01-19 23:06 ` Song Liu
  2022-01-20  4:10   ` Alexei Starovoitov
  2022-01-19 23:06 ` [PATCH v4 bpf-next 3/7] bpf: use size instead of pages in bpf_binary_header Song Liu
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 13+ messages in thread
From: Song Liu @ 2022-01-19 23:06 UTC (permalink / raw)
  To: bpf, netdev, linux-kernel
  Cc: ast, daniel, andrii, kernel-team, peterz, x86, Song Liu
From: Song Liu <songliubraving@fb.com>
This enables sub-page memory charge and allocation.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/bpf.h     |  4 ++--
 kernel/bpf/core.c       | 19 +++++++++----------
 kernel/bpf/trampoline.c |  6 +++---
 3 files changed, 14 insertions(+), 15 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6d7346c54d83..920940f7be22 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -827,8 +827,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
 void bpf_image_ksym_del(struct bpf_ksym *ksym);
 void bpf_ksym_add(struct bpf_ksym *ksym);
 void bpf_ksym_del(struct bpf_ksym *ksym);
-int bpf_jit_charge_modmem(u32 pages);
-void bpf_jit_uncharge_modmem(u32 pages);
+int bpf_jit_charge_modmem(u32 size);
+void bpf_jit_uncharge_modmem(u32 size);
 bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
 #else
 static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index de3e5bc6781f..495e3b2c36ff 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -808,7 +808,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
 	return slot;
 }
 
-static atomic_long_t bpf_jit_current;
+static atomic64_t bpf_jit_current;
 
 /* Can be overridden by an arch's JIT compiler if it has a custom,
  * dedicated BPF backend memory area, or if neither of the two
@@ -833,12 +833,11 @@ static int __init bpf_jit_charge_init(void)
 }
 pure_initcall(bpf_jit_charge_init);
 
-int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 size)
 {
-	if (atomic_long_add_return(pages, &bpf_jit_current) >
-	    (bpf_jit_limit >> PAGE_SHIFT)) {
+	if (atomic64_add_return(size, &bpf_jit_current) > bpf_jit_limit) {
 		if (!bpf_capable()) {
-			atomic_long_sub(pages, &bpf_jit_current);
+			atomic64_sub(size, &bpf_jit_current);
 			return -EPERM;
 		}
 	}
@@ -846,9 +845,9 @@ int bpf_jit_charge_modmem(u32 pages)
 	return 0;
 }
 
-void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 size)
 {
-	atomic_long_sub(pages, &bpf_jit_current);
+	atomic64_sub(size, &bpf_jit_current);
 }
 
 void *__weak bpf_jit_alloc_exec(unsigned long size)
@@ -879,11 +878,11 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
 	pages = size / PAGE_SIZE;
 
-	if (bpf_jit_charge_modmem(pages))
+	if (bpf_jit_charge_modmem(size))
 		return NULL;
 	hdr = bpf_jit_alloc_exec(size);
 	if (!hdr) {
-		bpf_jit_uncharge_modmem(pages);
+		bpf_jit_uncharge_modmem(size);
 		return NULL;
 	}
 
@@ -906,7 +905,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 	u32 pages = hdr->pages;
 
 	bpf_jit_free_exec(hdr);
-	bpf_jit_uncharge_modmem(pages);
+	bpf_jit_uncharge_modmem(pages << PAGE_SHIFT);
 }
 
 /* This symbol is only overridden by archs that have different
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 4b6974a195c1..e76a488c09c3 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -213,7 +213,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work)
 	im = container_of(work, struct bpf_tramp_image, work);
 	bpf_image_ksym_del(&im->ksym);
 	bpf_jit_free_exec(im->image);
-	bpf_jit_uncharge_modmem(1);
+	bpf_jit_uncharge_modmem(PAGE_SIZE);
 	percpu_ref_exit(&im->pcref);
 	kfree_rcu(im, rcu);
 }
@@ -310,7 +310,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
 	if (!im)
 		goto out;
 
-	err = bpf_jit_charge_modmem(1);
+	err = bpf_jit_charge_modmem(PAGE_SIZE);
 	if (err)
 		goto out_free_im;
 
@@ -332,7 +332,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
 out_free_image:
 	bpf_jit_free_exec(im->image);
 out_uncharge:
-	bpf_jit_uncharge_modmem(1);
+	bpf_jit_uncharge_modmem(PAGE_SIZE);
 out_free_im:
 	kfree(im);
 out:
-- 
2.30.2
^ permalink raw reply related	[flat|nested] 13+ messages in thread
- * Re: [PATCH v4 bpf-next 2/7] bpf: use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem
  2022-01-19 23:06 ` [PATCH v4 bpf-next 2/7] bpf: use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem Song Liu
@ 2022-01-20  4:10   ` Alexei Starovoitov
  2022-01-20  4:45     ` Song Liu
  0 siblings, 1 reply; 13+ messages in thread
From: Alexei Starovoitov @ 2022-01-20  4:10 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, netdev, linux-kernel, ast, daniel, andrii, kernel-team,
	peterz, x86, Song Liu
On Wed, Jan 19, 2022 at 03:06:15PM -0800, Song Liu wrote:
> From: Song Liu <songliubraving@fb.com>
> 
> This enables sub-page memory charge and allocation.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  include/linux/bpf.h     |  4 ++--
>  kernel/bpf/core.c       | 19 +++++++++----------
>  kernel/bpf/trampoline.c |  6 +++---
>  3 files changed, 14 insertions(+), 15 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 6d7346c54d83..920940f7be22 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -827,8 +827,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
>  void bpf_image_ksym_del(struct bpf_ksym *ksym);
>  void bpf_ksym_add(struct bpf_ksym *ksym);
>  void bpf_ksym_del(struct bpf_ksym *ksym);
> -int bpf_jit_charge_modmem(u32 pages);
> -void bpf_jit_uncharge_modmem(u32 pages);
> +int bpf_jit_charge_modmem(u32 size);
> +void bpf_jit_uncharge_modmem(u32 size);
>  bool bpf_prog_has_trampoline(const struct bpf_prog *prog);
>  #else
>  static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index de3e5bc6781f..495e3b2c36ff 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -808,7 +808,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
>  	return slot;
>  }
>  
> -static atomic_long_t bpf_jit_current;
> +static atomic64_t bpf_jit_current;
I don't understand the motivation for this change.
bpf_jit_limit is type "long" and it's counting in bytes.
So why change jit_current to atomic64?
atomic_long will be fine even on 32-bit arch.
What did I miss?
^ permalink raw reply	[flat|nested] 13+ messages in thread 
- * Re: [PATCH v4 bpf-next 2/7] bpf: use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem
  2022-01-20  4:10   ` Alexei Starovoitov
@ 2022-01-20  4:45     ` Song Liu
  0 siblings, 0 replies; 13+ messages in thread
From: Song Liu @ 2022-01-20  4:45 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Song Liu, bpf@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, ast@kernel.org,
	daniel@iogearbox.net, andrii@kernel.org, Kernel Team,
	peterz@infradead.org, x86@kernel.org
> On Jan 19, 2022, at 8:10 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> On Wed, Jan 19, 2022 at 03:06:15PM -0800, Song Liu wrote:
>> From: Song Liu <songliubraving@fb.com>
>> 
>> This enables sub-page memory charge and allocation.
>> 
>> Signed-off-by: Song Liu <songliubraving@fb.com>
>> 
[...]
>> @@ -808,7 +808,7 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
>> 	return slot;
>> }
>> 
>> -static atomic_long_t bpf_jit_current;
>> +static atomic64_t bpf_jit_current;
> 
> I don't understand the motivation for this change.
> bpf_jit_limit is type "long" and it's counting in bytes.
> So why change jit_current to atomic64?
> atomic_long will be fine even on 32-bit arch.
> What did I miss?
Hmm.. I think you are right. I guess I missed the fact that we use
long bpf_jit_limit. Will remove this change in the next version. 
Thanks,
Song
^ permalink raw reply	[flat|nested] 13+ messages in thread 
 
 
- * [PATCH v4 bpf-next 3/7] bpf: use size instead of pages in bpf_binary_header
  2022-01-19 23:06 [PATCH v4 bpf-next 0/7] bpf_prog_pack allocator Song Liu
  2022-01-19 23:06 ` [PATCH v4 bpf-next 1/7] x86/Kconfig: select HAVE_ARCH_HUGE_VMALLOC with HAVE_ARCH_HUGE_VMAP Song Liu
  2022-01-19 23:06 ` [PATCH v4 bpf-next 2/7] bpf: use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem Song Liu
@ 2022-01-19 23:06 ` Song Liu
  2022-01-19 23:06 ` [PATCH v4 bpf-next 4/7] bpf: add a pointer of bpf_binary_header to bpf_prog Song Liu
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 13+ messages in thread
From: Song Liu @ 2022-01-19 23:06 UTC (permalink / raw)
  To: bpf, netdev, linux-kernel
  Cc: ast, daniel, andrii, kernel-team, peterz, x86, Song Liu
From: Song Liu <songliubraving@fb.com>
This is necessary to charge sub page memory for the BPF program.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/filter.h |  6 +++---
 kernel/bpf/core.c      | 11 +++++------
 2 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 71fa57b88bfc..6cc1ebf1f840 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -548,7 +548,7 @@ struct sock_fprog_kern {
 #define BPF_IMAGE_ALIGNMENT 8
 
 struct bpf_binary_header {
-	u32 pages;
+	u32 size;
 	u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
 };
 
@@ -886,8 +886,8 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
 {
 	set_vm_flush_reset_perms(hdr);
-	set_memory_ro((unsigned long)hdr, hdr->pages);
-	set_memory_x((unsigned long)hdr, hdr->pages);
+	set_memory_ro((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
+	set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
 }
 
 static inline struct bpf_binary_header *
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 495e3b2c36ff..684a8a972adf 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -543,7 +543,7 @@ bpf_prog_ksym_set_addr(struct bpf_prog *prog)
 	WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
 
 	prog->aux->ksym.start = (unsigned long) prog->bpf_func;
-	prog->aux->ksym.end   = addr + hdr->pages * PAGE_SIZE;
+	prog->aux->ksym.end   = addr + hdr->size;
 }
 
 static void
@@ -866,7 +866,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 		     bpf_jit_fill_hole_t bpf_fill_ill_insns)
 {
 	struct bpf_binary_header *hdr;
-	u32 size, hole, start, pages;
+	u32 size, hole, start;
 
 	WARN_ON_ONCE(!is_power_of_2(alignment) ||
 		     alignment > BPF_IMAGE_ALIGNMENT);
@@ -876,7 +876,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	 * random section of illegal instructions.
 	 */
 	size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
-	pages = size / PAGE_SIZE;
 
 	if (bpf_jit_charge_modmem(size))
 		return NULL;
@@ -889,7 +888,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	/* Fill space with illegal/arch-dep instructions. */
 	bpf_fill_ill_insns(hdr, size);
 
-	hdr->pages = pages;
+	hdr->size = size;
 	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
 		     PAGE_SIZE - sizeof(*hdr));
 	start = (get_random_int() % hole) & ~(alignment - 1);
@@ -902,10 +901,10 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 
 void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 {
-	u32 pages = hdr->pages;
+	u32 size = hdr->size;
 
 	bpf_jit_free_exec(hdr);
-	bpf_jit_uncharge_modmem(pages << PAGE_SHIFT);
+	bpf_jit_uncharge_modmem(size);
 }
 
 /* This symbol is only overridden by archs that have different
-- 
2.30.2
^ permalink raw reply related	[flat|nested] 13+ messages in thread
- * [PATCH v4 bpf-next 4/7] bpf: add a pointer of bpf_binary_header to bpf_prog
  2022-01-19 23:06 [PATCH v4 bpf-next 0/7] bpf_prog_pack allocator Song Liu
                   ` (2 preceding siblings ...)
  2022-01-19 23:06 ` [PATCH v4 bpf-next 3/7] bpf: use size instead of pages in bpf_binary_header Song Liu
@ 2022-01-19 23:06 ` Song Liu
  2022-01-19 23:06 ` [PATCH v4 bpf-next 5/7] x86/alternative: introduce text_poke_copy Song Liu
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 13+ messages in thread
From: Song Liu @ 2022-01-19 23:06 UTC (permalink / raw)
  To: bpf, netdev, linux-kernel
  Cc: ast, daniel, andrii, kernel-team, peterz, x86, Song Liu
With sub page allocation, we cannot simply use bpf_func & PAGE_MASK to
find the bpf_binary_header. Add a pointer to struct bpf_prog to avoid
this logic.
Use this point for x86_64. If the pointer is not set by the jit engine,
fall back to original logic.
Signed-off-by: Song Liu <song@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c |  2 ++
 include/linux/filter.h      | 10 ++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index ce1f86f245c9..fe4f08e25a1d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2339,6 +2339,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 			if (header)
 				bpf_jit_binary_free(header);
 			prog = orig_prog;
+			header = NULL;
 			goto out_addrs;
 		}
 		if (image) {
@@ -2406,6 +2407,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	if (tmp_blinded)
 		bpf_jit_prog_release_other(prog, prog == orig_prog ?
 					   tmp : orig_prog);
+	prog->hdr = header;
 	return prog;
 }
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6cc1ebf1f840..6bb00f343cc5 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -584,6 +584,7 @@ struct bpf_prog {
 					    const struct bpf_insn *insn);
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
+	struct bpf_binary_header *hdr;
 	/* Instructions for interpreter */
 	union {
 		DECLARE_FLEX_ARRAY(struct sock_filter, insns);
@@ -893,9 +894,14 @@ static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
 static inline struct bpf_binary_header *
 bpf_jit_binary_hdr(const struct bpf_prog *fp)
 {
-	unsigned long real_start = (unsigned long)fp->bpf_func;
-	unsigned long addr = real_start & PAGE_MASK;
+	unsigned long real_start;
+	unsigned long addr;
 
+	if (fp->hdr)
+		return fp->hdr;
+
+	real_start = (unsigned long)fp->bpf_func;
+	addr = real_start & PAGE_MASK;
 	return (void *)addr;
 }
 
-- 
2.30.2
^ permalink raw reply related	[flat|nested] 13+ messages in thread
- * [PATCH v4 bpf-next 5/7] x86/alternative: introduce text_poke_copy
  2022-01-19 23:06 [PATCH v4 bpf-next 0/7] bpf_prog_pack allocator Song Liu
                   ` (3 preceding siblings ...)
  2022-01-19 23:06 ` [PATCH v4 bpf-next 4/7] bpf: add a pointer of bpf_binary_header to bpf_prog Song Liu
@ 2022-01-19 23:06 ` Song Liu
  2022-01-19 23:06 ` [PATCH v4 bpf-next 6/7] bpf: introduce bpf_prog_pack allocator Song Liu
  2022-01-19 23:06 ` [PATCH v4 bpf-next 7/7] bpf, x86_64: use " Song Liu
  6 siblings, 0 replies; 13+ messages in thread
From: Song Liu @ 2022-01-19 23:06 UTC (permalink / raw)
  To: bpf, netdev, linux-kernel
  Cc: ast, daniel, andrii, kernel-team, peterz, x86, Song Liu
This will be used by BPF jit compiler to dump JITed binary to a RX huge
page, and thus allow multiple BPF programs sharing the a huge (2MB) page.
Signed-off-by: Song Liu <song@kernel.org>
---
 arch/x86/include/asm/text-patching.h |  1 +
 arch/x86/kernel/alternative.c        | 32 ++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index b7421780e4e9..4cc18ba1b75e 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -44,6 +44,7 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len);
 extern void *text_poke(void *addr, const void *opcode, size_t len);
 extern void text_poke_sync(void);
 extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
+extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
 extern int poke_int3_handler(struct pt_regs *regs);
 extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 23fb4d51a5da..903a415c19fa 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1102,6 +1102,38 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
 	return __text_poke(addr, opcode, len);
 }
 
+/**
+ * text_poke_copy - Copy instructions into (an unused part of) RX memory
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * Not safe against concurrent execution; useful for JITs to dump
+ * new code blocks into unused regions of RX memory. Can be used in
+ * conjunction with synchronize_rcu_tasks() to wait for existing
+ * execution to quiesce after having made sure no existing functions
+ * pointers are live.
+ */
+void *text_poke_copy(void *addr, const void *opcode, size_t len)
+{
+	unsigned long start = (unsigned long)addr;
+	size_t patched = 0;
+
+	if (WARN_ON_ONCE(core_kernel_text(start)))
+		return NULL;
+
+	while (patched < len) {
+		unsigned long ptr = start + patched;
+		size_t s;
+
+		s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
+
+		__text_poke((void *)ptr, opcode + patched, s);
+		patched += s;
+	}
+	return addr;
+}
+
 static void do_sync_core(void *info)
 {
 	sync_core();
-- 
2.30.2
^ permalink raw reply related	[flat|nested] 13+ messages in thread
- * [PATCH v4 bpf-next 6/7] bpf: introduce bpf_prog_pack allocator
  2022-01-19 23:06 [PATCH v4 bpf-next 0/7] bpf_prog_pack allocator Song Liu
                   ` (4 preceding siblings ...)
  2022-01-19 23:06 ` [PATCH v4 bpf-next 5/7] x86/alternative: introduce text_poke_copy Song Liu
@ 2022-01-19 23:06 ` Song Liu
  2022-01-20  4:14   ` Alexei Starovoitov
  2022-01-19 23:06 ` [PATCH v4 bpf-next 7/7] bpf, x86_64: use " Song Liu
  6 siblings, 1 reply; 13+ messages in thread
From: Song Liu @ 2022-01-19 23:06 UTC (permalink / raw)
  To: bpf, netdev, linux-kernel
  Cc: ast, daniel, andrii, kernel-team, peterz, x86, Song Liu
From: Song Liu <songliubraving@fb.com>
Most BPF programs are small, but they consume a page each. For systems
with busy traffic and many BPF programs, this could add significant
pressure to instruction TLB.
Introduce bpf_prog_pack allocator to pack multiple BPF programs in a huge
page. The memory is then allocated in 64 byte chunks.
Memory allocated by bpf_prog_pack allocator is RO protected after initial
allocation. To write to it, the user (jit engine) need to use text poke
API.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/filter.h |   7 ++
 kernel/bpf/core.c      | 187 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 190 insertions(+), 4 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6bb00f343cc5..175c97f7535b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1074,6 +1074,13 @@ void *bpf_jit_alloc_exec(unsigned long size);
 void bpf_jit_free_exec(void *addr);
 void bpf_jit_free(struct bpf_prog *fp);
 
+struct bpf_binary_header *
+bpf_jit_binary_alloc_pack(unsigned int proglen, u8 **image_r_ptr,
+			  unsigned int alignment,
+			  bpf_jit_fill_hole_t bpf_fill_ill_insns);
+void bpf_jit_binary_free_pack(struct bpf_binary_header *hdr);
+int bpf_prog_pack_max_size(void);
+
 int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
 				struct bpf_jit_poke_descriptor *poke);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 684a8a972adf..94c7d9ff9d6c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -808,6 +808,116 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
 	return slot;
 }
 
+/*
+ * BPF program pack allocator.
+ *
+ * Most BPF programs are pretty small. Allocating a hole page for each
+ * program is sometime a waste. Many small bpf program also adds pressure
+ * to instruction TLB. To solve this issue, we introduce a BPF program pack
+ * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
+ * to host BPF programs.
+ */
+#define BPF_PROG_PACK_SIZE	HPAGE_PMD_SIZE
+#define BPF_PROG_CHUNK_SHIFT	6
+#define BPF_PROG_CHUNK_SIZE	(1 << BPF_PROG_CHUNK_SHIFT)
+#define BPF_PROG_CHUNK_COUNT	(BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
+
+struct bpf_prog_pack {
+	struct list_head list;
+	void *ptr;
+	unsigned long bitmap[BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)];
+};
+
+#define BPF_PROG_MAX_PACK_PROG_SIZE	HPAGE_PMD_SIZE
+#define BPF_PROG_SIZE_TO_NBITS(size)	(round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
+
+static DEFINE_MUTEX(pack_mutex);
+static LIST_HEAD(pack_list);
+
+static struct bpf_prog_pack *alloc_new_pack(void)
+{
+	struct bpf_prog_pack *pack;
+
+	pack = kzalloc(sizeof(*pack), GFP_KERNEL);
+	if (!pack)
+		return NULL;
+	pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
+	if (!pack->ptr) {
+		kfree(pack);
+		return NULL;
+	}
+	bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
+	list_add_tail(&pack->list, &pack_list);
+
+	set_vm_flush_reset_perms(pack);
+	set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+	set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+	return pack;
+}
+
+static void *bpf_prog_pack_alloc(u32 size)
+{
+	unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
+	struct bpf_prog_pack *pack;
+	unsigned long pos;
+	void *ptr = NULL;
+
+	mutex_lock(&pack_mutex);
+	list_for_each_entry(pack, &pack_list, list) {
+		pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+						 nbits, 0);
+		if (pos < BPF_PROG_CHUNK_COUNT)
+			goto found_free_area;
+	}
+
+	pack = alloc_new_pack();
+	if (!pack)
+		goto out;
+
+	pos = 0;
+
+found_free_area:
+	bitmap_set(pack->bitmap, pos, nbits);
+	ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
+
+out:
+	mutex_unlock(&pack_mutex);
+	return ptr;
+}
+
+static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+{
+	void *pack_ptr = (void *)((unsigned long)hdr & ~(BPF_PROG_PACK_SIZE - 1));
+	struct bpf_prog_pack *pack = NULL, *tmp;
+	unsigned int nbits;
+	unsigned long pos;
+
+	mutex_lock(&pack_mutex);
+
+	list_for_each_entry(tmp, &pack_list, list) {
+		if (tmp->ptr == pack_ptr) {
+			pack = tmp;
+			break;
+		}
+	}
+
+	if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
+		goto out;
+
+	nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
+	pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
+
+	bitmap_clear(pack->bitmap, pos, nbits);
+	if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+				       BPF_PROG_CHUNK_COUNT, 0) == 0) {
+		list_del(&pack->list);
+		module_memfree(pack->ptr);
+		kfree(pack);
+	}
+out:
+	mutex_unlock(&pack_mutex);
+}
+
 static atomic64_t bpf_jit_current;
 
 /* Can be overridden by an arch's JIT compiler if it has a custom,
@@ -860,10 +970,59 @@ void __weak bpf_jit_free_exec(void *addr)
 	module_memfree(addr);
 }
 
+static struct bpf_binary_header *
+__bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
+		       unsigned int alignment,
+		       bpf_jit_fill_hole_t bpf_fill_ill_insns,
+		       u32 round_up_to)
+{
+	struct bpf_binary_header *hdr;
+	u32 size, hole, start;
+
+	WARN_ON_ONCE(!is_power_of_2(alignment) ||
+		     alignment > BPF_IMAGE_ALIGNMENT);
+
+	/* Most of BPF filters are really small, but if some of them
+	 * fill a page, allow at least 128 extra bytes to insert a
+	 * random section of illegal instructions.
+	 */
+	size = round_up(proglen + sizeof(*hdr) + 128, round_up_to);
+
+	if (bpf_jit_charge_modmem(size))
+		return NULL;
+	hdr = bpf_jit_alloc_exec(size);
+	if (!hdr) {
+		bpf_jit_uncharge_modmem(size);
+		return NULL;
+	}
+
+	/* Fill space with illegal/arch-dep instructions. */
+	bpf_fill_ill_insns(hdr, size);
+
+	hdr->size = size;
+	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
+		     PAGE_SIZE - sizeof(*hdr));
+	start = (get_random_int() % hole) & ~(alignment - 1);
+
+	/* Leave a random number of instructions before BPF code. */
+	*image_ptr = &hdr->image[start];
+
+	return hdr;
+}
+
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 		     unsigned int alignment,
 		     bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+	return __bpf_jit_binary_alloc(proglen, image_ptr, alignment,
+				      bpf_fill_ill_insns, PAGE_SIZE);
+}
+
+struct bpf_binary_header *
+bpf_jit_binary_alloc_pack(unsigned int proglen, u8 **image_ptr,
+			  unsigned int alignment,
+			  bpf_jit_fill_hole_t bpf_fill_ill_insns)
 {
 	struct bpf_binary_header *hdr;
 	u32 size, hole, start;
@@ -875,11 +1034,19 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	 * fill a page, allow at least 128 extra bytes to insert a
 	 * random section of illegal instructions.
 	 */
-	size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
+	size = round_up(proglen + sizeof(*hdr) + 128, BPF_PROG_CHUNK_SIZE);
+
+	/* for too big program, use __bpf_jit_binary_alloc with round_up_to
+	 * of BPF_PROG_MAX_PACK_PROG_SIZE.
+	 */
+	if (size > BPF_PROG_MAX_PACK_PROG_SIZE)
+		return __bpf_jit_binary_alloc(proglen, image_ptr,
+					      alignment, bpf_fill_ill_insns,
+					      BPF_PROG_MAX_PACK_PROG_SIZE);
 
 	if (bpf_jit_charge_modmem(size))
 		return NULL;
-	hdr = bpf_jit_alloc_exec(size);
+	hdr = bpf_prog_pack_alloc(size);
 	if (!hdr) {
 		bpf_jit_uncharge_modmem(size);
 		return NULL;
@@ -888,9 +1055,8 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	/* Fill space with illegal/arch-dep instructions. */
 	bpf_fill_ill_insns(hdr, size);
 
-	hdr->size = size;
 	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
-		     PAGE_SIZE - sizeof(*hdr));
+		     BPF_PROG_CHUNK_SIZE - sizeof(*hdr));
 	start = (get_random_int() % hole) & ~(alignment - 1);
 
 	/* Leave a random number of instructions before BPF code. */
@@ -907,6 +1073,19 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 	bpf_jit_uncharge_modmem(size);
 }
 
+void bpf_jit_binary_free_pack(struct bpf_binary_header *hdr)
+{
+	u32 size = hdr->size;
+
+	bpf_prog_pack_free(hdr);
+	bpf_jit_uncharge_modmem(size);
+}
+
+int bpf_prog_pack_max_size(void)
+{
+	return BPF_PROG_MAX_PACK_PROG_SIZE;
+}
+
 /* This symbol is only overridden by archs that have different
  * requirements than the usual eBPF JITs, f.e. when they only
  * implement cBPF JIT, do not set images read-only, etc.
-- 
2.30.2
^ permalink raw reply related	[flat|nested] 13+ messages in thread
- * Re: [PATCH v4 bpf-next 6/7] bpf: introduce bpf_prog_pack allocator
  2022-01-19 23:06 ` [PATCH v4 bpf-next 6/7] bpf: introduce bpf_prog_pack allocator Song Liu
@ 2022-01-20  4:14   ` Alexei Starovoitov
  2022-01-20  4:48     ` Song Liu
  0 siblings, 1 reply; 13+ messages in thread
From: Alexei Starovoitov @ 2022-01-20  4:14 UTC (permalink / raw)
  To: Song Liu
  Cc: bpf, netdev, linux-kernel, ast, daniel, andrii, kernel-team,
	peterz, x86, Song Liu
On Wed, Jan 19, 2022 at 03:06:19PM -0800, Song Liu wrote:
>  
> +/*
> + * BPF program pack allocator.
> + *
> + * Most BPF programs are pretty small. Allocating a hole page for each
> + * program is sometime a waste. Many small bpf program also adds pressure
> + * to instruction TLB. To solve this issue, we introduce a BPF program pack
> + * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
> + * to host BPF programs.
> + */
> +#define BPF_PROG_PACK_SIZE	HPAGE_PMD_SIZE
> +#define BPF_PROG_MAX_PACK_PROG_SIZE	HPAGE_PMD_SIZE
We have a synthetic test with 1M bpf instructions. How did it JIT?
Are you saying we were lucky that every BPF insn was JITed to <2 bytes x86?
Did I misread the 2MB limit?
^ permalink raw reply	[flat|nested] 13+ messages in thread 
- * Re: [PATCH v4 bpf-next 6/7] bpf: introduce bpf_prog_pack allocator
  2022-01-20  4:14   ` Alexei Starovoitov
@ 2022-01-20  4:48     ` Song Liu
  2022-01-20  4:50       ` Alexei Starovoitov
  0 siblings, 1 reply; 13+ messages in thread
From: Song Liu @ 2022-01-20  4:48 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Song Liu, bpf@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, ast@kernel.org,
	daniel@iogearbox.net, andrii@kernel.org, Kernel Team,
	peterz@infradead.org, x86@kernel.org
> On Jan 19, 2022, at 8:14 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> On Wed, Jan 19, 2022 at 03:06:19PM -0800, Song Liu wrote:
>> 
>> +/*
>> + * BPF program pack allocator.
>> + *
>> + * Most BPF programs are pretty small. Allocating a hole page for each
>> + * program is sometime a waste. Many small bpf program also adds pressure
>> + * to instruction TLB. To solve this issue, we introduce a BPF program pack
>> + * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
>> + * to host BPF programs.
>> + */
>> +#define BPF_PROG_PACK_SIZE	HPAGE_PMD_SIZE
>> +#define BPF_PROG_MAX_PACK_PROG_SIZE	HPAGE_PMD_SIZE
> 
> We have a synthetic test with 1M bpf instructions. How did it JIT?
> Are you saying we were lucky that every BPF insn was JITed to <2 bytes x86?
> Did I misread the 2MB limit?
The logic is, if the program is bigger than 2MB, we fall back to use 
module_alloc(). This limitation simplifies the bpf_prog_pack allocator. 
Thanks,
Song
^ permalink raw reply	[flat|nested] 13+ messages in thread 
- * Re: [PATCH v4 bpf-next 6/7] bpf: introduce bpf_prog_pack allocator
  2022-01-20  4:48     ` Song Liu
@ 2022-01-20  4:50       ` Alexei Starovoitov
  0 siblings, 0 replies; 13+ messages in thread
From: Alexei Starovoitov @ 2022-01-20  4:50 UTC (permalink / raw)
  To: Song Liu
  Cc: Song Liu, bpf@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, ast@kernel.org,
	daniel@iogearbox.net, andrii@kernel.org, Kernel Team,
	peterz@infradead.org, x86@kernel.org
On Wed, Jan 19, 2022 at 8:48 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Jan 19, 2022, at 8:14 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >
> > On Wed, Jan 19, 2022 at 03:06:19PM -0800, Song Liu wrote:
> >>
> >> +/*
> >> + * BPF program pack allocator.
> >> + *
> >> + * Most BPF programs are pretty small. Allocating a hole page for each
> >> + * program is sometime a waste. Many small bpf program also adds pressure
> >> + * to instruction TLB. To solve this issue, we introduce a BPF program pack
> >> + * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
> >> + * to host BPF programs.
> >> + */
> >> +#define BPF_PROG_PACK_SIZE  HPAGE_PMD_SIZE
> >> +#define BPF_PROG_MAX_PACK_PROG_SIZE HPAGE_PMD_SIZE
> >
> > We have a synthetic test with 1M bpf instructions. How did it JIT?
> > Are you saying we were lucky that every BPF insn was JITed to <2 bytes x86?
> > Did I misread the 2MB limit?
>
> The logic is, if the program is bigger than 2MB, we fall back to use
> module_alloc(). This limitation simplifies the bpf_prog_pack allocator.
Ahh. Missed this part of the diff. Makes sense.
^ permalink raw reply	[flat|nested] 13+ messages in thread 
 
 
 
- * [PATCH v4 bpf-next 7/7] bpf, x86_64: use bpf_prog_pack allocator
  2022-01-19 23:06 [PATCH v4 bpf-next 0/7] bpf_prog_pack allocator Song Liu
                   ` (5 preceding siblings ...)
  2022-01-19 23:06 ` [PATCH v4 bpf-next 6/7] bpf: introduce bpf_prog_pack allocator Song Liu
@ 2022-01-19 23:06 ` Song Liu
  6 siblings, 0 replies; 13+ messages in thread
From: Song Liu @ 2022-01-19 23:06 UTC (permalink / raw)
  To: bpf, netdev, linux-kernel
  Cc: ast, daniel, andrii, kernel-team, peterz, x86, Song Liu
From: Song Liu <songliubraving@fb.com>
Use bpf_prog_pack allocator in x86_64 jit.
The program header from bpf_prog_pack is read only during the jit process.
Therefore, the binary is first written to a temporary buffer, and later
copied to final location with text_poke_copy().
Similarly, jit_fill_hole() is updated to fill the hole with 0xcc using
text_poke_copy().
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 arch/x86/net/bpf_jit_comp.c | 134 +++++++++++++++++++++++++++---------
 1 file changed, 103 insertions(+), 31 deletions(-)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index fe4f08e25a1d..6d97f7c24df2 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -216,11 +216,34 @@ static u8 simple_alu_opcodes[] = {
 	[BPF_ARSH] = 0xF8,
 };
 
+static char jit_hole_buffer[PAGE_SIZE] = {};
+
 static void jit_fill_hole(void *area, unsigned int size)
+{
+	struct bpf_binary_header *hdr = area;
+	int i;
+
+	for (i = 0; i < roundup(size, PAGE_SIZE); i += PAGE_SIZE) {
+		int s;
+
+		s = min_t(int, PAGE_SIZE, size - i);
+		text_poke_copy(area + i, jit_hole_buffer, s);
+	}
+
+	/*
+	 * bpf_jit_binary_alloc_pack cannot write size directly to the ro
+	 * mapping. Write it here with text_poke_copy().
+	 */
+	text_poke_copy(&hdr->size, &size, sizeof(size));
+}
+
+static int __init x86_jit_fill_hole_init(void)
 {
 	/* Fill whole space with INT3 instructions */
-	memset(area, 0xcc, size);
+	memset(jit_hole_buffer, 0xcc, PAGE_SIZE);
+	return 0;
 }
+pure_initcall(x86_jit_fill_hole_init);
 
 struct jit_context {
 	int cleanup_addr; /* Epilogue code offset */
@@ -361,14 +384,11 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 
 	ret = -EBUSY;
 	mutex_lock(&text_mutex);
-	if (memcmp(ip, old_insn, X86_PATCH_SIZE))
+	if (text_live && memcmp(ip, old_insn, X86_PATCH_SIZE))
 		goto out;
 	ret = 1;
 	if (memcmp(ip, new_insn, X86_PATCH_SIZE)) {
-		if (text_live)
-			text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
-		else
-			memcpy(ip, new_insn, X86_PATCH_SIZE);
+		text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
 		ret = 0;
 	}
 out:
@@ -537,7 +557,7 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
 	*pprog = prog;
 }
 
-static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
+static void bpf_tail_call_direct_fixup(struct bpf_prog *prog, bool text_live)
 {
 	struct bpf_jit_poke_descriptor *poke;
 	struct bpf_array *array;
@@ -558,24 +578,15 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
 		mutex_lock(&array->aux->poke_mutex);
 		target = array->ptrs[poke->tail_call.key];
 		if (target) {
-			/* Plain memcpy is used when image is not live yet
-			 * and still not locked as read-only. Once poke
-			 * location is active (poke->tailcall_target_stable),
-			 * any parallel bpf_arch_text_poke() might occur
-			 * still on the read-write image until we finally
-			 * locked it as read-only. Both modifications on
-			 * the given image are under text_mutex to avoid
-			 * interference.
-			 */
 			ret = __bpf_arch_text_poke(poke->tailcall_target,
 						   BPF_MOD_JUMP, NULL,
 						   (u8 *)target->bpf_func +
-						   poke->adj_off, false);
+						   poke->adj_off, text_live);
 			BUG_ON(ret < 0);
 			ret = __bpf_arch_text_poke(poke->tailcall_bypass,
 						   BPF_MOD_JUMP,
 						   (u8 *)poke->tailcall_target +
-						   X86_PATCH_SIZE, NULL, false);
+						   X86_PATCH_SIZE, NULL, text_live);
 			BUG_ON(ret < 0);
 		}
 		WRITE_ONCE(poke->tailcall_target_stable, true);
@@ -867,7 +878,7 @@ static void emit_nops(u8 **pprog, int len)
 
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
-static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *tmp_image,
 		  int oldproglen, struct jit_context *ctx, bool jmp_padding)
 {
 	bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
@@ -894,8 +905,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 	push_callee_regs(&prog, callee_regs_used);
 
 	ilen = prog - temp;
-	if (image)
-		memcpy(image + proglen, temp, ilen);
+	if (tmp_image)
+		memcpy(tmp_image + proglen, temp, ilen);
 	proglen += ilen;
 	addrs[0] = proglen;
 	prog = temp;
@@ -1324,8 +1335,10 @@ st:			if (is_imm8(insn->off))
 					pr_err("extable->insn doesn't fit into 32-bit\n");
 					return -EFAULT;
 				}
-				ex->insn = delta;
+				/* switch ex to temporary buffer for writes */
+				ex = (void *)tmp_image + ((void *)ex - (void *)image);
 
+				ex->insn = delta;
 				ex->type = EX_TYPE_BPF;
 
 				if (dst_reg > BPF_REG_9) {
@@ -1706,7 +1719,7 @@ st:			if (is_imm8(insn->off))
 				pr_err("bpf_jit: fatal error\n");
 				return -EFAULT;
 			}
-			memcpy(image + proglen, temp, ilen);
+			memcpy(tmp_image + proglen, temp, ilen);
 		}
 		proglen += ilen;
 		addrs[i] = proglen;
@@ -2248,8 +2261,10 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
 
 struct x64_jit_data {
 	struct bpf_binary_header *header;
+	struct bpf_binary_header *tmp_header;
 	int *addrs;
 	u8 *image;
+	u8 *tmp_image;
 	int proglen;
 	struct jit_context ctx;
 };
@@ -2259,6 +2274,7 @@ struct x64_jit_data {
 
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
+	struct bpf_binary_header *tmp_header = NULL;
 	struct bpf_binary_header *header = NULL;
 	struct bpf_prog *tmp, *orig_prog = prog;
 	struct x64_jit_data *jit_data;
@@ -2267,6 +2283,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	bool tmp_blinded = false;
 	bool extra_pass = false;
 	bool padding = false;
+	u8 *tmp_image = NULL;
 	u8 *image = NULL;
 	int *addrs;
 	int pass;
@@ -2301,7 +2318,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		ctx = jit_data->ctx;
 		oldproglen = jit_data->proglen;
 		image = jit_data->image;
+		tmp_image = jit_data->tmp_image;
 		header = jit_data->header;
+		tmp_header = jit_data->tmp_header;
 		extra_pass = true;
 		padding = true;
 		goto skip_init_addrs;
@@ -2332,14 +2351,18 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	for (pass = 0; pass < MAX_PASSES || image; pass++) {
 		if (!padding && pass >= PADDING_PASSES)
 			padding = true;
-		proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding);
+		proglen = do_jit(prog, addrs, image, tmp_image, oldproglen, &ctx, padding);
 		if (proglen <= 0) {
 out_image:
 			image = NULL;
-			if (header)
-				bpf_jit_binary_free(header);
+			tmp_image = NULL;
+			if (header) {
+				bpf_jit_binary_free_pack(header);
+				kfree(tmp_header);
+			}
 			prog = orig_prog;
 			header = NULL;
+			tmp_header = NULL;
 			goto out_addrs;
 		}
 		if (image) {
@@ -2362,13 +2385,27 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 				sizeof(struct exception_table_entry);
 
 			/* allocate module memory for x86 insns and extable */
-			header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size,
-						      &image, align, jit_fill_hole);
+			header = bpf_jit_binary_alloc_pack(roundup(proglen, align) + extable_size,
+							   &image, align, jit_fill_hole);
 			if (!header) {
 				prog = orig_prog;
 				goto out_addrs;
 			}
-			prog->aux->extable = (void *) image + roundup(proglen, align);
+			if (header->size > bpf_prog_pack_max_size()) {
+				tmp_header = header;
+				tmp_image = image;
+			} else {
+				tmp_header = kzalloc(header->size, GFP_KERNEL);
+				if (!tmp_header) {
+					bpf_jit_binary_free_pack(header);
+					header = NULL;
+					prog = orig_prog;
+					goto out_addrs;
+				}
+				tmp_header->size = header->size;
+				tmp_image = (void *)tmp_header + ((void *)image - (void *)header);
+			}
+			prog->aux->extable = (void *)image + roundup(proglen, align);
 		}
 		oldproglen = proglen;
 		cond_resched();
@@ -2379,14 +2416,24 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 
 	if (image) {
 		if (!prog->is_func || extra_pass) {
-			bpf_tail_call_direct_fixup(prog);
-			bpf_jit_binary_lock_ro(header);
+			if (header->size > bpf_prog_pack_max_size()) {
+				/*
+				 * bpf_prog_pack cannot handle too big
+				 * program (> ~2MB). Fall back to regular
+				 * module_alloc(), and do the fixup and
+				 * lock_ro here.
+				 */
+				bpf_tail_call_direct_fixup(prog, false);
+				bpf_jit_binary_lock_ro(header);
+			}
 		} else {
 			jit_data->addrs = addrs;
 			jit_data->ctx = ctx;
 			jit_data->proglen = proglen;
 			jit_data->image = image;
+			jit_data->tmp_image = tmp_image;
 			jit_data->header = header;
+			jit_data->tmp_header = tmp_header;
 		}
 		prog->bpf_func = (void *)image;
 		prog->jited = 1;
@@ -2402,6 +2449,17 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		kvfree(addrs);
 		kfree(jit_data);
 		prog->aux->jit_data = NULL;
+		jit_data = NULL;
+		if (tmp_header != header) {
+			text_poke_copy(header, tmp_header, header->size);
+			kfree(tmp_header);
+			/*
+			 * Do the fixup after final text_poke_copy().
+			 * Otherwise, the fix up will be overwritten by
+			 * text_poke_copy().
+			 */
+			bpf_tail_call_direct_fixup(prog, true);
+		}
 	}
 out:
 	if (tmp_blinded)
@@ -2415,3 +2473,17 @@ bool bpf_jit_supports_kfunc_call(void)
 {
 	return true;
 }
+
+void bpf_jit_free(struct bpf_prog *fp)
+{
+	if (fp->jited) {
+		struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
+
+		if (hdr->size > bpf_prog_pack_max_size())
+			bpf_jit_binary_free(hdr);
+		else
+			bpf_jit_binary_free_pack(hdr);
+	}
+
+	bpf_prog_unlock_free(fp);
+}
-- 
2.30.2
^ permalink raw reply related	[flat|nested] 13+ messages in thread