bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Pingfan Liu <piliu@redhat.com>
To: bpf@vger.kernel.org
Cc: Pingfan Liu <piliu@redhat.com>,
	Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	John Fastabend <john.fastabend@gmail.com>,
	Andrii Nakryiko <andrii@kernel.org>,
	Martin KaFai Lau <martin.lau@linux.dev>,
	Eduard Zingerman <eddyz87@gmail.com>, Song Liu <song@kernel.org>,
	Yonghong Song <yonghong.song@linux.dev>,
	Jeremy Linton <jeremy.linton@arm.com>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Will Deacon <will@kernel.org>, Ard Biesheuvel <ardb@kernel.org>,
	Simon Horman <horms@kernel.org>,
	Gerd Hoffmann <kraxel@redhat.com>,
	Vitaly Kuznetsov <vkuznets@redhat.com>,
	Philipp Rudo <prudo@redhat.com>, Viktor Malik <vmalik@redhat.com>,
	Jan Hendrik Farr <kernel@jfarr.cc>, Baoquan He <bhe@redhat.com>,
	Dave Young <dyoung@redhat.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	kexec@lists.infradead.org, systemd-devel@lists.freedesktop.org,
	KP Singh <kpsingh@kernel.org>,
	Stanislav Fomichev <sdf@fomichev.me>, Hao Luo <haoluo@google.com>,
	Jiri Olsa <jolsa@kernel.org>
Subject: [PATCHv5 04/12] bpf: Introduce decompressor kfunc
Date: Tue, 19 Aug 2025 09:24:20 +0800	[thread overview]
Message-ID: <20250819012428.6217-5-piliu@redhat.com> (raw)
In-Reply-To: <20250819012428.6217-1-piliu@redhat.com>

This commit bridges the gap between bpf-prog and the kernel
decompression routines. At present, only a global memory allocator is
used for the decompression. Later, if needed, the decompress_fn's
prototype can be changed to pass in a task related allocator.

This memory allocator can allocate 2MB each time with a transient
virtual address, up to a 1GB limit.  After decompression finishes, it
presents all of the decompressed data in a new unified virtual
address space.

Signed-off-by: Pingfan Liu <piliu@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Eduard Zingerman <eddyz87@gmail.com>
Cc: Song Liu <song@kernel.org>
Cc: Yonghong Song <yonghong.song@linux.dev>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Stanislav Fomichev <sdf@fomichev.me>
Cc: Hao Luo <haoluo@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
To: bpf@vger.kernel.org
---
 kernel/bpf/helpers.c | 226 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 226 insertions(+)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bd83ec9a2b2a6..895fe8fdaa78d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -25,6 +25,7 @@
 #include <linux/kasan.h>
 #include <linux/bpf_verifier.h>
 #include <linux/uaccess.h>
+#include <linux/decompress/generic.h>
 
 #include "../../lib/kstrtox.h"
 
@@ -3703,6 +3704,230 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
 	return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX);
 }
 
+#ifdef CONFIG_KEXEC_PE_IMAGE
+
+#define MAX_UNCOMPRESSED_BUF_SIZE	(1 << 28)
+/* a chunk should be large enough to contain a decompressing */
+#define CHUNK_SIZE	(1 << 23)
+
+/*
+ * At present, one global allocator for decompression. Later if needed, changing the
+ * prototype of decompress_fn to introduce each task's allocator.
+ */
+static DEFINE_MUTEX(output_buf_mutex);
+
+struct decompress_mem_allocator {
+	struct page **pages;
+	unsigned int pg_idx;
+	void *chunk_start;
+	unsigned int chunk_size;
+	void *chunk_cur;
+};
+
+static struct decompress_mem_allocator dcmpr_allocator;
+
+/*
+ * Set up an active chunk to hold partial decompressed data.
+ */
+static void *vmap_decompressed_chunk(void)
+{
+	struct decompress_mem_allocator *a = &dcmpr_allocator;
+	unsigned int i, pg_cnt = a->chunk_size >> PAGE_SHIFT;
+	struct page **pg_start = &a->pages[a->pg_idx];
+
+	for (i = 0; i < pg_cnt; i++)
+		a->pages[a->pg_idx++] = alloc_page(GFP_KERNEL | __GFP_ACCOUNT);
+
+	return vmap(pg_start, pg_cnt, VM_MAP, PAGE_KERNEL);
+}
+
+/*
+ * Present the scattered pages containing decompressed data at a unified virtual
+ * address.
+ */
+static int decompress_mem_allocator_handover(struct decompress_mem_allocator *a,
+		struct mem_range_result *range)
+{
+	unsigned long pg_array_sz = a->pg_idx * sizeof(struct page *);
+
+	range->pages = vmalloc(pg_array_sz);
+	if (!range->pages)
+		return -ENOMEM;
+
+	range->pg_cnt = a->pg_idx;
+	memcpy(range->pages, a->pages, pg_array_sz);
+	range->buf = vmap(range->pages, range->pg_cnt, VM_MAP, PAGE_KERNEL);
+	if (!range->buf) {
+		vfree(range->pages);
+		return -1;
+	}
+	/*
+	 * Free the tracing pointer; The pages are freed when mem_range_result
+	 * is released.
+	 */
+	vfree(a->pages);
+	a->pages = NULL;
+
+	/* vmap-ed */
+	range->alloc_type = TYPE_VMAP;
+	range->buf_sz = a->pg_idx << PAGE_SHIFT;
+	range->data_sz = range->buf_sz - a->chunk_size;
+	range->data_sz += a->chunk_cur - a->chunk_start;
+
+	return 0;
+}
+
+static int decompress_mem_allocator_init(
+	struct decompress_mem_allocator *allocator,
+	unsigned int chunk_size)
+{
+	unsigned long sz = (MAX_UNCOMPRESSED_BUF_SIZE >> PAGE_SHIFT) * sizeof(struct page *);
+
+	allocator->pages = __vmalloc(sz, GFP_KERNEL | __GFP_ACCOUNT);
+	if (!allocator->pages)
+		return -ENOMEM;
+
+	allocator->pg_idx = 0;
+	allocator->chunk_start = NULL;
+	allocator->chunk_size = chunk_size;
+	allocator->chunk_cur = NULL;
+	return 0;
+}
+
+static void decompress_mem_allocator_fini(struct decompress_mem_allocator *allocator)
+{
+	unsigned int i;
+
+	/* unmap the active chunk */
+	if (!!allocator->chunk_start)
+		vunmap(allocator->chunk_start);
+	if (!!allocator->pages) {
+		for (i = 0; i < allocator->pg_idx; i++)
+			__free_pages(allocator->pages[i], 0);
+		vfree(allocator->pages);
+	}
+}
+
+/*
+ * This is a callback for decompress_fn.
+ *
+ * It copies the partial decompressed content in [buf, buf + len) to dst. If the
+ * active chunk is not large enough, retire it and activate a new chunk to hold
+ * the remaining data.
+ */
+static long flush(void *buf, unsigned long len)
+{
+	struct decompress_mem_allocator *a = &dcmpr_allocator;
+	long free, copied = 0;
+
+	/* The first time allocation */
+	if (unlikely(!a->chunk_start)) {
+		a->chunk_start = a->chunk_cur = vmap_decompressed_chunk();
+		if (unlikely(!a->chunk_start))
+			return -1;
+	}
+
+	free = a->chunk_start + a->chunk_size - a->chunk_cur;
+	BUG_ON(free < 0);
+	if (free < len) {
+		/*
+		 * If the totoal size exceeds MAX_UNCOMPRESSED_BUF_SIZE,
+		 * return -1 to indicate the decompress method that something
+		 * is wrong
+		 */
+		if (unlikely((a->pg_idx >= MAX_UNCOMPRESSED_BUF_SIZE >> PAGE_SHIFT)))
+			return -1;
+		memcpy(a->chunk_cur, buf, free);
+		copied += free;
+		a->chunk_cur += free;
+		len -= free;
+		/*
+		 * When retiring the active chunk, release its virtual address
+		 * but do not release the contents in the pages.
+		 */
+		vunmap(a->chunk_start);
+		a->chunk_start = a->chunk_cur = vmap_decompressed_chunk();
+		if (unlikely(!a->chunk_start))
+			return -1;
+	}
+	memcpy(a->chunk_cur, buf, len);
+	copied += len;
+	a->chunk_cur += len;
+	return copied;
+}
+
+__bpf_kfunc struct mem_range_result *bpf_decompress(char *image_gz_payload, int image_gz_sz)
+{
+	struct decompress_mem_allocator *a = &dcmpr_allocator;
+	decompress_fn decompressor;
+	struct mem_cgroup *memcg, *old_memcg;
+	struct mem_range_result *range;
+	const char *name;
+	char *input_buf;
+	int ret;
+
+	memcg = get_mem_cgroup_from_current();
+	old_memcg = set_active_memcg(memcg);
+	range = mem_range_result_alloc();
+	if (!range) {
+		pr_err("fail to allocate mem_range_result\n");
+		goto error;
+	}
+
+	input_buf = __vmalloc(image_gz_sz, GFP_KERNEL | __GFP_ACCOUNT);
+	if (!input_buf) {
+		kfree(range);
+		pr_err("fail to allocate input buffer\n");
+		goto error;
+	}
+
+	ret = copy_from_kernel_nofault(input_buf, image_gz_payload, image_gz_sz);
+	if (ret < 0) {
+		kfree(range);
+		vfree(input_buf);
+		pr_err("Error when copying from 0x%p, size:0x%x\n",
+				image_gz_payload, image_gz_sz);
+		goto error;
+	}
+
+	mutex_lock(&output_buf_mutex);
+	decompress_mem_allocator_init(a, CHUNK_SIZE);
+	decompressor = decompress_method(input_buf, image_gz_sz, &name);
+	if (!decompressor) {
+		kfree(range);
+		vfree(input_buf);
+		pr_err("Can not find decompress method\n");
+		goto error;
+	}
+	ret = decompressor(input_buf, image_gz_sz, NULL, flush,
+				NULL, NULL, NULL);
+
+	vfree(input_buf);
+	if (ret == 0) {
+		ret = decompress_mem_allocator_handover(a, range);
+		if (!!ret)
+			goto fail;
+		range->status = 0;
+		mem_cgroup_tryget(memcg);
+		range->memcg = memcg;
+		set_active_memcg(old_memcg);
+	}
+fail:
+	decompress_mem_allocator_fini(a);
+	mutex_unlock(&output_buf_mutex);
+	if (!!ret) {
+		kfree(range);
+		range = NULL;
+		pr_err("Decompress error\n");
+	}
+
+error:
+	set_active_memcg(old_memcg);
+	mem_cgroup_put(memcg);
+	return range;
+}
+#endif
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(generic_btf_ids)
@@ -3710,6 +3935,7 @@ BTF_KFUNCS_START(generic_btf_ids)
 BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
 #endif
 #ifdef CONFIG_KEXEC_PE_IMAGE
+BTF_ID_FLAGS(func, bpf_decompress, KF_TRUSTED_ARGS | KF_ACQUIRE | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_mem_range_result_put, KF_RELEASE | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_copy_to_kernel, KF_TRUSTED_ARGS | KF_SLEEPABLE)
 #endif
-- 
2.49.0


  parent reply	other threads:[~2025-08-19  1:26 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-08-19  1:24 [PATCHv5 00/12] kexec: Use BPF lskel to enable kexec to load PE format boot image Pingfan Liu
2025-08-19  1:24 ` [PATCHv5 01/12] kexec_file: Make kexec_image_load_default global visible Pingfan Liu
2025-08-19  1:24 ` [PATCHv5 02/12] lib/decompress: Keep decompressor when CONFIG_KEEP_COMPRESSOR Pingfan Liu
2025-08-19  1:24 ` [PATCHv5 03/12] bpf: Introduce bpf_copy_to_kernel() to buffer the content from bpf-prog Pingfan Liu
2025-08-19  1:24 ` Pingfan Liu [this message]
2025-08-19  1:24 ` [PATCHv5 05/12] kexec: Introduce kexec_pe_image to parse and load PE file Pingfan Liu
2025-09-01 14:30   ` Philipp Rudo
2025-08-19  1:24 ` [PATCHv5 06/12] kexec: Integrate with the introduced bpf kfuncs Pingfan Liu
2025-09-01 14:30   ` Philipp Rudo
2025-08-19  1:24 ` [PATCHv5 07/12] kexec: Introduce a bpf-prog lskel to parse PE file Pingfan Liu
2025-08-19  1:24 ` [PATCHv5 08/12] kexec: Factor out routine to find a symbol in ELF Pingfan Liu
2025-09-01 14:31   ` Philipp Rudo
2025-08-19  1:24 ` [PATCHv5 09/12] kexec: Integrate bpf light skeleton to load zboot image Pingfan Liu
2025-08-19  1:24 ` [PATCHv5 10/12] arm64/kexec: Add PE image format support Pingfan Liu
2025-08-19 18:23   ` kernel test robot
2025-08-19 18:54   ` kernel test robot
2025-08-20  3:09   ` Pingfan Liu
2025-08-19  1:24 ` [PATCHv5 11/12] tools/kexec: Introduce a bpf-prog to parse zboot image format Pingfan Liu
2025-08-19  1:24 ` [PATCHv5 12/12] tools/kexec: Add a zboot image building tool Pingfan Liu
2025-09-01 14:29 ` [PATCHv5 00/12] kexec: Use BPF lskel to enable kexec to load PE format boot image Philipp Rudo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250819012428.6217-5-piliu@redhat.com \
    --to=piliu@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=andrii@kernel.org \
    --cc=ardb@kernel.org \
    --cc=ast@kernel.org \
    --cc=bhe@redhat.com \
    --cc=bpf@vger.kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=daniel@iogearbox.net \
    --cc=dyoung@redhat.com \
    --cc=eddyz87@gmail.com \
    --cc=haoluo@google.com \
    --cc=horms@kernel.org \
    --cc=jeremy.linton@arm.com \
    --cc=john.fastabend@gmail.com \
    --cc=jolsa@kernel.org \
    --cc=kernel@jfarr.cc \
    --cc=kexec@lists.infradead.org \
    --cc=kpsingh@kernel.org \
    --cc=kraxel@redhat.com \
    --cc=martin.lau@linux.dev \
    --cc=prudo@redhat.com \
    --cc=sdf@fomichev.me \
    --cc=song@kernel.org \
    --cc=systemd-devel@lists.freedesktop.org \
    --cc=vkuznets@redhat.com \
    --cc=vmalik@redhat.com \
    --cc=will@kernel.org \
    --cc=yonghong.song@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).