[PATCH v2 bpf-next 4/5] libbpf: add ksyscall/kretsyscall sections support for syscall kprobes

BPF List
 help / color / mirror / Atom feed

From: Andrii Nakryiko <andrii@kernel.org>
To: <bpf@vger.kernel.org>, <ast@kernel.org>, <daniel@iogearbox.net>
Cc: <andrii@kernel.org>, <kernel-team@fb.com>,
	Alan Maguire <alan.maguire@oracle.com>
Subject: [PATCH v2 bpf-next 4/5] libbpf: add ksyscall/kretsyscall sections support for syscall kprobes
Date: Thu, 14 Jul 2022 00:07:54 -0700	[thread overview]
Message-ID: <20220714070755.3235561-5-andrii@kernel.org> (raw)
In-Reply-To: <20220714070755.3235561-1-andrii@kernel.org>

Add SEC("ksyscall")/SEC("ksyscall/<syscall_name>") and corresponding
kretsyscall variants (for return kprobes) to allow users to kprobe
syscall functions in kernel. These special sections allow to ignore
complexities and differences between kernel versions and host
architectures when it comes to syscall wrapper and corresponding
__<arch>_sys_<syscall> vs __se_sys_<syscall> differences, depending on
whether host kernel has CONFIG_ARCH_HAS_SYSCALL_WRAPPER (though libbpf
itself doesn't rely on /proc/config.gz for detecting this, see
BPF_KSYSCALL patch for how it's done internally).

Combined with the use of BPF_KSYSCALL() macro, this allows to just
specify intended syscall name and expected input arguments and leave
dealing with all the variations to libbpf.

In addition to SEC("ksyscall+") and SEC("kretsyscall+") add
bpf_program__attach_ksyscall() API which allows to specify syscall name
at runtime and provide associated BPF cookie value.

At the moment SEC("ksyscall") and bpf_program__attach_ksyscall() do not
handle all the calling convention quirks for mmap(), clone() and compat
syscalls. It also only attaches to "native" syscall interfaces. If host
system supports compat syscalls or defines 32-bit syscalls in 64-bit
kernel, such syscall interfaces won't be attached to by libbpf.

These limitations may or may not change in the future. Therefore it is
recommended to use SEC("kprobe") for these syscalls or if working with
compat and 32-bit interfaces is required.

Tested-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/lib/bpf/libbpf.c          | 117 +++++++++++++++++++++++++++++---
 tools/lib/bpf/libbpf.h          |  46 +++++++++++++
 tools/lib/bpf/libbpf.map        |   1 +
 tools/lib/bpf/libbpf_internal.h |   2 +
 4 files changed, 157 insertions(+), 9 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index eb35a20a33f6..ba96760936f8 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -4670,6 +4670,8 @@ static int probe_kern_btf_enum64(void)
 					     strs, sizeof(strs)));
 }
 
+static int probe_kern_syscall_wrapper(void);
+
 enum kern_feature_result {
 	FEAT_UNKNOWN = 0,
 	FEAT_SUPPORTED = 1,
@@ -4738,6 +4740,9 @@ static struct kern_feature_desc {
 	[FEAT_BTF_ENUM64] = {
 		"BTF_KIND_ENUM64 support", probe_kern_btf_enum64,
 	},
+	[FEAT_SYSCALL_WRAPPER] = {
+		"Kernel using syscall wrapper", probe_kern_syscall_wrapper,
+	},
 };
 
 bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id)
@@ -8421,6 +8426,7 @@ int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log
 
 static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link);
+static int attach_ksyscall(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link);
@@ -8441,6 +8447,8 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("uretprobe.s+",		KPROBE, 0, SEC_SLEEPABLE, attach_uprobe),
 	SEC_DEF("kprobe.multi+",	KPROBE,	BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi),
 	SEC_DEF("kretprobe.multi+",	KPROBE,	BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi),
+	SEC_DEF("ksyscall+",		KPROBE,	0, SEC_NONE, attach_ksyscall),
+	SEC_DEF("kretsyscall+",		KPROBE, 0, SEC_NONE, attach_ksyscall),
 	SEC_DEF("usdt+",		KPROBE,	0, SEC_NONE, attach_usdt),
 	SEC_DEF("tc",			SCHED_CLS, 0, SEC_NONE),
 	SEC_DEF("classifier",		SCHED_CLS, 0, SEC_NONE),
@@ -9797,7 +9805,7 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name,
 {
 	struct perf_event_attr attr = {};
 	char errmsg[STRERR_BUFSIZE];
-	int type, pfd, err;
+	int type, pfd;
 
 	if (ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS))
 		return -EINVAL;
@@ -9833,14 +9841,7 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name,
 		      pid < 0 ? -1 : pid /* pid */,
 		      pid == -1 ? 0 : -1 /* cpu */,
 		      -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC);
-	if (pfd < 0) {
-		err = -errno;
-		pr_warn("%s perf_event_open() failed: %s\n",
-			uprobe ? "uprobe" : "kprobe",
-			libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
-		return err;
-	}
-	return pfd;
+	return pfd >= 0 ? pfd : -errno;
 }
 
 static int append_to_file(const char *file, const char *fmt, ...)
@@ -9945,6 +9946,60 @@ static int perf_event_kprobe_open_legacy(const char *probe_name, bool retprobe,
 	return err;
 }
 
+static const char *arch_specific_syscall_pfx(void)
+{
+#if defined(__x86_64__)
+	return "x64";
+#elif defined(__i386__)
+	return "ia32";
+#elif defined(__s390x__)
+	return "s390x";
+#elif defined(__s390__)
+	return "s390";
+#elif defined(__arm__)
+	return "arm";
+#elif defined(__aarch64__)
+	return "arm64";
+#elif defined(__mips__)
+	return "mips";
+#elif defined(__riscv)
+	return "riscv";
+#else
+	return NULL;
+#endif
+}
+
+static int probe_kern_syscall_wrapper(void)
+{
+	char syscall_name[64];
+	const char *ksys_pfx;
+
+	ksys_pfx = arch_specific_syscall_pfx();
+	if (!ksys_pfx)
+		return 0;
+
+	snprintf(syscall_name, sizeof(syscall_name), "__%s_sys_bpf", ksys_pfx);
+
+	if (determine_kprobe_perf_type() >= 0) {
+		int pfd;
+
+		pfd = perf_event_open_probe(false, false, syscall_name, 0, getpid(), 0);
+		if (pfd >= 0)
+			close(pfd);
+
+		return pfd >= 0 ? 1 : 0;
+	} else { /* legacy mode */
+		char probe_name[128];
+
+		gen_kprobe_legacy_event_name(probe_name, sizeof(probe_name), syscall_name, 0);
+		if (add_kprobe_event_legacy(probe_name, false, syscall_name, 0) < 0)
+			return 0;
+
+		(void)remove_kprobe_event_legacy(probe_name, false);
+		return 1;
+	}
+}
+
 struct bpf_link *
 bpf_program__attach_kprobe_opts(const struct bpf_program *prog,
 				const char *func_name,
@@ -10030,6 +10085,29 @@ struct bpf_link *bpf_program__attach_kprobe(const struct bpf_program *prog,
 	return bpf_program__attach_kprobe_opts(prog, func_name, &opts);
 }
 
+struct bpf_link *bpf_program__attach_ksyscall(const struct bpf_program *prog,
+					      const char *syscall_name,
+					      const struct bpf_ksyscall_opts *opts)
+{
+	LIBBPF_OPTS(bpf_kprobe_opts, kprobe_opts);
+	char func_name[128];
+
+	if (!OPTS_VALID(opts, bpf_ksyscall_opts))
+		return libbpf_err_ptr(-EINVAL);
+
+	if (kernel_supports(prog->obj, FEAT_SYSCALL_WRAPPER)) {
+		snprintf(func_name, sizeof(func_name), "__%s_sys_%s",
+			 arch_specific_syscall_pfx(), syscall_name);
+	} else {
+		snprintf(func_name, sizeof(func_name), "__se_sys_%s", syscall_name);
+	}
+
+	kprobe_opts.retprobe = OPTS_GET(opts, retprobe, false);
+	kprobe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0);
+
+	return bpf_program__attach_kprobe_opts(prog, func_name, &kprobe_opts);
+}
+
 /* Adapted from perf/util/string.c */
 static bool glob_match(const char *str, const char *pat)
 {
@@ -10200,6 +10278,27 @@ static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf
 	return libbpf_get_error(*link);
 }
 
+static int attach_ksyscall(const struct bpf_program *prog, long cookie, struct bpf_link **link)
+{
+	LIBBPF_OPTS(bpf_ksyscall_opts, opts);
+	const char *syscall_name;
+
+	*link = NULL;
+
+	/* no auto-attach for SEC("ksyscall") and SEC("kretsyscall") */
+	if (strcmp(prog->sec_name, "ksyscall") == 0 || strcmp(prog->sec_name, "kretsyscall") == 0)
+		return 0;
+
+	opts.retprobe = str_has_pfx(prog->sec_name, "kretsyscall/");
+	if (opts.retprobe)
+		syscall_name = prog->sec_name + sizeof("kretsyscall/") - 1;
+	else
+		syscall_name = prog->sec_name + sizeof("ksyscall/") - 1;
+
+	*link = bpf_program__attach_ksyscall(prog, syscall_name, &opts);
+	return *link ? 0 : -errno;
+}
+
 static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link)
 {
 	LIBBPF_OPTS(bpf_kprobe_multi_opts, opts);
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index e4d5353f757b..d10234aaa8df 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -457,6 +457,52 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog,
 				      const char *pattern,
 				      const struct bpf_kprobe_multi_opts *opts);
 
+struct bpf_ksyscall_opts {
+	/* size of this struct, for forward/backward compatiblity */
+	size_t sz;
+	/* custom user-provided value fetchable through bpf_get_attach_cookie() */
+	__u64 bpf_cookie;
+	/* attach as return probe? */
+	bool retprobe;
+	size_t :0;
+};
+#define bpf_ksyscall_opts__last_field retprobe
+
+/**
+ * @brief **bpf_program__attach_ksyscall()** attaches a BPF program
+ * to kernel syscall handler of a specified syscall. Optionally it's possible
+ * to request to install retprobe that will be triggered at syscall exit. It's
+ * also possible to associate BPF cookie (though options).
+ *
+ * Libbpf automatically will determine correct full kernel function name,
+ * which depending on system architecture and kernel version/configuration
+ * could be of the form __<arch>_sys_<syscall> or __se_sys_<syscall>, and will
+ * attach specified program using kprobe/kretprobe mechanism.
+ *
+ * **bpf_program__attach_ksyscall()** is an API counterpart of declarative
+ * **SEC("ksyscall/<syscall>")** annotation of BPF programs.
+ *
+ * At the moment **SEC("ksyscall")** and **bpf_program__attach_ksyscall()** do
+ * not handle all the calling convention quirks for mmap(), clone() and compat
+ * syscalls. It also only attaches to "native" syscall interfaces. If host
+ * system supports compat syscalls or defines 32-bit syscalls in 64-bit
+ * kernel, such syscall interfaces won't be attached to by libbpf.
+ *
+ * These limitations may or may not change in the future. Therefore it is
+ * recommended to use SEC("kprobe") for these syscalls or if working with
+ * compat and 32-bit interfaces is required.
+ *
+ * @param prog BPF program to attach
+ * @param syscall_name Symbolic name of the syscall (e.g., "bpf")
+ * @param opts Additional options (see **struct bpf_ksyscall_opts**)
+ * @return Reference to the newly created BPF link; or NULL is returned on
+ * error, error code is stored in errno
+ */
+LIBBPF_API struct bpf_link *
+bpf_program__attach_ksyscall(const struct bpf_program *prog,
+			     const char *syscall_name,
+			     const struct bpf_ksyscall_opts *opts);
+
 struct bpf_uprobe_opts {
 	/* size of this struct, for forward/backward compatiblity */
 	size_t sz;
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 94b589ecfeaa..63c059a02a0d 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -356,6 +356,7 @@ LIBBPF_0.8.0 {
 LIBBPF_1.0.0 {
 	global:
 		bpf_prog_query_opts;
+		bpf_program__attach_ksyscall;
 		btf__add_enum64;
 		btf__add_enum64_value;
 		libbpf_bpf_attach_type_str;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index 9cd7829cbe41..f01dbab49da9 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -352,6 +352,8 @@ enum kern_feature_id {
 	FEAT_BPF_COOKIE,
 	/* BTF_KIND_ENUM64 support and BTF_KIND_ENUM kflag support */
 	FEAT_BTF_ENUM64,
+	/* Kernel uses syscall wrapper (CONFIG_ARCH_HAS_SYSCALL_WRAPPER) */
+	FEAT_SYSCALL_WRAPPER,
 	__FEAT_CNT,
 };
 
-- 
2.30.2

next prev parent reply	other threads:[~2022-07-14  7:08 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-07-14  7:07 [PATCH v2 bpf-next 0/5] Add SEC("ksyscall") support Andrii Nakryiko
2022-07-14  7:07 ` [PATCH v2 bpf-next 1/5] libbpf: generalize virtual __kconfig externs and use it for USDT Andrii Nakryiko
2022-07-14  7:07 ` [PATCH v2 bpf-next 2/5] selftests/bpf: add test of __weak unknown virtual __kconfig extern Andrii Nakryiko
2022-07-14  7:07 ` [PATCH v2 bpf-next 3/5] libbpf: improve BPF_KPROBE_SYSCALL macro and rename it to BPF_KSYSCALL Andrii Nakryiko
2022-07-14  7:07 ` Andrii Nakryiko [this message]
2022-07-14  7:07 ` [PATCH v2 bpf-next 5/5] selftests/bpf: use BPF_KSYSCALL and SEC("ksyscall") in selftests Andrii Nakryiko
2022-07-15 23:28   ` Ilya Leoshkevich
2022-07-15 23:54     ` Ilya Leoshkevich
2022-07-14 18:31 ` [PATCH v2 bpf-next 0/5] Add SEC("ksyscall") support sdf
2022-07-19 16:50 ` patchwork-bot+netdevbpf

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:eb35a20a33f dfblob:ba96760936f dfblob:e4d5353f757
dfblob:d10234aaa8d dfblob:94b589ecfea dfblob:63c059a02a0
dfblob:9cd7829cbe4 dfblob:f01dbab49da )
 OR (
bs:"[PATCH v2 bpf-next 4/5] libbpf: add ksyscall/kretsyscall sections support for syscall kprobes" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220714070755.3235561-5-andrii@kernel.org \
    --to=andrii@kernel.org \
    --cc=alan.maguire@oracle.com \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=kernel-team@fb.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox