Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf-next v3 09/10] tools: bpf: sync bpf uapi header
From: Sandipan Das @ 2018-05-22 17:16 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527008646.git.sandipan@linux.vnet.ibm.com>

Syncing the bpf.h uapi header with tools so that struct
bpf_prog_info has the two new fields for passing on the
JITed image lengths of each function in a multi-function
program.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 tools/include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c44105f27da9..8c3109b5d6d3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2206,7 +2206,9 @@ struct bpf_prog_info {
 	__u64 netns_dev;
 	__u64 netns_ino;
 	__u32 nr_jited_ksyms;
+	__u32 nr_jited_func_lens;
 	__aligned_u64 jited_ksyms;
+	__aligned_u64 jited_func_lens;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v3 08/10] bpf: get JITed image lengths of functions via syscall
From: Sandipan Das @ 2018-05-22 17:16 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527008646.git.sandipan@linux.vnet.ibm.com>

This adds new two new fields to struct bpf_prog_info. For
multi-function programs, these fields can be used to pass
a list of the JITed image lengths of each function for a
given program to userspace using the bpf system call with
the BPF_OBJ_GET_INFO_BY_FD command.

This can be used by userspace applications like bpftool
to split up the contiguous JITed dump, also obtained via
the system call, into more relatable chunks corresponding
to each function.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c44105f27da9..8c3109b5d6d3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2206,7 +2206,9 @@ struct bpf_prog_info {
 	__u64 netns_dev;
 	__u64 netns_ino;
 	__u32 nr_jited_ksyms;
+	__u32 nr_jited_func_lens;
 	__aligned_u64 jited_ksyms;
+	__aligned_u64 jited_func_lens;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1c4cba91e523..faadbcd90191 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2036,6 +2036,26 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_jited_func_lens;
+	info.nr_jited_func_lens = prog->aux->func_cnt;
+	if (info.nr_jited_func_lens && ulen) {
+		if (bpf_dump_raw_ok()) {
+			u32 __user *user_lens;
+			u32 func_len, i;
+
+			/* copy the JITed image lengths for each function */
+			ulen = min_t(u32, info.nr_jited_func_lens, ulen);
+			user_lens = u64_to_user_ptr(info.jited_func_lens);
+			for (i = 0; i < ulen; i++) {
+				func_len = prog->aux->func[i]->jited_len;
+				if (put_user(func_len, &user_lens[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_func_lens = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v3 07/10] bpf: fix multi-function JITed dump obtained via syscall
From: Sandipan Das @ 2018-05-22 17:16 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527008646.git.sandipan@linux.vnet.ibm.com>

Currently, for multi-function programs, we cannot get the JITed
instructions using the bpf system call's BPF_OBJ_GET_INFO_BY_FD
command. Because of this, userspace tools such as bpftool fail
to identify a multi-function program as being JITed or not.

With the JIT enabled and the test program running, this can be
verified as follows:

  # cat /proc/sys/net/core/bpf_jit_enable
  1

Before applying this patch:

  # bpftool prog list
  1: kprobe  name foo  tag b811aab41a39ad3d  gpl
          loaded_at 2018-05-16T11:43:38+0530  uid 0
          xlated 216B  not jited  memlock 65536B
  ...

  # bpftool prog dump jited id 1
  no instructions returned

After applying this patch:

  # bpftool prog list
  1: kprobe  name foo  tag b811aab41a39ad3d  gpl
          loaded_at 2018-05-16T12:13:01+0530  uid 0
          xlated 216B  jited 308B  memlock 65536B
  ...

  # bpftool prog dump jited id 1
     0:   nop
     4:   nop
     8:   mflr    r0
     c:   std     r0,16(r1)
    10:   stdu    r1,-112(r1)
    14:   std     r31,104(r1)
    18:   addi    r31,r1,48
    1c:   li      r3,10
  ...

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 kernel/bpf/syscall.c | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f0ad4b5f0224..1c4cba91e523 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1970,13 +1970,43 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	 * for offload.
 	 */
 	ulen = info.jited_prog_len;
-	info.jited_prog_len = prog->jited_len;
+	if (prog->aux->func_cnt) {
+		u32 i;
+
+		info.jited_prog_len = 0;
+		for (i = 0; i < prog->aux->func_cnt; i++)
+			info.jited_prog_len += prog->aux->func[i]->jited_len;
+	} else {
+		info.jited_prog_len = prog->jited_len;
+	}
+
 	if (info.jited_prog_len && ulen) {
 		if (bpf_dump_raw_ok()) {
 			uinsns = u64_to_user_ptr(info.jited_prog_insns);
 			ulen = min_t(u32, info.jited_prog_len, ulen);
-			if (copy_to_user(uinsns, prog->bpf_func, ulen))
-				return -EFAULT;
+
+			/* for multi-function programs, copy the JITed
+			 * instructions for all the functions
+			 */
+			if (prog->aux->func_cnt) {
+				u32 len, free, i;
+				u8 *img;
+
+				free = ulen;
+				for (i = 0; i < prog->aux->func_cnt; i++) {
+					len = prog->aux->func[i]->jited_len;
+					img = (u8 *) prog->aux->func[i]->bpf_func;
+					if (len > free)
+						break;
+					if (copy_to_user(uinsns, img, len))
+						return -EFAULT;
+					uinsns += len;
+					free -= len;
+				}
+			} else {
+				if (copy_to_user(uinsns, prog->bpf_func, ulen))
+					return -EFAULT;
+			}
 		} else {
 			info.jited_prog_insns = 0;
 		}
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v3 06/10] tools: bpftool: resolve calls without using imm field
From: Sandipan Das @ 2018-05-22 17:16 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527008646.git.sandipan@linux.vnet.ibm.com>

Currently, we resolve the callee's address for a JITed function
call by using the imm field of the call instruction as an offset
from __bpf_call_base. If bpf_jit_kallsyms is enabled, we further
use this address to get the callee's kernel symbol's name.

For some architectures, such as powerpc64, the imm field is not
large enough to hold this offset. So, instead of assigning this
offset to the imm field, the verifier now assigns the subprog
id. Also, a list of kernel symbol addresses for all the JITed
functions is provided in the program info. We now use the imm
field as an index for this list to lookup a callee's symbol's
address and resolve its name.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
v3:
 - Avoid using redundant pointers.
 - Fix indentation.

v2:
 - Order variables from longest to shortest.
 - Make sure that ksyms_ptr and ksyms_len are always initialized.
 - Simplify code.
---
 tools/bpf/bpftool/prog.c          | 24 ++++++++++++++++++++++++
 tools/bpf/bpftool/xlated_dumper.c | 10 +++++++++-
 tools/bpf/bpftool/xlated_dumper.h |  2 ++
 3 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 9bdfdf2d3fbe..e05ab58d39e2 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -420,7 +420,9 @@ static int do_show(int argc, char **argv)
 
 static int do_dump(int argc, char **argv)
 {
+	unsigned long *func_ksyms = NULL;
 	struct bpf_prog_info info = {};
+	unsigned int nr_func_ksyms;
 	struct dump_data dd = {};
 	__u32 len = sizeof(info);
 	unsigned int buf_size;
@@ -496,10 +498,22 @@ static int do_dump(int argc, char **argv)
 		return -1;
 	}
 
+	nr_func_ksyms = info.nr_jited_ksyms;
+	if (nr_func_ksyms) {
+		func_ksyms = malloc(nr_func_ksyms * sizeof(__u64));
+		if (!func_ksyms) {
+			p_err("mem alloc failed");
+			close(fd);
+			goto err_free;
+		}
+	}
+
 	memset(&info, 0, sizeof(info));
 
 	*member_ptr = ptr_to_u64(buf);
 	*member_len = buf_size;
+	info.jited_ksyms = ptr_to_u64(func_ksyms);
+	info.nr_jited_ksyms = nr_func_ksyms;
 
 	err = bpf_obj_get_info_by_fd(fd, &info, &len);
 	close(fd);
@@ -513,6 +527,11 @@ static int do_dump(int argc, char **argv)
 		goto err_free;
 	}
 
+	if (info.nr_jited_ksyms > nr_func_ksyms) {
+		p_err("too many addresses returned");
+		goto err_free;
+	}
+
 	if ((member_len == &info.jited_prog_len &&
 	     info.jited_prog_insns == 0) ||
 	    (member_len == &info.xlated_prog_len &&
@@ -558,6 +577,9 @@ static int do_dump(int argc, char **argv)
 			dump_xlated_cfg(buf, *member_len);
 	} else {
 		kernel_syms_load(&dd);
+		dd.nr_jited_ksyms = info.nr_jited_ksyms;
+		dd.jited_ksyms = (__u64 *) info.jited_ksyms;
+
 		if (json_output)
 			dump_xlated_json(&dd, buf, *member_len, opcodes);
 		else
@@ -566,10 +588,12 @@ static int do_dump(int argc, char **argv)
 	}
 
 	free(buf);
+	free(func_ksyms);
 	return 0;
 
 err_free:
 	free(buf);
+	free(func_ksyms);
 	return -1;
 }
 
diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c
index 7a3173b76c16..efdc8fecf2bb 100644
--- a/tools/bpf/bpftool/xlated_dumper.c
+++ b/tools/bpf/bpftool/xlated_dumper.c
@@ -174,7 +174,11 @@ static const char *print_call_pcrel(struct dump_data *dd,
 				    unsigned long address,
 				    const struct bpf_insn *insn)
 {
-	if (sym)
+	if (!dd->nr_jited_ksyms)
+		/* Do not show address for interpreted programs */
+		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
+			"%+d", insn->off);
+	else if (sym)
 		snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
 			 "%+d#%s", insn->off, sym->name);
 	else
@@ -203,6 +207,10 @@ static const char *print_call(void *private_data,
 	unsigned long address = dd->address_call_base + insn->imm;
 	struct kernel_sym *sym;
 
+	if (insn->src_reg == BPF_PSEUDO_CALL &&
+	    (__u32) insn->imm < dd->nr_jited_ksyms)
+		address = dd->jited_ksyms[insn->imm];
+
 	sym = kernel_syms_search(dd, address);
 	if (insn->src_reg == BPF_PSEUDO_CALL)
 		return print_call_pcrel(dd, sym, address, insn);
diff --git a/tools/bpf/bpftool/xlated_dumper.h b/tools/bpf/bpftool/xlated_dumper.h
index b34affa7ef2d..eafbb49c8d0b 100644
--- a/tools/bpf/bpftool/xlated_dumper.h
+++ b/tools/bpf/bpftool/xlated_dumper.h
@@ -49,6 +49,8 @@ struct dump_data {
 	unsigned long address_call_base;
 	struct kernel_sym *sym_mapping;
 	__u32 sym_count;
+	__u64 *jited_ksyms;
+	__u32 nr_jited_ksyms;
 	char scratch_buff[SYM_MAX_NAME + 8];
 };
 
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v3 05/10] tools: bpf: sync bpf uapi header
From: Sandipan Das @ 2018-05-22 17:16 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527008646.git.sandipan@linux.vnet.ibm.com>

Syncing the bpf.h uapi header with tools so that struct
bpf_prog_info has the two new fields for passing on the
addresses of the kernel symbols corresponding to each
function in a program.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
v3:
 - Move new fields to the end of bpf_prog_info to avoid
   breaking userspace.
---
 tools/include/uapi/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 97446bbe2ca5..c44105f27da9 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2205,6 +2205,8 @@ struct bpf_prog_info {
 	__u32 gpl_compatible:1;
 	__u64 netns_dev;
 	__u64 netns_ino;
+	__u32 nr_jited_ksyms;
+	__aligned_u64 jited_ksyms;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v3 04/10] bpf: get kernel symbol addresses via syscall
From: Sandipan Das @ 2018-05-22 17:16 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527008646.git.sandipan@linux.vnet.ibm.com>

This adds new two new fields to struct bpf_prog_info. For
multi-function programs, these fields can be used to pass
a list of kernel symbol addresses for all functions in a
given program to userspace using the bpf system call with
the BPF_OBJ_GET_INFO_BY_FD command.

When bpf_jit_kallsyms is enabled, we can get the address
of the corresponding kernel symbol for a callee function
and resolve the symbol's name. The address is determined
by adding the value of the call instruction's imm field
to __bpf_call_base. This offset gets assigned to the imm
field by the verifier.

For some architectures, such as powerpc64, the imm field
is not large enough to hold this offset.

We resolve this by:

[1] Assigning the subprog id to the imm field of a call
    instruction in the verifier instead of the offset of
    the callee's symbol's address from __bpf_call_base.

[2] Determining the address of a callee's corresponding
    symbol by using the imm field as an index for the
    list of kernel symbol addresses now available from
    the program info.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
v3:
 - Copy addresses to jited_ksyms only if bpf_dump_raw_ok()
   is true.
 - Move new fields to the end of bpf_prog_info to avoid
   breaking userspace.
---
 include/uapi/linux/bpf.h |  2 ++
 kernel/bpf/syscall.c     | 25 +++++++++++++++++++++++++
 kernel/bpf/verifier.c    |  7 +------
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 97446bbe2ca5..c44105f27da9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2205,6 +2205,8 @@ struct bpf_prog_info {
 	__u32 gpl_compatible:1;
 	__u64 netns_dev;
 	__u64 netns_ino;
+	__u32 nr_jited_ksyms;
+	__aligned_u64 jited_ksyms;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bfcde949c7f8..f0ad4b5f0224 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1933,6 +1933,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
+		info.nr_jited_ksyms = 0;
 		goto done;
 	}
 
@@ -1981,6 +1982,30 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 		}
 	}
 
+	ulen = info.nr_jited_ksyms;
+	info.nr_jited_ksyms = prog->aux->func_cnt;
+	if (info.nr_jited_ksyms && ulen) {
+		if (bpf_dump_raw_ok()) {
+			u64 __user *user_ksyms;
+			ulong ksym_addr;
+			u32 i;
+
+			/* copy the address of the kernel symbol
+			 * corresponding to each function
+			 */
+			ulen = min_t(u32, info.nr_jited_ksyms, ulen);
+			user_ksyms = u64_to_user_ptr(info.jited_ksyms);
+			for (i = 0; i < ulen; i++) {
+				ksym_addr = (ulong) prog->aux->func[i]->bpf_func;
+				ksym_addr &= PAGE_MASK;
+				if (put_user((u64) ksym_addr, &user_ksyms[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_ksyms = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 559cb74ba29e..8c4d9d0fd3ab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5426,17 +5426,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * later look the same as if they were interpreted only.
 	 */
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
-		unsigned long addr;
-
 		if (insn->code != (BPF_JMP | BPF_CALL) ||
 		    insn->src_reg != BPF_PSEUDO_CALL)
 			continue;
 		insn->off = env->insn_aux_data[i].call_imm;
 		subprog = find_subprog(env, i + insn->off + 1);
-		addr  = (unsigned long)func[subprog]->bpf_func;
-		addr &= PAGE_MASK;
-		insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
-			    addr - __bpf_call_base;
+		insn->imm = subprog;
 	}
 
 	prog->jited = 1;
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v3 02/10] bpf: powerpc64: pad function address loads with NOPs
From: Sandipan Das @ 2018-05-22 17:16 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527008646.git.sandipan@linux.vnet.ibm.com>

For multi-function programs, loading the address of a callee
function to a register requires emitting instructions whose
count varies from one to five depending on the nature of the
address.

Since we come to know of the callee's address only before the
extra pass, the number of instructions required to load this
address may vary from what was previously generated. This can
make the JITed image grow or shrink.

To avoid this, we should generate a constant five-instruction
when loading function addresses by padding the optimized load
sequence with NOPs.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 arch/powerpc/net/bpf_jit_comp64.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 1bdb1aff0619..e4582744a31d 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -167,25 +167,37 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
 
 static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func)
 {
+	unsigned int i, ctx_idx = ctx->idx;
+
+	/* Load function address into r12 */
+	PPC_LI64(12, func);
+
+	/* For bpf-to-bpf function calls, the callee's address is unknown
+	 * until the last extra pass. As seen above, we use PPC_LI64() to
+	 * load the callee's address, but this may optimize the number of
+	 * instructions required based on the nature of the address.
+	 *
+	 * Since we don't want the number of instructions emitted to change,
+	 * we pad the optimized PPC_LI64() call with NOPs to guarantee that
+	 * we always have a five-instruction sequence, which is the maximum
+	 * that PPC_LI64() can emit.
+	 */
+	for (i = ctx->idx - ctx_idx; i < 5; i++)
+		PPC_NOP();
+
 #ifdef PPC64_ELF_ABI_v1
-	/* func points to the function descriptor */
-	PPC_LI64(b2p[TMP_REG_2], func);
-	/* Load actual entry point from function descriptor */
-	PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0);
-	/* ... and move it to LR */
-	PPC_MTLR(b2p[TMP_REG_1]);
 	/*
 	 * Load TOC from function descriptor at offset 8.
 	 * We can clobber r2 since we get called through a
 	 * function pointer (so caller will save/restore r2)
 	 * and since we don't use a TOC ourself.
 	 */
-	PPC_BPF_LL(2, b2p[TMP_REG_2], 8);
-#else
-	/* We can clobber r12 */
-	PPC_FUNC_ADDR(12, func);
-	PPC_MTLR(12);
+	PPC_BPF_LL(2, 12, 8);
+	/* Load actual entry point from function descriptor */
+	PPC_BPF_LL(12, 12, 0);
 #endif
+
+	PPC_MTLR(12);
 	PPC_BLRL();
 }
 
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v3 03/10] bpf: powerpc64: add JIT support for multi-function programs
From: Sandipan Das @ 2018-05-22 17:16 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527008646.git.sandipan@linux.vnet.ibm.com>

This adds support for bpf-to-bpf function calls in the powerpc64
JIT compiler. The JIT compiler converts the bpf call instructions
to native branch instructions. After a round of the usual passes,
the start addresses of the JITed images for the callee functions
are known. Finally, to fixup the branch target addresses, we need
to perform an extra pass.

Because of the address range in which JITed images are allocated
on powerpc64, the offsets of the start addresses of these images
from __bpf_call_base are as large as 64 bits. So, for a function
call, we cannot use the imm field of the instruction to determine
the callee's address. Instead, we use the alternative method of
getting it from the list of function addresses in the auxiliary
data of the caller by using the off field as an index.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
v3:
 - Fix memory leak for jit_data when we fail to allocated addrs.
 - Remove unnecessary bpf_jit_binary_lock_ro() call.
---
 arch/powerpc/net/bpf_jit_comp64.c | 76 +++++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index e4582744a31d..f1c95779843b 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -268,7 +268,7 @@ static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32
 /* Assemble the body code between the prologue & epilogue */
 static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
 			      struct codegen_context *ctx,
-			      u32 *addrs)
+			      u32 *addrs, bool extra_pass)
 {
 	const struct bpf_insn *insn = fp->insnsi;
 	int flen = fp->len;
@@ -724,11 +724,25 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
 			break;
 
 		/*
-		 * Call kernel helper
+		 * Call kernel helper or bpf function
 		 */
 		case BPF_JMP | BPF_CALL:
 			ctx->seen |= SEEN_FUNC;
-			func = (u8 *) __bpf_call_base + imm;
+
+			/* bpf function call */
+			if (insn[i].src_reg == BPF_PSEUDO_CALL)
+				if (!extra_pass)
+					func = NULL;
+				else if (fp->aux->func && off < fp->aux->func_cnt)
+					/* use the subprog id from the off
+					 * field to lookup the callee address
+					 */
+					func = (u8 *) fp->aux->func[off]->bpf_func;
+				else
+					return -EINVAL;
+			/* kernel helper call */
+			else
+				func = (u8 *) __bpf_call_base + imm;
 
 			bpf_jit_emit_func_call(image, ctx, (u64)func);
 
@@ -876,6 +890,14 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
 	return 0;
 }
 
+struct powerpc64_jit_data {
+	struct bpf_binary_header *header;
+	u32 *addrs;
+	u8 *image;
+	u32 proglen;
+	struct codegen_context ctx;
+};
+
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 {
 	u32 proglen;
@@ -883,6 +905,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	u8 *image = NULL;
 	u32 *code_base;
 	u32 *addrs;
+	struct powerpc64_jit_data *jit_data;
 	struct codegen_context cgctx;
 	int pass;
 	int flen;
@@ -890,6 +913,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	struct bpf_prog *org_fp = fp;
 	struct bpf_prog *tmp_fp;
 	bool bpf_blinded = false;
+	bool extra_pass = false;
 
 	if (!fp->jit_requested)
 		return org_fp;
@@ -903,11 +927,32 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 		fp = tmp_fp;
 	}
 
+	jit_data = fp->aux->jit_data;
+	if (!jit_data) {
+		jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
+		if (!jit_data) {
+			fp = org_fp;
+			goto out;
+		}
+		fp->aux->jit_data = jit_data;
+	}
+
 	flen = fp->len;
+	addrs = jit_data->addrs;
+	if (addrs) {
+		cgctx = jit_data->ctx;
+		image = jit_data->image;
+		bpf_hdr = jit_data->header;
+		proglen = jit_data->proglen;
+		alloclen = proglen + FUNCTION_DESCR_SIZE;
+		extra_pass = true;
+		goto skip_init_ctx;
+	}
+
 	addrs = kzalloc((flen+1) * sizeof(*addrs), GFP_KERNEL);
 	if (addrs == NULL) {
 		fp = org_fp;
-		goto out;
+		goto out_addrs;
 	}
 
 	memset(&cgctx, 0, sizeof(struct codegen_context));
@@ -916,10 +961,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	cgctx.stack_size = round_up(fp->aux->stack_depth, 16);
 
 	/* Scouting faux-generate pass 0 */
-	if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) {
+	if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) {
 		/* We hit something illegal or unsupported. */
 		fp = org_fp;
-		goto out;
+		goto out_addrs;
 	}
 
 	/*
@@ -937,9 +982,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 			bpf_jit_fill_ill_insns);
 	if (!bpf_hdr) {
 		fp = org_fp;
-		goto out;
+		goto out_addrs;
 	}
 
+skip_init_ctx:
 	code_base = (u32 *)(image + FUNCTION_DESCR_SIZE);
 
 	/* Code generation passes 1-2 */
@@ -947,7 +993,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 		/* Now build the prologue, body code & epilogue for real. */
 		cgctx.idx = 0;
 		bpf_jit_build_prologue(code_base, &cgctx);
-		bpf_jit_build_body(fp, code_base, &cgctx, addrs);
+		bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass);
 		bpf_jit_build_epilogue(code_base, &cgctx);
 
 		if (bpf_jit_enable > 1)
@@ -973,10 +1019,20 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	fp->jited_len = alloclen;
 
 	bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE));
+	if (!fp->is_func || extra_pass) {
+out_addrs:
+		kfree(addrs);
+		kfree(jit_data);
+		fp->aux->jit_data = NULL;
+	} else {
+		jit_data->addrs = addrs;
+		jit_data->ctx = cgctx;
+		jit_data->proglen = proglen;
+		jit_data->image = image;
+		jit_data->header = bpf_hdr;
+	}
 
 out:
-	kfree(addrs);
-
 	if (bpf_blinded)
 		bpf_jit_prog_release_other(fp, fp == org_fp ? tmp_fp : org_fp);
 
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v3 01/10] bpf: support 64-bit offsets for bpf function calls
From: Sandipan Das @ 2018-05-22 17:16 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527008646.git.sandipan@linux.vnet.ibm.com>

The imm field of a bpf instruction is a signed 32-bit integer.
For JITed bpf-to-bpf function calls, it holds the offset of the
start address of the callee's JITed image from __bpf_call_base.

For some architectures, such as powerpc64, this offset may be
as large as 64 bits and cannot be accomodated in the imm field
without truncation.

We resolve this by:

[1] Additionally using the auxillary data of each function to
    keep a list of start addresses of the JITed images for all
    functions determined by the verifier.

[2] Retaining the subprog id inside the off field of the call
    instructions and using it to index into the list mentioned
    above and lookup the callee's address.

To make sure that the existing JIT compilers continue to work
without requiring changes, we keep the imm field as it is.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 kernel/bpf/verifier.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a9e4b1372da6..559cb74ba29e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5383,11 +5383,24 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			    insn->src_reg != BPF_PSEUDO_CALL)
 				continue;
 			subprog = insn->off;
-			insn->off = 0;
 			insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
 				func[subprog]->bpf_func -
 				__bpf_call_base;
 		}
+
+		/* we use the aux data to keep a list of the start addresses
+		 * of the JITed images for each function in the program
+		 *
+		 * for some architectures, such as powerpc64, the imm field
+		 * might not be large enough to hold the offset of the start
+		 * address of the callee's JITed image from __bpf_call_base
+		 *
+		 * in such cases, we can lookup the start address of a callee
+		 * by using its subprog id, available from the off field of
+		 * the call instruction, as an index for this list
+		 */
+		func[i]->aux->func = func;
+		func[i]->aux->func_cnt = env->subprog_cnt;
 	}
 	for (i = 0; i < env->subprog_cnt; i++) {
 		old_bpf_func = func[i]->bpf_func;
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v3 00/10] bpf: enhancements for multi-function programs
From: Sandipan Das @ 2018-05-22 17:16 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski

v3:
 - Change base tree tag to bpf-next.
 - Incorporate review comments from Alexei, Daniel and Jakub.
 - Make sure that the JITed image does not grow or shrink after
   the last pass due to the way the instruction sequence used
   to load a callee's address maybe optimized.
 - Make additional changes to the bpf system call and bpftool to
   make multi-function JITed dumps easier to correlate.

v2:
 - Incorporate review comments from Jakub.

Sandipan Das (10):
  bpf: support 64-bit offsets for bpf function calls
  bpf: powerpc64: pad function address loads with NOPs
  bpf: powerpc64: add JIT support for multi-function programs
  bpf: get kernel symbol addresses via syscall
  tools: bpf: sync bpf uapi header
  tools: bpftool: resolve calls without using imm field
  bpf: fix multi-function JITed dump obtained via syscall
  bpf: get JITed image lengths of functions via syscall
  tools: bpf: sync bpf uapi header
  tools: bpftool: add delimiters to multi-function JITed dumps

 arch/powerpc/net/bpf_jit_comp64.c | 110 ++++++++++++++++++++++++++++++--------
 include/uapi/linux/bpf.h          |   4 ++
 kernel/bpf/syscall.c              |  81 ++++++++++++++++++++++++++--
 kernel/bpf/verifier.c             |  22 +++++---
 tools/bpf/bpftool/prog.c          |  75 +++++++++++++++++++++++++-
 tools/bpf/bpftool/xlated_dumper.c |  14 +++--
 tools/bpf/bpftool/xlated_dumper.h |   3 ++
 tools/include/uapi/linux/bpf.h    |   4 ++
 8 files changed, 278 insertions(+), 35 deletions(-)

-- 
2.14.3

^ permalink raw reply

* Re: [PATCH 30/31] timerfd: convert to ->poll_mask
From: Sergei Shtylyov @ 2018-05-22 16:59 UTC (permalink / raw)
  To: Christoph Hellwig, viro
  Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180522113108.25713-31-hch@lst.de>

Hello!

On 05/22/2018 02:31 PM, Christoph Hellwig wrote:

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/timerfd.c | 22 +++++++++++-----------
>  1 file changed, 11 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/timerfd.c b/fs/timerfd.c
> index cdad49da3ff7..d84a2bee4f82 100644
> --- a/fs/timerfd.c
> +++ b/fs/timerfd.c
> @@ -226,21 +226,20 @@ static int timerfd_release(struct inode *inode, struct file *file)
>  	kfree_rcu(ctx, rcu);
>  	return 0;
>  }
> -
> -static __poll_t timerfd_poll(struct file *file, poll_table *wait)
> +	

   Stray tab detected! :-)

> +static struct wait_queue_head *timerfd_get_poll_head(struct file *file,
> +		__poll_t eventmask)
>  {
>  	struct timerfd_ctx *ctx = file->private_data;
> -	__poll_t events = 0;
> -	unsigned long flags;
>  
> -	poll_wait(file, &ctx->wqh, wait);
> +	return &ctx->wqh;
> +}
>  
> -	spin_lock_irqsave(&ctx->wqh.lock, flags);
> -	if (ctx->ticks)
> -		events |= EPOLLIN;
> -	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
> +static __poll_t timerfd_poll_mask(struct file *file, __poll_t eventmask)
> +{
> +	struct timerfd_ctx *ctx = file->private_data;
>  
> -	return events;
> +	return ctx->ticks ? EPOLLIN : 0;
>  }
>  
>  static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
[...]

MBR, Sergei

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply

* Re: [RFC V4 PATCH 7/8] vhost: packed ring support
From: Wei Xu @ 2018-05-22 16:54 UTC (permalink / raw)
  To: Jason Wang
  Cc: mst, kvm, virtualization, netdev, linux-kernel, jfreimann,
	tiwei.bie
In-Reply-To: <1526473941-16199-8-git-send-email-jasowang@redhat.com>

On Wed, May 16, 2018 at 08:32:20PM +0800, Jason Wang wrote:
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/net.c   |   3 +-
>  drivers/vhost/vhost.c | 539 ++++++++++++++++++++++++++++++++++++++++++++++----
>  drivers/vhost/vhost.h |   8 +-
>  3 files changed, 513 insertions(+), 37 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 8304c30..f2a0f5b 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -1358,6 +1382,8 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
>  			break;
>  		}
>  		vq->last_avail_idx = s.num;
> +		if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
> +			vq->avail_wrap_counter = s.num >> 31;
>  		/* Forget the cached index value. */
>  		vq->avail_idx = vq->last_avail_idx;
>  		break;
> @@ -1366,6 +1392,8 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
>  		s.num = vq->last_avail_idx;
>  		if (copy_to_user(argp, &s, sizeof s))
>  			r = -EFAULT;
> +		if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
> +			s.num |= vq->avail_wrap_counter << 31;
>  		break;
>  	case VHOST_SET_VRING_ADDR:
>  		if (copy_from_user(&a, argp, sizeof a)) {

'last_used_idx' also needs to be saved/restored here.

I have figured out the root cause of broken device after reloading
'virtio-net' module, all indices have been reset for a reloading but
'last_used_idx' is not properly reset in this case. This confuses
handle_rx()/tx().

Wei

^ permalink raw reply

* Re: [PATCH net-next 2/2] tcp: do not aggressively quick ack after ECN events
From: Yuchung Cheng @ 2018-05-22 16:54 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, netdev, Van Jacobson, Neal Cardwell,
	Soheil Hassas Yeganeh, Eric Dumazet
In-Reply-To: <20180521220857.229273-3-edumazet@google.com>

On Mon, May 21, 2018 at 3:08 PM, Eric Dumazet <edumazet@google.com> wrote:
> ECN signals currently forces TCP to enter quickack mode for
> up to 16 (TCP_MAX_QUICKACKS) following incoming packets.
>
> We believe this is not needed, and only sending one immediate ack
> for the current packet should be enough.
>
> This should reduce the extra load noticed in DCTCP environments,
> after congestion events.
>
> This is part 2 of our effort to reduce pure ACK packets.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
Acked-by: Yuchung Cheng <ycheng@google.com>

Thanks for this patch. I am still wondering how much does the "funny
extension" help. but this patch definitely reduce the amount of
unnecessary immediate ACKs on ECN.

>  net/ipv4/tcp_input.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 2e970e9f4e09d966b703af2d14d521a4328eba7e..1191cac72109f2f7e2b688ddbc1d404151d274d6 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -263,7 +263,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
>                  * it is probably a retransmit.
>                  */
>                 if (tp->ecn_flags & TCP_ECN_SEEN)
> -                       tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
> +                       tcp_enter_quickack_mode((struct sock *)tp, 1);
>                 break;
>         case INET_ECN_CE:
>                 if (tcp_ca_needs_ecn((struct sock *)tp))
> @@ -271,7 +271,7 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
>
>                 if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
>                         /* Better not delay acks, sender can have a very low cwnd */
> -                       tcp_enter_quickack_mode((struct sock *)tp, TCP_MAX_QUICKACKS);
> +                       tcp_enter_quickack_mode((struct sock *)tp, 1);
>                         tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
>                 }
>                 tp->ecn_flags |= TCP_ECN_SEEN;
> --
> 2.17.0.441.gb46fe60e1d-goog
>

^ permalink raw reply

* Re: [PATCH net-next v11 2/5] netvsc: refactor notifier/event handling code to use the failover framework
From: Michael S. Tsirkin @ 2018-05-22 16:52 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Sridhar Samudrala, stephen, davem, netdev, virtualization,
	virtio-dev, jesse.brandeburg, alexander.h.duyck, kubakici,
	jasowang, loseweigh, aaron.f.brown, anjali.singhai
In-Reply-To: <20180522154501.GL2149@nanopsycho>

On Tue, May 22, 2018 at 05:45:01PM +0200, Jiri Pirko wrote:
> Tue, May 22, 2018 at 05:32:30PM CEST, mst@redhat.com wrote:
> >On Tue, May 22, 2018 at 05:13:43PM +0200, Jiri Pirko wrote:
> >> Tue, May 22, 2018 at 03:39:33PM CEST, mst@redhat.com wrote:
> >> >On Tue, May 22, 2018 at 03:26:26PM +0200, Jiri Pirko wrote:
> >> >> Tue, May 22, 2018 at 03:17:37PM CEST, mst@redhat.com wrote:
> >> >> >On Tue, May 22, 2018 at 03:14:22PM +0200, Jiri Pirko wrote:
> >> >> >> Tue, May 22, 2018 at 03:12:40PM CEST, mst@redhat.com wrote:
> >> >> >> >On Tue, May 22, 2018 at 11:08:53AM +0200, Jiri Pirko wrote:
> >> >> >> >> Tue, May 22, 2018 at 11:06:37AM CEST, jiri@resnulli.us wrote:
> >> >> >> >> >Tue, May 22, 2018 at 04:06:18AM CEST, sridhar.samudrala@intel.com wrote:
> >> >> >> >> >>Use the registration/notification framework supported by the generic
> >> >> >> >> >>failover infrastructure.
> >> >> >> >> >>
> >> >> >> >> >>Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> >> >> >> >> >
> >> >> >> >> >In previous patchset versions, the common code did
> >> >> >> >> >netdev_rx_handler_register() and netdev_upper_dev_link() etc
> >> >> >> >> >(netvsc_vf_join()). Now, this is still done in netvsc. Why?
> >> >> >> >> >
> >> >> >> >> >This should be part of the common "failover" code.
> >> >> >> >> >
> >> >> >> >> 
> >> >> >> >> Also note that in the current patchset you use IFF_FAILOVER flag for
> >> >> >> >> master, yet for the slave you use IFF_SLAVE. That is wrong.
> >> >> >> >> IFF_FAILOVER_SLAVE should be used.
> >> >> >> >
> >> >> >> >Or drop IFF_FAILOVER_SLAVE and set both IFF_FAILOVER and IFF_SLAVE?
> >> >> >> 
> >> >> >> No. IFF_SLAVE is for bonding.
> >> >> >
> >> >> >What breaks if we reuse it for failover?
> >> >> 
> >> >> This is exposed to userspace. IFF_SLAVE is expected for bonding slaves.
> >> >> And failover slave is not a bonding slave.
> >> >
> >> >That does not really answer the question.  I'd claim it's sufficiently
> >> >like a bond slave for IFF_SLAVE to make sense.
> >> >
> >> >In fact you will find that netvsc already sets IFF_SLAVE, and so
> >> 
> >> netvsc does the whole failover thing in a wrong way. This patchset is
> >> trying to fix it.
> >
> >Maybe, but we don't need gratuitous changes either, especially if they
> >break userspace.
> 
> What do you mean by the "break"? It was a mistake to reuse IFF_SLAVE at
> the first place, lets fix it. If some userspace depends on that flag, it
> is broken anyway.
> 
> 
> >
> >> >does e.g. the eql driver.
> >> >
> >> >The advantage of using IFF_SLAVE is that userspace knows to skip it.  If
> >> 
> >> The userspace should know how to skip other types of slaves - team,
> >> bridge, ovs, etc.
> >> The "master link" should be the one to look at.
> >> 
> >
> >How should existing userspace know which ones to skip and which one is
> >the master?  Right now userspace seems to assume whatever does not have
> >IFF_SLAVE should be looked at. Are you saying that's not the right thing
> 
> Why do you say so? What do you mean by "looked at"? Certainly not.
> IFLA_MASTER is the attribute that should be looked at, nothing else.
> 
> 
> >to do and userspace should be fixed? What should userspace do in
> >your opinion that will be forward compatible with future kernels?
> >
> >> 
> >> >we don't set IFF_SLAVE existing userspace tries to use the lowerdev.
> >> 
> >> Each master type has a IFF_ master flag and IFF_ slave flag.
> >
> >Could you give some examples please?
> 
> enum netdev_priv_flags {
>         IFF_EBRIDGE                     = 1<<1,
>         IFF_BRIDGE_PORT                 = 1<<9,
>         IFF_OPENVSWITCH                 = 1<<20,
>         IFF_OVS_DATAPATH                = 1<<10,
> 	IFF_L3MDEV_MASTER               = 1<<18,
>         IFF_L3MDEV_SLAVE                = 1<<21,
>         IFF_TEAM                        = 1<<22,
>         IFF_TEAM_PORT                   = 1<<13,
> };

That's not in uapi, is it?  the comment above that says:

These flags are invisible to userspace



> 
> >
> >> In private
> >> flag. I don't see no reason to break this pattern here.
> >
> >Other masters are setup from userspace, this one is set up automatically
> >by kernel. So the bar is higher, we need an interface that existing
> >userspace knows about.  We can't just say "oh if userspace set this up
> >it should know to skip lowerdevs".
> >
> >Otherwise multiple interfaces with same mac tend to confuse userspace.
> 
> No difference, really.
> Regardless who does the setup, and independent userspace deamon should
> react accordingly.

If the deamon does the setup itself, it's reasonable to require that it
learns about new flags each time we add a new driver.  If it doesn't,
then I think it's less reasonable.

-- 
MST

^ permalink raw reply

* [PATCH net] ibmvnic: Only do H_EOI for mobility events
From: Nathan Fontenot @ 2018-05-22 16:21 UTC (permalink / raw)
  To: netdev; +Cc: jallen, tlfalcon

When enabling the sub-CRQ IRQ a previous update sent a H_EOI prior
to the enablement to clear any pending interrupts that may be present
across a partition migration. This fixed a firmware bug where a
migration could erroneously indicate that a H_EOI was pending.

The H_EOI should only be sent when enabling during a mobility
event though. Doing so at other time could wrong and can produce
extra driver output when IRQs are enabled when doing TX completion.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
---
 drivers/net/ethernet/ibm/ibmvnic.c |   15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4bb4646a5f92..62cd3602c633 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -2617,18 +2617,21 @@ static int enable_scrq_irq(struct ibmvnic_adapter *adapter,
 {
 	struct device *dev = &adapter->vdev->dev;
 	unsigned long rc;
-	u64 val;
 
 	if (scrq->hw_irq > 0x100000000ULL) {
 		dev_err(dev, "bad hw_irq = %lx\n", scrq->hw_irq);
 		return 1;
 	}
 
-	val = (0xff000000) | scrq->hw_irq;
-	rc = plpar_hcall_norets(H_EOI, val);
-	if (rc)
-		dev_err(dev, "H_EOI FAILED irq 0x%llx. rc=%ld\n",
-			val, rc);
+	if (adapter->resetting &&
+	    adapter->reset_reason == VNIC_RESET_MOBILITY) {
+		u64 val = (0xff000000) | scrq->hw_irq;
+
+		rc = plpar_hcall_norets(H_EOI, val);
+		if (rc)
+			dev_err(dev, "H_EOI FAILED irq 0x%llx. rc=%ld\n",
+				val, rc);
+	}
 
 	rc = plpar_hcall_norets(H_VIOCTL, adapter->vdev->unit_address,
 				H_ENABLE_VIO_INTERRUPT, scrq->hw_irq, 0, 0);

^ permalink raw reply related

* [PATCH bpf-next v3 7/7] tools/bpftool: add perf subcommand
From: Yonghong Song @ 2018-05-22 16:31 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180522163114.3130198-1-yhs@fb.com>

The new command "bpftool perf [show | list]" will traverse
all processes under /proc, and if any fd is associated
with a perf event, it will print out related perf event
information. Documentation is also added.

Below is an example to show the results using bcc commands.
Running the following 4 bcc commands:
  kprobe:     trace.py '__x64_sys_nanosleep'
  kretprobe:  trace.py 'r::__x64_sys_nanosleep'
  tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
  uprobe:     trace.py 'p:/home/yhs/a.out:main'

The bpftool command line and result:

  $ bpftool perf
  pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
  pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
  pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
  pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159

  $ bpftool -j perf
  [{"pid":21711,"fd":5,"prog_id":5,"attach_info":"kprobe","func":"__x64_sys_write","offset":0}, \
   {"pid":21765,"fd":5,"prog_id":7,"attach_info":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \
   {"pid":21767,"fd":5,"prog_id":8,"attach_info":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \
   {"pid":21800,"fd":5,"prog_id":9,"attach_info":"uprobe","filename":"/home/yhs/a.out","offset":1159}]

  $ bpftool prog
  5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
	  loaded_at 2018-05-15T04:46:37-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 4
  7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
	  loaded_at 2018-05-15T04:48:32-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 7
  8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
	  loaded_at 2018-05-15T04:48:48-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 8
  9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
	  loaded_at 2018-05-15T04:49:52-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 9

  $ ps ax | grep "python ./trace.py"
  21711 pts/0    T      0:03 python ./trace.py __x64_sys_write
  21765 pts/0    S+     0:00 python ./trace.py r::__x64_sys_nanosleep
  21767 pts/2    S+     0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
  21800 pts/3    S+     0:00 python ./trace.py p:/home/yhs/a.out:main
  22374 pts/1    S+     0:00 grep --color=auto python ./trace.py

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/bpf/bpftool/Documentation/bpftool-perf.rst |  81 ++++++++
 tools/bpf/bpftool/Documentation/bpftool.rst      |   5 +-
 tools/bpf/bpftool/bash-completion/bpftool        |   9 +
 tools/bpf/bpftool/main.c                         |   3 +-
 tools/bpf/bpftool/main.h                         |   1 +
 tools/bpf/bpftool/perf.c                         | 244 +++++++++++++++++++++++
 6 files changed, 341 insertions(+), 2 deletions(-)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
 create mode 100644 tools/bpf/bpftool/perf.c

diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
new file mode 100644
index 0000000..3e65375
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -0,0 +1,81 @@
+================
+bpftool-perf
+================
+-------------------------------------------------------------------------------
+tool for inspection of perf related bpf prog attachments
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+	**bpftool** [*OPTIONS*] **perf** *COMMAND*
+
+	*OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
+
+	*COMMANDS* :=
+	{ **show** | **list** | **help** }
+
+PERF COMMANDS
+=============
+
+|	**bpftool** **perf { show | list }**
+|	**bpftool** **perf help**
+
+DESCRIPTION
+===========
+	**bpftool perf { show | list }**
+		  List all raw_tracepoint, tracepoint, kprobe attachment in the system.
+
+		  Output will start with process id and file descriptor in that process,
+		  followed by bpf program id, attachment information, and attachment point.
+		  The attachment point for raw_tracepoint/tracepoint is the trace probe name.
+		  The attachment point for k[ret]probe is either symbol name and offset,
+		  or a kernel virtual address.
+		  The attachment point for u[ret]probe is the file name and the file offset.
+
+	**bpftool perf help**
+		  Print short help message.
+
+OPTIONS
+=======
+	-h, --help
+		  Print short generic help message (similar to **bpftool help**).
+
+	-v, --version
+		  Print version number (similar to **bpftool version**).
+
+	-j, --json
+		  Generate JSON output. For commands that cannot produce JSON, this
+		  option has no effect.
+
+	-p, --pretty
+		  Generate human-readable JSON output. Implies **-j**.
+
+EXAMPLES
+========
+
+| **# bpftool perf**
+
+::
+
+      pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
+      pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
+      pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
+      pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159
+
+|
+| **# bpftool -j perf**
+
+::
+
+    [{"pid":21711,"fd":5,"prog_id":5,"attach_info":"kprobe","func":"__x64_sys_write","offset":0}, \
+     {"pid":21765,"fd":5,"prog_id":7,"attach_info":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \
+     {"pid":21767,"fd":5,"prog_id":8,"attach_info":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \
+     {"pid":21800,"fd":5,"prog_id":9,"attach_info":"uprobe","filename":"/home/yhs/a.out","offset":1159}]
+
+
+SEE ALSO
+========
+	**bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 564cb0d..b6f5d56 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -16,7 +16,7 @@ SYNOPSIS
 
 	**bpftool** **version**
 
-	*OBJECT* := { **map** | **program** | **cgroup** }
+	*OBJECT* := { **map** | **program** | **cgroup** | **perf** }
 
 	*OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** }
 	| { **-j** | **--json** } [{ **-p** | **--pretty** }] }
@@ -30,6 +30,8 @@ SYNOPSIS
 
 	*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** }
 
+	*PERF-COMMANDS* := { **show** | **list** | **help** }
+
 DESCRIPTION
 ===========
 	*bpftool* allows for inspection and simple modification of BPF objects
@@ -56,3 +58,4 @@ OPTIONS
 SEE ALSO
 ========
 	**bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8)
+        **bpftool-perf**\ (8)
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index b301c9b..7bc198d 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -448,6 +448,15 @@ _bpftool()
                     ;;
             esac
             ;;
+        perf)
+            case $command in
+                *)
+                    [[ $prev == $object ]] && \
+                        COMPREPLY=( $( compgen -W 'help \
+                            show list' -- "$cur" ) )
+                    ;;
+            esac
+            ;;
     esac
 } &&
 complete -F _bpftool bpftool
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 1ec852d..eea7f14 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -87,7 +87,7 @@ static int do_help(int argc, char **argv)
 		"       %s batch file FILE\n"
 		"       %s version\n"
 		"\n"
-		"       OBJECT := { prog | map | cgroup }\n"
+		"       OBJECT := { prog | map | cgroup | perf }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, bin_name, bin_name);
@@ -216,6 +216,7 @@ static const struct cmd cmds[] = {
 	{ "prog",	do_prog },
 	{ "map",	do_map },
 	{ "cgroup",	do_cgroup },
+	{ "perf",	do_perf },
 	{ "version",	do_version },
 	{ 0 }
 };
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 6173cd9..63fdb31 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -119,6 +119,7 @@ int do_prog(int argc, char **arg);
 int do_map(int argc, char **arg);
 int do_event_pipe(int argc, char **argv);
 int do_cgroup(int argc, char **arg);
+int do_perf(int argc, char **arg);
 
 int prog_parse_fd(int *argc, char ***argv);
 int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c
new file mode 100644
index 0000000..4abe372
--- /dev/null
+++ b/tools/bpf/bpftool/perf.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+// Author: Yonghong Song <yhs@fb.com>
+
+#define _GNU_SOURCE
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <ftw.h>
+
+#include <bpf.h>
+
+#include "main.h"
+
+/* 0: undecided, 1: supported, 2: not supported */
+static int perf_query_supported;
+static bool has_perf_query_support(void)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 prog_id, attach_info;
+	char buf[256];
+	int fd;
+
+	if (perf_query_supported)
+		goto out;
+
+	fd = open(bin_name, O_RDONLY);
+	if (fd < 0) {
+		p_err("perf_query_support: %s", strerror(errno));
+		goto out;
+	}
+
+	/* the following query will fail as no bpf attachment,
+	 * the expected errno is ENOTSUPP
+	 */
+	errno = 0;
+	bpf_task_fd_query(getpid(), fd, 0, buf, sizeof(buf), &prog_id,
+			  &attach_info, &probe_offset, &probe_addr);
+
+	if (errno == 524 /* ENOTSUPP */) {
+		perf_query_supported = 1;
+		goto close_fd;
+	}
+
+	perf_query_supported = 2;
+	p_err("perf_query_support: %s", strerror(errno));
+	fprintf(stderr,
+		"HINT: non root or kernel doesn't support TASK_FD_QUERY\n");
+
+close_fd:
+	close(fd);
+out:
+	return perf_query_supported == 1;
+}
+
+static void print_perf_json(int pid, int fd, __u32 prog_id, __u32 attach_info,
+			    char *buf, __u64 probe_offset, __u64 probe_addr)
+{
+	jsonw_start_object(json_wtr);
+	jsonw_int_field(json_wtr, "pid", pid);
+	jsonw_int_field(json_wtr, "fd", fd);
+	jsonw_uint_field(json_wtr, "prog_id", prog_id);
+	switch (attach_info) {
+	case BPF_ATTACH_RAW_TRACEPOINT:
+		jsonw_string_field(json_wtr, "attach_info", "raw_tracepoint");
+		jsonw_string_field(json_wtr, "tracepoint", buf);
+		break;
+	case BPF_ATTACH_TRACEPOINT:
+		jsonw_string_field(json_wtr, "attach_info", "tracepoint");
+		jsonw_string_field(json_wtr, "tracepoint", buf);
+		break;
+	case BPF_ATTACH_KPROBE:
+		jsonw_string_field(json_wtr, "attach_info", "kprobe");
+		if (buf[0] != '\0') {
+			jsonw_string_field(json_wtr, "func", buf);
+			jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		} else {
+			jsonw_lluint_field(json_wtr, "addr", probe_addr);
+		}
+		break;
+	case BPF_ATTACH_KRETPROBE:
+		jsonw_string_field(json_wtr, "attach_info", "kretprobe");
+		if (buf[0] != '\0') {
+			jsonw_string_field(json_wtr, "func", buf);
+			jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		} else {
+			jsonw_lluint_field(json_wtr, "addr", probe_addr);
+		}
+		break;
+	case BPF_ATTACH_UPROBE:
+		jsonw_string_field(json_wtr, "attach_info", "uprobe");
+		jsonw_string_field(json_wtr, "filename", buf);
+		jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		break;
+	case BPF_ATTACH_URETPROBE:
+		jsonw_string_field(json_wtr, "attach_info", "uretprobe");
+		jsonw_string_field(json_wtr, "filename", buf);
+		jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		break;
+	}
+	jsonw_end_object(json_wtr);
+}
+
+static void print_perf_plain(int pid, int fd, __u32 prog_id, __u32 attach_info,
+			     char *buf, __u64 probe_offset, __u64 probe_addr)
+{
+	printf("pid %d  fd %d: prog_id %u  ", pid, fd, prog_id);
+	switch (attach_info) {
+	case BPF_ATTACH_RAW_TRACEPOINT:
+		printf("raw_tracepoint  %s\n", buf);
+		break;
+	case BPF_ATTACH_TRACEPOINT:
+		printf("tracepoint  %s\n", buf);
+		break;
+	case BPF_ATTACH_KPROBE:
+		if (buf[0] != '\0')
+			printf("kprobe  func %s  offset %llu\n", buf,
+			       probe_offset);
+		else
+			printf("kprobe  addr %llu\n", probe_addr);
+		break;
+	case BPF_ATTACH_KRETPROBE:
+		if (buf[0] != '\0')
+			printf("kretprobe  func %s  offset %llu\n", buf,
+			       probe_offset);
+		else
+			printf("kretprobe  addr %llu\n", probe_addr);
+		break;
+	case BPF_ATTACH_UPROBE:
+		printf("uprobe  filename %s  offset %llu\n", buf, probe_offset);
+		break;
+	case BPF_ATTACH_URETPROBE:
+		printf("uretprobe  filename %s  offset %llu\n", buf,
+		       probe_offset);
+		break;
+	}
+}
+
+static int show_proc(const char *fpath, const struct stat *sb,
+		     int tflag, struct FTW *ftwbuf)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 prog_id, attach_info;
+	int err, pid = 0, fd = 0;
+	const char *pch;
+	char buf[4096];
+
+	/* prefix always /proc */
+	pch = fpath + 5;
+	if (*pch == '\0')
+		return 0;
+
+	/* pid should be all numbers */
+	pch++;
+	while (isdigit(*pch)) {
+		pid = pid * 10 + *pch - '0';
+		pch++;
+	}
+	if (*pch == '\0')
+		return 0;
+	if (*pch != '/')
+		return FTW_SKIP_SUBTREE;
+
+	/* check /proc/<pid>/fd directory */
+	pch++;
+	if (strncmp(pch, "fd", 2))
+		return FTW_SKIP_SUBTREE;
+	pch += 2;
+	if (*pch == '\0')
+		return 0;
+	if (*pch != '/')
+		return FTW_SKIP_SUBTREE;
+
+	/* check /proc/<pid>/fd/<fd_num> */
+	pch++;
+	while (isdigit(*pch)) {
+		fd = fd * 10 + *pch - '0';
+		pch++;
+	}
+	if (*pch != '\0')
+		return FTW_SKIP_SUBTREE;
+
+	/* query (pid, fd) for potential perf events */
+	err = bpf_task_fd_query(pid, fd, 0, buf, sizeof(buf), &prog_id,
+				&attach_info, &probe_offset, &probe_addr);
+	if (err < 0)
+		return 0;
+
+	if (json_output)
+		print_perf_json(pid, fd, prog_id, attach_info, buf, probe_offset,
+				probe_addr);
+	else
+		print_perf_plain(pid, fd, prog_id, attach_info, buf, probe_offset,
+				 probe_addr);
+
+	return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+	int flags = FTW_ACTIONRETVAL | FTW_PHYS;
+	int err = 0, nopenfd = 16;
+
+	if (!has_perf_query_support())
+		return -1;
+
+	if (json_output)
+		jsonw_start_array(json_wtr);
+	if (nftw("/proc", show_proc, nopenfd, flags) == -1) {
+		p_err("%s", strerror(errno));
+		err = -1;
+	}
+	if (json_output)
+		jsonw_end_array(json_wtr);
+
+	return err;
+}
+
+static int do_help(int argc, char **argv)
+{
+	fprintf(stderr,
+		"Usage: %s %s { show | list | help }\n"
+		"",
+		bin_name, argv[-2]);
+
+	return 0;
+}
+
+static const struct cmd cmds[] = {
+	{ "show",	do_show },
+	{ "list",	do_show },
+	{ "help",	do_help },
+	{ 0 }
+};
+
+int do_perf(int argc, char **argv)
+{
+	return cmd_select(cmds, argc, argv, do_help);
+}
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 6/7] tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs
From: Yonghong Song @ 2018-05-22 16:31 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180522163114.3130198-1-yhs@fb.com>

The new tests are added to query perf_event information
for raw_tracepoint and tracepoint attachment. For tracepoint,
both syscalls and non-syscalls tracepoints are queries as
they are treated slightly differently inside the kernel.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c | 133 +++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 3ecf733..f7ede03 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1542,6 +1542,137 @@ static void test_get_stack_raw_tp(void)
 	bpf_object__close(obj);
 }
 
+static void test_task_fd_query_rawtp(void)
+{
+	const char *file = "./test_get_stack_rawtp.o";
+	struct perf_event_attr attr = {};
+	__u64 probe_offset, probe_addr;
+	int efd, err, prog_fd, pmu_fd;
+	__u32 prog_id, attach_info;
+	struct bpf_object *obj;
+	__u32 duration = 0;
+	char buf[256];
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+		return;
+
+	efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+	if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.type = PERF_TYPE_SOFTWARE;
+	attr.config = PERF_COUNT_SW_BPF_OUTPUT;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0);
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
+		  errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err < 0, "ioctl PERF_EVENT_IOC_ENABLE", "err %d errno %d\n",
+		  err, errno))
+		goto close_prog;
+
+	/* query (getpid(), efd */
+	err = bpf_task_fd_query(getpid(), efd, 0, buf, 256, &prog_id,
+				&attach_info, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_trace_event_query", "err %d errno %d\n", err,
+		  errno))
+		goto close_prog;
+
+	err = (attach_info == BPF_ATTACH_RAW_TRACEPOINT) &&
+	      (strcmp(buf, "sys_enter") == 0);
+	if (CHECK(!err, "check_results", "attach_info %d tp_name %s\n",
+		  attach_info, buf))
+		goto close_prog;
+
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
+
+static void test_task_fd_query_tp_core(const char *probe_name,
+				       const char *tp_name)
+{
+	const char *file = "./test_tracepoint.o";
+	int err, bytes, efd, prog_fd, pmu_fd;
+	struct perf_event_attr attr = {};
+	__u64 probe_offset, probe_addr;
+	__u32 prog_id, attach_info;
+	struct bpf_object *obj;
+	__u32 duration = 0;
+	char buf[256];
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	snprintf(buf, sizeof(buf),
+		 "/sys/kernel/debug/tracing/events/%s/id", probe_name);
+	efd = open(buf, O_RDONLY, 0);
+	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+	bytes = read(efd, buf, sizeof(buf));
+	close(efd);
+	if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
+		  "bytes %d errno %d\n", bytes, errno))
+		goto close_prog;
+
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(err, "perf_event_open", "err %d errno %d\n", err, errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	/* query (getpid(), pmu_fd */
+	err = bpf_task_fd_query(getpid(), pmu_fd, 0, buf, 256, &prog_id,
+				&attach_info, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_trace_event_query", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	err = (attach_info == BPF_ATTACH_TRACEPOINT) && !strcmp(buf, tp_name);
+	if (CHECK(!err, "check_results", "attach_info %d tp_name %s\n",
+		  attach_info, buf))
+		goto close_pmu;
+
+	close(pmu_fd);
+	goto close_prog_noerr;
+
+close_pmu:
+	close(pmu_fd);
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
+
+static void test_task_fd_query_tp(void)
+{
+	test_task_fd_query_tp_core("sched/sched_switch",
+				   "sched_switch");
+	test_task_fd_query_tp_core("syscalls/sys_enter_read",
+				   "sys_enter_read");
+}
+
 int main(void)
 {
 	jit_enabled = is_jit_enabled();
@@ -1561,6 +1692,8 @@ int main(void)
 	test_stacktrace_build_id_nmi();
 	test_stacktrace_map_raw_tp();
 	test_get_stack_raw_tp();
+	test_task_fd_query_rawtp();
+	test_task_fd_query_tp();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 5/7] samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY
From: Yonghong Song @ 2018-05-22 16:31 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team

This is mostly to test kprobe/uprobe which needs kernel headers.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 samples/bpf/Makefile             |   4 +
 samples/bpf/task_fd_query_kern.c |  19 ++
 samples/bpf/task_fd_query_user.c | 379 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 402 insertions(+)
 create mode 100644 samples/bpf/task_fd_query_kern.c
 create mode 100644 samples/bpf/task_fd_query_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 62d1aa1..7dc85ed 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -51,6 +51,7 @@ hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
+hostprogs-y += task_fd_query
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -105,6 +106,7 @@ cpustat-objs := bpf_load.o cpustat_user.o
 xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
+task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -160,6 +162,7 @@ always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
 always += xdp_fwd_kern.o
+always += task_fd_query_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -175,6 +178,7 @@ HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
 
 HOST_LOADLIBES		+= $(LIBBPF) -lelf
 HOSTLOADLIBES_tracex4		+= -lrt
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
new file mode 100644
index 0000000..f4b0a9e
--- /dev/null
+++ b/samples/bpf/task_fd_query_kern.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+SEC("kprobe/blk_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	return 0;
+}
+
+SEC("kretprobe/blk_account_io_completion")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
new file mode 100644
index 0000000..792ef24
--- /dev/null
+++ b/samples/bpf/task_fd_query_user.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define CHECK_PERROR_RET(condition) ({			\
+	int __ret = !!(condition);			\
+	if (__ret) {					\
+		printf("FAIL: %s:\n", __func__);	\
+		perror("    ");			\
+		return -1;				\
+	}						\
+})
+
+#define CHECK_AND_RET(condition) ({			\
+	int __ret = !!(condition);			\
+	if (__ret)					\
+		return -1;				\
+})
+
+static __u64 ptr_to_u64(void *ptr)
+{
+	return (__u64) (unsigned long) ptr;
+}
+
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+	char buf[256];
+	int fd, ret;
+
+	ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+	fd = open(buf, O_RDONLY);
+	CHECK_PERROR_RET(fd < 0);
+
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+	errno = 0;
+	ret = (int)strtol(buf, NULL, 10);
+	CHECK_PERROR_RET(errno);
+	return ret;
+}
+
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+	char buf[256];
+	int fd, ret;
+
+	ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+	fd = open(buf, O_RDONLY);
+	CHECK_PERROR_RET(fd < 0);
+
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+	CHECK_PERROR_RET(strlen(buf) < strlen("config:"));
+
+	errno = 0;
+	ret = (int)strtol(buf + strlen("config:"), NULL, 10);
+	CHECK_PERROR_RET(errno);
+	return ret;
+}
+
+static int test_debug_fs_kprobe(int fd_idx, const char *fn_name,
+				__u32 expected_prog_info)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 prog_id, prog_info;
+	char buf[256];
+	int err;
+
+	err = bpf_task_fd_query(getpid(), event_fd[fd_idx], 0, buf, 256,
+				&prog_id, &prog_info, &probe_offset,
+				&probe_addr);
+	if (err < 0) {
+		printf("FAIL: %s, for event_fd idx %d, fn_name %s\n",
+		       __func__, fd_idx, fn_name);
+		perror("    :");
+		return -1;
+	}
+	if (strcmp(buf, fn_name) != 0 ||
+	    prog_info != expected_prog_info ||
+	    probe_offset != 0x0 || probe_addr != 0x0) {
+		printf("FAIL: bpf_trace_event_query(event_fd[1]):\n");
+		printf("buf: %s, prog_info: %u, probe_offset: 0x%llx,"
+		       " probe_addr: 0x%llx\n",
+		       buf, prog_info, probe_offset, probe_addr);
+		return -1;
+	}
+	return 0;
+}
+
+static int test_nondebug_fs_kuprobe_common(const char *event_type,
+	const char *name, __u64 offset, __u64 addr, bool is_return,
+	char *buf, int buf_len, __u32 *prog_id, __u32 *prog_info,
+	__u64 *probe_offset, __u64 *probe_addr)
+{
+	int is_return_bit = bpf_get_retprobe_bit(event_type);
+	int type = bpf_find_probe_type(event_type);
+	struct perf_event_attr attr = {};
+	int fd;
+
+	if (type < 0 || is_return_bit < 0) {
+		printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n",
+			__func__, type, is_return_bit);
+		return -1;
+	}
+
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	if (is_return)
+		attr.config |= 1 << is_return_bit;
+
+	if (name) {
+		attr.config1 = ptr_to_u64((void *)name);
+		attr.config2 = offset;
+	} else {
+		attr.config1 = 0;
+		attr.config2 = addr;
+	}
+	attr.size = sizeof(attr);
+	attr.type = type;
+
+	fd = sys_perf_event_open(&attr, -1, 0, -1, 0);
+	CHECK_PERROR_RET(fd < 0);
+
+	CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0);
+	CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
+	CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len,
+			 prog_id, prog_info, probe_offset, probe_addr) < 0);
+
+	return 0;
+}
+
+static int test_nondebug_fs_probe(const char *event_type, const char *name,
+				  __u64 offset, __u64 addr, bool is_return,
+				  __u32 expected_prog_info,
+				  __u32 expected_ret_prog_info,
+				  char *buf, int buf_len)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 prog_id, prog_info;
+	int err;
+
+	err = test_nondebug_fs_kuprobe_common(event_type, name,
+					      offset, addr, is_return,
+					      buf, buf_len, &prog_id,
+					      &prog_info, &probe_offset,
+					      &probe_addr);
+	if (err < 0) {
+		printf("FAIL: %s, "
+		       "for name %s, offset 0x%llx, addr 0x%llx, is_return %d\n",
+		       __func__, name ? name : "", offset, addr, is_return);
+		perror("    :");
+		return -1;
+	}
+	if ((is_return && prog_info != expected_ret_prog_info) ||
+	    (!is_return && prog_info != expected_prog_info)) {
+		printf("FAIL: %s, incorrect prog_info %u\n",
+		       __func__, prog_info);
+		return -1;
+	}
+	if (name) {
+		if (strcmp(name, buf) != 0) {
+			printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+			return -1;
+		}
+		if (probe_offset != offset) {
+			printf("FAIL: %s, incorrect probe_offset 0x%llx\n",
+			       __func__, probe_offset);
+			return -1;
+		}
+	} else {
+		if (buf && buf[0] != '\0') {
+			printf("FAIL: %s, incorrect buf %p\n",
+			       __func__, buf);
+			return -1;
+		}
+
+		if (probe_addr != addr) {
+			printf("FAIL: %s, incorrect probe_addr 0x%llx\n",
+			       __func__, probe_addr);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return)
+{
+	const char *event_type = "uprobe";
+	struct perf_event_attr attr = {};
+	char buf[256], event_alias[256];
+	__u64 probe_offset, probe_addr;
+	__u32 prog_id, prog_info;
+	int err, res, kfd, efd;
+	ssize_t bytes;
+
+	snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events",
+		 event_type);
+	kfd = open(buf, O_WRONLY | O_APPEND, 0);
+	CHECK_PERROR_RET(kfd < 0);
+
+	res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid());
+	CHECK_PERROR_RET(res < 0 || res >= sizeof(event_alias));
+
+	res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx",
+		       is_return ? 'r' : 'p', event_type, event_alias,
+		       binary_path, offset);
+	CHECK_PERROR_RET(res < 0 || res >= sizeof(buf));
+	CHECK_PERROR_RET(write(kfd, buf, strlen(buf)) < 0);
+
+	close(kfd);
+	kfd = -1;
+
+	snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s/id",
+		 event_type, event_alias);
+	efd = open(buf, O_RDONLY, 0);
+	CHECK_PERROR_RET(efd < 0);
+
+	bytes = read(efd, buf, sizeof(buf));
+	CHECK_PERROR_RET(bytes <= 0 || bytes >= sizeof(buf));
+	close(efd);
+	buf[bytes] = '\0';
+
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
+	CHECK_PERROR_RET(kfd < 0);
+	CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
+	CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0);
+
+	err = bpf_task_fd_query(getpid(), kfd, 0, buf, 256,
+				&prog_id, &prog_info, &probe_offset,
+				&probe_addr);
+	if (err < 0) {
+		printf("FAIL: %s, binary_path %s\n", __func__, binary_path);
+		perror("    :");
+		return -1;
+	}
+	if ((is_return && prog_info != BPF_ATTACH_URETPROBE) ||
+	    (!is_return && prog_info != BPF_ATTACH_UPROBE)) {
+		printf("FAIL: %s, incorrect prog_info %u\n", __func__,
+		       prog_info);
+		return -1;
+	}
+	if (strcmp(binary_path, buf) != 0) {
+		printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+		return -1;
+	}
+	if (probe_offset != offset) {
+		printf("FAIL: %s, incorrect probe_offset 0x%llx\n", __func__,
+		       probe_offset);
+		return -1;
+	}
+
+	close(kfd);
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {1024*1024, RLIM_INFINITY};
+	extern char __executable_start;
+	char filename[256], buf[256];
+	__u64 uprobe_file_offset;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
+	if (load_kallsyms()) {
+		printf("failed to process /proc/kallsyms\n");
+		return 1;
+	}
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	/* test two functions in the corresponding *_kern.c file */
+	CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_start_request",
+					   BPF_ATTACH_KPROBE));
+	CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion",
+					   BPF_ATTACH_KRETPROBE));
+
+	/* test nondebug fs kprobe */
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+					     false, BPF_ATTACH_KPROBE,
+					     BPF_ATTACH_KRETPROBE,
+					     buf, sizeof(buf)));
+#ifdef __x86_64__
+	/* set a kprobe on "bpf_check + 0x5", which is x64 specific */
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x5, 0x0,
+					     false, BPF_ATTACH_KPROBE,
+					     BPF_ATTACH_KRETPROBE,
+					     buf, sizeof(buf)));
+#endif
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+					     true, BPF_ATTACH_KPROBE,
+					     BPF_ATTACH_KRETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), false,
+					     BPF_ATTACH_KPROBE,
+					     BPF_ATTACH_KRETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), false,
+					     BPF_ATTACH_KPROBE,
+					     BPF_ATTACH_KRETPROBE,
+					     NULL, 0));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), true,
+					     BPF_ATTACH_KPROBE,
+					     BPF_ATTACH_KRETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), true,
+					     BPF_ATTACH_KPROBE,
+					     BPF_ATTACH_KRETPROBE,
+					     0, 0));
+
+	/* test nondebug fs uprobe */
+	/* the calculation of uprobe file offset is based on gcc 7.3.1 on x64
+	 * and the default linker script, which defines __executable_start as
+	 * the start of the .text section. The calculation could be different
+	 * on different systems with different compilers. The right way is
+	 * to parse the ELF file. We took a shortcut here.
+	 */
+	uprobe_file_offset = (__u64)main - (__u64)&__executable_start;
+	CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+					     uprobe_file_offset, 0x0, false,
+					     BPF_ATTACH_UPROBE,
+					     BPF_ATTACH_URETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+					     uprobe_file_offset, 0x0, true,
+					     BPF_ATTACH_UPROBE,
+					     BPF_ATTACH_URETPROBE,
+					     buf, sizeof(buf)));
+
+	/* test debug fs uprobe */
+	CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+					   false));
+	CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+					   true));
+
+	return 0;
+}
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
From: Yonghong Song @ 2018-05-22 16:30 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180522163048.3128924-1-yhs@fb.com>

Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/trace_events.h |  16 ++++++
 include/uapi/linux/bpf.h     |  27 ++++++++++
 kernel/bpf/syscall.c         | 124 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c     |  48 +++++++++++++++++
 kernel/trace/trace_kprobe.c  |  29 ++++++++++
 kernel/trace/trace_uprobe.c  |  22 ++++++++
 6 files changed, 266 insertions(+)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bde3ef..eab806d 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info);
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+			    u32 *attach_info, const char **buf,
+			    u64 *probe_offset, u64 *probe_addr);
 #else
 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
@@ -504,6 +507,12 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name
 {
 	return NULL;
 }
+static inline int bpf_get_perf_event_info(const struct file *file, u32 *prog_id,
+					  u32 *attach_info, const char **buf,
+					  u64 *probe_offset, u64 *probe_addr)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 enum {
@@ -560,10 +569,17 @@ extern void perf_trace_del(struct perf_event *event, int flags);
 #ifdef CONFIG_KPROBE_EVENTS
 extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_kprobe_destroy(struct perf_event *event);
+extern int bpf_get_kprobe_info(const struct perf_event *event,
+			       u32 *attach_info, const char **symbol,
+			       u64 *probe_offset, u64 *probe_addr,
+			       bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
 extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_uprobe_destroy(struct perf_event *event);
+extern int bpf_get_uprobe_info(const struct perf_event *event,
+			       u32 *attach_info, const char **filename,
+			       u64 *probe_offset, bool perf_type_tracepoint);
 #endif
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 97446bb..a602150 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
 	BPF_RAW_TRACEPOINT_OPEN,
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
+	BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -379,6 +380,22 @@ union bpf_attr {
 		__u32		btf_log_size;
 		__u32		btf_log_level;
 	};
+
+	struct {
+		int		pid;		/* input: pid */
+		int		fd;		/* input: fd */
+		__u32		flags;		/* input: flags */
+		__u32		buf_len;	/* input: buf len */
+		__aligned_u64	buf;		/* input/output:
+						 *   tp_name for tracepoint
+						 *   symbol for kprobe
+						 *   filename for uprobe
+						 */
+		__u32		prog_id;	/* output: prod_id */
+		__u32		attach_info;	/* output: BPF_ATTACH_* */
+		__u64		probe_offset;	/* output: probe_offset */
+		__u64		probe_addr;	/* output: probe_addr */
+	} task_fd_query;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -2458,4 +2475,14 @@ struct bpf_fib_lookup {
 	__u8	dmac[6];     /* ETH_ALEN */
 };
 
+/* used by <task, fd> based query */
+enum {
+	BPF_ATTACH_RAW_TRACEPOINT,	/* tp name */
+	BPF_ATTACH_TRACEPOINT,		/* tp name */
+	BPF_ATTACH_KPROBE,		/* (symbol + offset) or addr */
+	BPF_ATTACH_KRETPROBE,		/* (symbol + offset) or addr */
+	BPF_ATTACH_UPROBE,		/* filename + offset */
+	BPF_ATTACH_URETPROBE,		/* filename + offset */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bfcde94..9356f0e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,7 +18,9 @@
 #include <linux/vmalloc.h>
 #include <linux/mmzone.h>
 #include <linux/anon_inodes.h>
+#include <linux/fdtable.h>
 #include <linux/file.h>
+#include <linux/fs.h>
 #include <linux/license.h>
 #include <linux/filter.h>
 #include <linux/version.h>
@@ -2102,6 +2104,125 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
 	return btf_get_fd_by_id(attr->btf_id);
 }
 
+static int bpf_task_fd_query_copy(const union bpf_attr *attr,
+				    union bpf_attr __user *uattr,
+				    u32 prog_id, u32 attach_info,
+				    const char *buf, u64 probe_offset,
+				    u64 probe_addr)
+{
+	__u64 __user *ubuf;
+	int len;
+
+	ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
+	if (buf) {
+		len = strlen(buf);
+		if (attr->task_fd_query.buf_len < len + 1)
+			return -ENOSPC;
+		if (copy_to_user(ubuf, buf, len + 1))
+			return -EFAULT;
+	} else if (attr->task_fd_query.buf_len) {
+		/* copy '\0' to ubuf */
+		__u8 zero = 0;
+
+		if (copy_to_user(ubuf, &zero, 1))
+			return -EFAULT;
+	}
+
+	if (copy_to_user(&uattr->task_fd_query.prog_id, &prog_id,
+			 sizeof(prog_id)) ||
+	    copy_to_user(&uattr->task_fd_query.attach_info, &attach_info,
+			 sizeof(attach_info)) ||
+	    copy_to_user(&uattr->task_fd_query.probe_offset, &probe_offset,
+			 sizeof(probe_offset)) ||
+	    copy_to_user(&uattr->task_fd_query.probe_addr, &probe_addr,
+			 sizeof(probe_addr)))
+		return -EFAULT;
+
+	return 0;
+}
+
+#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
+
+static int bpf_task_fd_query(const union bpf_attr *attr,
+			     union bpf_attr __user *uattr)
+{
+	pid_t pid = attr->task_fd_query.pid;
+	int fd = attr->task_fd_query.fd;
+	const struct perf_event *event;
+	struct files_struct *files;
+	struct task_struct *task;
+	struct file *file;
+	int err;
+
+	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (attr->task_fd_query.flags != 0)
+		return -EINVAL;
+
+	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+	if (!task)
+		return -ENOENT;
+
+	files = get_files_struct(task);
+	put_task_struct(task);
+	if (!files)
+		return -ENOENT;
+
+	err = 0;
+	spin_lock(&files->file_lock);
+	file = fcheck_files(files, fd);
+	if (!file)
+		err = -EBADF;
+	else
+		get_file(file);
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+	if (err)
+		goto out;
+
+	if (file->f_op == &bpf_raw_tp_fops) {
+		struct bpf_raw_tracepoint *raw_tp = file->private_data;
+		struct bpf_raw_event_map *btp = raw_tp->btp;
+
+		if (!raw_tp->prog)
+			err = -ENOENT;
+		else
+			err = bpf_task_fd_query_copy(attr, uattr,
+						     raw_tp->prog->aux->id,
+						     BPF_ATTACH_RAW_TRACEPOINT,
+						     btp->tp->name, 0, 0);
+		goto put_file;
+	}
+
+	event = perf_get_event(file);
+	if (!IS_ERR(event)) {
+		u64 probe_offset, probe_addr;
+		u32 prog_id, attach_info;
+		const char *buf;
+
+		err = bpf_get_perf_event_info(event, &prog_id, &attach_info,
+					      &buf, &probe_offset,
+					      &probe_addr);
+		if (!err)
+			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
+						     attach_info, buf,
+						     probe_offset,
+						     probe_addr);
+		goto put_file;
+	}
+
+	err = -ENOTSUPP;
+put_file:
+	fput(file);
+out:
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2188,6 +2309,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_BTF_GET_FD_BY_ID:
 		err = bpf_btf_get_fd_by_id(&attr);
 		break;
+	case BPF_TASK_FD_QUERY:
+		err = bpf_task_fd_query(&attr, uattr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ce2cbbf..323c80e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,6 +14,7 @@
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/kprobes.h>
+#include <linux/syscalls.h>
 #include <linux/error-injection.h>
 
 #include "trace_probe.h"
@@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 	mutex_unlock(&bpf_event_mutex);
 	return err;
 }
+
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+			    u32 *attach_info, const char **buf,
+			    u64 *probe_offset, u64 *probe_addr)
+{
+	bool is_tracepoint, is_syscall_tp;
+	struct bpf_prog *prog;
+	int flags, err = 0;
+
+	prog = event->prog;
+	if (!prog)
+		return -ENOENT;
+
+	/* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
+	if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
+		return -EOPNOTSUPP;
+
+	*prog_id = prog->aux->id;
+	flags = event->tp_event->flags;
+	is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
+	is_syscall_tp = is_syscall_trace_event(event->tp_event);
+
+	if (is_tracepoint || is_syscall_tp) {
+		*buf = is_tracepoint ? event->tp_event->tp->name
+				     : event->tp_event->name;
+		*attach_info = BPF_ATTACH_TRACEPOINT;
+		*probe_offset = 0x0;
+		*probe_addr = 0x0;
+	} else {
+		/* kprobe/uprobe */
+		err = -EOPNOTSUPP;
+#ifdef CONFIG_KPROBE_EVENTS
+		if (flags & TRACE_EVENT_FL_KPROBE)
+			err = bpf_get_kprobe_info(event, attach_info, buf,
+						  probe_offset, probe_addr,
+						  event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+		if (flags & TRACE_EVENT_FL_UPROBE)
+			err = bpf_get_uprobe_info(event, attach_info, buf,
+						  probe_offset,
+						  event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+	}
+
+	return err;
+}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 02aed76..32e9190 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
+
+int bpf_get_kprobe_info(const struct perf_event *event, u32 *attach_info,
+			const char **symbol, u64 *probe_offset,
+			u64 *probe_addr, bool perf_type_tracepoint)
+{
+	const char *pevent = trace_event_name(event->tp_event);
+	const char *group = event->tp_event->class->system;
+	struct trace_kprobe *tk;
+
+	if (perf_type_tracepoint)
+		tk = find_trace_kprobe(pevent, group);
+	else
+		tk = event->tp_event->data;
+	if (!tk)
+		return -EINVAL;
+
+	*attach_info = trace_kprobe_is_return(tk) ? BPF_ATTACH_KRETPROBE
+						  : BPF_ATTACH_KPROBE;
+	if (tk->symbol) {
+		*symbol = tk->symbol;
+		*probe_offset = tk->rp.kp.offset;
+		*probe_addr = 0;
+	} else {
+		*symbol = NULL;
+		*probe_offset = 0;
+		*probe_addr = (unsigned long)tk->rp.kp.addr;
+	}
+	return 0;
+}
 #endif	/* CONFIG_PERF_EVENTS */
 
 /*
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index ac89287..12a3667 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
 {
 	__uprobe_perf_func(tu, func, regs, ucb, dsize);
 }
+
+int bpf_get_uprobe_info(const struct perf_event *event, u32 *attach_info,
+			const char **filename, u64 *probe_offset,
+			bool perf_type_tracepoint)
+{
+	const char *pevent = trace_event_name(event->tp_event);
+	const char *group = event->tp_event->class->system;
+	struct trace_uprobe *tu;
+
+	if (perf_type_tracepoint)
+		tu = find_probe_event(pevent, group);
+	else
+		tu = event->tp_event->data;
+	if (!tu)
+		return -EINVAL;
+
+	*attach_info = is_ret_probe(tu) ? BPF_ATTACH_URETPROBE
+					: BPF_ATTACH_UPROBE;
+	*filename = tu->filename;
+	*probe_offset = tu->offset;
+	return 0;
+}
 #endif	/* CONFIG_PERF_EVENTS */
 
 static int
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 0/7] bpf: implement BPF_TASK_FD_QUERY
From: Yonghong Song @ 2018-05-22 16:30 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team

Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, this command will return bpf related information
to user space. Right now it only supports tracepoint/kprobe/uprobe
perf event fd's. For such a fd, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Patch #1 adds function perf_get_event() in kernel/events/core.c.
Patch #2 implements the bpf subcommand BPF_TASK_FD_QUERY.
Patch #3 syncs tools bpf.h header and also add bpf_task_fd_query()
in the libbpf library for samples/selftests/bpftool to use.
Patch #4 adds ksym_get_addr() utility function.
Patch #5 add a test in samples/bpf for querying k[ret]probes and
u[ret]probes.
Patch #6 add a test in tools/testing/selftests/bpf for querying
raw_tracepoint and tracepoint.
Patch #7 add a new subcommand "perf" to bpftool.

Changelogs:
  v2 -> v3:
     . made perf_get_event() return perf_event pointer const.
       this was to ensure that event fields are not meddled.
     . detect whether newly BPF_TASK_FD_QUERY is supported or
       not in "bpftool perf" and warn users if it is not.
  v1 -> v2:
     . changed bpf subcommand name from BPF_PERF_EVENT_QUERY
       to BPF_TASK_FD_QUERY.
     . fixed various "bpftool perf" issues and added documentation
       and auto-completion.

Yonghong Song (7):
  perf/core: add perf_get_event() to return perf_event given a struct
    file
  bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
  tools/bpf: sync kernel header bpf.h and add bpf_trace_event_query in
    libbpf
  tools/bpf: add ksym_get_addr() in trace_helpers
  samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY
  tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs
  tools/bpftool: add perf subcommand

 include/linux/perf_event.h                       |   5 +
 include/linux/trace_events.h                     |  16 +
 include/uapi/linux/bpf.h                         |  27 ++
 kernel/bpf/syscall.c                             | 124 ++++++++
 kernel/events/core.c                             |   8 +
 kernel/trace/bpf_trace.c                         |  48 +++
 kernel/trace/trace_kprobe.c                      |  29 ++
 kernel/trace/trace_uprobe.c                      |  22 ++
 samples/bpf/Makefile                             |   4 +
 samples/bpf/task_fd_query_kern.c                 |  19 ++
 samples/bpf/task_fd_query_user.c                 | 379 +++++++++++++++++++++++
 tools/bpf/bpftool/Documentation/bpftool-perf.rst |  81 +++++
 tools/bpf/bpftool/Documentation/bpftool.rst      |   5 +-
 tools/bpf/bpftool/bash-completion/bpftool        |   9 +
 tools/bpf/bpftool/main.c                         |   3 +-
 tools/bpf/bpftool/main.h                         |   1 +
 tools/bpf/bpftool/perf.c                         | 244 +++++++++++++++
 tools/include/uapi/linux/bpf.h                   |  27 ++
 tools/lib/bpf/bpf.c                              |  24 ++
 tools/lib/bpf/bpf.h                              |   3 +
 tools/testing/selftests/bpf/test_progs.c         | 133 ++++++++
 tools/testing/selftests/bpf/trace_helpers.c      |  12 +
 tools/testing/selftests/bpf/trace_helpers.h      |   1 +
 23 files changed, 1222 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/task_fd_query_kern.c
 create mode 100644 samples/bpf/task_fd_query_user.c
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
 create mode 100644 tools/bpf/bpftool/perf.c

-- 
2.9.5

^ permalink raw reply

* [PATCH bpf-next v3 1/7] perf/core: add perf_get_event() to return perf_event given a struct file
From: Yonghong Song @ 2018-05-22 16:30 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180522163048.3128924-1-yhs@fb.com>

A new extern function, perf_get_event(), is added to return a perf event
given a struct file. This function will be used in later patches.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/perf_event.h | 5 +++++
 kernel/events/core.c       | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99e..eec302b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -868,6 +868,7 @@ extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
 extern struct file *perf_event_get(unsigned int fd);
+extern const struct perf_event *perf_get_event(struct file *file);
 extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
@@ -1289,6 +1290,10 @@ static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
 static inline void perf_event_delayed_put(struct task_struct *task)	{ }
 static inline struct file *perf_event_get(unsigned int fd)	{ return ERR_PTR(-EINVAL); }
+static inline const struct perf_event *perf_get_event(struct file *file)
+{
+	return ERR_PTR(-EINVAL);
+}
 static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
 	return ERR_PTR(-EINVAL);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67612ce..6eeab86 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11212,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd)
 	return file;
 }
 
+const struct perf_event *perf_get_event(struct file *file)
+{
+	if (file->f_op != &perf_fops)
+		return ERR_PTR(-EINVAL);
+
+	return file->private_data;
+}
+
 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
 	if (!event)
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 3/7] tools/bpf: sync kernel header bpf.h and add bpf_trace_event_query in libbpf
From: Yonghong Song @ 2018-05-22 16:30 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180522163048.3128924-1-yhs@fb.com>

Sync kernel header bpf.h to tools/include/uapi/linux/bpf.h and
implement bpf_trace_event_query() in libbpf. The test programs
in samples/bpf and tools/testing/selftests/bpf, and later bpftool
will use this libbpf function to query kernel.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/include/uapi/linux/bpf.h | 27 +++++++++++++++++++++++++++
 tools/lib/bpf/bpf.c            | 24 ++++++++++++++++++++++++
 tools/lib/bpf/bpf.h            |  3 +++
 3 files changed, 54 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 97446bb..a602150 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
 	BPF_RAW_TRACEPOINT_OPEN,
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
+	BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -379,6 +380,22 @@ union bpf_attr {
 		__u32		btf_log_size;
 		__u32		btf_log_level;
 	};
+
+	struct {
+		int		pid;		/* input: pid */
+		int		fd;		/* input: fd */
+		__u32		flags;		/* input: flags */
+		__u32		buf_len;	/* input: buf len */
+		__aligned_u64	buf;		/* input/output:
+						 *   tp_name for tracepoint
+						 *   symbol for kprobe
+						 *   filename for uprobe
+						 */
+		__u32		prog_id;	/* output: prod_id */
+		__u32		attach_info;	/* output: BPF_ATTACH_* */
+		__u64		probe_offset;	/* output: probe_offset */
+		__u64		probe_addr;	/* output: probe_addr */
+	} task_fd_query;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -2458,4 +2475,14 @@ struct bpf_fib_lookup {
 	__u8	dmac[6];     /* ETH_ALEN */
 };
 
+/* used by <task, fd> based query */
+enum {
+	BPF_ATTACH_RAW_TRACEPOINT,	/* tp name */
+	BPF_ATTACH_TRACEPOINT,		/* tp name */
+	BPF_ATTACH_KPROBE,		/* (symbol + offset) or addr */
+	BPF_ATTACH_KRETPROBE,		/* (symbol + offset) or addr */
+	BPF_ATTACH_UPROBE,		/* filename + offset */
+	BPF_ATTACH_URETPROBE,		/* filename + offset */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 6a8a000..da3f336 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -643,3 +643,27 @@ int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 
 	return fd;
 }
+
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 buf_len,
+		      __u32 *prog_id, __u32 *attach_info,
+		      __u64 *probe_offset, __u64 *probe_addr)
+{
+	union bpf_attr attr = {};
+	int err;
+
+	attr.task_fd_query.pid = pid;
+	attr.task_fd_query.fd = fd;
+	attr.task_fd_query.flags = flags;
+	attr.task_fd_query.buf = ptr_to_u64(buf);
+	attr.task_fd_query.buf_len = buf_len;
+
+	err = sys_bpf(BPF_TASK_FD_QUERY, &attr, sizeof(attr));
+	if (!err) {
+		*prog_id = attr.task_fd_query.prog_id;
+		*attach_info = attr.task_fd_query.attach_info;
+		*probe_offset = attr.task_fd_query.probe_offset;
+		*probe_addr = attr.task_fd_query.probe_addr;
+	}
+
+	return err;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 15bff77..9adfde6 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -107,4 +107,7 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
 int bpf_raw_tracepoint_open(const char *name, int prog_fd);
 int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 		 bool do_log);
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 buf_len,
+		      __u32 *prog_id, __u32 *prog_info,
+		      __u64 *probe_offset, __u64 *probe_addr);
 #endif
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 4/7] tools/bpf: add ksym_get_addr() in trace_helpers
From: Yonghong Song @ 2018-05-22 16:30 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180522163048.3128924-1-yhs@fb.com>

Given a kernel function name, ksym_get_addr() will return the kernel
address for this function, or 0 if it cannot find this function name
in /proc/kallsyms. This function will be used later when a kernel
address is used to initiate a kprobe perf event.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/trace_helpers.c | 12 ++++++++++++
 tools/testing/selftests/bpf/trace_helpers.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 8fb4fe8..3868dcb 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -72,6 +72,18 @@ struct ksym *ksym_search(long key)
 	return &syms[0];
 }
 
+long ksym_get_addr(const char *name)
+{
+	int i;
+
+	for (i = 0; i < sym_cnt; i++) {
+		if (strcmp(syms[i].name, name) == 0)
+			return syms[i].addr;
+	}
+
+	return 0;
+}
+
 static int page_size;
 static int page_cnt = 8;
 static struct perf_event_mmap_page *header;
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index 36d90e3..3b4bcf7 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -11,6 +11,7 @@ struct ksym {
 
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
+long ksym_get_addr(const char *name);
 
 typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size);
 
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH bpf-next 3/7] bpf: btf: Check array->index_type
From: Martin KaFai Lau @ 2018-05-22 16:20 UTC (permalink / raw)
  To: Yonghong Song; +Cc: netdev, Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <169ab3ed-d03f-eb3f-7d4f-6545c5516bec@fb.com>

On Mon, May 21, 2018 at 02:04:51PM -0700, Yonghong Song wrote:
> 
> 
> On 5/18/18 5:16 PM, Martin KaFai Lau wrote:
> > Instead of ingoring the array->index_type field.  Enforce that
> > it must be an unsigned BTF_KIND_INT.
> > 
> > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > ---
> >   kernel/bpf/btf.c | 83 ++++++++++++++++++++++++++++++++++++++++----------------
> >   1 file changed, 59 insertions(+), 24 deletions(-)
> > 
> > diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> > index 536e5981ad8c..b4e48dae2240 100644
> > --- a/kernel/bpf/btf.c
> > +++ b/kernel/bpf/btf.c
> > @@ -444,6 +444,28 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
> >   	return btf->types[type_id];
> >   }
> > +/*
> > + * Regular int is not a bit field and it must be either
> > + * u8/u16/u32/u64.
> > + */
> > +static bool btf_type_int_is_regular(const struct btf_type *t)
> > +{
> > +	u16 nr_bits, nr_bytes;
> > +	u32 int_data;
> > +
> > +	int_data = btf_type_int(t);
> > +	nr_bits = BTF_INT_BITS(int_data);
> > +	nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
> > +	if (BITS_PER_BYTE_MASKED(nr_bits) ||
> > +	    BTF_INT_OFFSET(int_data) ||
> > +	    (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
> > +	     nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
> > +		return false;
> > +	}
> > +
> > +	return true;
> > +}
> > +
> >   __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
> >   					      const char *fmt, ...)
> >   {
> > @@ -1309,14 +1331,16 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
> >   		return -EINVAL;
> >   	}
> > -	/* We are a little forgiving on array->index_type since
> > -	 * the kernel is not using it.
> > -	 */
> > -	/* Array elem cannot be in type void,
> > -	 * so !array->type is not allowed.
> > +	/* Array elem type and index type cannot be in type void,
> > +	 * so !array->type and !array->index_type are not allowed.
> >   	 */
> >   	if (!array->type || BTF_TYPE_PARENT(array->type)) {
> > -		btf_verifier_log_type(env, t, "Invalid type_id");
> > +		btf_verifier_log_type(env, t, "Invalid elem");
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (!array->index_type || BTF_TYPE_PARENT(array->index_type)) {
> > +		btf_verifier_log_type(env, t, "Invalid index");
> >   		return -EINVAL;
> >   	}
> > @@ -1329,11 +1353,35 @@ static int btf_array_resolve(struct btf_verifier_env *env,
> >   			     const struct resolve_vertex *v)
> >   {
> >   	const struct btf_array *array = btf_type_array(v->t);
> > -	const struct btf_type *elem_type;
> > -	u32 elem_type_id = array->type;
> > +	const struct btf_type *elem_type, *index_type;
> > +	u32 elem_type_id, index_type_id;
> >   	struct btf *btf = env->btf;
> >   	u32 elem_size;
> > +	/* Check array->index_type */
> > +	index_type_id = array->index_type;
> > +	index_type = btf_type_by_id(btf, index_type_id);
> > +	if (btf_type_is_void_or_null(index_type)) {
> > +		btf_verifier_log_type(env, v->t, "Invalid index");
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (!env_type_is_resolve_sink(env, index_type) &&
> > +	    !env_type_is_resolved(env, index_type_id))
> > +		return env_stack_push(env, index_type, index_type_id);
> > +
> > +	index_type = btf_type_id_size(btf, &index_type_id, NULL);
> > +	if (!index_type || !btf_type_is_int(index_type) ||
> > +	    /* bit field int is not allowed */
> > +	    !btf_type_int_is_regular(index_type) ||
> > +	    /* unsigned only */
> > +	    BTF_INT_ENCODING(btf_type_int(index_type))) {
> > +		btf_verifier_log_type(env, v->t, "Invalid index");
> > +		return -EINVAL;
> > +	}
> 
> Currently, in uapi/linux/btf.h, we have
> /* Attributes stored in the BTF_INT_ENCODING */
> #define BTF_INT_SIGNED  0x1
> #define BTF_INT_CHAR    0x2
> #define BTF_INT_BOOL    0x4
> #define BTF_INT_VARARGS 0x8
> 
> The BPF_INT_ENCODING value 0 stands for UNSIGNED.
It is a bit field, so getting 0 defined would be confusing.
If BTF_INT_SIGNED bit is not set, then it is not signed.

I think it will help to define them as (1 << x) to make
the bit field nature more obvious.

> Do we want to explicitly document this in uapi/linux/bpf.h?
> 
> > +
> > +	/* Check array->type */
> > +	elem_type_id = array->type;
> >   	elem_type = btf_type_by_id(btf, elem_type_id);
> >   	if (btf_type_is_void_or_null(elem_type)) {
> >   		btf_verifier_log_type(env, v->t,
> > @@ -1351,22 +1399,9 @@ static int btf_array_resolve(struct btf_verifier_env *env,
> >   		return -EINVAL;
> >   	}
> > -	if (btf_type_is_int(elem_type)) {
> > -		int int_type_data = btf_type_int(elem_type);
> > -		u16 nr_bits = BTF_INT_BITS(int_type_data);
> > -		u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
> > -
> > -		/* Put more restriction on array of int.  The int cannot
> > -		 * be a bit field and it must be either u8/u16/u32/u64.
> > -		 */
> > -		if (BITS_PER_BYTE_MASKED(nr_bits) ||
> > -		    BTF_INT_OFFSET(int_type_data) ||
> > -		    (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
> > -		     nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
> > -			btf_verifier_log_type(env, v->t,
> > -					      "Invalid array of int");
> > -			return -EINVAL;
> > -		}
> > +	if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) {
> > +		btf_verifier_log_type(env, v->t, "Invalid array of int");
> > +		return -EINVAL;
> >   	}
> >   	if (array->nelems && elem_size > U32_MAX / array->nelems) {
> > 

^ permalink raw reply

* [PATCH][V2] net: vxge: fix spelling mistake in macro VXGE_HW_ERR_PRIVILAGED_OPEARATION
From: Colin King @ 2018-05-22 16:18 UTC (permalink / raw)
  To: Jon Mason, David S . Miller, netdev; +Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

Rename VXGE_HW_ERR_PRIVILAGED_OPEARATION to VXGE_HW_ERR_PRIVILEGED_OPERATION
to fix spelling mistake.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
V2: PRIVILAGED -> PRIVILEGED, thanks to Edward Cree for spotting that mistake

---
 drivers/net/ethernet/neterion/vxge/vxge-config.c  | 12 ++++++------
 drivers/net/ethernet/neterion/vxge/vxge-config.h  |  2 +-
 drivers/net/ethernet/neterion/vxge/vxge-ethtool.c |  2 +-
 drivers/net/ethernet/neterion/vxge/vxge-main.c    |  4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/neterion/vxge/vxge-config.c b/drivers/net/ethernet/neterion/vxge/vxge-config.c
index 6223930a8155..8656fcc9f2a0 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-config.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-config.c
@@ -693,7 +693,7 @@ __vxge_hw_device_is_privilaged(u32 host_type, u32 func_id)
 		VXGE_HW_DEVICE_ACCESS_RIGHT_MRPCIM)
 		return VXGE_HW_OK;
 	else
-		return VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+		return VXGE_HW_ERR_PRIVILEGED_OPERATION;
 }
 
 /*
@@ -1920,7 +1920,7 @@ enum vxge_hw_status vxge_hw_device_getpause_data(struct __vxge_hw_device *hldev,
 	}
 
 	if (!(hldev->access_rights & VXGE_HW_DEVICE_ACCESS_RIGHT_MRPCIM)) {
-		status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+		status = VXGE_HW_ERR_PRIVILEGED_OPERATION;
 		goto exit;
 	}
 
@@ -3153,7 +3153,7 @@ vxge_hw_mgmt_reg_read(struct __vxge_hw_device *hldev,
 	case vxge_hw_mgmt_reg_type_mrpcim:
 		if (!(hldev->access_rights &
 			VXGE_HW_DEVICE_ACCESS_RIGHT_MRPCIM)) {
-			status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+			status = VXGE_HW_ERR_PRIVILEGED_OPERATION;
 			break;
 		}
 		if (offset > sizeof(struct vxge_hw_mrpcim_reg) - 8) {
@@ -3165,7 +3165,7 @@ vxge_hw_mgmt_reg_read(struct __vxge_hw_device *hldev,
 	case vxge_hw_mgmt_reg_type_srpcim:
 		if (!(hldev->access_rights &
 			VXGE_HW_DEVICE_ACCESS_RIGHT_SRPCIM)) {
-			status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+			status = VXGE_HW_ERR_PRIVILEGED_OPERATION;
 			break;
 		}
 		if (index > VXGE_HW_TITAN_SRPCIM_REG_SPACES - 1) {
@@ -3279,7 +3279,7 @@ vxge_hw_mgmt_reg_write(struct __vxge_hw_device *hldev,
 	case vxge_hw_mgmt_reg_type_mrpcim:
 		if (!(hldev->access_rights &
 			VXGE_HW_DEVICE_ACCESS_RIGHT_MRPCIM)) {
-			status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+			status = VXGE_HW_ERR_PRIVILEGED_OPERATION;
 			break;
 		}
 		if (offset > sizeof(struct vxge_hw_mrpcim_reg) - 8) {
@@ -3291,7 +3291,7 @@ vxge_hw_mgmt_reg_write(struct __vxge_hw_device *hldev,
 	case vxge_hw_mgmt_reg_type_srpcim:
 		if (!(hldev->access_rights &
 			VXGE_HW_DEVICE_ACCESS_RIGHT_SRPCIM)) {
-			status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+			status = VXGE_HW_ERR_PRIVILEGED_OPERATION;
 			break;
 		}
 		if (index > VXGE_HW_TITAN_SRPCIM_REG_SPACES - 1) {
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-config.h b/drivers/net/ethernet/neterion/vxge/vxge-config.h
index cfa970417f81..5ebdbfedc269 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-config.h
+++ b/drivers/net/ethernet/neterion/vxge/vxge-config.h
@@ -127,7 +127,7 @@ enum vxge_hw_status {
 	VXGE_HW_ERR_INVALID_TCODE 		  = VXGE_HW_BASE_ERR + 14,
 	VXGE_HW_ERR_INVALID_BLOCK_SIZE		  = VXGE_HW_BASE_ERR + 15,
 	VXGE_HW_ERR_INVALID_STATE		  = VXGE_HW_BASE_ERR + 16,
-	VXGE_HW_ERR_PRIVILAGED_OPEARATION	  = VXGE_HW_BASE_ERR + 17,
+	VXGE_HW_ERR_PRIVILEGED_OPERATION	  = VXGE_HW_BASE_ERR + 17,
 	VXGE_HW_ERR_INVALID_PORT 		  = VXGE_HW_BASE_ERR + 18,
 	VXGE_HW_ERR_FIFO		 	  = VXGE_HW_BASE_ERR + 19,
 	VXGE_HW_ERR_VPATH			  = VXGE_HW_BASE_ERR + 20,
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c b/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c
index 0452848d1316..2f1bde500420 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c
@@ -276,7 +276,7 @@ static void vxge_get_ethtool_stats(struct net_device *dev,
 	*ptr++ = 0;
 	status = vxge_hw_device_xmac_stats_get(hldev, xmac_stats);
 	if (status != VXGE_HW_OK) {
-		if (status != VXGE_HW_ERR_PRIVILAGED_OPEARATION) {
+		if (status != VXGE_HW_ERR_PRIVILEGED_OPERATION) {
 			vxge_debug_init(VXGE_ERR,
 				"%s : %d Failure in getting xmac stats",
 				__func__, __LINE__);
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index b2299f2b2155..64fa94f8b471 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -3484,11 +3484,11 @@ static int vxge_device_register(struct __vxge_hw_device *hldev,
 				0,
 				&stat);
 
-	if (status == VXGE_HW_ERR_PRIVILAGED_OPEARATION)
+	if (status == VXGE_HW_ERR_PRIVILEGED_OPERATION)
 		vxge_debug_init(
 			vxge_hw_device_trace_level_get(hldev),
 			"%s: device stats clear returns"
-			"VXGE_HW_ERR_PRIVILAGED_OPEARATION", ndev->name);
+			"VXGE_HW_ERR_PRIVILEGED_OPERATION", ndev->name);
 
 	vxge_debug_entryexit(vxge_hw_device_trace_level_get(hldev),
 		"%s: %s:%d  Exiting...",
-- 
2.17.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox