Netdev List
 help / color / mirror / Atom feed
* [PATCH bpf-next v3 05/10] bpf: btf: Add BPF_BTF_LOAD command
From: Martin KaFai Lau @ 2018-04-16 19:33 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180416193327.477239-1-kafai@fb.com>

This patch adds a BPF_BTF_LOAD command which
1) loads and verifies the BTF (implemented in earlier patches)
2) returns a BTF fd to userspace.  In the next patch, the
   BTF fd can be specified during BPF_MAP_CREATE.

It currently limits to CAP_SYS_ADMIN.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
 include/linux/btf.h      |  4 +++
 include/uapi/linux/bpf.h |  9 +++++++
 kernel/bpf/btf.c         | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c     | 17 ++++++++++++
 4 files changed, 97 insertions(+)

diff --git a/include/linux/btf.h b/include/linux/btf.h
index d8bdab0280ba..a7c7072535ea 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -8,7 +8,11 @@
 
 struct btf;
 struct btf_type;
+union bpf_attr;
 
+void btf_put(struct btf *btf);
+int btf_new_fd(const union bpf_attr *attr);
+struct btf *btf_get_by_fd(int fd);
 /* Figure out the size of a type_id.  If type_id is a modifier
  * (e.g. const), it will be resolved to find out the type with size.
  *
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec89732a8d..c7d75f18521b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -95,6 +95,7 @@ enum bpf_cmd {
 	BPF_OBJ_GET_INFO_BY_FD,
 	BPF_PROG_QUERY,
 	BPF_RAW_TRACEPOINT_OPEN,
+	BPF_BTF_LOAD,
 };
 
 enum bpf_map_type {
@@ -363,6 +364,14 @@ union bpf_attr {
 		__u64 name;
 		__u32 prog_fd;
 	} raw_tracepoint;
+
+	struct { /* anonymous struct for BPF_BTF_LOAD */
+		__aligned_u64	btf;
+		__aligned_u64	btf_log_buf;
+		__u32		btf_size;
+		__u32		btf_log_size;
+		__u32		btf_log_level;
+	};
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 15bfca419231..45af787ceddf 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7,6 +7,8 @@
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/bpf_verifier.h>
@@ -190,6 +192,7 @@ struct btf {
 	u32 nr_types;
 	u32 types_size;
 	u32 data_size;
+	refcount_t refcnt;
 };
 
 enum verifier_phase {
@@ -607,6 +610,17 @@ static void btf_free(struct btf *btf)
 	kfree(btf);
 }
 
+static void btf_get(struct btf *btf)
+{
+	refcount_inc(&btf->refcnt);
+}
+
+void btf_put(struct btf *btf)
+{
+	if (btf && refcount_dec_and_test(&btf->refcnt))
+		btf_free(btf);
+}
+
 static int env_resolve_init(struct btf_verifier_env *env)
 {
 	struct btf *btf = env->btf;
@@ -1992,6 +2006,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 
 	if (!err) {
 		btf_verifier_env_free(env);
+		btf_get(btf);
 		return btf;
 	}
 
@@ -2009,3 +2024,55 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
 
 	btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
 }
+
+static int btf_release(struct inode *inode, struct file *filp)
+{
+	btf_put(filp->private_data);
+	return 0;
+}
+
+static const struct file_operations btf_fops = {
+	.release	= btf_release,
+};
+
+int btf_new_fd(const union bpf_attr *attr)
+{
+	struct btf *btf;
+	int fd;
+
+	btf = btf_parse(u64_to_user_ptr(attr->btf),
+			attr->btf_size, attr->btf_log_level,
+			u64_to_user_ptr(attr->btf_log_buf),
+			attr->btf_log_size);
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+
+	fd = anon_inode_getfd("btf", &btf_fops, btf,
+			      O_RDONLY | O_CLOEXEC);
+	if (fd < 0)
+		btf_put(btf);
+
+	return fd;
+}
+
+struct btf *btf_get_by_fd(int fd)
+{
+	struct btf *btf;
+	struct fd f;
+
+	f = fdget(fd);
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	if (f.file->f_op != &btf_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	btf = f.file->private_data;
+	btf_get(btf);
+	fdput(f);
+
+	return btf;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4ca46df19c9a..cd8ebadc66eb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -11,6 +11,7 @@
  */
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/btf.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/sched/signal.h>
@@ -2023,6 +2024,19 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	return err;
 }
 
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+
+static int bpf_btf_load(const union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_BTF_LOAD))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btf_new_fd(attr);
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2103,6 +2117,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_RAW_TRACEPOINT_OPEN:
 		err = bpf_raw_tracepoint_open(&attr);
 		break;
+	case BPF_BTF_LOAD:
+		err = bpf_btf_load(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 02/10] bpf: btf: Validate type reference
From: Martin KaFai Lau @ 2018-04-16 19:33 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180416193327.477239-1-kafai@fb.com>

After collecting all btf_type in the first pass in an earlier patch,
the second pass (in this patch) can validate the reference types
(e.g. the referring type does exist and it does not refer to itself).

While checking the reference type, it also gathers other information (e.g.
the size of an array).  This info will be useful in checking the
struct's members in a later patch.  They will also be useful in doing
pretty print later.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
 include/linux/btf.h |  37 +++
 kernel/bpf/btf.c    | 668 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 704 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/btf.h

diff --git a/include/linux/btf.h b/include/linux/btf.h
new file mode 100644
index 000000000000..f14b60368753
--- /dev/null
+++ b/include/linux/btf.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#ifndef _LINUX_BTF_H
+#define _LINUX_BTF_H 1
+
+#include <linux/types.h>
+
+struct btf;
+struct btf_type;
+
+/* Figure out the size of a type_id.  If type_id is a modifier
+ * (e.g. const), it will be resolved to find out the type with size.
+ *
+ * For example:
+ * In describing "const void *",  type_id is "const" and "const"
+ * refers to "void *".  The return type will be "void *".
+ *
+ * If type_id is a simple "int", then return type will be "int".
+ *
+ * @btf: struct btf object
+ * @type_id: Find out the size of type_id. The type_id of the return
+ *           type is set to *type_id.
+ * @ret_size: It can be NULL.  If not NULL, the size of the return
+ *            type is set to *ret_size.
+ * Return: The btf_type (resolved to another type with size info if needed).
+ *         NULL is returned if type_id itself does not have size info
+ *         (e.g. void) or it cannot be resolved to another type that
+ *         has size info.
+ *         *type_id and *ret_size will not be changed in the
+ *         NULL return case.
+ */
+const struct btf_type *btf_type_id_size(const struct btf *btf,
+					u32 *type_id,
+					u32 *ret_size);
+
+#endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 905be2f48a35..6ade409da2f9 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -105,6 +105,50 @@
  *
  * In the first pass, it still does some verifications (e.g.
  * checking the name is a valid offset to the string section).
+ *
+ * Pass #2
+ * ~~~~~~~
+ * The main focus is to resolve a btf_type that is referring
+ * to another type.
+ *
+ * We have to ensure the referring type:
+ * 1) does exist in the BTF (i.e. in btf->types[])
+ * 2) does not cause a loop:
+ *	struct A {
+ *		struct B b;
+ *	};
+ *
+ *	struct B {
+ *		struct A a;
+ *	};
+ *
+ * btf_type_needs_resolve() decides if a btf_type needs
+ * to be resolved.
+ *
+ * The needs_resolve type implements the "resolve()" ops which
+ * essentially does a DFS and detects backedge.
+ *
+ * During resolve (or DFS), different C types have different
+ * "RESOLVED" conditions.
+ *
+ * When resolving a BTF_KIND_STRUCT, we need to resolve all its
+ * members because a member is always referring to another
+ * type.  A struct's member can be treated as "RESOLVED" if
+ * it is referring to a BTF_KIND_PTR.  Otherwise, the
+ * following valid C struct would be rejected:
+ *
+ *	struct A {
+ *		int m;
+ *		struct A *a;
+ *	};
+ *
+ * When resolving a BTF_KIND_PTR, it needs to keep resolving if
+ * it is referring to another BTF_KIND_PTR.  Otherwise, we cannot
+ * detect a pointer loop, e.g.:
+ * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR +
+ *                        ^                                         |
+ *                        +-----------------------------------------+
+ *
  */
 
 #define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
@@ -127,12 +171,19 @@
 	     i < btf_type_vlen(struct_type);			\
 	     i++, member++)
 
+#define for_each_member_from(i, from, struct_type, member)		\
+	for (i = from, member = btf_type_member(struct_type) + from;	\
+	     i < btf_type_vlen(struct_type);				\
+	     i++, member++)
+
 struct btf {
 	union {
 		struct btf_header *hdr;
 		void *data;
 	};
 	struct btf_type **types;
+	u32 *resolved_ids;
+	u32 *resolved_sizes;
 	const char *strings;
 	void *nohdr_data;
 	u32 nr_types;
@@ -140,10 +191,42 @@ struct btf {
 	u32 data_size;
 };
 
+enum verifier_phase {
+	CHECK_META,
+	CHECK_TYPE,
+};
+
+struct resolve_vertex {
+	const struct btf_type *t;
+	u32 type_id;
+	u16 next_member;
+};
+
+enum visit_state {
+	NOT_VISITED,
+	VISITED,
+	RESOLVED,
+};
+
+enum resolve_mode {
+	RESOLVE_TBD,	/* To Be Determined */
+	RESOLVE_PTR,	/* Resolving for Pointer */
+	RESOLVE_STRUCT_OR_ARRAY,	/* Resolving for struct/union
+					 * or array
+					 */
+};
+
+#define MAX_RESOLVE_DEPTH 32
+
 struct btf_verifier_env {
 	struct btf *btf;
+	u8 *visit_states;
+	struct resolve_vertex stack[MAX_RESOLVE_DEPTH];
 	struct bpf_verifier_log log;
 	u32 log_type_id;
+	u32 top_stack;
+	enum verifier_phase phase;
+	enum resolve_mode resolve_mode;
 };
 
 static const char * const btf_kind_str[NR_BTF_KINDS] = {
@@ -167,6 +250,8 @@ struct btf_kind_operations {
 	s32 (*check_meta)(struct btf_verifier_env *env,
 			  const struct btf_type *t,
 			  u32 meta_left);
+	int (*resolve)(struct btf_verifier_env *env,
+		       const struct resolve_vertex *v);
 	void (*log_details)(struct btf_verifier_env *env,
 			    const struct btf_type *t);
 };
@@ -174,6 +259,102 @@ struct btf_kind_operations {
 static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
 static struct btf_type btf_void;
 
+static bool btf_type_is_modifier(const struct btf_type *t)
+{
+	/* Some of them is not strictly a C modifier
+	 * but they are grouped into the same bucket
+	 * for BTF concern:
+	 *   A type (t) that refers to another
+	 *   type through t->type AND its size cannot
+	 *   be determined without following the t->type.
+	 *
+	 * ptr does not fall into this bucket
+	 * because its size is always sizeof(void *).
+	 */
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_CONST:
+	case BTF_KIND_RESTRICT:
+		return true;
+	};
+
+	return false;
+}
+
+static bool btf_type_is_void(const struct btf_type *t)
+{
+	/* void => no type and size info.
+	 * Hence, FWD is also treated as void.
+	 */
+	return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+}
+
+static bool btf_type_is_void_or_null(const struct btf_type *t)
+{
+	return !t || btf_type_is_void(t);
+}
+
+/* union is only a special case of struct:
+ * all its offsetof(member) == 0
+ */
+static bool btf_type_is_struct(const struct btf_type *t)
+{
+	u8 kind = BTF_INFO_KIND(t->info);
+
+	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
+}
+
+static bool btf_type_is_array(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
+}
+
+static bool btf_type_is_ptr(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
+}
+
+static bool btf_type_is_int(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
+}
+
+/* What types need to be resolved?
+ *
+ * btf_type_is_modifier() is an obvious one.
+ *
+ * btf_type_is_struct() because its member refers to
+ * another type (through member->type).
+
+ * btf_type_is_array() because its element (array->type)
+ * refers to another type.  Array can be thought of a
+ * special case of struct while array just has the same
+ * member-type repeated by array->nelems of times.
+ */
+static bool btf_type_needs_resolve(const struct btf_type *t)
+{
+	return btf_type_is_modifier(t) ||
+		btf_type_is_ptr(t) ||
+		btf_type_is_struct(t) ||
+		btf_type_is_array(t);
+}
+
+/* t->size can be used */
+static bool btf_type_has_size(const struct btf_type *t)
+{
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_INT:
+	case BTF_KIND_FLOAT:
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION:
+	case BTF_KIND_ENUM:
+		return true;
+	};
+
+	return false;
+}
+
 static const char *btf_int_encoding_str(u8 encoding)
 {
 	if (encoding == 0)
@@ -236,6 +417,14 @@ static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
 		return "(invalid-name-offset)";
 }
 
+static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
+{
+	if (type_id > btf->nr_types)
+		return NULL;
+
+	return btf->types[type_id];
+}
+
 __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
 					      const char *fmt, ...)
 {
@@ -310,6 +499,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
 	if (!bpf_verifier_log_needed(log))
 		return;
 
+	/* The CHECK_META phase already did a btf dump.
+	 *
+	 * If member is logged again, it must hit an error in
+	 * parsing this member.  It is useful to print out which
+	 * struct this member belongs to.
+	 */
+	if (env->phase != CHECK_META)
+		btf_verifier_log_type(env, struct_type, NULL);
+
 	__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
 			   btf_name_by_offset(btf, member->name),
 			   member->type, member->offset);
@@ -395,15 +593,176 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
 static void btf_free(struct btf *btf)
 {
 	kvfree(btf->types);
+	kvfree(btf->resolved_sizes);
+	kvfree(btf->resolved_ids);
 	kvfree(btf->data);
 	kfree(btf);
 }
 
+static int env_resolve_init(struct btf_verifier_env *env)
+{
+	struct btf *btf = env->btf;
+	u32 nr_types = btf->nr_types;
+	u32 *resolved_sizes = NULL;
+	u32 *resolved_ids = NULL;
+	u8 *visit_states = NULL;
+
+	/* +1 for btf_void */
+	resolved_sizes = kvzalloc((nr_types + 1) * sizeof(*resolved_sizes),
+				  GFP_KERNEL | __GFP_NOWARN);
+	if (!resolved_sizes)
+		goto nomem;
+
+	resolved_ids = kvzalloc((nr_types + 1) * sizeof(*resolved_ids),
+				GFP_KERNEL | __GFP_NOWARN);
+	if (!resolved_ids)
+		goto nomem;
+
+	visit_states = kvzalloc((nr_types + 1) * sizeof(*visit_states),
+				GFP_KERNEL | __GFP_NOWARN);
+	if (!visit_states)
+		goto nomem;
+
+	btf->resolved_sizes = resolved_sizes;
+	btf->resolved_ids = resolved_ids;
+	env->visit_states = visit_states;
+
+	return 0;
+
+nomem:
+	kvfree(resolved_sizes);
+	kvfree(resolved_ids);
+	kvfree(visit_states);
+	return -ENOMEM;
+}
+
 static void btf_verifier_env_free(struct btf_verifier_env *env)
 {
+	kvfree(env->visit_states);
 	kfree(env);
 }
 
+static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
+				     const struct btf_type *next_type)
+{
+	switch (env->resolve_mode) {
+	case RESOLVE_TBD:
+		/* int, enum or void is a sink */
+		return !btf_type_needs_resolve(next_type);
+	case RESOLVE_PTR:
+		/* int, enum, void, struct or array is a sink for ptr */
+		return !btf_type_is_modifier(next_type) &&
+			!btf_type_is_ptr(next_type);
+	case RESOLVE_STRUCT_OR_ARRAY:
+		/* int, enum, void or ptr is a sink for struct and array */
+		return !btf_type_is_modifier(next_type) &&
+			!btf_type_is_array(next_type) &&
+			!btf_type_is_struct(next_type);
+	default:
+		BUG_ON(1);
+	};
+}
+
+static bool env_type_is_resolved(const struct btf_verifier_env *env,
+				 u32 type_id)
+{
+	return env->visit_states[type_id] == RESOLVED;
+}
+
+static int env_stack_push(struct btf_verifier_env *env,
+			  const struct btf_type *t, u32 type_id)
+{
+	struct resolve_vertex *v;
+
+	if (env->top_stack == MAX_RESOLVE_DEPTH)
+		return -E2BIG;
+
+	if (env->visit_states[type_id] != NOT_VISITED)
+		return -EEXIST;
+
+	env->visit_states[type_id] = VISITED;
+
+	v = &env->stack[env->top_stack++];
+	v->t = t;
+	v->type_id = type_id;
+	v->next_member = 0;
+
+	if (env->resolve_mode == RESOLVE_TBD) {
+		if (btf_type_is_ptr(t))
+			env->resolve_mode = RESOLVE_PTR;
+		else if (btf_type_is_struct(t) || btf_type_is_array(t))
+			env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY;
+	}
+
+	return 0;
+}
+
+static void env_stack_set_next_member(struct btf_verifier_env *env,
+				      u16 next_member)
+{
+	env->stack[env->top_stack - 1].next_member = next_member;
+}
+
+static void env_stack_pop_resolved(struct btf_verifier_env *env,
+				   u32 resolved_type_id,
+				   u32 resolved_size)
+{
+	u32 type_id = env->stack[--(env->top_stack)].type_id;
+	struct btf *btf = env->btf;
+
+	btf->resolved_sizes[type_id] = resolved_size;
+	btf->resolved_ids[type_id] = resolved_type_id;
+	env->visit_states[type_id] = RESOLVED;
+}
+
+static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
+{
+	return env->top_stack ? &env->stack[env->top_stack - 1] : NULL;
+}
+
+/* The input param "type_id" must point to a needs_resolve type */
+static const struct btf_type *btf_type_id_resolve(const struct btf *btf,
+						  u32 *type_id)
+{
+	*type_id = btf->resolved_ids[*type_id];
+	return btf_type_by_id(btf, *type_id);
+}
+
+const struct btf_type *btf_type_id_size(const struct btf *btf,
+					u32 *type_id, u32 *ret_size)
+{
+	const struct btf_type *size_type;
+	u32 size_type_id = *type_id;
+	u32 size = 0;
+
+	size_type = btf_type_by_id(btf, size_type_id);
+	if (btf_type_is_void_or_null(size_type))
+		return NULL;
+
+	if (btf_type_has_size(size_type)) {
+		size = size_type->size;
+	} else if (btf_type_is_array(size_type)) {
+		size = btf->resolved_sizes[size_type_id];
+	} else if (btf_type_is_ptr(size_type)) {
+		size = sizeof(void *);
+	} else {
+		if (WARN_ON_ONCE(!btf_type_is_modifier(size_type)))
+			return NULL;
+
+		size = btf->resolved_sizes[size_type_id];
+		size_type_id = btf->resolved_ids[size_type_id];
+		size_type = btf_type_by_id(btf, size_type_id);
+		if (btf_type_is_void(size_type))
+			return NULL;
+	}
+
+	*type_id = size_type_id;
+	if (ret_size)
+		*ret_size = size;
+
+	return size_type;
+}
+
 static int btf_df_check_meta(struct btf_verifier_env *env,
 			     const struct btf_type *t,
 			     u32 meta_left)
@@ -412,6 +771,13 @@ static int btf_df_check_meta(struct btf_verifier_env *env,
 	return -ENOTSUPP;
 }
 
+static int btf_df_resolve(struct btf_verifier_env *env,
+			  const struct resolve_vertex *v)
+{
+	btf_verifier_log_basic(env, v->t, "Unsupported resolve");
+	return -EINVAL;
+}
+
 static void btf_df_log(struct btf_verifier_env *env,
 		       const struct btf_type *t)
 {
@@ -420,6 +786,7 @@ static void btf_df_log(struct btf_verifier_env *env,
 
 static struct btf_kind_operations df_ops = {
 	.check_meta = btf_df_check_meta,
+	.resolve = btf_df_resolve,
 	.log_details = btf_df_log,
 };
 
@@ -487,6 +854,7 @@ static void btf_int_log(struct btf_verifier_env *env,
 
 static const struct btf_kind_operations int_ops = {
 	.check_meta = btf_int_check_meta,
+	.resolve = btf_df_resolve,
 	.log_details = btf_int_log,
 };
 
@@ -509,6 +877,104 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 	return 0;
 }
 
+static int btf_modifier_resolve(struct btf_verifier_env *env,
+				const struct resolve_vertex *v)
+{
+	const struct btf_type *t = v->t;
+	const struct btf_type *next_type;
+	u32 next_type_id = t->type;
+	struct btf *btf = env->btf;
+	u32 next_type_size = 0;
+
+	next_type = btf_type_by_id(btf, next_type_id);
+	if (!next_type) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	/* "typedef void new_void", "const void"...etc */
+	if (btf_type_is_void(next_type))
+		goto resolved;
+
+	if (!env_type_is_resolve_sink(env, next_type) &&
+	    !env_type_is_resolved(env, next_type_id))
+		return env_stack_push(env, next_type, next_type_id);
+
+	/* Figure out the resolved next_type_id with size.
+	 * They will be stored in the current modifier's
+	 * resolved_ids and resolved_sizes such that it can
+	 * save us a few type-following when we use it later (e.g. in
+	 * pretty print).
+	 */
+	if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
+	    !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+resolved:
+	env_stack_pop_resolved(env, next_type_id, next_type_size);
+
+	return 0;
+}
+
+static int btf_ptr_resolve(struct btf_verifier_env *env,
+			   const struct resolve_vertex *v)
+{
+	const struct btf_type *next_type;
+	const struct btf_type *t = v->t;
+	u32 next_type_id = t->type;
+	struct btf *btf = env->btf;
+	u32 next_type_size = 0;
+
+	next_type = btf_type_by_id(btf, next_type_id);
+	if (!next_type) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	/* "void *" */
+	if (btf_type_is_void(next_type))
+		goto resolved;
+
+	if (!env_type_is_resolve_sink(env, next_type) &&
+	    !env_type_is_resolved(env, next_type_id))
+		return env_stack_push(env, next_type, next_type_id);
+
+	/* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY,
+	 * the modifier may have stopped resolving when it was resolved
+	 * to a ptr (last-resolved-ptr).
+	 *
+	 * We now need to continue from the last-resolved-ptr to
+	 * ensure the last-resolved-ptr will not referring back to
+	 * the currenct ptr (t).
+	 */
+	if (btf_type_is_modifier(next_type)) {
+		const struct btf_type *resolved_type;
+		u32 resolved_type_id;
+
+		resolved_type_id = next_type_id;
+		resolved_type = btf_type_id_resolve(btf, &resolved_type_id);
+
+		if (btf_type_is_ptr(resolved_type) &&
+		    !env_type_is_resolve_sink(env, resolved_type) &&
+		    !env_type_is_resolved(env, resolved_type_id))
+			return env_stack_push(env, resolved_type,
+					      resolved_type_id);
+	}
+
+	if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
+	    !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+resolved:
+	env_stack_pop_resolved(env, next_type_id, 0);
+
+	return 0;
+}
+
 static void btf_ref_type_log(struct btf_verifier_env *env,
 			     const struct btf_type *t)
 {
@@ -517,16 +983,19 @@ static void btf_ref_type_log(struct btf_verifier_env *env,
 
 static struct btf_kind_operations modifier_ops = {
 	.check_meta = btf_ref_type_check_meta,
+	.resolve = btf_modifier_resolve,
 	.log_details = btf_ref_type_log,
 };
 
 static struct btf_kind_operations ptr_ops = {
 	.check_meta = btf_ref_type_check_meta,
+	.resolve = btf_ptr_resolve,
 	.log_details = btf_ref_type_log,
 };
 
 static struct btf_kind_operations fwd_ops = {
 	.check_meta = btf_ref_type_check_meta,
+	.resolve = btf_df_resolve,
 	.log_details = btf_ref_type_log,
 };
 
@@ -565,6 +1034,61 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
 	return meta_needed;
 }
 
+static int btf_array_resolve(struct btf_verifier_env *env,
+			     const struct resolve_vertex *v)
+{
+	const struct btf_array *array = btf_type_array(v->t);
+	const struct btf_type *elem_type;
+	u32 elem_type_id = array->type;
+	struct btf *btf = env->btf;
+	u32 elem_size;
+
+	elem_type = btf_type_by_id(btf, elem_type_id);
+	if (btf_type_is_void_or_null(elem_type)) {
+		btf_verifier_log_type(env, v->t,
+				      "Invalid elem");
+		return -EINVAL;
+	}
+
+	if (!env_type_is_resolve_sink(env, elem_type) &&
+	    !env_type_is_resolved(env, elem_type_id))
+		return env_stack_push(env, elem_type, elem_type_id);
+
+	elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+	if (!elem_type) {
+		btf_verifier_log_type(env, v->t, "Invalid elem");
+		return -EINVAL;
+	}
+
+	if (btf_type_is_int(elem_type)) {
+		int int_type_data = btf_type_int(elem_type);
+		u16 nr_bits = BTF_INT_BITS(int_type_data);
+		u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+
+		/* Put more restriction on array of int.  The int cannot
+		 * be a bit field and it must be either u8/u16/u32/u64.
+		 */
+		if (BITS_PER_BYTE_MASKED(nr_bits) ||
+		    BTF_INT_OFFSET(int_type_data) ||
+		    (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
+		     nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+			btf_verifier_log_type(env, v->t,
+					      "Invalid array of int");
+			return -EINVAL;
+		}
+	}
+
+	if (array->nelems && elem_size > U32_MAX / array->nelems) {
+		btf_verifier_log_type(env, v->t,
+				      "Array size overflows U32_MAX");
+		return -EINVAL;
+	}
+
+	env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems);
+
+	return 0;
+}
+
 static void btf_array_log(struct btf_verifier_env *env,
 			  const struct btf_type *t)
 {
@@ -576,6 +1100,7 @@ static void btf_array_log(struct btf_verifier_env *env,
 
 static struct btf_kind_operations array_ops = {
 	.check_meta = btf_array_check_meta,
+	.resolve = btf_array_resolve,
 	.log_details = btf_array_log,
 };
 
@@ -633,6 +1158,50 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 	return meta_needed;
 }
 
+static int btf_struct_resolve(struct btf_verifier_env *env,
+			      const struct resolve_vertex *v)
+{
+	const struct btf_member *member;
+	u16 i;
+
+	/* Before continue resolving the next_member,
+	 * ensure the last member is indeed resolved to a
+	 * type with size info.
+	 */
+	if (v->next_member) {
+		const struct btf_member *last_member;
+		u16 last_member_type_id;
+
+		last_member = btf_type_member(v->t) + v->next_member - 1;
+		last_member_type_id = last_member->type;
+		if (WARN_ON_ONCE(!env_type_is_resolved(env,
+						       last_member_type_id)))
+			return -EINVAL;
+	}
+
+	for_each_member_from(i, v->next_member, v->t, member) {
+		u32 member_type_id = member->type;
+		const struct btf_type *member_type = btf_type_by_id(env->btf,
+								member_type_id);
+
+		if (btf_type_is_void_or_null(member_type)) {
+			btf_verifier_log_member(env, v->t, member,
+						"Invalid member");
+			return -EINVAL;
+		}
+
+		if (!env_type_is_resolve_sink(env, member_type) &&
+		    !env_type_is_resolved(env, member_type_id)) {
+			env_stack_set_next_member(env, i + 1);
+			return env_stack_push(env, member_type, member_type_id);
+		}
+	}
+
+	env_stack_pop_resolved(env, 0, 0);
+
+	return 0;
+}
+
 static void btf_struct_log(struct btf_verifier_env *env,
 			   const struct btf_type *t)
 {
@@ -641,6 +1210,7 @@ static void btf_struct_log(struct btf_verifier_env *env,
 
 static struct btf_kind_operations struct_ops = {
 	.check_meta = btf_struct_check_meta,
+	.resolve = btf_struct_resolve,
 	.log_details = btf_struct_log,
 };
 
@@ -694,6 +1264,7 @@ static void btf_enum_log(struct btf_verifier_env *env,
 
 static struct btf_kind_operations enum_ops = {
 	.check_meta = btf_enum_check_meta,
+	.resolve = btf_df_resolve,
 	.log_details = btf_enum_log,
 };
 
@@ -776,9 +1347,104 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
 	return 0;
 }
 
+static int btf_resolve(struct btf_verifier_env *env,
+		       const struct btf_type *t, u32 type_id)
+{
+	const struct resolve_vertex *v;
+	int err = 0;
+
+	env->resolve_mode = RESOLVE_TBD;
+	env_stack_push(env, t, type_id);
+	while (!err && (v = env_stack_peak(env))) {
+		env->log_type_id = v->type_id;
+		err = btf_type_ops(v->t)->resolve(env, v);
+	}
+
+	env->log_type_id = type_id;
+	if (err == -E2BIG)
+		btf_verifier_log_type(env, t,
+				      "Exceeded max resolving depth:%u",
+				      MAX_RESOLVE_DEPTH);
+	else if (err == -EEXIST)
+		btf_verifier_log_type(env, t, "Loop detected");
+
+	return err;
+}
+
+static bool btf_resolve_valid(struct btf_verifier_env *env,
+			      const struct btf_type *t,
+			      u32 type_id)
+{
+	struct btf *btf = env->btf;
+
+	if (!env_type_is_resolved(env, type_id))
+		return false;
+
+	if (btf_type_is_struct(t))
+		return !btf->resolved_ids[type_id] &&
+			!btf->resolved_sizes[type_id];
+
+	if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) {
+		t = btf_type_id_resolve(btf, &type_id);
+		return t && !btf_type_is_modifier(t);
+	}
+
+	if (btf_type_is_array(t)) {
+		const struct btf_array *array = btf_type_array(t);
+		const struct btf_type *elem_type;
+		u32 elem_type_id = array->type;
+		u32 elem_size;
+
+		elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+		return elem_type && !btf_type_is_modifier(elem_type) &&
+			(array->nelems * elem_size ==
+			 btf->resolved_sizes[type_id]);
+	}
+
+	return false;
+}
+
+static int btf_check_all_types(struct btf_verifier_env *env)
+{
+	struct btf *btf = env->btf;
+	u32 type_id;
+	int err;
+
+	err = env_resolve_init(env);
+	if (err)
+		return err;
+
+	env->phase++;
+	for (type_id = 1; type_id <= btf->nr_types; type_id++) {
+		const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+		env->log_type_id = type_id;
+		if (btf_type_needs_resolve(t) &&
+		    !env_type_is_resolved(env, type_id)) {
+			err = btf_resolve(env, t, type_id);
+			if (err)
+				return err;
+		}
+
+		if (btf_type_needs_resolve(t) &&
+		    !btf_resolve_valid(env, t, type_id)) {
+			btf_verifier_log_type(env, t, "Invalid resolve state");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int btf_parse_type_sec(struct btf_verifier_env *env)
 {
-	return btf_check_all_metas(env);
+	int err;
+
+	err = btf_check_all_metas(env);
+	if (err)
+		return err;
+
+	return btf_check_all_types(env);
 }
 
 static int btf_parse_str_sec(struct btf_verifier_env *env)
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 00/10] BTF: BPF Type Format
From: Martin KaFai Lau @ 2018-04-16 19:33 UTC (permalink / raw)
  To: netdev
  Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team,
	Arnaldo Carvalho de Melo

This patch introduces BPF Type Format (BTF).

BTF (BPF Type Format) is the meta data format which describes
the data types of BPF program/map.  Hence, it basically focus
on the C programming language which the modern BPF is primary
using.  The first use case is to provide a generic pretty print
capability for a BPF map.

A modified pahole (Cc: Arnaldo) that can convert dwarf to BTF is here:
https://github.com/iamkafai/pahole/tree/btf

Please see individual patch for details.

v3:
- Rebase to bpf-next
- Fix sparse warning (by adding static)
- Add BTF header logging: btf_verifier_log_hdr()
- Fix the alignment test on btf->type_off
- Add tests for the BTF header
- Lower the max BTF size to 16MB.  It should be enough
  for some time.  We could raise it later if it would
  be needed.

v2:
- Use kvfree where needed in patch 1 and 2
- Also consider BTF_INT_OFFSET() in the btf_int_check_meta()
  in patch 1
- Fix an incorrect goto target in map_create() during
  the btf-error-path in patch 7
- re-org some local vars to keep the rev xmas tree in btf.c

Martin KaFai Lau (10):
  bpf: btf: Introduce BPF Type Format (BTF)
  bpf: btf: Validate type reference
  bpf: btf: Check members of struct/union
  bpf: btf: Add pretty print capability for data with BTF type info
  bpf: btf: Add BPF_BTF_LOAD command
  bpf: btf: Add BPF_OBJ_GET_INFO_BY_FD support to BTF fd
  bpf: btf: Add pretty print support to the basic arraymap
  bpf: btf: Sync bpf.h and btf.h to tools/
  bpf: btf: Add BTF support to libbpf
  bpf: btf: Add BTF tests

 include/linux/bpf.h                          |   20 +-
 include/linux/btf.h                          |   48 +
 include/uapi/linux/bpf.h                     |   12 +
 include/uapi/linux/btf.h                     |  132 ++
 kernel/bpf/Makefile                          |    1 +
 kernel/bpf/arraymap.c                        |   50 +
 kernel/bpf/btf.c                             | 2093 ++++++++++++++++++++++++++
 kernel/bpf/inode.c                           |  146 +-
 kernel/bpf/syscall.c                         |   51 +-
 tools/include/uapi/linux/bpf.h               |   13 +
 tools/include/uapi/linux/btf.h               |  132 ++
 tools/lib/bpf/Build                          |    2 +-
 tools/lib/bpf/bpf.c                          |   92 +-
 tools/lib/bpf/bpf.h                          |   16 +
 tools/lib/bpf/btf.c                          |  377 +++++
 tools/lib/bpf/btf.h                          |   22 +
 tools/lib/bpf/libbpf.c                       |  148 +-
 tools/lib/bpf/libbpf.h                       |    3 +
 tools/testing/selftests/bpf/Makefile         |   26 +-
 tools/testing/selftests/bpf/test_btf.c       | 1669 ++++++++++++++++++++
 tools/testing/selftests/bpf/test_btf_haskv.c |   48 +
 tools/testing/selftests/bpf/test_btf_nokv.c  |   43 +
 22 files changed, 5103 insertions(+), 41 deletions(-)
 create mode 100644 include/linux/btf.h
 create mode 100644 include/uapi/linux/btf.h
 create mode 100644 kernel/bpf/btf.c
 create mode 100644 tools/include/uapi/linux/btf.h
 create mode 100644 tools/lib/bpf/btf.c
 create mode 100644 tools/lib/bpf/btf.h
 create mode 100644 tools/testing/selftests/bpf/test_btf.c
 create mode 100644 tools/testing/selftests/bpf/test_btf_haskv.c
 create mode 100644 tools/testing/selftests/bpf/test_btf_nokv.c

-- 
2.9.5

^ permalink raw reply

* [PATCH bpf-next v3 10/10] bpf: btf: Add BTF tests
From: Martin KaFai Lau @ 2018-04-16 19:33 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180416193327.477239-1-kafai@fb.com>

This patch tests the BTF loading, map_create with BTF
and the changes in libbpf.

-r: Raw tests that test raw crafted BTF data
-f: Test LLVM compiled bpf prog with BTF data
-g: Test BPF_OBJ_GET_INFO_BY_FD for btf_fd
-p: Test pretty print

The tools/testing/selftests/bpf/Makefile will probe
for BTF support in llc and pahole before generating
debug info (-g) and convert them to BTF.  You can supply
the BTF supported binary through the following make variables:
LLC, BTF_PAHOLE and LLVM_OBJCOPY.

LLC: The lastest llc with -mattr=dwarfris support for the bpf target.
     It is only in the master of the llvm repo for now.
BTF_PAHOLE: The modified pahole with BTF support:
	    https://github.com/iamkafai/pahole/tree/btf
	    To add a BTF section: "pahole -J bpf_prog.o"
LLVM_OBJCOPY: Any llvm-objcopy should do

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
 tools/testing/selftests/bpf/Makefile         |   26 +-
 tools/testing/selftests/bpf/test_btf.c       | 1669 ++++++++++++++++++++++++++
 tools/testing/selftests/bpf/test_btf_haskv.c |   48 +
 tools/testing/selftests/bpf/test_btf_nokv.c  |   43 +
 4 files changed, 1783 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_btf.c
 create mode 100644 tools/testing/selftests/bpf/test_btf_haskv.c
 create mode 100644 tools/testing/selftests/bpf/test_btf_nokv.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 0a315ddabbf4..8d954926de91 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -24,14 +24,15 @@ urandom_read: urandom_read.c
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
-	test_sock test_sock_addr
+	test_sock test_sock_addr test_btf
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
-	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o
+	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o \
+	test_btf_haskv.o test_btf_nokv.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -66,6 +67,8 @@ $(BPFOBJ): force
 
 CLANG ?= clang
 LLC   ?= llc
+LLVM_OBJCOPY ?= llvm-objcopy
+BTF_PAHOLE ?= pahole
 
 PROBE := $(shell $(LLC) -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
 
@@ -83,9 +86,26 @@ CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \
 $(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline
 $(OUTPUT)/test_xdp_noinline.o: CLANG_FLAGS += -fno-inline
 
+BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help |& grep dwarfris)
+BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help |& grep BTF)
+BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --version |& grep LLVM)
+
+ifneq ($(BTF_LLC_PROBE),)
+ifneq ($(BTF_PAHOLE_PROBE),)
+ifneq ($(BTF_OBJCOPY_PROBE),)
+	CLANG_FLAGS += -g
+	LLC_FLAGS += -mattr=dwarfris
+	DWARF2BTF = y
+endif
+endif
+endif
+
 $(OUTPUT)/%.o: %.c
 	$(CLANG) $(CLANG_FLAGS) \
 		 -O2 -target bpf -emit-llvm -c $< -o - |      \
-	$(LLC) -march=bpf -mcpu=$(CPU) -filetype=obj -o $@
+	$(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@
+ifeq ($(DWARF2BTF),y)
+	$(BTF_PAHOLE) -J $@
+endif
 
 EXTRA_CLEAN := $(TEST_CUSTOM_PROGS)
diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
new file mode 100644
index 000000000000..7b39b1f712a1
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -0,0 +1,1669 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <bpf/bpf.h>
+#include <sys/resource.h>
+#include <libelf.h>
+#include <gelf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <bpf/libbpf.h>
+#include <bpf/btf.h>
+
+#include "bpf_rlimit.h"
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define __printf(a, b)	__attribute__((format(printf, a, b)))
+
+__printf(1, 2)
+static int __base_pr(const char *format, ...)
+{
+	va_list args;
+	int err;
+
+	va_start(args, format);
+	err = vfprintf(stderr, format, args);
+	va_end(args);
+	return err;
+}
+
+#define BTF_INFO_ENC(kind, root, vlen)			\
+	((!!(root) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))
+
+#define BTF_TYPE_ENC(name, info, size_or_type)	\
+	(name), (info), (size_or_type)
+
+#define BTF_INT_ENC(encoding, bits_offset, nr_bits)	\
+	((encoding) << 24 | (bits_offset) << 16 | (nr_bits))
+#define BTF_TYPE_INT_ENC(name, encoding, bits_offset, bits, sz)	\
+	BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), sz),	\
+	BTF_INT_ENC(encoding, bits_offset, bits)
+
+#define BTF_ARRAY_ENC(type, index_type, nr_elems)	\
+	(type), (index_type), (nr_elems)
+#define BTF_TYPE_ARRAY_ENC(type, index_type, nr_elems) \
+	BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ARRAY, 0, 0), 0), \
+	BTF_ARRAY_ENC(type, index_type, nr_elems)
+
+#define BTF_MEMBER_ENC(name, type, bits_offset)	\
+	(name), (type), (bits_offset)
+#define BTF_ENUM_ENC(name, val) (name), (val)
+
+#define BTF_TYPEDEF_ENC(name, type) \
+	BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_TYPEDEF, 0, 0), type)
+
+#define BTF_PTR_ENC(name, type) \
+	BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), type)
+
+#define BTF_END_RAW 0xdeadbeef
+#define NAME_TBD 0xdeadb33f
+
+#define MAX_NR_RAW_TYPES 1024
+#define BTF_LOG_BUF_SIZE 65535
+
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+static struct args {
+	unsigned int raw_test_num;
+	unsigned int file_test_num;
+	unsigned int get_info_test_num;
+	bool raw_test;
+	bool file_test;
+	bool get_info_test;
+	bool pprint_test;
+	bool always_log;
+} args;
+
+static char btf_log_buf[BTF_LOG_BUF_SIZE];
+
+static struct btf_header hdr_tmpl = {
+	.magic = BTF_MAGIC,
+	.version = BTF_VERSION,
+};
+
+struct btf_raw_test {
+	const char *descr;
+	const char *str_sec;
+	const char *map_name;
+	__u32 raw_types[MAX_NR_RAW_TYPES];
+	__u32 str_sec_size;
+	enum bpf_map_type map_type;
+	__u32 key_size;
+	__u32 value_size;
+	__u32 key_id;
+	__u32 value_id;
+	__u32 max_entries;
+	bool btf_load_err;
+	bool map_create_err;
+	int type_off_delta;
+	int str_off_delta;
+	int str_len_delta;
+};
+
+static struct btf_raw_test raw_tests[] = {
+/* enum E {
+ *     E0,
+ *     E1,
+ * };
+ *
+ * struct A {
+ *	int m;
+ *	unsigned long long n;
+ *	char o;
+ *	[3 bytes hole]
+ *	int p[8];
+ *	int q[4][8];
+ *	enum E r;
+ * };
+ */
+{
+	.descr = "struct test #1",
+	.raw_types = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+		/* unsigned long long */
+		BTF_TYPE_INT_ENC(0, 0, 0, 64, 8),		/* [2] */
+		/* char */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1),	/* [3] */
+		/* int[8] */
+		BTF_TYPE_ARRAY_ENC(1, 1, 8),			/* [4] */
+		/* struct A { */				/* [5] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 6), 180),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m;		*/
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* unsigned long long n;*/
+		BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o;		*/
+		BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8]		*/
+		BTF_MEMBER_ENC(NAME_TBD, 6, 384),/* int q[4][8]		*/
+		BTF_MEMBER_ENC(NAME_TBD, 7, 1408), /* enum E r		*/
+		/* } */
+		/* int[4][8] */
+		BTF_TYPE_ARRAY_ENC(4, 1, 4),			/* [6] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), sizeof(int)),
+		BTF_ENUM_ENC(NAME_TBD, 0),
+		BTF_ENUM_ENC(NAME_TBD, 1),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m\0n\0o\0p\0q\0r\0E\0E0\0E1",
+	.str_sec_size = sizeof("\0A\0m\0n\0o\0p\0q\0r\0E\0E0\0E1"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "struct_test1_map",
+	.key_size = sizeof(int),
+	.value_size = 180,
+	.key_id = 1,
+	.value_id = 5,
+	.max_entries = 4,
+},
+
+/* typedef struct b Struct_B;
+ *
+ * struct A {
+ *     int m;
+ *     struct b n[4];
+ *     const Struct_B o[4];
+ * };
+ *
+ * struct B {
+ *     int m;
+ *     int n;
+ * };
+ */
+{
+	.descr = "struct test #2",
+	.raw_types = {
+		/* int */					/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* struct b [4] */				/* [2] */
+		BTF_TYPE_ARRAY_ENC(4, 1, 4),
+
+		/* struct A { */				/* [3] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 3), 68),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m;		*/
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* struct B n[4]	*/
+		BTF_MEMBER_ENC(NAME_TBD, 8, 288),/* const Struct_B o[4];*/
+		/* } */
+
+		/* struct B { */				/* [4] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m; */
+		BTF_MEMBER_ENC(NAME_TBD, 1, 32),/* int n; */
+		/* } */
+
+		/* const int */					/* [5] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 1),
+		/* typedef struct b Struct_B */	/* [6] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_TYPEDEF, 0, 0), 4),
+		/* const Struct_B */				/* [7] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 6),
+		/* const Struct_B [4] */			/* [8] */
+		BTF_TYPE_ARRAY_ENC(7, 1, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m\0n\0o\0B\0m\0n\0Struct_B",
+	.str_sec_size = sizeof("\0A\0m\0n\0o\0B\0m\0n\0Struct_B"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "struct_test2_map",
+	.key_size = sizeof(int),
+	.value_size = 68,
+	.key_id = 1,
+	.value_id = 3,
+	.max_entries = 4,
+},
+
+/* Test member exceeds the size of struct.
+ *
+ * struct A {
+ *     int m;
+ *     int n;
+ * };
+ */
+{
+	.descr = "size check test #1",
+	.raw_types = {
+		/* int */					/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* struct A { */				/* [2] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) * 2 -  1),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m; */
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* int n; */
+		/* } */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m\0n",
+	.str_sec_size = sizeof("\0A\0m\0n"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "size_check1_map",
+	.key_size = sizeof(int),
+	.value_size = 1,
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* Test member exeeds the size of struct
+ *
+ * struct A {
+ *     int m;
+ *     int n[2];
+ * };
+ */
+{
+	.descr = "size check test #2",
+	.raw_types = {
+		/* int */					/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, sizeof(int)),
+		/* int[2] */					/* [2] */
+		BTF_TYPE_ARRAY_ENC(1, 1, 2),
+		/* struct A { */				/* [3] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) * 3 - 1),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m; */
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* int n[2]; */
+		/* } */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m\0n",
+	.str_sec_size = sizeof("\0A\0m\0n"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "size_check2_map",
+	.key_size = sizeof(int),
+	.value_size = 1,
+	.key_id = 1,
+	.value_id = 3,
+	.max_entries = 4,
+	.btf_load_err = true,
+
+},
+
+/* Test member exeeds the size of struct
+ *
+ * struct A {
+ *     int m;
+ *     void *n;
+ * };
+ */
+{
+	.descr = "size check test #3",
+	.raw_types = {
+		/* int */					/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, sizeof(int)),
+		/* void* */					/* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0),
+		/* struct A { */				/* [3] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) + sizeof(void *) - 1),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m; */
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* void *n; */
+		/* } */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m\0n",
+	.str_sec_size = sizeof("\0A\0m\0n"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "size_check3_map",
+	.key_size = sizeof(int),
+	.value_size = 1,
+	.key_id = 1,
+	.value_id = 3,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* Test member exceeds the size of struct
+ *
+ * enum E {
+ *     E0,
+ *     E1,
+ * };
+ *
+ * struct A {
+ *     int m;
+ *     enum E n;
+ * };
+ */
+{
+	.descr = "size check test #4",
+	.raw_types = {
+		/* int */			/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, sizeof(int)),
+		/* enum E { */			/* [2] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), sizeof(int)),
+		BTF_ENUM_ENC(NAME_TBD, 0),
+		BTF_ENUM_ENC(NAME_TBD, 1),
+		/* } */
+		/* struct A { */		/* [3] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) * 2 - 1),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int m; */
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* enum E n; */
+		/* } */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0E\0E0\0E1\0A\0m\0n",
+	.str_sec_size = sizeof("\0E\0E0\0E1\0A\0m\0n"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "size_check4_map",
+	.key_size = sizeof(int),
+	.value_size = 1,
+	.key_id = 1,
+	.value_id = 3,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* typedef const void * const_void_ptr;
+ * struct A {
+ *	const_void_ptr m;
+ * };
+ */
+{
+	.descr = "void test #1",
+	.raw_types = {
+		/* int */		/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* const void */	/* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+		/* const void* */	/* [3] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),
+		/* typedef const void * const_void_ptr */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3),
+		/* struct A { */	/* [4] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), sizeof(void *)),
+		/* const_void_ptr m; */
+		BTF_MEMBER_ENC(NAME_TBD, 3, 0),
+		/* } */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0const_void_ptr\0A\0m",
+	.str_sec_size = sizeof("\0const_void_ptr\0A\0m"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "void_test1_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(void *),
+	.key_id = 1,
+	.value_id = 4,
+	.max_entries = 4,
+},
+
+/* struct A {
+ *     const void m;
+ * };
+ */
+{
+	.descr = "void test #2",
+	.raw_types = {
+		/* int */		/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* const void */	/* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+		/* struct A { */	/* [3] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 8),
+		/* const void m; */
+		BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+		/* } */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m",
+	.str_sec_size = sizeof("\0A\0m"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "void_test2_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(void *),
+	.key_id = 1,
+	.value_id = 3,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* typedef const void * const_void_ptr;
+ * const_void_ptr[4]
+ */
+{
+	.descr = "void test #3",
+	.raw_types = {
+		/* int */		/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* const void */	/* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+		/* const void* */	/* [3] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),
+		/* typedef const void * const_void_ptr */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3),
+		/* const_void_ptr[4] */	/* [4] */
+		BTF_TYPE_ARRAY_ENC(3, 1, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0const_void_ptr",
+	.str_sec_size = sizeof("\0const_void_ptr"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "void_test3_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(void *) * 4,
+	.key_id = 1,
+	.value_id = 4,
+	.max_entries = 4,
+},
+
+/* const void[4]  */
+{
+	.descr = "void test #4",
+	.raw_types = {
+		/* int */		/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* const void */	/* [2] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+		/* const void[4] */	/* [3] */
+		BTF_TYPE_ARRAY_ENC(2, 1, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m",
+	.str_sec_size = sizeof("\0A\0m"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "void_test4_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(void *) * 4,
+	.key_id = 1,
+	.value_id = 3,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* Array_A  <------------------+
+ *     elem_type == Array_B    |
+ *                    |        |
+ *                    |        |
+ * Array_B  <-------- +        |
+ *      elem_type == Array A --+
+ */
+{
+	.descr = "loop test #1",
+	.raw_types = {
+		/* int */			/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* Array_A */			/* [2] */
+		BTF_TYPE_ARRAY_ENC(3, 1, 8),
+		/* Array_B */			/* [3] */
+		BTF_TYPE_ARRAY_ENC(2, 1, 8),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test1_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(sizeof(int) * 8),
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* typedef is _before_ the BTF type of Array_A and Array_B
+ *
+ * typedef Array_B int_array;
+ *
+ * Array_A  <------------------+
+ *     elem_type == int_array  |
+ *                    |        |
+ *                    |        |
+ * Array_B  <-------- +        |
+ *      elem_type == Array_A --+
+ */
+{
+	.descr = "loop test #2",
+	.raw_types = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+		/* typedef Array_B int_array */
+		BTF_TYPEDEF_ENC(1, 4),				/* [2] */
+		/* Array_A */
+		BTF_TYPE_ARRAY_ENC(2, 1, 8),			/* [3] */
+		/* Array_B */
+		BTF_TYPE_ARRAY_ENC(3, 1, 8),			/* [4] */
+
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int_array\0",
+	.str_sec_size = sizeof("\0int_array"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test2_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(sizeof(int) * 8),
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* Array_A  <------------------+
+ *     elem_type == Array_B    |
+ *                    |        |
+ *                    |        |
+ * Array_B  <-------- +        |
+ *      elem_type == Array_A --+
+ */
+{
+	.descr = "loop test #3",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* Array_A */				/* [2] */
+		BTF_TYPE_ARRAY_ENC(3, 1, 8),
+		/* Array_B */				/* [3] */
+		BTF_TYPE_ARRAY_ENC(2, 1, 8),
+
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test3_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(sizeof(int) * 8),
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* typedef is _between_ the BTF type of Array_A and Array_B
+ *
+ * typedef Array_B int_array;
+ *
+ * Array_A  <------------------+
+ *     elem_type == int_array  |
+ *                    |        |
+ *                    |        |
+ * Array_B  <-------- +        |
+ *      elem_type == Array_A --+
+ */
+{
+	.descr = "loop test #4",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* Array_A */				/* [2] */
+		BTF_TYPE_ARRAY_ENC(3, 1, 8),
+		/* typedef Array_B int_array */		/* [3] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 4),
+		/* Array_B */				/* [4] */
+		BTF_TYPE_ARRAY_ENC(2, 1, 8),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int_array\0",
+	.str_sec_size = sizeof("\0int_array"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test4_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(sizeof(int) * 8),
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* typedef struct B Struct_B
+ *
+ * struct A {
+ *     int x;
+ *     Struct_B y;
+ * };
+ *
+ * struct B {
+ *     int x;
+ *     struct A y;
+ * };
+ */
+{
+	.descr = "loop test #5",
+	.raw_types = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+		/* struct A */					/* [2] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int x;	*/
+		BTF_MEMBER_ENC(NAME_TBD, 3, 32),/* Struct_B y;	*/
+		/* typedef struct B Struct_B */
+		BTF_TYPEDEF_ENC(NAME_TBD, 4),			/* [3] */
+		/* struct B */					/* [4] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int x;	*/
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* struct A y;	*/
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0x\0y\0Struct_B\0B\0x\0y",
+	.str_sec_size = sizeof("\0A\0x\0y\0Struct_B\0B\0x\0y"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test5_map",
+	.key_size = sizeof(int),
+	.value_size = 8,
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+/* struct A {
+ *     int x;
+ *     struct A array_a[4];
+ * };
+ */
+{
+	.descr = "loop test #6",
+	.raw_types = {
+		/* int */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+		BTF_TYPE_ARRAY_ENC(3, 1, 4),			/* [2] */
+		/* struct A */					/* [3] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
+		BTF_MEMBER_ENC(NAME_TBD, 1, 0),	/* int x;		*/
+		BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* struct A array_a[4];	*/
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0x\0y",
+	.str_sec_size = sizeof("\0A\0x\0y"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test6_map",
+	.key_size = sizeof(int),
+	.value_size = 8,
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+{
+	.descr = "loop test #7",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* struct A { */			/* [2] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), sizeof(void *)),
+		/*     const void *m;	*/
+		BTF_MEMBER_ENC(NAME_TBD, 3, 0),
+		/* CONST type_id=3	*/		/* [3] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 4),
+		/* PTR type_id=2	*/		/* [4] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m",
+	.str_sec_size = sizeof("\0A\0m"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test7_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(void *),
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+{
+	.descr = "loop test #8",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		/* struct A { */			/* [2] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), sizeof(void *)),
+		/*     const void *m;	*/
+		BTF_MEMBER_ENC(NAME_TBD, 4, 0),
+		/* struct B { */			/* [3] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), sizeof(void *)),
+		/*     const void *n;	*/
+		BTF_MEMBER_ENC(NAME_TBD, 6, 0),
+		/* CONST type_id=5	*/		/* [4] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 5),
+		/* PTR type_id=6	*/		/* [5] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 6),
+		/* CONST type_id=7	*/		/* [6] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 7),
+		/* PTR type_id=4	*/		/* [7] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0A\0m\0B\0n",
+	.str_sec_size = sizeof("\0A\0m\0B\0n"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "loop_test8_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(void *),
+	.key_id = 1,
+	.value_id = 2,
+	.max_entries = 4,
+	.btf_load_err = true,
+},
+
+{
+	.descr = "type_off == str_off",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int",
+	.str_sec_size = sizeof("\0int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_id = 1,
+	.value_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.type_off_delta = sizeof(struct btf_type) + sizeof(int) + sizeof("\0int"),
+},
+
+{
+	.descr = "Unaligned type_off",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int",
+	.str_sec_size = sizeof("\0int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_id = 1,
+	.value_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.type_off_delta = 1,
+},
+
+{
+	.descr = "str_off beyonds btf size",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int",
+	.str_sec_size = sizeof("\0int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_id = 1,
+	.value_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.str_off_delta = sizeof("\0int") + 1,
+},
+
+{
+	.descr = "str_len beyonds btf size",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int",
+	.str_sec_size = sizeof("\0int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_id = 1,
+	.value_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.str_len_delta = 1,
+},
+
+{
+	.descr = "String section does not end with null",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int",
+	.str_sec_size = sizeof("\0int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_id = 1,
+	.value_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.str_len_delta = -1,
+},
+
+{
+	.descr = "Empty string section",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "\0int",
+	.str_sec_size = sizeof("\0int"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "hdr_test_map",
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.key_id = 1,
+	.value_id = 1,
+	.max_entries = 4,
+	.btf_load_err = true,
+	.str_len_delta = 0 - (int)sizeof("\0int"),
+},
+
+}; /* struct btf_raw_test raw_tests[] */
+
+static const char *get_next_str(const char *start, const char *end)
+{
+	return start < end - 1 ? start + 1 : NULL;
+}
+
+static int get_type_sec_size(const __u32 *raw_types)
+{
+	int i;
+
+	for (i = MAX_NR_RAW_TYPES - 1;
+	     i >= 0 && raw_types[i] != BTF_END_RAW;
+	     i--)
+		;
+
+	return i < 0 ? i : i * sizeof(raw_types[0]);
+}
+
+static void *btf_raw_create(const struct btf_header *hdr,
+			    const __u32 *raw_types,
+			    const char *str,
+			    unsigned int str_sec_size,
+			    unsigned int *btf_size)
+{
+	const char *next_str = str, *end_str = str + str_sec_size;
+	unsigned int size_needed, offset;
+	struct btf_header *ret_hdr;
+	int i, type_sec_size;
+	uint32_t *ret_types;
+	void *raw_btf;
+
+	type_sec_size = get_type_sec_size(raw_types);
+	if (type_sec_size < 0) {
+		fprintf(stderr, "Cannot get nr_raw_types\n");
+		return NULL;
+	}
+
+	size_needed = sizeof(*hdr) + type_sec_size + str_sec_size;
+	raw_btf = malloc(size_needed);
+	if (!raw_btf) {
+		fprintf(stderr, "Cannot allocate memory for raw_btf\n");
+		return NULL;
+	}
+
+	/* Copy header */
+	memcpy(raw_btf, hdr, sizeof(*hdr));
+	offset = sizeof(*hdr);
+
+	/* Copy type section */
+	ret_types = raw_btf + offset;
+	for (i = 0; i < type_sec_size / sizeof(raw_types[0]); i++) {
+		if (raw_types[i] == NAME_TBD) {
+			next_str = get_next_str(next_str, end_str);
+			if (!next_str) {
+				fprintf(stderr, "Error in getting next_str\n");
+				free(raw_btf);
+				return NULL;
+			}
+			ret_types[i] = next_str - str;
+			next_str += strlen(next_str);
+		} else {
+			ret_types[i] = raw_types[i];
+		}
+	}
+	offset += type_sec_size;
+
+	/* Copy string section */
+	memcpy(raw_btf + offset, str, str_sec_size);
+
+	ret_hdr = (struct btf_header *)raw_btf;
+	ret_hdr->str_off = type_sec_size;
+	ret_hdr->str_len = str_sec_size;
+
+	*btf_size = size_needed;
+
+	return raw_btf;
+}
+
+static int do_test_raw(unsigned int test_num)
+{
+	struct btf_raw_test *test = &raw_tests[test_num - 1];
+	struct bpf_create_map_attr create_attr = {};
+	int map_fd = -1, btf_fd = -1;
+	unsigned int raw_btf_size;
+	struct btf_header *hdr;
+	void *raw_btf;
+	int err;
+
+	fprintf(stderr, "BTF raw test[%u] (%s): ", test_num, test->descr);
+	raw_btf = btf_raw_create(&hdr_tmpl,
+				 test->raw_types,
+				 test->str_sec,
+				 test->str_sec_size,
+				 &raw_btf_size);
+
+	if (!raw_btf)
+		return -1;
+
+	hdr = raw_btf;
+
+	hdr->type_off = (int)hdr->type_off + test->type_off_delta;
+	hdr->str_off = (int)hdr->str_off + test->str_off_delta;
+	hdr->str_len = (int)hdr->str_len + test->str_len_delta;
+
+	*btf_log_buf = '\0';
+	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
+			      btf_log_buf, BTF_LOG_BUF_SIZE,
+			      args.always_log);
+	free(raw_btf);
+
+	err = ((btf_fd == -1) != test->btf_load_err);
+	if (err)
+		fprintf(stderr, "btf_load_err:%d btf_fd:%d\n",
+			test->btf_load_err, btf_fd);
+
+	if (err || btf_fd == -1)
+		goto done;
+
+	create_attr.name = test->map_name;
+	create_attr.map_type = test->map_type;
+	create_attr.key_size = test->key_size;
+	create_attr.value_size = test->value_size;
+	create_attr.max_entries = test->max_entries;
+	create_attr.btf_fd = btf_fd;
+	create_attr.btf_key_id = test->key_id;
+	create_attr.btf_value_id = test->value_id;
+
+	map_fd = bpf_create_map_xattr(&create_attr);
+
+	err = ((map_fd == -1) != test->map_create_err);
+	if (err)
+		fprintf(stderr, "map_create_err:%d map_fd:%d\n",
+			test->map_create_err, map_fd);
+
+done:
+	if (!err)
+		fprintf(stderr, "OK\n");
+
+	if (*btf_log_buf && (err || args.always_log))
+		fprintf(stderr, "%s\n", btf_log_buf);
+
+	if (btf_fd != -1)
+		close(btf_fd);
+	if (map_fd != -1)
+		close(map_fd);
+
+	return err;
+}
+
+static int test_raw(void)
+{
+	unsigned int i;
+	int err = 0;
+
+	if (args.raw_test_num)
+		return do_test_raw(args.raw_test_num);
+
+	for (i = 1; i <= ARRAY_SIZE(raw_tests); i++)
+		err |= do_test_raw(i);
+
+	return err;
+}
+
+struct btf_get_info_test {
+	const char *descr;
+	const char *str_sec;
+	__u32 raw_types[MAX_NR_RAW_TYPES];
+	__u32 str_sec_size;
+	int info_size_delta;
+};
+
+const struct btf_get_info_test get_info_tests[] = {
+{
+	.descr = "== raw_btf_size+1",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.info_size_delta = 1,
+},
+{
+	.descr = "== raw_btf_size-3",
+	.raw_types = {
+		/* int */				/* [1] */
+		BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+		BTF_END_RAW,
+	},
+	.str_sec = "",
+	.str_sec_size = sizeof(""),
+	.info_size_delta = -3,
+},
+};
+
+static int do_test_get_info(unsigned int test_num)
+{
+	const struct btf_get_info_test *test = &get_info_tests[test_num - 1];
+	unsigned int raw_btf_size, user_btf_size, expected_nbytes;
+	uint8_t *raw_btf = NULL, *user_btf = NULL;
+	int btf_fd = -1, err;
+
+	fprintf(stderr, "BTF GET_INFO_BY_ID test[%u] (%s): ",
+		test_num, test->descr);
+
+	raw_btf = btf_raw_create(&hdr_tmpl,
+				 test->raw_types,
+				 test->str_sec,
+				 test->str_sec_size,
+				 &raw_btf_size);
+
+	if (!raw_btf)
+		return -1;
+
+	*btf_log_buf = '\0';
+
+	user_btf = malloc(raw_btf_size);
+	if (!user_btf) {
+		fprintf(stderr, "Cannot allocate memory for user_btf\n");
+		err = -1;
+		goto done;
+	}
+
+	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
+			      btf_log_buf, BTF_LOG_BUF_SIZE,
+			      args.always_log);
+	if (btf_fd == -1) {
+		fprintf(stderr, "bpf_load_btf:%s(%d)\n",
+			strerror(errno), errno);
+		err = -1;
+		goto done;
+	}
+
+	user_btf_size = (int)raw_btf_size + test->info_size_delta;
+	expected_nbytes = min(raw_btf_size, user_btf_size);
+	if (raw_btf_size > expected_nbytes)
+		memset(user_btf + expected_nbytes, 0xff,
+		       raw_btf_size - expected_nbytes);
+
+	err = bpf_obj_get_info_by_fd(btf_fd, user_btf, &user_btf_size);
+	if (err || user_btf_size != raw_btf_size ||
+	    memcmp(raw_btf, user_btf, expected_nbytes)) {
+		fprintf(stderr,
+			"err:%d(errno:%d) raw_btf_size:%u user_btf_size:%u expected_nbytes:%u memcmp:%d\n",
+			err, errno,
+			raw_btf_size, user_btf_size, expected_nbytes,
+			memcmp(raw_btf, user_btf, expected_nbytes));
+		err = -1;
+		goto done;
+	}
+
+	while (expected_nbytes < raw_btf_size) {
+		fprintf(stderr, "%u...", expected_nbytes);
+		if (user_btf[expected_nbytes++] != 0xff) {
+			fprintf(stderr, "!= 0xff\n");
+			err = -1;
+			goto done;
+		}
+	}
+
+	fprintf(stderr, "OK\n");
+
+done:
+	if (*btf_log_buf && (err || args.always_log))
+		fprintf(stderr, "%s\n", btf_log_buf);
+
+	free(raw_btf);
+	free(user_btf);
+
+	if (btf_fd != -1)
+		close(btf_fd);
+
+	return err;
+}
+
+static int test_get_info(void)
+{
+	unsigned int i;
+	int err = 0;
+
+	if (args.get_info_test_num)
+		return do_test_get_info(args.get_info_test_num);
+
+	for (i = 1; i <= ARRAY_SIZE(get_info_tests); i++)
+		err |= do_test_get_info(i);
+
+	return err;
+}
+
+struct btf_file_test {
+	const char *file;
+	bool btf_kv_notfound;
+};
+
+static struct btf_file_test file_tests[] = {
+{
+	.file = "test_btf_haskv.o",
+},
+{
+	.file = "test_btf_nokv.o",
+	.btf_kv_notfound = true,
+},
+};
+
+static int file_has_btf_elf(const char *fn)
+{
+	Elf_Scn *scn = NULL;
+	GElf_Ehdr ehdr;
+	int elf_fd;
+	Elf *elf;
+	int ret;
+
+	if (elf_version(EV_CURRENT) == EV_NONE) {
+		fprintf(stderr, "Failed to init libelf\n");
+		return -1;
+	}
+
+	elf_fd = open(fn, O_RDONLY);
+	if (elf_fd == -1) {
+		fprintf(stderr, "Cannot open file %s: %s(%d)\n",
+			fn, strerror(errno), errno);
+		return -1;
+	}
+
+	elf = elf_begin(elf_fd, ELF_C_READ, NULL);
+	if (!elf) {
+		fprintf(stderr, "Failed to read ELF from %s. %s\n", fn,
+			elf_errmsg(elf_errno()));
+		ret = -1;
+		goto done;
+	}
+
+	if (!gelf_getehdr(elf, &ehdr)) {
+		fprintf(stderr, "Failed to get EHDR from %s\n", fn);
+		ret = -1;
+		goto done;
+	}
+
+	while ((scn = elf_nextscn(elf, scn))) {
+		const char *sh_name;
+		GElf_Shdr sh;
+
+		if (gelf_getshdr(scn, &sh) != &sh) {
+			fprintf(stderr,
+				"Failed to get section header from %s\n", fn);
+			ret = -1;
+			goto done;
+		}
+
+		sh_name = elf_strptr(elf, ehdr.e_shstrndx, sh.sh_name);
+		if (!strcmp(sh_name, BTF_ELF_SEC)) {
+			ret = 1;
+			goto done;
+		}
+	}
+
+	ret = 0;
+
+done:
+	close(elf_fd);
+	elf_end(elf);
+	return ret;
+}
+
+static int do_test_file(unsigned int test_num)
+{
+	const struct btf_file_test *test = &file_tests[test_num - 1];
+	struct bpf_object *obj = NULL;
+	struct bpf_program *prog;
+	struct bpf_map *map;
+	int err;
+
+	fprintf(stderr, "BTF libbpf test[%u] (%s): ", test_num,
+		test->file);
+
+	err = file_has_btf_elf(test->file);
+	if (err == -1)
+		return err;
+
+	if (err == 0) {
+		fprintf(stderr, "SKIP. No ELF %s found\n", BTF_ELF_SEC);
+		return 0;
+	}
+
+	obj = bpf_object__open(test->file);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	err = bpf_object__btf_fd(obj);
+	if (err == -1) {
+		fprintf(stderr, "bpf_object__btf_fd: -1\n");
+		goto done;
+	}
+
+	prog = bpf_program__next(NULL, obj);
+	if (!prog) {
+		fprintf(stderr, "Cannot find bpf_prog\n");
+		err = -1;
+		goto done;
+	}
+
+	bpf_program__set_type(prog, BPF_PROG_TYPE_TRACEPOINT);
+	err = bpf_object__load(obj);
+	if (err < 0) {
+		fprintf(stderr, "bpf_object__load: %d\n", err);
+		goto done;
+	}
+
+	map = bpf_object__find_map_by_name(obj, "btf_map");
+	if (!map) {
+		fprintf(stderr, "btf_map not found\n");
+		err = -1;
+		goto done;
+	}
+
+	err = (bpf_map__btf_key_id(map) == 0 || bpf_map__btf_value_id(map) == 0)
+		!= test->btf_kv_notfound;
+	if (err) {
+		fprintf(stderr,
+			"btf_kv_notfound:%u btf_key_id:%u btf_value_id:%u\n",
+			test->btf_kv_notfound,
+			bpf_map__btf_key_id(map),
+			bpf_map__btf_value_id(map));
+		goto done;
+	}
+
+	fprintf(stderr, "OK\n");
+
+done:
+	bpf_object__close(obj);
+	return err;
+}
+
+static int test_file(void)
+{
+	unsigned int i;
+	int err = 0;
+
+	if (args.file_test_num)
+		return do_test_file(args.file_test_num);
+
+	for (i = 1; i <= ARRAY_SIZE(file_tests); i++)
+		err |= do_test_file(i);
+
+	return err;
+}
+
+const char *pprint_enum_str[] = {
+	"ENUM_ZERO",
+	"ENUM_ONE",
+	"ENUM_TWO",
+	"ENUM_THREE",
+};
+
+struct pprint_mapv {
+	uint32_t ui32;
+	uint16_t ui16;
+	/* 2 bytes hole */
+	int32_t si32;
+	uint32_t unused_bits2a:2,
+		bits28:28,
+		unused_bits2b:2;
+	union {
+		uint64_t ui64;
+		uint8_t ui8a[8];
+	};
+	enum {
+		ENUM_ZERO,
+		ENUM_ONE,
+		ENUM_TWO,
+		ENUM_THREE,
+	} aenum;
+};
+
+static struct btf_raw_test pprint_test = {
+	.descr = "BTF pretty print test #1",
+	.raw_types = {
+		/* unsighed char */			/* [1] */
+		BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1),
+		/* unsigned short */			/* [2] */
+		BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 16, 2),
+		/* unsigned int */			/* [3] */
+		BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 32, 4),
+		/* int */				/* [4] */
+		BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+		/* unsigned long long */		/* [5] */
+		BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 64, 8),
+		/* 2 bits */				/* [6] */
+		BTF_TYPE_INT_ENC(0, 0, 0, 2, 2),
+		/* 28 bits */				/* [7] */
+		BTF_TYPE_INT_ENC(0, 0, 0, 28, 4),
+		/* uint8_t[8] */			/* [8] */
+		BTF_TYPE_ARRAY_ENC(9, 3, 8),
+		/* typedef unsigned char uint8_t */	/* [9] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 1),
+		/* typedef unsigned short uint16_t */	/* [10] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 2),
+		/* typedef unsigned int uint32_t */	/* [11] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 3),
+		/* typedef int int32_t */		/* [12] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 4),
+		/* typedef unsigned long long uint64_t *//* [13] */
+		BTF_TYPEDEF_ENC(NAME_TBD, 5),
+		/* union (anon) */			/* [14] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_UNION, 0, 2), 8),
+		BTF_MEMBER_ENC(NAME_TBD, 13, 0),/* uint64_t ui64; */
+		BTF_MEMBER_ENC(NAME_TBD, 8, 0),	/* uint8_t ui8a[8]; */
+		/* enum (anon) */			/* [15] */
+		BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 4), 4),
+		BTF_ENUM_ENC(NAME_TBD, 0),
+		BTF_ENUM_ENC(NAME_TBD, 1),
+		BTF_ENUM_ENC(NAME_TBD, 2),
+		BTF_ENUM_ENC(NAME_TBD, 3),
+		/* struct pprint_mapv */		/* [16] */
+		BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 8), 28),
+		BTF_MEMBER_ENC(NAME_TBD, 11, 0),	/* uint32_t ui32 */
+		BTF_MEMBER_ENC(NAME_TBD, 10, 32),	/* uint16_t ui16 */
+		BTF_MEMBER_ENC(NAME_TBD, 12, 64),	/* int32_t si32 */
+		BTF_MEMBER_ENC(NAME_TBD, 6, 96),	/* unused_bits2a */
+		BTF_MEMBER_ENC(NAME_TBD, 7, 98),	/* bits28 */
+		BTF_MEMBER_ENC(NAME_TBD, 6, 126),	/* unused_bits2b */
+		BTF_MEMBER_ENC(0, 14, 128),		/* union (anon) */
+		BTF_MEMBER_ENC(NAME_TBD, 15, 192),	/* aenum */
+		BTF_END_RAW,
+	},
+	.str_sec = "\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum",
+	.str_sec_size = sizeof("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum"),
+	.map_type = BPF_MAP_TYPE_ARRAY,
+	.map_name = "pprint_test",
+	.key_size = sizeof(unsigned int),
+	.value_size = sizeof(struct pprint_mapv),
+	.key_id = 3,	/* unsigned int */
+	.value_id = 16,	/* struct pprint_mapv */
+	.max_entries = 128 * 1024,
+};
+
+static void set_pprint_mapv(struct pprint_mapv *v, uint32_t i)
+{
+	v->ui32 = i;
+	v->si32 = -i;
+	v->unused_bits2a = 3;
+	v->bits28 = i;
+	v->unused_bits2b = 3;
+	v->ui64 = i;
+	v->aenum = i & 0x03;
+}
+
+static int test_pprint(void)
+{
+	const struct btf_raw_test *test = &pprint_test;
+	struct bpf_create_map_attr create_attr = {};
+	int map_fd = -1, btf_fd = -1;
+	struct pprint_mapv mapv = {};
+	unsigned int raw_btf_size;
+	char expected_line[255];
+	FILE *pin_file = NULL;
+	char pin_path[255];
+	size_t line_len = 0;
+	char *line = NULL;
+	unsigned int key;
+	uint8_t *raw_btf;
+	ssize_t nread;
+	int err;
+
+	fprintf(stderr, "%s......", test->descr);
+	raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types,
+				 test->str_sec, test->str_sec_size,
+				 &raw_btf_size);
+
+	if (!raw_btf)
+		return -1;
+
+	*btf_log_buf = '\0';
+	btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
+			      btf_log_buf, BTF_LOG_BUF_SIZE,
+			      args.always_log);
+	free(raw_btf);
+
+	if (btf_fd == -1) {
+		err = -1;
+		fprintf(stderr, "bpf_load_btf: %s(%d)\n",
+			strerror(errno), errno);
+		goto done;
+	}
+
+	create_attr.name = test->map_name;
+	create_attr.map_type = test->map_type;
+	create_attr.key_size = test->key_size;
+	create_attr.value_size = test->value_size;
+	create_attr.max_entries = test->max_entries;
+	create_attr.btf_fd = btf_fd;
+	create_attr.btf_key_id = test->key_id;
+	create_attr.btf_value_id = test->value_id;
+
+	map_fd = bpf_create_map_xattr(&create_attr);
+	if (map_fd == -1) {
+		err = -1;
+		fprintf(stderr, "bpf_creat_map_btf: %s(%d)\n",
+			strerror(errno), errno);
+		goto done;
+	}
+
+	if (snprintf(pin_path, sizeof(pin_path), "%s/%s",
+		     "/sys/fs/bpf", test->map_name) == sizeof(pin_path)) {
+		err = -1;
+		fprintf(stderr, "pin_path is too long\n");
+		goto done;
+	}
+
+	err = bpf_obj_pin(map_fd, pin_path);
+	if (err) {
+		fprintf(stderr, "Cannot pin to %s. %s(%d).\n", pin_path,
+			strerror(errno), errno);
+		goto done;
+	}
+
+	for (key = 0; key < test->max_entries; key++) {
+		set_pprint_mapv(&mapv, key);
+		bpf_map_update_elem(map_fd, &key, &mapv, 0);
+	}
+
+	pin_file = fopen(pin_path, "r");
+	if (!pin_file) {
+		err = -1;
+		fprintf(stderr, "fopen(%s): %s(%d)\n", pin_path,
+			strerror(errno), errno);
+		goto done;
+	}
+
+	/* Skip lines start with '#' */
+	while ((nread = getline(&line, &line_len, pin_file)) > 0 &&
+	       *line == '#')
+		;
+
+	if (nread <= 0) {
+		err = -1;
+		fprintf(stderr, "Unexpected EOF\n");
+		goto done;
+	}
+
+	key = 0;
+	do {
+		ssize_t nexpected_line;
+
+		set_pprint_mapv(&mapv, key);
+		nexpected_line = snprintf(expected_line, sizeof(expected_line),
+					  "%u: {%u,0,%d,0x%x,0x%x,0x%x,{%lu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s}\n",
+					  key,
+					  mapv.ui32, mapv.si32,
+					  mapv.unused_bits2a, mapv.bits28, mapv.unused_bits2b,
+					  mapv.ui64,
+					  mapv.ui8a[0], mapv.ui8a[1], mapv.ui8a[2], mapv.ui8a[3],
+					  mapv.ui8a[4], mapv.ui8a[5], mapv.ui8a[6], mapv.ui8a[7],
+					  pprint_enum_str[mapv.aenum]);
+
+		if (nexpected_line == sizeof(expected_line)) {
+			err = -1;
+			fprintf(stderr, "expected_line is too long\n");
+			goto done;
+		}
+
+		if (strcmp(expected_line, line)) {
+			err = -1;
+			fprintf(stderr, "unexpected pprint output\n");
+			fprintf(stderr, "expected: %s", expected_line);
+			fprintf(stderr, "    read: %s", line);
+			goto done;
+		}
+
+		nread = getline(&line, &line_len, pin_file);
+	} while (++key < test->max_entries && nread > 0);
+
+	if (key < test->max_entries) {
+		err = -1;
+		fprintf(stderr, "Unexpected EOF\n");
+		goto done;
+	}
+
+	if (nread > 0) {
+		err = -1;
+		fprintf(stderr, "Unexpected extra pprint output: %s\n", line);
+		goto done;
+	}
+
+	err = 0;
+
+done:
+	if (!err)
+		fprintf(stderr, "OK\n");
+	if (*btf_log_buf && (err || args.always_log))
+		fprintf(stderr, "%s\n", btf_log_buf);
+	if (btf_fd != -1)
+		close(btf_fd);
+	if (map_fd != -1)
+		close(map_fd);
+	if (pin_file)
+		fclose(pin_file);
+	unlink(pin_path);
+	free(line);
+
+	return err;
+}
+
+static void usage(const char *cmd)
+{
+	fprintf(stderr, "Usage: %s [-l] [[-r test_num (1 - %zu)] | [-g test_num (1 - %zu)] | [-f test_num (1 - %zu)] | [-p]]\n",
+		cmd, ARRAY_SIZE(raw_tests), ARRAY_SIZE(get_info_tests),
+		ARRAY_SIZE(file_tests));
+}
+
+static int parse_args(int argc, char **argv)
+{
+	const char *optstr = "lpf:r:g:";
+	int opt;
+
+	while ((opt = getopt(argc, argv, optstr)) != -1) {
+		switch (opt) {
+		case 'l':
+			args.always_log = true;
+			break;
+		case 'f':
+			args.file_test_num = atoi(optarg);
+			args.file_test = true;
+			break;
+		case 'r':
+			args.raw_test_num = atoi(optarg);
+			args.raw_test = true;
+			break;
+		case 'g':
+			args.get_info_test_num = atoi(optarg);
+			args.get_info_test = true;
+			break;
+		case 'p':
+			args.pprint_test = true;
+			break;
+		case 'h':
+			usage(argv[0]);
+			exit(0);
+		default:
+				usage(argv[0]);
+				return -1;
+		}
+	}
+
+	if (args.raw_test_num &&
+	    (args.raw_test_num < 1 ||
+	     args.raw_test_num > ARRAY_SIZE(raw_tests))) {
+		fprintf(stderr, "BTF raw test number must be [1 - %zu]\n",
+			ARRAY_SIZE(raw_tests));
+		return -1;
+	}
+
+	if (args.file_test_num &&
+	    (args.file_test_num < 1 ||
+	     args.file_test_num > ARRAY_SIZE(file_tests))) {
+		fprintf(stderr, "BTF file test number must be [1 - %zu]\n",
+			ARRAY_SIZE(file_tests));
+		return -1;
+	}
+
+	if (args.get_info_test_num &&
+	    (args.get_info_test_num < 1 ||
+	     args.get_info_test_num > ARRAY_SIZE(get_info_tests))) {
+		fprintf(stderr, "BTF get info test number must be [1 - %zu]\n",
+			ARRAY_SIZE(get_info_tests));
+		return -1;
+	}
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int err = 0;
+
+	err = parse_args(argc, argv);
+	if (err)
+		return err;
+
+	if (args.always_log)
+		libbpf_set_print(__base_pr, __base_pr, __base_pr);
+
+	if (args.raw_test)
+		err |= test_raw();
+
+	if (args.get_info_test)
+		err |= test_get_info();
+
+	if (args.file_test)
+		err |= test_file();
+
+	if (args.pprint_test)
+		err |= test_pprint();
+
+	if (args.raw_test || args.get_info_test || args.file_test ||
+	    args.pprint_test)
+		return err;
+
+	err |= test_raw();
+	err |= test_get_info();
+	err |= test_file();
+
+	return err;
+}
diff --git a/tools/testing/selftests/bpf/test_btf_haskv.c b/tools/testing/selftests/bpf/test_btf_haskv.c
new file mode 100644
index 000000000000..8c7ca096ecf2
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_btf_haskv.c
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+int _version SEC("version") = 1;
+
+struct ipv_counts {
+	unsigned int v4;
+	unsigned int v6;
+};
+
+typedef int btf_map_key;
+typedef struct ipv_counts btf_map_value;
+btf_map_key dumm_key;
+btf_map_value dummy_value;
+
+struct bpf_map_def SEC("maps") btf_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct ipv_counts),
+	.max_entries = 4,
+};
+
+struct dummy_tracepoint_args {
+	unsigned long long pad;
+	struct sock *sock;
+};
+
+SEC("dummy_tracepoint")
+int _dummy_tracepoint(struct dummy_tracepoint_args *arg)
+{
+	struct ipv_counts *counts;
+	int key = 0;
+
+	if (!arg->sock)
+		return 0;
+
+	counts = bpf_map_lookup_elem(&btf_map, &key);
+	if (!counts)
+		return 0;
+
+	counts->v6++;
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_btf_nokv.c b/tools/testing/selftests/bpf/test_btf_nokv.c
new file mode 100644
index 000000000000..0ed8e088eebf
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_btf_nokv.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+int _version SEC("version") = 1;
+
+struct ipv_counts {
+	unsigned int v4;
+	unsigned int v6;
+};
+
+struct bpf_map_def SEC("maps") btf_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct ipv_counts),
+	.max_entries = 4,
+};
+
+struct dummy_tracepoint_args {
+	unsigned long long pad;
+	struct sock *sock;
+};
+
+SEC("dummy_tracepoint")
+int _dummy_tracepoint(struct dummy_tracepoint_args *arg)
+{
+	struct ipv_counts *counts;
+	int key = 0;
+
+	if (!arg->sock)
+		return 0;
+
+	counts = bpf_map_lookup_elem(&btf_map, &key);
+	if (!counts)
+		return 0;
+
+	counts->v6++;
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 04/10] bpf: btf: Add pretty print capability for data with BTF type info
From: Martin KaFai Lau @ 2018-04-16 19:33 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180416193327.477239-1-kafai@fb.com>

This patch adds pretty print capability for data with BTF type info.
The current usage is to allow pretty print for a BPF map.

The next few patches will allow a read() on a pinned map with BTF
type info for its key and value.

This patch uses the seq_printf() infra.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
 include/linux/btf.h |   2 +
 kernel/bpf/btf.c    | 199 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 201 insertions(+)

diff --git a/include/linux/btf.h b/include/linux/btf.h
index f14b60368753..d8bdab0280ba 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -33,5 +33,7 @@ struct btf_type;
 const struct btf_type *btf_type_id_size(const struct btf *btf,
 					u32 *type_id,
 					u32 *ret_size);
+void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
+		       struct seq_file *m);
 
 #endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 3110f3cc7d0a..15bfca419231 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3,6 +3,7 @@
 
 #include <uapi/linux/btf.h>
 #include <uapi/linux/types.h>
+#include <linux/seq_file.h>
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
@@ -258,6 +259,9 @@ struct btf_kind_operations {
 			    const struct btf_type *member_type);
 	void (*log_details)(struct btf_verifier_env *env,
 			    const struct btf_type *t);
+	void (*seq_show)(const struct btf *btf, const struct btf_type *t,
+			 u32 type_id, void *data, u8 bits_offsets,
+			 struct seq_file *m);
 };
 
 static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
@@ -798,11 +802,19 @@ static void btf_df_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "Unsupported log_details");
 }
 
+static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t,
+			    u32 type_id, void *data, u8 bits_offsets,
+			    struct seq_file *m)
+{
+	seq_printf(m, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
+}
+
 static struct btf_kind_operations df_ops = {
 	.check_meta = btf_df_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_df_check_member,
 	.log_details = btf_df_log,
+	.seq_show = btf_df_seq_show,
 };
 
 static int btf_int_check_member(struct btf_verifier_env *env,
@@ -905,11 +917,96 @@ static void btf_int_log(struct btf_verifier_env *env,
 			 btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
 }
 
+static void btf_int_bits_seq_show(const struct btf *btf,
+				  const struct btf_type *t,
+				  void *data, u8 bits_offset,
+				  struct seq_file *m)
+{
+	u32 int_data = btf_type_int(t);
+	u16 nr_bits = BTF_INT_BITS(int_data);
+	u16 total_bits_offset;
+	u16 nr_copy_bytes;
+	u16 nr_copy_bits;
+	u8 nr_upper_bits;
+	union {
+		u64 u64_num;
+		u8  u8_nums[8];
+	} print_num;
+
+	total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
+	data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
+	bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
+	nr_copy_bits = nr_bits + bits_offset;
+	nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
+
+	print_num.u64_num = 0;
+	memcpy(&print_num.u64_num, data, nr_copy_bytes);
+
+	/* Ditch the higher order bits */
+	nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits);
+	if (nr_upper_bits) {
+		/* We need to mask out some bits of the upper byte. */
+		u8 mask = (1 << nr_upper_bits) - 1;
+
+		print_num.u8_nums[nr_copy_bytes - 1] &= mask;
+	}
+
+	print_num.u64_num >>= bits_offset;
+
+	seq_printf(m, "0x%llx", print_num.u64_num);
+}
+
+static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
+			     u32 type_id, void *data, u8 bits_offset,
+			     struct seq_file *m)
+{
+	u32 int_data = btf_type_int(t);
+	u8 encoding = BTF_INT_ENCODING(int_data);
+	bool sign = encoding & BTF_INT_SIGNED;
+	u32 nr_bits = BTF_INT_BITS(int_data);
+
+	if (bits_offset || BTF_INT_OFFSET(int_data) ||
+	    BITS_PER_BYTE_MASKED(nr_bits)) {
+		btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+		return;
+	}
+
+	switch (nr_bits) {
+	case 64:
+		if (sign)
+			seq_printf(m, "%lld", *(s64 *)data);
+		else
+			seq_printf(m, "%llu", *(u64 *)data);
+		break;
+	case 32:
+		if (sign)
+			seq_printf(m, "%d", *(s32 *)data);
+		else
+			seq_printf(m, "%u", *(u32 *)data);
+		break;
+	case 16:
+		if (sign)
+			seq_printf(m, "%d", *(s16 *)data);
+		else
+			seq_printf(m, "%u", *(u16 *)data);
+		break;
+	case 8:
+		if (sign)
+			seq_printf(m, "%d", *(s8 *)data);
+		else
+			seq_printf(m, "%u", *(u8 *)data);
+		break;
+	default:
+		btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+	}
+}
+
 static const struct btf_kind_operations int_ops = {
 	.check_meta = btf_int_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_int_check_member,
 	.log_details = btf_int_log,
+	.seq_show = btf_int_seq_show,
 };
 
 static int btf_modifier_check_member(struct btf_verifier_env *env,
@@ -1080,6 +1177,24 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
 	return 0;
 }
 
+static void btf_modifier_seq_show(const struct btf *btf,
+				  const struct btf_type *t,
+				  u32 type_id, void *data,
+				  u8 bits_offset, struct seq_file *m)
+{
+	t = btf_type_id_resolve(btf, &type_id);
+
+	btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+}
+
+static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t,
+			     u32 type_id, void *data, u8 bits_offset,
+			     struct seq_file *m)
+{
+	/* It is a hashed value */
+	seq_printf(m, "%p", *(void **)data);
+}
+
 static void btf_ref_type_log(struct btf_verifier_env *env,
 			     const struct btf_type *t)
 {
@@ -1091,6 +1206,7 @@ static struct btf_kind_operations modifier_ops = {
 	.resolve = btf_modifier_resolve,
 	.check_member = btf_modifier_check_member,
 	.log_details = btf_ref_type_log,
+	.seq_show = btf_modifier_seq_show,
 };
 
 static struct btf_kind_operations ptr_ops = {
@@ -1098,6 +1214,7 @@ static struct btf_kind_operations ptr_ops = {
 	.resolve = btf_ptr_resolve,
 	.check_member = btf_ptr_check_member,
 	.log_details = btf_ref_type_log,
+	.seq_show = btf_ptr_seq_show,
 };
 
 static struct btf_kind_operations fwd_ops = {
@@ -1105,6 +1222,7 @@ static struct btf_kind_operations fwd_ops = {
 	.resolve = btf_df_resolve,
 	.check_member = btf_df_check_member,
 	.log_details = btf_ref_type_log,
+	.seq_show = btf_df_seq_show,
 };
 
 static int btf_array_check_member(struct btf_verifier_env *env,
@@ -1235,11 +1353,36 @@ static void btf_array_log(struct btf_verifier_env *env,
 			 array->type, array->index_type, array->nelems);
 }
 
+static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t,
+			       u32 type_id, void *data, u8 bits_offset,
+			       struct seq_file *m)
+{
+	const struct btf_array *array = btf_type_array(t);
+	const struct btf_kind_operations *elem_ops;
+	const struct btf_type *elem_type;
+	u32 i, elem_size, elem_type_id;
+
+	elem_type_id = array->type;
+	elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+	elem_ops = btf_type_ops(elem_type);
+	seq_puts(m, "[");
+	for (i = 0; i < array->nelems; i++) {
+		if (i)
+			seq_puts(m, ",");
+
+		elem_ops->seq_show(btf, elem_type, elem_type_id, data,
+				   bits_offset, m);
+		data += elem_size;
+	}
+	seq_puts(m, "]");
+}
+
 static struct btf_kind_operations array_ops = {
 	.check_meta = btf_array_check_meta,
 	.resolve = btf_array_resolve,
 	.check_member = btf_array_check_member,
 	.log_details = btf_array_log,
+	.seq_show = btf_array_seq_show,
 };
 
 static int btf_struct_check_member(struct btf_verifier_env *env,
@@ -1387,11 +1530,39 @@ static void btf_struct_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
+static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
+				u32 type_id, void *data, u8 bits_offset,
+				struct seq_file *m)
+{
+	const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ",";
+	const struct btf_member *member;
+	u32 i;
+
+	seq_puts(m, "{");
+	for_each_member(i, t, member) {
+		const struct btf_type *member_type = btf_type_by_id(btf,
+								member->type);
+		u32 member_offset = member->offset;
+		u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
+		u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
+		const struct btf_kind_operations *ops;
+
+		if (i)
+			seq_puts(m, seq);
+
+		ops = btf_type_ops(member_type);
+		ops->seq_show(btf, member_type, member->type,
+			      data + bytes_offset, bits8_offset, m);
+	}
+	seq_puts(m, "}");
+}
+
 static struct btf_kind_operations struct_ops = {
 	.check_meta = btf_struct_check_meta,
 	.resolve = btf_struct_resolve,
 	.check_member = btf_struct_check_member,
 	.log_details = btf_struct_log,
+	.seq_show = btf_struct_seq_show,
 };
 
 static int btf_enum_check_member(struct btf_verifier_env *env,
@@ -1467,11 +1638,31 @@ static void btf_enum_log(struct btf_verifier_env *env,
 	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
 }
 
+static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
+			      u32 type_id, void *data, u8 bits_offset,
+			      struct seq_file *m)
+{
+	const struct btf_enum *enums = btf_type_enum(t);
+	u32 i, nr_enums = btf_type_vlen(t);
+	int v = *(int *)data;
+
+	for (i = 0; i < nr_enums; i++) {
+		if (v == enums[i].val) {
+			seq_printf(m, "%s",
+				   btf_name_by_offset(btf, enums[i].name));
+			return;
+		}
+	}
+
+	seq_printf(m, "%d", v);
+}
+
 static struct btf_kind_operations enum_ops = {
 	.check_meta = btf_enum_check_meta,
 	.resolve = btf_df_resolve,
 	.check_member = btf_enum_check_member,
 	.log_details = btf_enum_log,
+	.seq_show = btf_enum_seq_show,
 };
 
 static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
@@ -1810,3 +2001,11 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
 		btf_free(btf);
 	return ERR_PTR(err);
 }
+
+void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
+		       struct seq_file *m)
+{
+	const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+	btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
+}
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 08/10] bpf: btf: Sync bpf.h and btf.h to tools/
From: Martin KaFai Lau @ 2018-04-16 19:33 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180416193327.477239-1-kafai@fb.com>

This patch sync up the bpf.h and btf.h to tools/

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
 tools/include/uapi/linux/bpf.h |  13 ++++
 tools/include/uapi/linux/btf.h | 132 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 145 insertions(+)
 create mode 100644 tools/include/uapi/linux/btf.h

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9d07465023a2..2c010a6b25e7 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -95,6 +95,7 @@ enum bpf_cmd {
 	BPF_OBJ_GET_INFO_BY_FD,
 	BPF_PROG_QUERY,
 	BPF_RAW_TRACEPOINT_OPEN,
+	BPF_BTF_LOAD,
 };
 
 enum bpf_map_type {
@@ -279,6 +280,9 @@ union bpf_attr {
 					 */
 		char	map_name[BPF_OBJ_NAME_LEN];
 		__u32	map_ifindex;	/* ifindex of netdev to create on */
+		__u32	btf_fd;		/* fd pointing to a BTF type data */
+		__u32	btf_key_id;	/* BTF type_id of the key */
+		__u32	btf_value_id;	/* BTF type_id of the value */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -363,6 +367,14 @@ union bpf_attr {
 		__u64 name;
 		__u32 prog_fd;
 	} raw_tracepoint;
+
+	struct { /* anonymous struct for BPF_BTF_LOAD */
+		__aligned_u64	btf;
+		__aligned_u64	btf_log_buf;
+		__u32		btf_size;
+		__u32		btf_log_size;
+		__u32		btf_log_level;
+	};
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
@@ -864,6 +876,7 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
 #define BPF_F_DONT_FRAGMENT		(1ULL << 2)
+#define BPF_F_SEQ_NUMBER		(1ULL << 3)
 
 /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
  * BPF_FUNC_perf_event_read_value flags.
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
new file mode 100644
index 000000000000..30f23ad78eff
--- /dev/null
+++ b/tools/include/uapi/linux/btf.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (c) 2018 Facebook */
+#ifndef _UAPI__LINUX_BTF_H__
+#define _UAPI__LINUX_BTF_H__
+
+#include <linux/types.h>
+
+#define BTF_MAGIC	0xeB9F
+#define BTF_MAGIC_SWAP	0x9FeB
+#define BTF_VERSION	1
+#define BTF_FLAGS_COMPR	0x01
+
+struct btf_header {
+	__u16	magic;
+	__u8	version;
+	__u8	flags;
+
+	__u32	parent_label;
+	__u32	parent_name;
+
+	/* All offsets are in bytes relative to the end of this header */
+	__u32	label_off;	/* offset of label section	*/
+	__u32	object_off;	/* offset of data object section*/
+	__u32	func_off;	/* offset of function section	*/
+	__u32	type_off;	/* offset of type section	*/
+	__u32	str_off;	/* offset of string section	*/
+	__u32	str_len;	/* length of string section	*/
+};
+
+/* Max # of type identifier */
+#define BTF_MAX_TYPE	0x7fffffff
+/* Max offset into the string section */
+#define BTF_MAX_NAME_OFFSET	0x7fffffff
+/* Max # of struct/union/enum members or func args */
+#define BTF_MAX_VLEN	0xffff
+
+/* The type id is referring to a parent BTF */
+#define BTF_TYPE_PARENT(id)	(((id) >> 31) & 0x1)
+#define BTF_TYPE_ID(id)		((id) & BTF_MAX_TYPE)
+
+/* String is in the ELF string section */
+#define BTF_STR_TBL_ELF_ID(ref)	(((ref) >> 31) & 0x1)
+#define BTF_STR_OFFSET(ref)	((ref) & BTF_MAX_NAME_OFFSET)
+
+struct btf_type {
+	__u32 name;
+	/* "info" bits arrangement
+	 * bits  0-15: vlen (e.g. # of struct's members)
+	 * bits 16-23: unused
+	 * bits 24-28: kind (e.g. int, ptr, array...etc)
+	 * bits 29-30: unused
+	 * bits    31: root
+	 */
+	__u32 info;
+	/* "size" is used by INT, ENUM, STRUCT and UNION.
+	 * "size" tells the size of the type it is describing.
+	 *
+	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT.
+	 * "type" is a type_id referring to another type.
+	 */
+	union {
+		__u32 size;
+		__u32 type;
+	};
+};
+
+#define BTF_INFO_KIND(info)	(((info) >> 24) & 0x1f)
+#define BTF_INFO_ISROOT(info)	(!!(((info) >> 24) & 0x80))
+#define BTF_INFO_VLEN(info)	((info) & 0xffff)
+
+#define BTF_KIND_UNKN		0	/* Unknown	*/
+#define BTF_KIND_INT		1	/* Integer	*/
+#define BTF_KIND_FLOAT		2	/* Float	*/
+#define BTF_KIND_PTR		3	/* Pointer	*/
+#define BTF_KIND_ARRAY		4	/* Array	*/
+#define BTF_KIND_FUNC		5	/* Function	*/
+#define BTF_KIND_STRUCT		6	/* Struct	*/
+#define BTF_KIND_UNION		7	/* Union	*/
+#define BTF_KIND_ENUM		8	/* Enumeration	*/
+#define BTF_KIND_FWD		9	/* Forward	*/
+#define BTF_KIND_TYPEDEF	10	/* Typedef	*/
+#define BTF_KIND_VOLATILE	11	/* Volatile	*/
+#define BTF_KIND_CONST		12	/* Const	*/
+#define BTF_KIND_RESTRICT	13	/* Restrict	*/
+#define BTF_KIND_MAX		13
+#define NR_BTF_KINDS		14
+
+/* For some specific BTF_KIND, "struct btf_type" is immediately
+ * followed by extra data.
+ */
+
+/* BTF_KIND_INT is followed by a u32 and the following
+ * is the 32 bits arrangement:
+ */
+#define BTF_INT_ENCODING(VAL)	(((VAL) & 0xff000000) >> 24)
+#define BTF_INT_OFFSET(VAL)	(((VAL  & 0x00ff0000)) >> 16)
+#define BTF_INT_BITS(VAL)	((VAL)  & 0x0000ffff)
+
+/* Attributes stored in the BTF_INT_ENCODING */
+#define BTF_INT_SIGNED	0x1
+#define BTF_INT_CHAR	0x2
+#define BTF_INT_BOOL	0x4
+#define BTF_INT_VARARGS	0x8
+
+/* BTF_KIND_ENUM is followed by multiple "struct btf_enum".
+ * The exact number of btf_enum is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_enum {
+	__u32	name;
+	__s32	val;
+};
+
+/* BTF_KIND_ARRAY is followed by one "struct btf_array" */
+struct btf_array {
+	__u32	type;
+	__u32	index_type;
+	__u32	nelems;
+};
+
+/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed
+ * by multiple "struct btf_member".  The exact number
+ * of btf_member is stored in the vlen (of the info in
+ * "struct btf_type").
+ */
+struct btf_member {
+	__u32	name;
+	__u32	type;
+	__u32	offset;	/* offset in bits */
+};
+
+#endif /* _UAPI__LINUX_BTF_H__ */
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 03/10] bpf: btf: Check members of struct/union
From: Martin KaFai Lau @ 2018-04-16 19:33 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180416193327.477239-1-kafai@fb.com>

This patch checks a few things of struct's members:

1) It has a valid size (e.g. a "const void" is invalid)
2) A member's size (+ its member's offset) does not exceed
   the containing struct's size.
3) The member's offset satisfies the alignment requirement

The above can only be done after the needs_resolve member's type
is resolved.  Hence, the above is done together in
btf_struct_resolve().

Each possible member's type (e.g. int, enum, modifier...) implements
the check_member() ops which will be called from btf_struct_resolve().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
 kernel/bpf/btf.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 6ade409da2f9..3110f3cc7d0a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -252,6 +252,10 @@ struct btf_kind_operations {
 			  u32 meta_left);
 	int (*resolve)(struct btf_verifier_env *env,
 		       const struct resolve_vertex *v);
+	int (*check_member)(struct btf_verifier_env *env,
+			    const struct btf_type *struct_type,
+			    const struct btf_member *member,
+			    const struct btf_type *member_type);
 	void (*log_details)(struct btf_verifier_env *env,
 			    const struct btf_type *t);
 };
@@ -771,6 +775,16 @@ static int btf_df_check_meta(struct btf_verifier_env *env,
 	return -ENOTSUPP;
 }
 
+static int btf_df_check_member(struct btf_verifier_env *env,
+			       const struct btf_type *struct_type,
+			       const struct btf_member *member,
+			       const struct btf_type *member_type)
+{
+	btf_verifier_log_basic(env, struct_type,
+			       "Unsupported check_member");
+	return -EINVAL;
+}
+
 static int btf_df_resolve(struct btf_verifier_env *env,
 			  const struct resolve_vertex *v)
 {
@@ -787,9 +801,48 @@ static void btf_df_log(struct btf_verifier_env *env,
 static struct btf_kind_operations df_ops = {
 	.check_meta = btf_df_check_meta,
 	.resolve = btf_df_resolve,
+	.check_member = btf_df_check_member,
 	.log_details = btf_df_log,
 };
 
+static int btf_int_check_member(struct btf_verifier_env *env,
+				const struct btf_type *struct_type,
+				const struct btf_member *member,
+				const struct btf_type *member_type)
+{
+	u32 int_data = btf_type_int(member_type);
+	u32 struct_bits_off = member->offset;
+	u32 struct_size = struct_type->size;
+	u32 nr_copy_bits;
+	u32 bytes_offset;
+
+	if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"bits_offset exceeds U32_MAX");
+		return -EINVAL;
+	}
+
+	struct_bits_off += BTF_INT_OFFSET(int_data);
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+	nr_copy_bits = BTF_INT_BITS(int_data) +
+		BITS_PER_BYTE_MASKED(struct_bits_off);
+
+	if (nr_copy_bits > BITS_PER_U64) {
+		btf_verifier_log_member(env, struct_type, member,
+					"nr_copy_bits exceeds 64");
+		return -EINVAL;
+	}
+
+	if (struct_size < bytes_offset ||
+	    struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static s32 btf_int_check_meta(struct btf_verifier_env *env,
 			      const struct btf_type *t,
 			      u32 meta_left)
@@ -855,9 +908,61 @@ static void btf_int_log(struct btf_verifier_env *env,
 static const struct btf_kind_operations int_ops = {
 	.check_meta = btf_int_check_meta,
 	.resolve = btf_df_resolve,
+	.check_member = btf_int_check_member,
 	.log_details = btf_int_log,
 };
 
+static int btf_modifier_check_member(struct btf_verifier_env *env,
+				     const struct btf_type *struct_type,
+				     const struct btf_member *member,
+				     const struct btf_type *member_type)
+{
+	const struct btf_type *resolved_type;
+	u32 resolved_type_id = member->type;
+	struct btf_member resolved_member;
+	struct btf *btf = env->btf;
+
+	resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
+	if (!resolved_type) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Invalid member");
+		return -EINVAL;
+	}
+
+	resolved_member = *member;
+	resolved_member.type = resolved_type_id;
+
+	return btf_type_ops(resolved_type)->check_member(env, struct_type,
+							 &resolved_member,
+							 resolved_type);
+}
+
+static int btf_ptr_check_member(struct btf_verifier_env *env,
+				const struct btf_type *struct_type,
+				const struct btf_member *member,
+				const struct btf_type *member_type)
+{
+	u32 struct_size, struct_bits_off, bytes_offset;
+
+	struct_size = struct_type->size;
+	struct_bits_off = member->offset;
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+
+	if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member is not byte aligned");
+		return -EINVAL;
+	}
+
+	if (struct_size - bytes_offset < sizeof(void *)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int btf_ref_type_check_meta(struct btf_verifier_env *env,
 				   const struct btf_type *t,
 				   u32 meta_left)
@@ -984,21 +1089,53 @@ static void btf_ref_type_log(struct btf_verifier_env *env,
 static struct btf_kind_operations modifier_ops = {
 	.check_meta = btf_ref_type_check_meta,
 	.resolve = btf_modifier_resolve,
+	.check_member = btf_modifier_check_member,
 	.log_details = btf_ref_type_log,
 };
 
 static struct btf_kind_operations ptr_ops = {
 	.check_meta = btf_ref_type_check_meta,
 	.resolve = btf_ptr_resolve,
+	.check_member = btf_ptr_check_member,
 	.log_details = btf_ref_type_log,
 };
 
 static struct btf_kind_operations fwd_ops = {
 	.check_meta = btf_ref_type_check_meta,
 	.resolve = btf_df_resolve,
+	.check_member = btf_df_check_member,
 	.log_details = btf_ref_type_log,
 };
 
+static int btf_array_check_member(struct btf_verifier_env *env,
+				  const struct btf_type *struct_type,
+				  const struct btf_member *member,
+				  const struct btf_type *member_type)
+{
+	u32 struct_bits_off = member->offset;
+	u32 struct_size, bytes_offset;
+	u32 array_type_id, array_size;
+	struct btf *btf = env->btf;
+
+	if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member is not byte aligned");
+		return -EINVAL;
+	}
+
+	array_type_id = member->type;
+	btf_type_id_size(btf, &array_type_id, &array_size);
+	struct_size = struct_type->size;
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+	if (struct_size - bytes_offset < array_size) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static s32 btf_array_check_meta(struct btf_verifier_env *env,
 				const struct btf_type *t,
 				u32 meta_left)
@@ -1101,9 +1238,35 @@ static void btf_array_log(struct btf_verifier_env *env,
 static struct btf_kind_operations array_ops = {
 	.check_meta = btf_array_check_meta,
 	.resolve = btf_array_resolve,
+	.check_member = btf_array_check_member,
 	.log_details = btf_array_log,
 };
 
+static int btf_struct_check_member(struct btf_verifier_env *env,
+				   const struct btf_type *struct_type,
+				   const struct btf_member *member,
+				   const struct btf_type *member_type)
+{
+	u32 struct_bits_off = member->offset;
+	u32 struct_size, bytes_offset;
+
+	if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member is not byte aligned");
+		return -EINVAL;
+	}
+
+	struct_size = struct_type->size;
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+	if (struct_size - bytes_offset < member_type->size) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static s32 btf_struct_check_meta(struct btf_verifier_env *env,
 				 const struct btf_type *t,
 				 u32 meta_left)
@@ -1162,6 +1325,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 			      const struct resolve_vertex *v)
 {
 	const struct btf_member *member;
+	int err;
 	u16 i;
 
 	/* Before continue resolving the next_member,
@@ -1169,6 +1333,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 	 * type with size info.
 	 */
 	if (v->next_member) {
+		const struct btf_type *last_member_type;
 		const struct btf_member *last_member;
 		u16 last_member_type_id;
 
@@ -1177,6 +1342,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 		if (WARN_ON_ONCE(!env_type_is_resolved(env,
 						       last_member_type_id)))
 			return -EINVAL;
+
+		last_member_type = btf_type_by_id(env->btf,
+						  last_member_type_id);
+		err = btf_type_ops(last_member_type)->check_member(env, v->t,
+							last_member,
+							last_member_type);
+		if (err)
+			return err;
 	}
 
 	for_each_member_from(i, v->next_member, v->t, member) {
@@ -1195,6 +1368,12 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 			env_stack_set_next_member(env, i + 1);
 			return env_stack_push(env, member_type, member_type_id);
 		}
+
+		err = btf_type_ops(member_type)->check_member(env, v->t,
+							      member,
+							      member_type);
+		if (err)
+			return err;
 	}
 
 	env_stack_pop_resolved(env, 0, 0);
@@ -1211,9 +1390,35 @@ static void btf_struct_log(struct btf_verifier_env *env,
 static struct btf_kind_operations struct_ops = {
 	.check_meta = btf_struct_check_meta,
 	.resolve = btf_struct_resolve,
+	.check_member = btf_struct_check_member,
 	.log_details = btf_struct_log,
 };
 
+static int btf_enum_check_member(struct btf_verifier_env *env,
+				 const struct btf_type *struct_type,
+				 const struct btf_member *member,
+				 const struct btf_type *member_type)
+{
+	u32 struct_bits_off = member->offset;
+	u32 struct_size, bytes_offset;
+
+	if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member is not byte aligned");
+		return -EINVAL;
+	}
+
+	struct_size = struct_type->size;
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+	if (struct_size - bytes_offset < sizeof(int)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static s32 btf_enum_check_meta(struct btf_verifier_env *env,
 			       const struct btf_type *t,
 			       u32 meta_left)
@@ -1265,6 +1470,7 @@ static void btf_enum_log(struct btf_verifier_env *env,
 static struct btf_kind_operations enum_ops = {
 	.check_meta = btf_enum_check_meta,
 	.resolve = btf_df_resolve,
+	.check_member = btf_enum_check_member,
 	.log_details = btf_enum_log,
 };
 
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 07/10] bpf: btf: Add pretty print support to the basic arraymap
From: Martin KaFai Lau @ 2018-04-16 19:33 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180416193327.477239-1-kafai@fb.com>

This patch adds pretty print support to the basic arraymap.
Support for other bpf maps can be added later.

This patch adds new attrs to the BPF_MAP_CREATE command to allow
specifying the btf_fd, btf_key_id and btf_value_id.  The
BPF_MAP_CREATE can then associate the btf to the map if
the creating map supports BTF.

A BTF supported map needs to implement two new map ops,
map_seq_show_elem() and map_check_btf().  This patch has
implemented these new map ops for the basic arraymap.

It also adds file_operations to the pinned map
such that the pinned map can be opened and read.

Here is a sample output when reading a pinned arraymap
with the following map's value:

struct map_value {
	int count_a;
	int count_b;
};

cat /sys/fs/bpf/pinned_array_map:

0: {1,2}
1: {3,4}
2: {5,6}
...

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
 include/linux/bpf.h      |  20 ++++++-
 include/uapi/linux/bpf.h |   3 +
 kernel/bpf/arraymap.c    |  50 ++++++++++++++++
 kernel/bpf/inode.c       | 146 ++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c     |  32 ++++++++++-
 5 files changed, 244 insertions(+), 7 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 95a7abd0ee92..ee5275e7d4df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -22,6 +22,8 @@ struct perf_event;
 struct bpf_prog;
 struct bpf_map;
 struct sock;
+struct seq_file;
+struct btf;
 
 /* map is generic key/value storage optionally accesible by eBPF programs */
 struct bpf_map_ops {
@@ -43,10 +45,14 @@ struct bpf_map_ops {
 	void (*map_fd_put_ptr)(void *ptr);
 	u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
+	void (*map_seq_show_elem)(struct bpf_map *map, void *key,
+				  struct seq_file *m);
+	int (*map_check_btf)(const struct bpf_map *map, const struct btf *btf,
+			     u32 key_type_id, u32 value_type_id);
 };
 
 struct bpf_map {
-	/* 1st cacheline with read-mostly members of which some
+	/* The first two cachelines with read-mostly members of which some
 	 * are also accessed in fast-path (e.g. ops, max_entries).
 	 */
 	const struct bpf_map_ops *ops ____cacheline_aligned;
@@ -62,10 +68,13 @@ struct bpf_map {
 	u32 pages;
 	u32 id;
 	int numa_node;
+	u32 btf_key_id;
+	u32 btf_value_id;
+	struct btf *btf;
 	bool unpriv_array;
-	/* 7 bytes hole */
+	/* 55 bytes hole */
 
-	/* 2nd cacheline with misc members to avoid false sharing
+	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
 	 */
 	struct user_struct *user ____cacheline_aligned;
@@ -100,6 +109,11 @@ static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
 	return container_of(map, struct bpf_offloaded_map, map);
 }
 
+static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
+{
+	return map->ops->map_seq_show_elem && map->ops->map_check_btf;
+}
+
 extern const struct bpf_map_ops bpf_map_offload_ops;
 
 /* function argument constraints */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c7d75f18521b..2c010a6b25e7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -280,6 +280,9 @@ union bpf_attr {
 					 */
 		char	map_name[BPF_OBJ_NAME_LEN];
 		__u32	map_ifindex;	/* ifindex of netdev to create on */
+		__u32	btf_fd;		/* fd pointing to a BTF type data */
+		__u32	btf_key_id;	/* BTF type_id of the key */
+		__u32	btf_value_id;	/* BTF type_id of the value */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 14750e7c5ee4..02a189339381 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,11 +11,13 @@
  * General Public License for more details.
  */
 #include <linux/bpf.h>
+#include <linux/btf.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/filter.h>
 #include <linux/perf_event.h>
+#include <uapi/linux/btf.h>
 
 #include "map_in_map.h"
 
@@ -336,6 +338,52 @@ static void array_map_free(struct bpf_map *map)
 	bpf_map_area_free(array);
 }
 
+static void array_map_seq_show_elem(struct bpf_map *map, void *key,
+				    struct seq_file *m)
+{
+	void *value;
+
+	rcu_read_lock();
+
+	value = array_map_lookup_elem(map, key);
+	if (!value) {
+		rcu_read_unlock();
+		return;
+	}
+
+	seq_printf(m, "%u: ", *(u32 *)key);
+	btf_type_seq_show(map->btf, map->btf_value_id, value, m);
+	seq_puts(m, "\n");
+
+	rcu_read_unlock();
+}
+
+static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+			       u32 btf_key_id, u32 btf_value_id)
+{
+	const struct btf_type *key_type, *value_type;
+	u32 key_size, value_size;
+	u32 int_data;
+
+	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
+	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+		return -EINVAL;
+
+	int_data = *(u32 *)(key_type + 1);
+	/* bpf array can only take a u32 key.  This check makes
+	 * sure that the btf matches the attr used during map_create.
+	 */
+	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
+	    BTF_INT_OFFSET(int_data))
+		return -EINVAL;
+
+	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
+	if (!value_type || value_size > map->value_size)
+		return -EINVAL;
+
+	return 0;
+}
+
 const struct bpf_map_ops array_map_ops = {
 	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
@@ -345,6 +393,8 @@ const struct bpf_map_ops array_map_ops = {
 	.map_update_elem = array_map_update_elem,
 	.map_delete_elem = array_map_delete_elem,
 	.map_gen_lookup = array_map_gen_lookup,
+	.map_seq_show_elem = array_map_seq_show_elem,
+	.map_check_btf = array_map_check_btf,
 };
 
 const struct bpf_map_ops percpu_array_map_ops = {
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index bf6da59ae0d0..e3eec943e43c 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -150,8 +150,144 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return 0;
 }
 
+struct map_iter {
+	void *key;
+	bool done;
+};
+
+static struct map_iter *map_iter(struct seq_file *m)
+{
+	return m->private;
+}
+
+static struct bpf_map *seq_file_to_map(struct seq_file *m)
+{
+	return file_inode(m->file)->i_private;
+}
+
+static void map_iter_free(struct map_iter *iter)
+{
+	if (iter) {
+		kfree(iter->key);
+		kfree(iter);
+	}
+}
+
+static struct map_iter *map_iter_alloc(struct bpf_map *map)
+{
+	struct map_iter *iter;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN);
+	if (!iter)
+		goto error;
+
+	iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN);
+	if (!iter->key)
+		goto error;
+
+	return iter;
+
+error:
+	map_iter_free(iter);
+	return NULL;
+}
+
+static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct bpf_map *map = seq_file_to_map(m);
+	void *key = map_iter(m)->key;
+
+	if (map_iter(m)->done)
+		return NULL;
+
+	if (unlikely(v == SEQ_START_TOKEN))
+		goto done;
+
+	if (map->ops->map_get_next_key(map, key, key)) {
+		map_iter(m)->done = true;
+		return NULL;
+	}
+
+done:
+	++(*pos);
+	return key;
+}
+
+static void *map_seq_start(struct seq_file *m, loff_t *pos)
+{
+	if (map_iter(m)->done)
+		return NULL;
+
+	return *pos ? map_iter(m)->key : SEQ_START_TOKEN;
+}
+
+static void map_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static int map_seq_show(struct seq_file *m, void *v)
+{
+	struct bpf_map *map = seq_file_to_map(m);
+	void *key = map_iter(m)->key;
+
+	if (unlikely(v == SEQ_START_TOKEN)) {
+		seq_puts(m, "# WARNING!! The output is for debug purpose only\n");
+		seq_puts(m, "# WARNING!! The output format will change\n");
+	} else {
+		map->ops->map_seq_show_elem(map, key, m);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations bpffs_map_seq_ops = {
+	.start	= map_seq_start,
+	.next	= map_seq_next,
+	.show	= map_seq_show,
+	.stop	= map_seq_stop,
+};
+
+static int bpffs_map_open(struct inode *inode, struct file *file)
+{
+	struct bpf_map *map = inode->i_private;
+	struct map_iter *iter;
+	struct seq_file *m;
+	int err;
+
+	iter = map_iter_alloc(map);
+	if (!iter)
+		return -ENOMEM;
+
+	err = seq_open(file, &bpffs_map_seq_ops);
+	if (err) {
+		map_iter_free(iter);
+		return err;
+	}
+
+	m = file->private_data;
+	m->private = iter;
+
+	return 0;
+}
+
+static int bpffs_map_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+
+	map_iter_free(map_iter(m));
+
+	return seq_release(inode, file);
+}
+
+static const struct file_operations bpffs_map_fops = {
+	.open		= bpffs_map_open,
+	.read		= seq_read,
+	.release	= bpffs_map_release,
+};
+
 static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
-			 const struct inode_operations *iops)
+			 const struct inode_operations *iops,
+			 const struct file_operations *fops)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 	struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
@@ -159,6 +295,7 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
 		return PTR_ERR(inode);
 
 	inode->i_op = iops;
+	inode->i_fop = fops;
 	inode->i_private = raw;
 
 	bpf_dentry_finalize(dentry, inode, dir);
@@ -167,12 +304,15 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
 
 static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
 {
-	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops);
+	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, NULL);
 }
 
 static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
 {
-	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops);
+	struct bpf_map *map = arg;
+
+	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
+			     map->btf ? &bpffs_map_fops : NULL);
 }
 
 static struct dentry *
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0a4924a0a8da..fe23dc5a3ec4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -27,6 +27,7 @@
 #include <linux/cred.h>
 #include <linux/timekeeping.h>
 #include <linux/ctype.h>
+#include <linux/btf.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -251,6 +252,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 
 	bpf_map_uncharge_memlock(map);
 	security_bpf_map_free(map);
+	btf_put(map->btf);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -416,7 +418,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
+#define BPF_MAP_CREATE_LAST_FIELD btf_value_id
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -450,6 +452,33 @@ static int map_create(union bpf_attr *attr)
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
+	if (bpf_map_support_seq_show(map) &&
+	    (attr->btf_key_id || attr->btf_value_id)) {
+		struct btf *btf;
+
+		if (!attr->btf_key_id || !attr->btf_value_id) {
+			err = -EINVAL;
+			goto free_map_nouncharge;
+		}
+
+		btf = btf_get_by_fd(attr->btf_fd);
+		if (IS_ERR(btf)) {
+			err = PTR_ERR(btf);
+			goto free_map_nouncharge;
+		}
+
+		err = map->ops->map_check_btf(map, btf, attr->btf_key_id,
+					      attr->btf_value_id);
+		if (err) {
+			btf_put(btf);
+			goto free_map_nouncharge;
+		}
+
+		map->btf = btf;
+		map->btf_key_id = attr->btf_key_id;
+		map->btf_value_id = attr->btf_value_id;
+	}
+
 	err = security_bpf_map_alloc(map);
 	if (err)
 		goto free_map_nouncharge;
@@ -482,6 +511,7 @@ static int map_create(union bpf_attr *attr)
 free_map_sec:
 	security_bpf_map_free(map);
 free_map_nouncharge:
+	btf_put(map->btf);
 	map->ops->map_free(map);
 	return err;
 }
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v3 06/10] bpf: btf: Add BPF_OBJ_GET_INFO_BY_FD support to BTF fd
From: Martin KaFai Lau @ 2018-04-16 19:33 UTC (permalink / raw)
  To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180416193327.477239-1-kafai@fb.com>

This patch adds BPF_OBJ_GET_INFO_BY_FD support to BTF fd.
The original BTF data, which was used to create the BTF fd during
the earlier BPF_BTF_LOAD call, will be returned.

The userspace is expected to allocate buffer
to info.info and the buffer size is set to info.info_len before
calling BPF_OBJ_GET_INFO_BY_FD.

The original BTF data is copied to the userspace buffer (info.info).
Only upto the user's specified info.info_len will be copied.

The original BTF data size is set to info.info_len.  The userspace
needs to check if it is bigger than its allocated buffer size.
If it is, the userspace should realloc with the kernel-returned
info.info_len and call the BPF_OBJ_GET_INFO_BY_FD again.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
 include/linux/btf.h  |  5 +++++
 kernel/bpf/btf.c     | 17 ++++++++++++++++-
 kernel/bpf/syscall.c |  2 ++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/include/linux/btf.h b/include/linux/btf.h
index a7c7072535ea..a966dc6d61ee 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -10,9 +10,14 @@ struct btf;
 struct btf_type;
 union bpf_attr;
 
+extern const struct file_operations btf_fops;
+
 void btf_put(struct btf *btf);
 int btf_new_fd(const union bpf_attr *attr);
 struct btf *btf_get_by_fd(int fd);
+int btf_get_info_by_fd(const struct btf *btf,
+		       const union bpf_attr *attr,
+		       union bpf_attr __user *uattr);
 /* Figure out the size of a type_id.  If type_id is a modifier
  * (e.g. const), it will be resolved to find out the type with size.
  *
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 45af787ceddf..0b98d32158bd 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2031,7 +2031,7 @@ static int btf_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static const struct file_operations btf_fops = {
+const struct file_operations btf_fops = {
 	.release	= btf_release,
 };
 
@@ -2076,3 +2076,18 @@ struct btf *btf_get_by_fd(int fd)
 
 	return btf;
 }
+
+int btf_get_info_by_fd(const struct btf *btf,
+		       const union bpf_attr *attr,
+		       union bpf_attr __user *uattr)
+{
+	void __user *udata = u64_to_user_ptr(attr->info.info);
+	u32 copy_len = min_t(u32, btf->data_size,
+			     attr->info.info_len);
+
+	if (copy_to_user(udata, btf->data, copy_len) ||
+	    put_user(btf->data_size, &uattr->info.info_len))
+		return -EFAULT;
+
+	return 0;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cd8ebadc66eb..0a4924a0a8da 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2017,6 +2017,8 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	else if (f.file->f_op == &bpf_map_fops)
 		err = bpf_map_get_info_by_fd(f.file->private_data, attr,
 					     uattr);
+	else if (f.file->f_op == &btf_fops)
+		err = btf_get_info_by_fd(f.file->private_data, attr, uattr);
 	else
 		err = -EINVAL;
 
-- 
2.9.5

^ permalink raw reply related

* [PATCH net-next 0/2] PCI: add two more values for PCIe Max_Read_Request_Size and initially use them in r8169 network driver
From: Heiner Kallweit @ 2018-04-16 19:35 UTC (permalink / raw)
  To: Bjorn Helgaas, David Miller, Bjorn Helgaas,
	Realtek linux nic maintainers
  Cc: linux-pci@vger.kernel.org, netdev@vger.kernel.org

In r8169 network driver I stumbled across a magic number translating
to PCI MRRS size 4K. The PCI core is still missing constants for
values 2K and 4K (as defined in PCI standard).

So let's add these two constants and use the 4K constant in r8169.

Second patch depends on the first one, therefore both patches
preferrably should go through either PCI or netdev tree.

Heiner Kallweit (2):
  PCI: add two more values for PCIe Max_Read_Request_Size
  r8169: replace magic numbers with PCI MRRS constant

 drivers/net/ethernet/realtek/r8169.c | 39 ++++++++++++++--------------
 include/uapi/linux/pci_regs.h        |  2 ++
 2 files changed, 21 insertions(+), 20 deletions(-)

-- 
2.17.0

^ permalink raw reply

* [PATCH net-next 1/2] PCI: Add two more values for PCIe Max_Read_Request_Size
From: Heiner Kallweit @ 2018-04-16 19:37 UTC (permalink / raw)
  To: Bjorn Helgaas, David Miller, Bjorn Helgaas,
	Realtek linux nic maintainers
  Cc: linux-pci@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <00f1255f-acf0-4760-c20f-0b78bda44645@gmail.com>

This patch adds missing values for the max read request size.
E.g. network driver r8169 uses a value of 4K.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 include/uapi/linux/pci_regs.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 0c79eac5..699257fb 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -506,6 +506,8 @@
 #define  PCI_EXP_DEVCTL_READRQ_256B  0x1000 /* 256 Bytes */
 #define  PCI_EXP_DEVCTL_READRQ_512B  0x2000 /* 512 Bytes */
 #define  PCI_EXP_DEVCTL_READRQ_1024B 0x3000 /* 1024 Bytes */
+#define  PCI_EXP_DEVCTL_READRQ_2048B 0x4000 /* 2048 Bytes */
+#define  PCI_EXP_DEVCTL_READRQ_4096B 0x5000 /* 4096 Bytes */
 #define  PCI_EXP_DEVCTL_BCR_FLR 0x8000  /* Bridge Configuration Retry / FLR */
 #define PCI_EXP_DEVSTA		10	/* Device Status */
 #define  PCI_EXP_DEVSTA_CED	0x0001	/* Correctable Error Detected */
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next 2/2] r8169: replace magic numbers with PCI MRRS constant
From: Heiner Kallweit @ 2018-04-16 19:38 UTC (permalink / raw)
  To: Bjorn Helgaas, David Miller, Bjorn Helgaas,
	Realtek linux nic maintainers
  Cc: linux-pci@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <00f1255f-acf0-4760-c20f-0b78bda44645@gmail.com>

Replace magic number "0x5 << MAX_READ_REQUEST_SHIFT" with the
appropriate constant as defined in PCI core.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 drivers/net/ethernet/realtek/r8169.c | 39 ++++++++++++++--------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 604ae783..7d4e6890 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -84,7 +84,6 @@
    The RTL chips use a 64 element hash table based on the Ethernet CRC. */
 static const int multicast_filter_limit = 32;
 
-#define MAX_READ_REQUEST_SHIFT	12
 #define TX_DMA_BURST	7	/* Maximum PCI burst, '7' is unlimited */
 #define InterFrameGap	0x03	/* 3 means InterFrameGap = the shortest one */
 
@@ -5125,7 +5124,7 @@ static void r8168c_hw_jumbo_disable(struct rtl8169_private *tp)
 {
 	RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Jumbo_En0);
 	RTL_W8(tp, Config4, RTL_R8(tp, Config4) & ~Jumbo_En1);
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 }
 
 static void r8168dp_hw_jumbo_enable(struct rtl8169_private *tp)
@@ -5151,7 +5150,7 @@ static void r8168e_hw_jumbo_disable(struct rtl8169_private *tp)
 	RTL_W8(tp, MaxTxPacketSize, 0x0c);
 	RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Jumbo_En0);
 	RTL_W8(tp, Config4, RTL_R8(tp, Config4) & ~0x01);
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 }
 
 static void r8168b_0_hw_jumbo_enable(struct rtl8169_private *tp)
@@ -5163,7 +5162,7 @@ static void r8168b_0_hw_jumbo_enable(struct rtl8169_private *tp)
 static void r8168b_0_hw_jumbo_disable(struct rtl8169_private *tp)
 {
 	rtl_tx_performance_tweak(tp,
-		(0x5 << MAX_READ_REQUEST_SHIFT) | PCI_EXP_DEVCTL_NOSNOOP_EN);
+		PCI_EXP_DEVCTL_READRQ_4096B | PCI_EXP_DEVCTL_NOSNOOP_EN);
 }
 
 static void r8168b_1_hw_jumbo_enable(struct rtl8169_private *tp)
@@ -5736,7 +5735,7 @@ static void rtl_hw_start_8168bb(struct rtl8169_private *tp)
 	RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) & ~R8168_CPCMD_QUIRK_MASK);
 
 	if (tp->dev->mtu <= ETH_DATA_LEN) {
-		rtl_tx_performance_tweak(tp, (0x5 << MAX_READ_REQUEST_SHIFT) |
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B |
 					 PCI_EXP_DEVCTL_NOSNOOP_EN);
 	}
 }
@@ -5757,7 +5756,7 @@ static void __rtl_hw_start_8168cp(struct rtl8169_private *tp)
 	RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Beacon_en);
 
 	if (tp->dev->mtu <= ETH_DATA_LEN)
-		rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	rtl_disable_clock_request(tp);
 
@@ -5788,7 +5787,7 @@ static void rtl_hw_start_8168cp_2(struct rtl8169_private *tp)
 	RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Beacon_en);
 
 	if (tp->dev->mtu <= ETH_DATA_LEN)
-		rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) & ~R8168_CPCMD_QUIRK_MASK);
 }
@@ -5805,7 +5804,7 @@ static void rtl_hw_start_8168cp_3(struct rtl8169_private *tp)
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
 
 	if (tp->dev->mtu <= ETH_DATA_LEN)
-		rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) & ~R8168_CPCMD_QUIRK_MASK);
 }
@@ -5862,7 +5861,7 @@ static void rtl_hw_start_8168d(struct rtl8169_private *tp)
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
 
 	if (tp->dev->mtu <= ETH_DATA_LEN)
-		rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W16(tp, CPlusCmd, RTL_R16(tp, CPlusCmd) & ~R8168_CPCMD_QUIRK_MASK);
 }
@@ -5872,7 +5871,7 @@ static void rtl_hw_start_8168dp(struct rtl8169_private *tp)
 	rtl_csi_access_enable_1(tp);
 
 	if (tp->dev->mtu <= ETH_DATA_LEN)
-		rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
 
@@ -5889,7 +5888,7 @@ static void rtl_hw_start_8168d_4(struct rtl8169_private *tp)
 
 	rtl_csi_access_enable_1(tp);
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
 
@@ -5921,7 +5920,7 @@ static void rtl_hw_start_8168e_1(struct rtl8169_private *tp)
 	rtl_ephy_init(tp, e_info_8168e_1, ARRAY_SIZE(e_info_8168e_1));
 
 	if (tp->dev->mtu <= ETH_DATA_LEN)
-		rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W8(tp, MaxTxPacketSize, TxPacketMax);
 
@@ -5946,7 +5945,7 @@ static void rtl_hw_start_8168e_2(struct rtl8169_private *tp)
 	rtl_ephy_init(tp, e_info_8168e_2, ARRAY_SIZE(e_info_8168e_2));
 
 	if (tp->dev->mtu <= ETH_DATA_LEN)
-		rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+		rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	rtl_eri_write(tp, 0xc0, ERIAR_MASK_0011, 0x0000, ERIAR_EXGMAC);
 	rtl_eri_write(tp, 0xb8, ERIAR_MASK_0011, 0x0000, ERIAR_EXGMAC);
@@ -5976,7 +5975,7 @@ static void rtl_hw_start_8168f(struct rtl8169_private *tp)
 {
 	rtl_csi_access_enable_2(tp);
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	rtl_eri_write(tp, 0xc0, ERIAR_MASK_0011, 0x0000, ERIAR_EXGMAC);
 	rtl_eri_write(tp, 0xb8, ERIAR_MASK_0011, 0x0000, ERIAR_EXGMAC);
@@ -6047,7 +6046,7 @@ static void rtl_hw_start_8168g(struct rtl8169_private *tp)
 
 	rtl_csi_access_enable_1(tp);
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	rtl_w0w1_eri(tp, 0xdc, ERIAR_MASK_0001, 0x00, 0x01, ERIAR_EXGMAC);
 	rtl_w0w1_eri(tp, 0xdc, ERIAR_MASK_0001, 0x01, 0x00, ERIAR_EXGMAC);
@@ -6147,7 +6146,7 @@ static void rtl_hw_start_8168h_1(struct rtl8169_private *tp)
 
 	rtl_csi_access_enable_1(tp);
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	rtl_w0w1_eri(tp, 0xdc, ERIAR_MASK_0001, 0x00, 0x01, ERIAR_EXGMAC);
 	rtl_w0w1_eri(tp, 0xdc, ERIAR_MASK_0001, 0x01, 0x00, ERIAR_EXGMAC);
@@ -6229,7 +6228,7 @@ static void rtl_hw_start_8168ep(struct rtl8169_private *tp)
 
 	rtl_csi_access_enable_1(tp);
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	rtl_w0w1_eri(tp, 0xdc, ERIAR_MASK_0001, 0x00, 0x01, ERIAR_EXGMAC);
 	rtl_w0w1_eri(tp, 0xdc, ERIAR_MASK_0001, 0x01, 0x00, ERIAR_EXGMAC);
@@ -6495,7 +6494,7 @@ static void rtl_hw_start_8102e_1(struct rtl8169_private *tp)
 
 	RTL_W8(tp, DBG_REG, FIX_NAK_1);
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W8(tp, Config1,
 	       LEDS1 | LEDS0 | Speed_down | MEMMAP | IOMAP | VPD | PMEnable);
@@ -6512,7 +6511,7 @@ static void rtl_hw_start_8102e_2(struct rtl8169_private *tp)
 {
 	rtl_csi_access_enable_2(tp);
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	RTL_W8(tp, Config1, MEMMAP | IOMAP | VPD | PMEnable);
 	RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~Beacon_en);
@@ -6575,7 +6574,7 @@ static void rtl_hw_start_8402(struct rtl8169_private *tp)
 
 	rtl_ephy_init(tp, e_info_8402, ARRAY_SIZE(e_info_8402));
 
-	rtl_tx_performance_tweak(tp, 0x5 << MAX_READ_REQUEST_SHIFT);
+	rtl_tx_performance_tweak(tp, PCI_EXP_DEVCTL_READRQ_4096B);
 
 	rtl_eri_write(tp, 0xc8, ERIAR_MASK_1111, 0x00000002, ERIAR_EXGMAC);
 	rtl_eri_write(tp, 0xe8, ERIAR_MASK_1111, 0x00000006, ERIAR_EXGMAC);
-- 
2.17.0

^ permalink raw reply related

* Re: net: hang in unregister_netdevice: waiting for lo to become free
From: Dan Streetman @ 2018-04-16 19:42 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Tommi Rantala, Neil Horman, Xin Long, David Ahern,
	Daniel Borkmann, Cong Wang, David Miller, Eric Dumazet,
	Willem de Bruijn, Jakub Kicinski, Rasmus Villemoes, netdev, LKML,
	Alexey Kuznetsov, Hideaki YOSHIFUJI, syzkaller, Dan Streetman,
	Eric W. Biederman, Alexey Kodanev
In-Reply-To: <CACT4Y+a2TyMWfs+ZQ=gJMbpXzGoYOWLEDBJAjVY4cJTkYecV2A@mail.gmail.com>

On Mon, Apr 16, 2018 at 3:35 AM, Dmitry Vyukov <dvyukov@google.com> wrote:
> On Fri, Apr 13, 2018 at 5:54 PM, Dmitry Vyukov <dvyukov@google.com> wrote:
>> On Fri, Apr 13, 2018 at 2:43 PM, Dan Streetman <ddstreet@ieee.org> wrote:
>>> On Thu, Apr 12, 2018 at 8:15 AM, Dmitry Vyukov <dvyukov@google.com> wrote:
>>>> On Wed, Feb 21, 2018 at 3:53 PM, Tommi Rantala
>>>> <tommi.t.rantala@nokia.com> wrote:
>>>>> On 20.02.2018 18:26, Neil Horman wrote:
>>>>>>
>>>>>> On Tue, Feb 20, 2018 at 09:14:41AM +0100, Dmitry Vyukov wrote:
>>>>>>>
>>>>>>> On Tue, Feb 20, 2018 at 8:56 AM, Tommi Rantala
>>>>>>> <tommi.t.rantala@nokia.com> wrote:
>>>>>>>>
>>>>>>>> On 19.02.2018 20:59, Dmitry Vyukov wrote:
>>>>>>>>>
>>>>>>>>> Is this meant to be fixed already? I am still seeing this on the
>>>>>>>>> latest upstream tree.
>>>>>>>>>
>>>>>>>>
>>>>>>>> These two commits are in v4.16-rc1:
>>>>>>>>
>>>>>>>> commit 4a31a6b19f9ddf498c81f5c9b089742b7472a6f8
>>>>>>>> Author: Tommi Rantala <tommi.t.rantala@nokia.com>
>>>>>>>> Date:   Mon Feb 5 21:48:14 2018 +0200
>>>>>>>>
>>>>>>>>      sctp: fix dst refcnt leak in sctp_v4_get_dst
>>>>>>>> ...
>>>>>>>>      Fixes: 410f03831 ("sctp: add routing output fallback")
>>>>>>>>      Fixes: 0ca50d12f ("sctp: fix src address selection if using
>>>>>>>> secondary
>>>>>>>> addresses")
>>>>>>>>
>>>>>>>>
>>>>>>>> commit 957d761cf91cdbb175ad7d8f5472336a4d54dbf2
>>>>>>>> Author: Alexey Kodanev <alexey.kodanev@oracle.com>
>>>>>>>> Date:   Mon Feb 5 15:10:35 2018 +0300
>>>>>>>>
>>>>>>>>      sctp: fix dst refcnt leak in sctp_v6_get_dst()
>>>>>>>> ...
>>>>>>>>      Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using
>>>>>>>> secondary
>>>>>>>> addresses for ipv6")
>>>>>>>>
>>>>>>>>
>>>>>>>> I guess we missed something if it's still reproducible.
>>>>>>>>
>>>>>>>> I can check it later this week, unless someone else beat me to it.
>>>>>>>
>>>>>>>
>>>>>>> Hi Tommi,
>>>>>>>
>>>>>>> Hmmm, I can't claim that it's exactly the same bug. Perhaps it's
>>>>>>> another one then. But I am still seeing these:
>>>>>>>
>>>>>>> [   58.799130] unregister_netdevice: waiting for lo to become free.
>>>>>>> Usage count = 4
>>>>>>> [   60.847138] unregister_netdevice: waiting for lo to become free.
>>>>>>> Usage count = 4
>>>>>>> [   62.895093] unregister_netdevice: waiting for lo to become free.
>>>>>>> Usage count = 4
>>>>>>> [   64.943103] unregister_netdevice: waiting for lo to become free.
>>>>>>> Usage count = 4
>>>>>>>
>>>>>>> on upstream tree pulled ~12 hours ago.
>>>>>>>
>>>>>> Can you write a systemtap script to probe dev_hold, and dev_put, printing
>>>>>> out a
>>>>>> backtrace if the device name matches "lo".  That should tell us
>>>>>> definitively if
>>>>>> the problem is in the same location or not
>>>>>
>>>>>
>>>>> Hi Dmitry, I tested with the reproducer and the kernel .config file that you
>>>>> sent in the first email in this thread:
>>>>>
>>>>> With 4.16-rc2 unable to reproduce.
>>>>>
>>>>> With 4.15-rc9 bug reproducible, and I get "unregister_netdevice: waiting for
>>>>> lo to become free. Usage count = 3"
>>>>>
>>>>> With 4.15-rc9 and Alexey's "sctp: fix dst refcnt leak in sctp_v6_get_dst()"
>>>>> cherry-picked on top, unable to reproduce.
>>>>>
>>>>>
>>>>> Is syzkaller doing something else now to trigger the bug...?
>>>>> Can you still trigger the bug with the same reproducer?
>>>>
>>>> Hi Neil, Tommi,
>>>>
>>>> Reviving this old thread about "unregister_netdevice: waiting for lo
>>>> to become free. Usage count = 3" hangs.
>>>> I still did not have time to deep dive into what happens there (too
>>>> many bugs coming from syzbot). But this still actively happens and I
>>>> suspect accounts to a significant portion of various hang reports,
>>>> which are quite unpleasant.
>>>>
>>>> One idea that could make it all simpler:
>>>>
>>>> Is this wait loop in netdev_wait_allrefs() supposed to wait for any
>>>> prolonged periods of time under any non-buggy conditions? E.g. more
>>>> than 1-2 minutes?
>>>> If it only supposed to wait briefly for things that already supposed
>>>> to be shutting down, and we add a WARNING there after some timeout,
>>>> then syzbot will report all info how/when it happens, hopefully
>>>> extracting reproducers, and all the nice things.
>>>> But this WARNING should not have any false positives under any
>>>> realistic conditions (e.g. waiting for arrival of remote packets with
>>>> large timeouts).
>>>>
>>>> Looking at some task hung reports, it seems that this code holds some
>>>> mutexes, takes workqueue thread and prevents any progress with
>>>> destruction of other devices (and net namespace creation/destruction),
>>>> so I guess it should not wait for any indefinite periods of time?
>>>
>>> I'm working on this currently:
>>> https://bugs.launchpad.net/ubuntu/zesty/+source/linux/+bug/1711407
>>>
>>> I added a summary of what I've found to be the cause (or at least, one
>>> possible cause) of this:
>>> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407/comments/72
>>>
>>> I'm working on a patch to work around the main side-effect of this,
>>> which is hanging while holding the global net mutex.  Hangs will still
>>> happen (e.g. if a dst leaks) but should not affect anything else,
>>> other than a leak of the dst and its net namespace.
>>>
>>> Fixing the dst leaks is important too, of course, but a dst leak (or
>>> other cause) shouldn't break the entire system.
>>
>> Leaking some memory is definitely better than hanging the system.
>>
>> So I've made syzkaller to recognize "unregister_netdevice: waiting for
>> (.*) to become free" as a kernel bug:
>> https://github.com/google/syzkaller/commit/7a67784ca8bdc3b26cce2f0ec9a40d2dd9ec9396
>> Unfortunately it does not make it catch these bugs because creating a
>> net namespace per test is too damn slow, so namespaces are reused for
>> lots of tests and when/if it's eventually destroyed it's already too
>> late to find root cause.
>>
>> But I've run a one-off experiment with prompt net namespace
>> destruction and syzkaller was able to easily extract a C reproducer:
>> https://gist.githubusercontent.com/dvyukov/d571e8fff24e127ca48a8c4790d42bfa/raw/52050e93ba9afbb5126b9d7bb39b7e71a82af016/gistfile1.txt
>>
>> On upstream 16e205cf42da1f497b10a4a24f563e6c0d574eec with this config:
>> https://gist.githubusercontent.com/dvyukov/9663c57443adb21f2795b92ef0829d62/raw/bbea0652e23746096dd56855a28f6c681aebcdee/gistfile1.txt
>>
>> this gives me:
>>
>> [   83.183198] unregister_netdevice: waiting for lo to become free.
>> Usage count = 9
>> [   85.231202] unregister_netdevice: waiting for lo to become free.
>> Usage count = 9
>> ...
>> [  523.511205] unregister_netdevice: waiting for lo to become free.
>> Usage count = 9
>> ...
>>
>> This is generated from this syzkaller program:
>>
>> r0 = socket$inet6(0xa, 0x1, 0x84)
>> setsockopt$inet6_IPV6_XFRM_POLICY(r0, 0x29, 0x23,
>> &(0x7f0000000380)={{{@in6=@remote={0xfe, 0x80, [], 0xbb},
>> @in=@dev={0xac, 0x14, 0x14}, 0x0, 0x0, 0x0, 0x0, 0xa}, {}, {}, 0x0,
>> 0x0, 0x1}, {{@in=@local={0xac, 0x14, 0x14, 0xaa}, 0x0, 0x32}, 0x0,
>> @in=@local={0xac, 0x14, 0x14, 0xaa}, 0x3504}}, 0xe8)
>> bind$inet6(r0, &(0x7f0000000000)={0xa, 0x4e20}, 0x1c)
>> connect$inet(r0, &(0x7f0000000040)={0x2, 0x4e20, @dev={0xac, 0x14,
>> 0x14, 0xd}}, 0x10)
>> syz_emit_ethernet(0x3e, &(0x7f00000001c0)={@local={[0xaa, 0xaa, 0xaa,
>> 0xaa, 0xaa], 0xaa}, @dev={[0xaa, 0xaa, 0xaa, 0xaa, 0xaa]}, [],
>> {@ipv6={0x86dd, {0x0, 0x6, "50a09c", 0x8, 0xffffff11, 0x0,
>> @remote={0xfe, 0x80, [], 0xbb}, @local={0xfe, 0x80, [], 0xaa}, {[],
>> @udp={0x0, 0x4e20, 0x8}}}}}}, &(0x7f0000000040))
>>
>> So this seems to be related to IPv6 and/or xfrm and is potentially
>> caused by external packets (that syz_emit_ethernet call).
>
>
>
> Here is another repro which seems to be a different bug (note that it
> requires fault injection):
>
> https://gist.githubusercontent.com/dvyukov/1c56623016cc4c24a69d433c5114ad5b/raw/530478f571b195193101b912aa646948528baa8e/gistfile1.txt
>
> Dan, do you mind taking a look at them? Fixing these should eliminate
> root causes of these hangs/leaks.

Yep I will look at them, thanks for the reproducers.

^ permalink raw reply

* Re: [PATCH net-next 07/21] net/ipv6: Save route type in rt6_info
From: David Ahern @ 2018-04-16 19:50 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, idosch, roopa, eric.dumazet, weiwan, kafai, yoshfuji
In-Reply-To: <20180416.152033.1080883507779037027.davem@davemloft.net>

On 4/16/18 1:20 PM, David Miller wrote:
> From: David Ahern <dsahern@gmail.com>
> Date: Mon, 16 Apr 2018 08:22:41 -0700
> 
>> @@ -2394,6 +2395,7 @@ static void addrconf_add_mroute(struct net_device *dev)
>>  		.fc_ifindex = dev->ifindex,
>>  		.fc_dst_len = 8,
>>  		.fc_flags = RTF_UP,
>> +		.fc_type = RTN_UNICAST,
>>  		.fc_nlinfo.nl_net = dev_net(dev),
>>  	};
>>  
> 
> Multicast route is of type RTN_UNICAST?
> 
> All of these cases where passing in a zero initialized value of
> fc_type up until this patch.  Perhaps you should discuss that in your
> commit message a little bit.

Added this to the commit message:

fc_type is set in current users based on the algorithm in rt6_fill_node:
  - rt6i_flags contains RTF_LOCAL: fc_type = RTN_LOCAL
  - rt6i_flags contains RTF_ANYCAST: fc_type = RTN_ANYCAST
  - else fc_type = RTN_UNICAST

Similarly, fib6_type is set in the rt6_info templates based on the
RTF_REJECT section of rt6_fill_node converting dst.error to RTN type.

#####

Will send a v2 later this week.

^ permalink raw reply

* Re: [PATCH net-next 07/21] net/ipv6: Save route type in rt6_info
From: David Miller @ 2018-04-16 19:55 UTC (permalink / raw)
  To: dsahern; +Cc: netdev, idosch, roopa, eric.dumazet, weiwan, kafai, yoshfuji
In-Reply-To: <c2ff3c7e-a113-9eab-e25f-cf432b9e241b@gmail.com>

From: David Ahern <dsahern@gmail.com>
Date: Mon, 16 Apr 2018 13:50:07 -0600

> Added this to the commit message:
> 
> fc_type is set in current users based on the algorithm in rt6_fill_node:
>   - rt6i_flags contains RTF_LOCAL: fc_type = RTN_LOCAL
>   - rt6i_flags contains RTF_ANYCAST: fc_type = RTN_ANYCAST
>   - else fc_type = RTN_UNICAST
> 
> Similarly, fib6_type is set in the rt6_info templates based on the
> RTF_REJECT section of rt6_fill_node converting dst.error to RTN type.
> 
> #####
> 
> Will send a v2 later this week.

Looks great.

^ permalink raw reply

* Re: [PATCH bpf-next v3 00/10] BTF: BPF Type Format
From: Arnaldo Carvalho de Melo @ 2018-04-16 20:22 UTC (permalink / raw)
  To: Martin KaFai Lau; +Cc: netdev, Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180416193327.477239-1-kafai@fb.com>

Em Mon, Apr 16, 2018 at 12:33:17PM -0700, Martin KaFai Lau escreveu:
> This patch introduces BPF Type Format (BTF).
> 
> BTF (BPF Type Format) is the meta data format which describes
> the data types of BPF program/map.  Hence, it basically focus
> on the C programming language which the modern BPF is primary
> using.  The first use case is to provide a generic pretty print
> capability for a BPF map.
> 
> A modified pahole (Cc: Arnaldo) that can convert dwarf to BTF is here:
> https://github.com/iamkafai/pahole/tree/btf

Thanks for CCing me, no changes since when you posted the pahole
patches, I gave it a quick look, seems sane, will try to merge and push
a new pahole version out so that distros can pick it, at least fedora
will 8-)

- Arnaldo
 
> Please see individual patch for details.
> 
> v3:
> - Rebase to bpf-next
> - Fix sparse warning (by adding static)
> - Add BTF header logging: btf_verifier_log_hdr()
> - Fix the alignment test on btf->type_off
> - Add tests for the BTF header
> - Lower the max BTF size to 16MB.  It should be enough
>   for some time.  We could raise it later if it would
>   be needed.
> 
> v2:
> - Use kvfree where needed in patch 1 and 2
> - Also consider BTF_INT_OFFSET() in the btf_int_check_meta()
>   in patch 1
> - Fix an incorrect goto target in map_create() during
>   the btf-error-path in patch 7
> - re-org some local vars to keep the rev xmas tree in btf.c
> 
> Martin KaFai Lau (10):
>   bpf: btf: Introduce BPF Type Format (BTF)
>   bpf: btf: Validate type reference
>   bpf: btf: Check members of struct/union
>   bpf: btf: Add pretty print capability for data with BTF type info
>   bpf: btf: Add BPF_BTF_LOAD command
>   bpf: btf: Add BPF_OBJ_GET_INFO_BY_FD support to BTF fd
>   bpf: btf: Add pretty print support to the basic arraymap
>   bpf: btf: Sync bpf.h and btf.h to tools/
>   bpf: btf: Add BTF support to libbpf
>   bpf: btf: Add BTF tests
> 
>  include/linux/bpf.h                          |   20 +-
>  include/linux/btf.h                          |   48 +
>  include/uapi/linux/bpf.h                     |   12 +
>  include/uapi/linux/btf.h                     |  132 ++
>  kernel/bpf/Makefile                          |    1 +
>  kernel/bpf/arraymap.c                        |   50 +
>  kernel/bpf/btf.c                             | 2093 ++++++++++++++++++++++++++
>  kernel/bpf/inode.c                           |  146 +-
>  kernel/bpf/syscall.c                         |   51 +-
>  tools/include/uapi/linux/bpf.h               |   13 +
>  tools/include/uapi/linux/btf.h               |  132 ++
>  tools/lib/bpf/Build                          |    2 +-
>  tools/lib/bpf/bpf.c                          |   92 +-
>  tools/lib/bpf/bpf.h                          |   16 +
>  tools/lib/bpf/btf.c                          |  377 +++++
>  tools/lib/bpf/btf.h                          |   22 +
>  tools/lib/bpf/libbpf.c                       |  148 +-
>  tools/lib/bpf/libbpf.h                       |    3 +
>  tools/testing/selftests/bpf/Makefile         |   26 +-
>  tools/testing/selftests/bpf/test_btf.c       | 1669 ++++++++++++++++++++
>  tools/testing/selftests/bpf/test_btf_haskv.c |   48 +
>  tools/testing/selftests/bpf/test_btf_nokv.c  |   43 +
>  22 files changed, 5103 insertions(+), 41 deletions(-)
>  create mode 100644 include/linux/btf.h
>  create mode 100644 include/uapi/linux/btf.h
>  create mode 100644 kernel/bpf/btf.c
>  create mode 100644 tools/include/uapi/linux/btf.h
>  create mode 100644 tools/lib/bpf/btf.c
>  create mode 100644 tools/lib/bpf/btf.h
>  create mode 100644 tools/testing/selftests/bpf/test_btf.c
>  create mode 100644 tools/testing/selftests/bpf/test_btf_haskv.c
>  create mode 100644 tools/testing/selftests/bpf/test_btf_nokv.c
> 
> -- 
> 2.9.5

^ permalink raw reply

* [PATCH net-next 0/5] fib rules extack support and selftest
From: Roopa Prabhu @ 2018-04-16 20:41 UTC (permalink / raw)
  To: davem; +Cc: netdev, dsa

From: Roopa Prabhu <roopa@cumulusnetworks.com>

The first patch refactors some fib rule common add del netlink
processing code. fibrule selftest uses route get, so this
patch extends route get with a few matches.

Roopa Prabhu (5):
  fib_rules: move common handling of newrule delrule msgs into
    fib_nl2rule
  net: fib_rules: add extack support
  ipv4: support sport, dport and ip protocol in RTM_GETROUTE
  ipv6: support sport, dport and ip protocol in RTM_GETROUTE
  selftests: net: initial fib rule tests

 include/net/fib_rules.h                       |   3 +-
 include/uapi/linux/rtnetlink.h                |   3 +
 net/core/fib_rules.c                          | 469 ++++++++++++--------------
 net/ipv4/fib_rules.c                          |   7 +-
 net/ipv4/route.c                              |  75 +++-
 net/ipv6/fib6_rules.c                         |   7 +-
 net/ipv6/route.c                              |  29 ++
 tools/testing/selftests/net/Makefile          |   2 +-
 tools/testing/selftests/net/fib_rule_tests.sh | 208 ++++++++++++
 9 files changed, 535 insertions(+), 268 deletions(-)
 create mode 100755 tools/testing/selftests/net/fib_rule_tests.sh

-- 
2.1.4

^ permalink raw reply

* [PATCH net-next 1/5] fib_rules: move common handling of newrule delrule msgs into fib_nl2rule
From: Roopa Prabhu @ 2018-04-16 20:41 UTC (permalink / raw)
  To: davem; +Cc: netdev, dsa
In-Reply-To: <1523911298-8965-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This reduces code duplication in the fib rule add and del paths.
Get rid of validate_rulemsg. This became obvious when adding duplicate
extack support in fib newrule/delrule error paths.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 net/core/fib_rules.c | 436 +++++++++++++++++++++++----------------------------
 1 file changed, 192 insertions(+), 244 deletions(-)

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 33958f8..6780110 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -387,206 +387,185 @@ unsigned int fib_rules_seq_read(struct net *net, int family)
 }
 EXPORT_SYMBOL_GPL(fib_rules_seq_read);
 
-static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
-			    struct fib_rules_ops *ops)
-{
-	int err = -EINVAL;
-
-	if (frh->src_len)
-		if (tb[FRA_SRC] == NULL ||
-		    frh->src_len > (ops->addr_size * 8) ||
-		    nla_len(tb[FRA_SRC]) != ops->addr_size)
-			goto errout;
-
-	if (frh->dst_len)
-		if (tb[FRA_DST] == NULL ||
-		    frh->dst_len > (ops->addr_size * 8) ||
-		    nla_len(tb[FRA_DST]) != ops->addr_size)
-			goto errout;
-
-	err = 0;
-errout:
-	return err;
-}
-
-static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
-		       struct nlattr **tb, struct fib_rule *rule)
+static struct fib_rule *rule_find(struct fib_rules_ops *ops,
+				  struct fib_rule_hdr *frh,
+				  struct nlattr **tb,
+				  struct fib_rule *rule,
+				  bool user_priority)
 {
 	struct fib_rule *r;
 
 	list_for_each_entry(r, &ops->rules_list, list) {
-		if (r->action != rule->action)
+		if (rule->action && r->action != rule->action)
 			continue;
 
-		if (r->table != rule->table)
+		if (rule->table && r->table != rule->table)
 			continue;
 
-		if (r->pref != rule->pref)
+		if (user_priority && r->pref != rule->pref)
 			continue;
 
-		if (memcmp(r->iifname, rule->iifname, IFNAMSIZ))
+		if (rule->iifname[0] &&
+		    memcmp(r->iifname, rule->iifname, IFNAMSIZ))
 			continue;
 
-		if (memcmp(r->oifname, rule->oifname, IFNAMSIZ))
+		if (rule->oifname[0] &&
+		    memcmp(r->oifname, rule->oifname, IFNAMSIZ))
 			continue;
 
-		if (r->mark != rule->mark)
+		if (rule->mark && r->mark != rule->mark)
 			continue;
 
-		if (r->mark_mask != rule->mark_mask)
+		if (rule->mark_mask && r->mark_mask != rule->mark_mask)
 			continue;
 
-		if (r->tun_id != rule->tun_id)
+		if (rule->tun_id && r->tun_id != rule->tun_id)
 			continue;
 
 		if (r->fr_net != rule->fr_net)
 			continue;
 
-		if (r->l3mdev != rule->l3mdev)
+		if (rule->l3mdev && r->l3mdev != rule->l3mdev)
 			continue;
 
-		if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
-		    !uid_eq(r->uid_range.end, rule->uid_range.end))
+		if (uid_range_set(&rule->uid_range) &&
+		    (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+		    !uid_eq(r->uid_range.end, rule->uid_range.end)))
 			continue;
 
-		if (r->ip_proto != rule->ip_proto)
+		if (rule->ip_proto && r->ip_proto != rule->ip_proto)
 			continue;
 
-		if (!fib_rule_port_range_compare(&r->sport_range,
+		if (fib_rule_port_range_set(&rule->sport_range) &&
+		    !fib_rule_port_range_compare(&r->sport_range,
 						 &rule->sport_range))
 			continue;
 
-		if (!fib_rule_port_range_compare(&r->dport_range,
+		if (fib_rule_port_range_set(&rule->dport_range) &&
+		    !fib_rule_port_range_compare(&r->dport_range,
 						 &rule->dport_range))
 			continue;
 
 		if (!ops->compare(r, frh, tb))
 			continue;
-		return 1;
+		return r;
 	}
-	return 0;
+
+	return NULL;
 }
 
-int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
-		   struct netlink_ext_ack *extack)
+static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
+		       struct netlink_ext_ack *extack,
+		       struct fib_rules_ops *ops,
+		       struct nlattr *tb[],
+		       struct fib_rule **rule,
+		       bool *user_priority)
 {
 	struct net *net = sock_net(skb->sk);
 	struct fib_rule_hdr *frh = nlmsg_data(nlh);
-	struct fib_rules_ops *ops = NULL;
-	struct fib_rule *rule, *r, *last = NULL;
-	struct nlattr *tb[FRA_MAX+1];
-	int err = -EINVAL, unresolved = 0;
-
-	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
-		goto errout;
-
-	ops = lookup_rules_ops(net, frh->family);
-	if (ops == NULL) {
-		err = -EAFNOSUPPORT;
-		goto errout;
-	}
+	struct fib_rule *nlrule = NULL;
+	int err = -EINVAL;
 
-	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
-	if (err < 0)
-		goto errout;
+	if (frh->src_len)
+		if (!tb[FRA_SRC] ||
+		    frh->src_len > (ops->addr_size * 8) ||
+		    nla_len(tb[FRA_SRC]) != ops->addr_size)
+			goto errout;
 
-	err = validate_rulemsg(frh, tb, ops);
-	if (err < 0)
-		goto errout;
+	if (frh->dst_len)
+		if (!tb[FRA_DST] ||
+		    frh->dst_len > (ops->addr_size * 8) ||
+		    nla_len(tb[FRA_DST]) != ops->addr_size)
+			goto errout;
 
-	rule = kzalloc(ops->rule_size, GFP_KERNEL);
-	if (rule == NULL) {
+	nlrule = kzalloc(ops->rule_size, GFP_KERNEL);
+	if (!nlrule) {
 		err = -ENOMEM;
 		goto errout;
 	}
-	refcount_set(&rule->refcnt, 1);
-	rule->fr_net = net;
+	refcount_set(&nlrule->refcnt, 1);
+	nlrule->fr_net = net;
 
-	rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
-	                              : fib_default_rule_pref(ops);
+	if (tb[FRA_PRIORITY]) {
+		nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]);
+		*user_priority = true;
+	} else {
+		nlrule->pref = fib_default_rule_pref(ops);
+	}
 
-	rule->proto = tb[FRA_PROTOCOL] ?
+	nlrule->proto = tb[FRA_PROTOCOL] ?
 		nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;
 
 	if (tb[FRA_IIFNAME]) {
 		struct net_device *dev;
 
-		rule->iifindex = -1;
-		nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
-		dev = __dev_get_by_name(net, rule->iifname);
+		nlrule->iifindex = -1;
+		nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
+		dev = __dev_get_by_name(net, nlrule->iifname);
 		if (dev)
-			rule->iifindex = dev->ifindex;
+			nlrule->iifindex = dev->ifindex;
 	}
 
 	if (tb[FRA_OIFNAME]) {
 		struct net_device *dev;
 
-		rule->oifindex = -1;
-		nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
-		dev = __dev_get_by_name(net, rule->oifname);
+		nlrule->oifindex = -1;
+		nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
+		dev = __dev_get_by_name(net, nlrule->oifname);
 		if (dev)
-			rule->oifindex = dev->ifindex;
+			nlrule->oifindex = dev->ifindex;
 	}
 
 	if (tb[FRA_FWMARK]) {
-		rule->mark = nla_get_u32(tb[FRA_FWMARK]);
-		if (rule->mark)
+		nlrule->mark = nla_get_u32(tb[FRA_FWMARK]);
+		if (nlrule->mark)
 			/* compatibility: if the mark value is non-zero all bits
 			 * are compared unless a mask is explicitly specified.
 			 */
-			rule->mark_mask = 0xFFFFFFFF;
+			nlrule->mark_mask = 0xFFFFFFFF;
 	}
 
 	if (tb[FRA_FWMASK])
-		rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
+		nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
 
 	if (tb[FRA_TUN_ID])
-		rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
+		nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
 
 	err = -EINVAL;
 	if (tb[FRA_L3MDEV]) {
 #ifdef CONFIG_NET_L3_MASTER_DEV
-		rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
-		if (rule->l3mdev != 1)
+		nlrule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
+		if (nlrule->l3mdev != 1)
 #endif
 			goto errout_free;
 	}
 
-	rule->action = frh->action;
-	rule->flags = frh->flags;
-	rule->table = frh_get_table(frh, tb);
+	nlrule->action = frh->action;
+	nlrule->flags = frh->flags;
+	nlrule->table = frh_get_table(frh, tb);
 	if (tb[FRA_SUPPRESS_PREFIXLEN])
-		rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
+		nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
 	else
-		rule->suppress_prefixlen = -1;
+		nlrule->suppress_prefixlen = -1;
 
 	if (tb[FRA_SUPPRESS_IFGROUP])
-		rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
+		nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
 	else
-		rule->suppress_ifgroup = -1;
+		nlrule->suppress_ifgroup = -1;
 
 	if (tb[FRA_GOTO]) {
-		if (rule->action != FR_ACT_GOTO)
+		if (nlrule->action != FR_ACT_GOTO)
 			goto errout_free;
 
-		rule->target = nla_get_u32(tb[FRA_GOTO]);
+		nlrule->target = nla_get_u32(tb[FRA_GOTO]);
 		/* Backward jumps are prohibited to avoid endless loops */
-		if (rule->target <= rule->pref)
+		if (nlrule->target <= nlrule->pref)
 			goto errout_free;
-
-		list_for_each_entry(r, &ops->rules_list, list) {
-			if (r->pref == rule->target) {
-				RCU_INIT_POINTER(rule->ctarget, r);
-				break;
-			}
-		}
-
-		if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
-			unresolved = 1;
-	} else if (rule->action == FR_ACT_GOTO)
+	} else if (nlrule->action == FR_ACT_GOTO) {
 		goto errout_free;
+	}
 
-	if (rule->l3mdev && rule->table)
+	if (nlrule->l3mdev && nlrule->table)
 		goto errout_free;
 
 	if (tb[FRA_UID_RANGE]) {
@@ -595,34 +574,72 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 			goto errout_free;
 		}
 
-		rule->uid_range = nla_get_kuid_range(tb);
+		nlrule->uid_range = nla_get_kuid_range(tb);
 
-		if (!uid_range_set(&rule->uid_range) ||
-		    !uid_lte(rule->uid_range.start, rule->uid_range.end))
+		if (!uid_range_set(&nlrule->uid_range) ||
+		    !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end))
 			goto errout_free;
 	} else {
-		rule->uid_range = fib_kuid_range_unset;
+		nlrule->uid_range = fib_kuid_range_unset;
 	}
 
 	if (tb[FRA_IP_PROTO])
-		rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
+		nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
 
 	if (tb[FRA_SPORT_RANGE]) {
 		err = nla_get_port_range(tb[FRA_SPORT_RANGE],
-					 &rule->sport_range);
+					 &nlrule->sport_range);
 		if (err)
 			goto errout_free;
 	}
 
 	if (tb[FRA_DPORT_RANGE]) {
 		err = nla_get_port_range(tb[FRA_DPORT_RANGE],
-					 &rule->dport_range);
+					 &nlrule->dport_range);
 		if (err)
 			goto errout_free;
 	}
 
+	*rule = nlrule;
+
+	return 0;
+
+errout_free:
+	kfree(nlrule);
+errout:
+	return err;
+}
+
+int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+		   struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct fib_rule_hdr *frh = nlmsg_data(nlh);
+	struct fib_rules_ops *ops = NULL;
+	struct fib_rule *rule = NULL, *r, *last = NULL;
+	struct nlattr *tb[FRA_MAX + 1];
+	int err = -EINVAL, unresolved = 0;
+	bool user_priority = false;
+
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+		goto errout;
+
+	ops = lookup_rules_ops(net, frh->family);
+	if (!ops) {
+		err = -EAFNOSUPPORT;
+		goto errout;
+	}
+
+	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
+	if (err < 0)
+		goto errout;
+
+	err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority);
+	if (err)
+		goto errout;
+
 	if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
-	    rule_exists(ops, frh, tb, rule)) {
+	    rule_find(ops, frh, tb, rule, user_priority)) {
 		err = -EEXIST;
 		goto errout_free;
 	}
@@ -637,6 +654,16 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto errout_free;
 
 	list_for_each_entry(r, &ops->rules_list, list) {
+		if (r->pref == rule->target) {
+			RCU_INIT_POINTER(rule->ctarget, r);
+			break;
+		}
+	}
+
+	if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
+		unresolved = 1;
+
+	list_for_each_entry(r, &ops->rules_list, list) {
 		if (r->pref > rule->pref)
 			break;
 		last = r;
@@ -690,13 +717,11 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 {
 	struct net *net = sock_net(skb->sk);
 	struct fib_rule_hdr *frh = nlmsg_data(nlh);
-	struct fib_rule_port_range sprange = {0, 0};
-	struct fib_rule_port_range dprange = {0, 0};
 	struct fib_rules_ops *ops = NULL;
-	struct fib_rule *rule, *r;
+	struct fib_rule *rule = NULL, *r, *nlrule = NULL;
 	struct nlattr *tb[FRA_MAX+1];
-	struct fib_kuid_range range;
 	int err = -EINVAL;
+	bool user_priority = false;
 
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
 		goto errout;
@@ -711,150 +736,73 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto errout;
 
-	err = validate_rulemsg(frh, tb, ops);
-	if (err < 0)
+	err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority);
+	if (err)
 		goto errout;
 
-	if (tb[FRA_UID_RANGE]) {
-		range = nla_get_kuid_range(tb);
-		if (!uid_range_set(&range)) {
-			err = -EINVAL;
-			goto errout;
-		}
-	} else {
-		range = fib_kuid_range_unset;
+	rule = rule_find(ops, frh, tb, nlrule, user_priority);
+	if (!rule) {
+		err = -ENOENT;
+		goto errout;
 	}
 
-	if (tb[FRA_SPORT_RANGE]) {
-		err = nla_get_port_range(tb[FRA_SPORT_RANGE],
-					 &sprange);
-		if (err)
-			goto errout;
+	if (rule->flags & FIB_RULE_PERMANENT) {
+		err = -EPERM;
+		goto errout;
 	}
 
-	if (tb[FRA_DPORT_RANGE]) {
-		err = nla_get_port_range(tb[FRA_DPORT_RANGE],
-					 &dprange);
+	if (ops->delete) {
+		err = ops->delete(rule);
 		if (err)
 			goto errout;
 	}
 
-	list_for_each_entry(rule, &ops->rules_list, list) {
-		if (tb[FRA_PROTOCOL] &&
-		    (rule->proto != nla_get_u8(tb[FRA_PROTOCOL])))
-			continue;
-
-		if (frh->action && (frh->action != rule->action))
-			continue;
-
-		if (frh_get_table(frh, tb) &&
-		    (frh_get_table(frh, tb) != rule->table))
-			continue;
-
-		if (tb[FRA_PRIORITY] &&
-		    (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
-			continue;
-
-		if (tb[FRA_IIFNAME] &&
-		    nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
-			continue;
-
-		if (tb[FRA_OIFNAME] &&
-		    nla_strcmp(tb[FRA_OIFNAME], rule->oifname))
-			continue;
-
-		if (tb[FRA_FWMARK] &&
-		    (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
-			continue;
-
-		if (tb[FRA_FWMASK] &&
-		    (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
-			continue;
-
-		if (tb[FRA_TUN_ID] &&
-		    (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID])))
-			continue;
-
-		if (tb[FRA_L3MDEV] &&
-		    (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV])))
-			continue;
-
-		if (uid_range_set(&range) &&
-		    (!uid_eq(rule->uid_range.start, range.start) ||
-		     !uid_eq(rule->uid_range.end, range.end)))
-			continue;
-
-		if (tb[FRA_IP_PROTO] &&
-		    (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO])))
-			continue;
-
-		if (fib_rule_port_range_set(&sprange) &&
-		    !fib_rule_port_range_compare(&rule->sport_range, &sprange))
-			continue;
-
-		if (fib_rule_port_range_set(&dprange) &&
-		    !fib_rule_port_range_compare(&rule->dport_range, &dprange))
-			continue;
-
-		if (!ops->compare(rule, frh, tb))
-			continue;
-
-		if (rule->flags & FIB_RULE_PERMANENT) {
-			err = -EPERM;
-			goto errout;
-		}
-
-		if (ops->delete) {
-			err = ops->delete(rule);
-			if (err)
-				goto errout;
-		}
-
-		if (rule->tun_id)
-			ip_tunnel_unneed_metadata();
+	if (rule->tun_id)
+		ip_tunnel_unneed_metadata();
 
-		list_del_rcu(&rule->list);
+	list_del_rcu(&rule->list);
 
-		if (rule->action == FR_ACT_GOTO) {
-			ops->nr_goto_rules--;
-			if (rtnl_dereference(rule->ctarget) == NULL)
-				ops->unresolved_rules--;
-		}
+	if (rule->action == FR_ACT_GOTO) {
+		ops->nr_goto_rules--;
+		if (rtnl_dereference(rule->ctarget) == NULL)
+			ops->unresolved_rules--;
+	}
 
-		/*
-		 * Check if this rule is a target to any of them. If so,
-		 * adjust to the next one with the same preference or
-		 * disable them. As this operation is eventually very
-		 * expensive, it is only performed if goto rules, except
-		 * current if it is goto rule, have actually been added.
-		 */
-		if (ops->nr_goto_rules > 0) {
-			struct fib_rule *n;
-
-			n = list_next_entry(rule, list);
-			if (&n->list == &ops->rules_list || n->pref != rule->pref)
-				n = NULL;
-			list_for_each_entry(r, &ops->rules_list, list) {
-				if (rtnl_dereference(r->ctarget) != rule)
-					continue;
-				rcu_assign_pointer(r->ctarget, n);
-				if (!n)
-					ops->unresolved_rules++;
-			}
+	/*
+	 * Check if this rule is a target to any of them. If so,
+	 * adjust to the next one with the same preference or
+	 * disable them. As this operation is eventually very
+	 * expensive, it is only performed if goto rules, except
+	 * current if it is goto rule, have actually been added.
+	 */
+	if (ops->nr_goto_rules > 0) {
+		struct fib_rule *n;
+
+		n = list_next_entry(rule, list);
+		if (&n->list == &ops->rules_list || n->pref != rule->pref)
+			n = NULL;
+		list_for_each_entry(r, &ops->rules_list, list) {
+			if (rtnl_dereference(r->ctarget) != rule)
+				continue;
+			rcu_assign_pointer(r->ctarget, n);
+			if (!n)
+				ops->unresolved_rules++;
 		}
-
-		call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
-					NULL);
-		notify_rule_change(RTM_DELRULE, rule, ops, nlh,
-				   NETLINK_CB(skb).portid);
-		fib_rule_put(rule);
-		flush_route_cache(ops);
-		rules_ops_put(ops);
-		return 0;
 	}
 
-	err = -ENOENT;
+	call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
+				NULL);
+	notify_rule_change(RTM_DELRULE, rule, ops, nlh,
+			   NETLINK_CB(skb).portid);
+	fib_rule_put(rule);
+	flush_route_cache(ops);
+	rules_ops_put(ops);
+	kfree(nlrule);
+	return 0;
+
 errout:
+	if (nlrule)
+		kfree(nlrule);
 	rules_ops_put(ops);
 	return err;
 }
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next 3/5] ipv4: support sport, dport and ip protocol in RTM_GETROUTE
From: Roopa Prabhu @ 2018-04-16 20:41 UTC (permalink / raw)
  To: davem; +Cc: netdev, dsa
In-Reply-To: <1523911298-8965-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This is a followup to fib rules sport, dport and ip proto
match support. Having them supported in getroute
makes it easier to test fib rule lookups. Used by fib rule
self tests.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 include/uapi/linux/rtnetlink.h |  3 ++
 net/ipv4/route.c               | 75 +++++++++++++++++++++++++++++++++---------
 2 files changed, 63 insertions(+), 15 deletions(-)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 9b15005..7947252 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -327,6 +327,9 @@ enum rtattr_type_t {
 	RTA_PAD,
 	RTA_UID,
 	RTA_TTL_PROPAGATE,
+	RTA_SPORT,
+	RTA_DPORT,
+	RTA_IP_PROTO,
 	__RTA_MAX
 };
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ccb25d8..ae55711 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2663,6 +2663,18 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
 		goto nla_put_failure;
 
+	if (fl4->fl4_sport &&
+	    nla_put_u16(skb, RTA_SPORT, ntohs(fl4->fl4_sport)))
+		goto nla_put_failure;
+
+	if (fl4->fl4_dport &&
+	    nla_put_u16(skb, RTA_DPORT, ntohs(fl4->fl4_dport)))
+		goto nla_put_failure;
+
+	if (fl4->flowi4_proto &&
+	    nla_put_u8(skb, RTA_IP_PROTO, fl4->flowi4_proto))
+		goto nla_put_failure;
+
 	error = rt->dst.error;
 
 	if (rt_is_input_route(rt)) {
@@ -2695,35 +2707,48 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 	return -EMSGSIZE;
 }
 
+static int nla_get_port(struct nlattr *attr, __be16 *port)
+{
+	int p = nla_get_u16(attr);
+
+	if (p <= 0 || p >= 0xffff)
+		return -EINVAL;
+
+	*port = htons(p);
+	return 0;
+}
+
 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(in_skb->sk);
-	struct rtmsg *rtm;
 	struct nlattr *tb[RTA_MAX+1];
+	u32 table_id = RT_TABLE_MAIN;
+	__be16 sport = 0, dport = 0;
 	struct fib_result res = {};
 	struct rtable *rt = NULL;
+	struct sk_buff *skb;
+	struct rtmsg *rtm;
 	struct flowi4 fl4;
+	u8 ip_proto = 0;
 	__be32 dst = 0;
 	__be32 src = 0;
+	kuid_t uid;
 	u32 iif;
 	int err;
 	int mark;
-	struct sk_buff *skb;
-	u32 table_id = RT_TABLE_MAIN;
-	kuid_t uid;
 
 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
 			  extack);
 	if (err < 0)
-		goto errout;
+		return err;
 
 	rtm = nlmsg_data(nlh);
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
 	if (!skb) {
 		err = -ENOBUFS;
-		goto errout;
+		return err;
 	}
 
 	/* Reserve room for dummy headers, this skb can pass
@@ -2740,6 +2765,20 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
 	else
 		uid = (iif ? INVALID_UID : current_uid());
+	if (tb[RTA_SPORT]) {
+		err = nla_get_port(tb[RTA_SPORT], &sport);
+		if (err)
+			goto errout_free;
+	}
+
+	if (tb[RTA_DPORT]) {
+		err = nla_get_port(tb[RTA_DPORT], &dport);
+		if (err)
+			goto errout_free;
+	}
+
+	if (tb[RTA_IP_PROTO])
+		ip_proto = nla_get_u8(tb[RTA_IP_PROTO]);
 
 	/* Bugfix: need to give ip_route_input enough of an IP header to
 	 * not gag.
@@ -2757,6 +2796,12 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
 	fl4.flowi4_mark = mark;
 	fl4.flowi4_uid = uid;
+	if (sport)
+		fl4.fl4_sport = sport;
+	if (dport)
+		fl4.fl4_dport = dport;
+	if (ip_proto)
+		fl4.flowi4_proto = ip_proto;
 
 	rcu_read_lock();
 
@@ -2766,7 +2811,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		dev = dev_get_by_index_rcu(net, iif);
 		if (!dev) {
 			err = -ENODEV;
-			goto errout_free;
+			goto errout_rcu;
 		}
 
 		skb->protocol	= htons(ETH_P_IP);
@@ -2789,7 +2834,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	}
 
 	if (err)
-		goto errout_free;
+		goto errout_rcu;
 
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
@@ -2802,7 +2847,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 			err = fib_props[res.type].error;
 			if (!err)
 				err = -EHOSTUNREACH;
-			goto errout_free;
+			goto errout_rcu;
 		}
 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
@@ -2813,18 +2858,18 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
 	}
 	if (err < 0)
-		goto errout_free;
+		goto errout_rcu;
 
 	rcu_read_unlock();
 
-	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
-errout:
-	return err;
+	return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
 
 errout_free:
-	rcu_read_unlock();
 	kfree_skb(skb);
-	goto errout;
+	return err;
+errout_rcu:
+	rcu_read_unlock();
+	goto errout_free;
 }
 
 void ip_rt_multicast_event(struct in_device *in_dev)
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next 2/5] net: fib_rules: add extack support
From: Roopa Prabhu @ 2018-04-16 20:41 UTC (permalink / raw)
  To: davem; +Cc: netdev, dsa
In-Reply-To: <1523911298-8965-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 include/net/fib_rules.h |  3 ++-
 net/core/fib_rules.c    | 55 +++++++++++++++++++++++++++++++++++++------------
 net/decnet/dn_rules.c   |  7 +++++--
 net/ipv4/fib_rules.c    |  7 +++++--
 net/ipv4/ipmr.c         |  3 ++-
 net/ipv6/fib6_rules.c   |  7 +++++--
 net/ipv6/ip6mr.c        |  3 ++-
 7 files changed, 63 insertions(+), 22 deletions(-)

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index e5cfcfc..b473df5 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -75,7 +75,8 @@ struct fib_rules_ops {
 	int			(*configure)(struct fib_rule *,
 					     struct sk_buff *,
 					     struct fib_rule_hdr *,
-					     struct nlattr **);
+					     struct nlattr **,
+					     struct netlink_ext_ack *);
 	int			(*delete)(struct fib_rule *);
 	int			(*compare)(struct fib_rule *,
 					   struct fib_rule_hdr *,
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 6780110..ebd9351 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -469,14 +469,18 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (frh->src_len)
 		if (!tb[FRA_SRC] ||
 		    frh->src_len > (ops->addr_size * 8) ||
-		    nla_len(tb[FRA_SRC]) != ops->addr_size)
+		    nla_len(tb[FRA_SRC]) != ops->addr_size) {
+			NL_SET_ERR_MSG(extack, "Invalid source address");
 			goto errout;
+	}
 
 	if (frh->dst_len)
 		if (!tb[FRA_DST] ||
 		    frh->dst_len > (ops->addr_size * 8) ||
-		    nla_len(tb[FRA_DST]) != ops->addr_size)
+		    nla_len(tb[FRA_DST]) != ops->addr_size) {
+			NL_SET_ERR_MSG(extack, "Invalid dst address");
 			goto errout;
+	}
 
 	nlrule = kzalloc(ops->rule_size, GFP_KERNEL);
 	if (!nlrule) {
@@ -537,6 +541,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		nlrule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]);
 		if (nlrule->l3mdev != 1)
 #endif
+			NL_SET_ERR_MSG(extack, "Invalid l3mdev");
 			goto errout_free;
 	}
 
@@ -554,31 +559,41 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		nlrule->suppress_ifgroup = -1;
 
 	if (tb[FRA_GOTO]) {
-		if (nlrule->action != FR_ACT_GOTO)
+		if (nlrule->action != FR_ACT_GOTO) {
+			NL_SET_ERR_MSG(extack, "Unexpected goto");
 			goto errout_free;
+		}
 
 		nlrule->target = nla_get_u32(tb[FRA_GOTO]);
 		/* Backward jumps are prohibited to avoid endless loops */
-		if (nlrule->target <= nlrule->pref)
+		if (nlrule->target <= nlrule->pref) {
+			NL_SET_ERR_MSG(extack, "Backward goto not supported");
 			goto errout_free;
+		}
 	} else if (nlrule->action == FR_ACT_GOTO) {
+		NL_SET_ERR_MSG(extack, "Missing goto target for action goto");
 		goto errout_free;
 	}
 
-	if (nlrule->l3mdev && nlrule->table)
+	if (nlrule->l3mdev && nlrule->table) {
+		NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive");
 		goto errout_free;
+	}
 
 	if (tb[FRA_UID_RANGE]) {
 		if (current_user_ns() != net->user_ns) {
 			err = -EPERM;
+			NL_SET_ERR_MSG(extack, "No permission to set uid");
 			goto errout_free;
 		}
 
 		nlrule->uid_range = nla_get_kuid_range(tb);
 
 		if (!uid_range_set(&nlrule->uid_range) ||
-		    !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end))
+		    !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) {
+			NL_SET_ERR_MSG(extack, "Invalid uid range");
 			goto errout_free;
+		}
 	} else {
 		nlrule->uid_range = fib_kuid_range_unset;
 	}
@@ -589,15 +604,19 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (tb[FRA_SPORT_RANGE]) {
 		err = nla_get_port_range(tb[FRA_SPORT_RANGE],
 					 &nlrule->sport_range);
-		if (err)
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Invalid sport range");
 			goto errout_free;
+		}
 	}
 
 	if (tb[FRA_DPORT_RANGE]) {
 		err = nla_get_port_range(tb[FRA_DPORT_RANGE],
 					 &nlrule->dport_range);
-		if (err)
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Invalid dport range");
 			goto errout_free;
+		}
 	}
 
 	*rule = nlrule;
@@ -621,18 +640,23 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int err = -EINVAL, unresolved = 0;
 	bool user_priority = false;
 
-	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+		NL_SET_ERR_MSG(extack, "Invalid msg length");
 		goto errout;
+	}
 
 	ops = lookup_rules_ops(net, frh->family);
 	if (!ops) {
 		err = -EAFNOSUPPORT;
+		NL_SET_ERR_MSG(extack, "Rule family not supported");
 		goto errout;
 	}
 
 	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
-	if (err < 0)
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack, "Error parsing msg");
 		goto errout;
+	}
 
 	err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority);
 	if (err)
@@ -644,7 +668,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto errout_free;
 	}
 
-	err = ops->configure(rule, skb, frh, tb);
+	err = ops->configure(rule, skb, frh, tb, extack);
 	if (err < 0)
 		goto errout_free;
 
@@ -723,18 +747,23 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int err = -EINVAL;
 	bool user_priority = false;
 
-	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
+	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+		NL_SET_ERR_MSG(extack, "Invalid msg length");
 		goto errout;
+	}
 
 	ops = lookup_rules_ops(net, frh->family);
 	if (ops == NULL) {
 		err = -EAFNOSUPPORT;
+		NL_SET_ERR_MSG(extack, "Rule family not supported");
 		goto errout;
 	}
 
 	err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
-	if (err < 0)
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack, "Error parsing msg");
 		goto errout;
+	}
 
 	err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority);
 	if (err)
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index c795c3f..7223669 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -121,13 +121,16 @@ static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 
 static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 				 struct fib_rule_hdr *frh,
-				 struct nlattr **tb)
+				 struct nlattr **tb,
+				 struct netlink_ext_ack *extack)
 {
 	int err = -EINVAL;
 	struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
 
-	if (frh->tos)
+	if (frh->tos) {
+		NL_SET_ERR_MSG(extack, "Invalid tos value");
 		goto  errout;
+	}
 
 	if (rule->table == RT_TABLE_UNSPEC) {
 		if (rule->action == FR_ACT_TO_TBL) {
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 737d11b..f8eb78d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -213,14 +213,17 @@ static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
 
 static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 			       struct fib_rule_hdr *frh,
-			       struct nlattr **tb)
+			       struct nlattr **tb,
+			       struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	int err = -EINVAL;
 	struct fib4_rule *rule4 = (struct fib4_rule *) rule;
 
-	if (frh->tos & ~IPTOS_TOS_MASK)
+	if (frh->tos & ~IPTOS_TOS_MASK) {
+		NL_SET_ERR_MSG(extack, "Invalid tos");
 		goto errout;
+	}
 
 	/* split local/main if they are not already split */
 	err = fib_unmerge(net);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2fb4de3..38e092e 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -201,7 +201,8 @@ static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
 };
 
 static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
-			       struct fib_rule_hdr *frh, struct nlattr **tb)
+			       struct fib_rule_hdr *frh, struct nlattr **tb,
+			       struct netlink_ext_ack *extack)
 {
 	return 0;
 }
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index df113c7..6547fc6 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -245,15 +245,18 @@ static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = {
 
 static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 			       struct fib_rule_hdr *frh,
-			       struct nlattr **tb)
+			       struct nlattr **tb,
+			       struct netlink_ext_ack *extack)
 {
 	int err = -EINVAL;
 	struct net *net = sock_net(skb->sk);
 	struct fib6_rule *rule6 = (struct fib6_rule *) rule;
 
 	if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
-		if (rule->table == RT6_TABLE_UNSPEC)
+		if (rule->table == RT6_TABLE_UNSPEC) {
+			NL_SET_ERR_MSG(extack, "Invalid table");
 			goto errout;
+		}
 
 		if (fib6_new_table(net, rule->table) == NULL) {
 			err = -ENOBUFS;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 298fd8b..20a419e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -180,7 +180,8 @@ static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = {
 };
 
 static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
-				struct fib_rule_hdr *frh, struct nlattr **tb)
+				struct fib_rule_hdr *frh, struct nlattr **tb,
+				struct netlink_ext_ack *extack)
 {
 	return 0;
 }
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next 4/5] ipv6: support sport, dport and ip protocol in RTM_GETROUTE
From: Roopa Prabhu @ 2018-04-16 20:41 UTC (permalink / raw)
  To: davem; +Cc: netdev, dsa
In-Reply-To: <1523911298-8965-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This is a followup to fib6 rules sport, dport and ip proto
match support. Having them supported in getroute
makes it easier to test fib6 rule lookups. Used by fib6 rule
self tests.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 net/ipv6/route.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 49b954d..5086a80 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3986,6 +3986,9 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
 	[RTA_EXPIRES]		= { .type = NLA_U32 },
 	[RTA_UID]		= { .type = NLA_U32 },
 	[RTA_MARK]		= { .type = NLA_U32 },
+	[RTA_IP_PROTO]		= { .type = NLA_U8 },
+	[RTA_SPORT]		= { .type = NLA_U16 },
+	[RTA_DPORT]		= { .type = NLA_U16 },
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -4658,6 +4661,17 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 		     NLM_F_MULTI);
 }
 
+static int nla_get_port(struct nlattr *attr, __be16 *port)
+{
+	int p = nla_get_u16(attr);
+
+	if (p <= 0 || p >= 0xffff)
+		return -EINVAL;
+
+	*port = htons(p);
+	return 0;
+}
+
 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 			      struct netlink_ext_ack *extack)
 {
@@ -4711,6 +4725,21 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	else
 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
 
+	if (tb[RTA_SPORT]) {
+		err = nla_get_port(tb[RTA_SPORT], &fl6.fl6_sport);
+		if (err)
+			goto errout;
+	}
+
+	if (tb[RTA_DPORT]) {
+		err = nla_get_port(tb[RTA_DPORT], &fl6.fl6_dport);
+		if (err)
+			goto errout;
+	}
+
+	if (tb[RTA_IP_PROTO])
+		fl6.flowi6_proto = nla_get_u8(tb[RTA_IP_PROTO]);
+
 	if (iif) {
 		struct net_device *dev;
 		int flags = 0;
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next 5/5] selftests: net: initial fib rule tests
From: Roopa Prabhu @ 2018-04-16 20:41 UTC (permalink / raw)
  To: davem; +Cc: netdev, dsa
In-Reply-To: <1523911298-8965-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This adds a first set of tests for fib rule match/action for
ipv4 and ipv6. Initial tests only cover action table.
can be extended to cover other actions in the future.
Uses ip route get to validate the rule lookup.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 tools/testing/selftests/net/Makefile          |   2 +-
 tools/testing/selftests/net/fib_rule_tests.sh | 208 ++++++++++++++++++++++++++
 2 files changed, 209 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/net/fib_rule_tests.sh

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 785fc18..f02ab70 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -5,7 +5,7 @@ CFLAGS =  -Wall -Wl,--no-as-needed -O2 -g
 CFLAGS += -I../../../../usr/include/
 
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
-TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh
+TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh fib_rule_tests.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
 TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
new file mode 100755
index 0000000..b28fbc1
--- /dev/null
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -0,0 +1,208 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is for checking IPv4 and IPv6 FIB rules API
+
+ret=0
+
+PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
+
+RTABLE=100
+GW_IP4=192.51.100.2
+SRC_IP=192.51.100.3
+GW_IP6=2001:db8:1::2
+SRC_IP6=2001:db8:1::3
+
+DEV_ADDR=192.51.100.1
+DEV=dummy0
+
+log_test()
+{
+	local rc=$1
+	local expected=$2
+	local msg="$3"
+
+	if [ ${rc} -eq ${expected} ]; then
+		printf "        %-60s  [ OK ]\n" "${msg}"
+	else
+		ret=1
+		printf "        %-60s  [FAIL]\n" "${msg}"
+		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+		echo
+			echo "hit enter to continue, 'q' to quit"
+			read a
+			[ "$a" = "q" ] && exit 1
+		fi
+	fi
+}
+
+setup()
+{
+	set -e
+	ip netns add testns
+	ip -netns testns link set dev lo up
+
+	ip -netns testns link add dummy0 type dummy
+	ip -netns testns link set dev dummy0 up
+	ip -netns testns address add 198.51.100.1/24 dev dummy0
+	ip -netns testns -6 address add 2001:db8:1::1/64 dev dummy0
+
+	set +e
+}
+
+cleanup()
+{
+	ip -netns testns link del dev dummy0 &> /dev/null
+	ip netns del testns
+}
+
+fib_check_iproute_support()
+{
+    ip rule help 2>&1 | grep -q $1
+    if [ $? -ne 0 ]; then
+	    echo "SKIP: iproute2 get too old, missing $1 match"
+        return 1
+    fi
+
+    ip route get help 2>&1 | grep -q $2
+    if [ $? -ne 0 ]; then
+	    echo "SKIP: iproute2 get too old, missing $2 match"
+        return 1
+    fi
+
+    return 0
+}
+
+fib_rule6_del()
+{
+    ip -netns testns -6 rule del $1
+	log_test $? 0 "rule6 del $1"
+}
+
+fib_rule6_del_by_pref()
+{
+	pref=$(ip -netns testns -6 rule show | grep "$1 lookup $TABLE" | cut -d ":" -f 1)
+	ip -netns testns -6 rule del pref $pref
+}
+
+fib_rule6_test_match_n_redirect()
+{
+    match="$1"
+    getmatch="$2"
+
+	ip -netns testns -6 rule add $match table $RTABLE
+	ip -netns testns -6 route get $GW_IP6 $getmatch | grep -q "table $RTABLE"
+	log_test $? 0 "rule6 check: $1"
+
+	fib_rule6_del_by_pref "$match"
+	log_test $? 0 "rule6 del by pref: $match"
+}
+
+fib_rule6_test()
+{
+	# setup the pbr redirect route
+	ip -netns testns -6 route add table $RTABLE default via $GW_IP6 dev $DEV onlink
+	# test oif match
+	match="oif $DEV"
+	fib_rule6_test_match_n_redirect "$match" "$match" "oif redirect to table"
+
+	match="from $SRC_IP6 iif $DEV"
+	fib_rule6_test_match_n_redirect "$match" "$match" "iif redirect to table"
+
+    match="tos 0x10"
+    fib_rule6_test_match_n_redirect "$match" "$match" "tos redirect to table"
+
+    match="fwmark 0x64"
+    getmatch="mark 0x64"
+    fib_rule6_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+
+    fib_check_iproute_support "uidrange" "uid"
+    if [ $? -eq 0 ]; then
+        match="uidrange 100-100"
+        getmatch="uid 100"
+        fib_rule6_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+    fi
+
+    fib_check_iproute_support "sport" "sport"
+    if [ $? -eq 0 ]; then
+        match="sport 666 dport 777"
+        fib_rule6_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+    fi
+}
+
+fib_rule4_del()
+{
+    ip -netns testns rule del $1
+	log_test $? 0 "del $1"
+}
+
+fib_rule4_del_by_pref()
+{
+	pref=$(ip -netns testns rule show | grep "$1 lookup $TABLE" | cut -d ":" -f 1)
+	ip -netns testns rule del pref $pref
+}
+
+fib_rule4_test_match_n_redirect()
+{
+	match="$1"
+    getmatch="$2"
+
+	ip -netns testns rule add $match table $RTABLE
+	ip -netns testns route get $GW_IP4 $getmatch | grep -q "table $RTABLE"
+	log_test $? 0 "rule4 check: $1"
+
+	fib_rule4_del_by_pref "$match"
+	log_test $? 0 "rule4 del by pref: $match"
+}
+
+fib_rule4_test()
+{
+	# setup the pbr redirect route
+	ip -netns testns route add table $RTABLE default via $GW_IP4 dev $DEV onlink
+
+	# test oif match
+	match="oif $DEV"
+	fib_rule4_test_match_n_redirect "$match" "$match" "oif redirect to table"
+
+	match="from $SRC_IP iif $DEV"
+	fib_rule4_test_match_n_redirect "$match" "$match" "iif redirect to table"
+
+    match="tos 0x10"
+    fib_rule4_test_match_n_redirect "$match" "$match" "tos redirect to table"
+
+    match="fwmark 0x64"
+    getmatch="mark 0x64"
+    fib_rule4_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+
+    fib_check_iproute_support "uidrange" "uid"
+    if [ $? -eq 0 ]; then
+        match="uidrange 100-100"
+        getmatch="uid 100"
+        fib_rule4_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+    fi
+
+    fib_check_iproute_support "sport" "sport"
+    if [ $? -eq 0 ]; then
+        match="sport 666 dport 777"
+        fib_rule4_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+    fi
+}
+
+if [ "$(id -u)" -ne 0 ];then
+	echo "SKIP: Need root privileges"
+	exit 0
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+	echo "SKIP: Could not run test without ip tool"
+	exit 0
+fi
+
+# start clean
+cleanup &> /dev/null
+setup
+fib_rule4_test
+fib_rule6_test
+cleanup
+
+exit $ret
-- 
2.1.4

^ permalink raw reply related

* Re: tcp hang when socket fills up ?
From: Marcelo Ricardo Leitner @ 2018-04-16 20:43 UTC (permalink / raw)
  To: Dominique Martinet; +Cc: Eric Dumazet, Michal Kubecek, netdev, Florian Westphal
In-Reply-To: <20180414015515.GA24798@nautica>

On Sat, Apr 14, 2018 at 03:55:15AM +0200, Dominique Martinet wrote:
> 16:49:26.724457 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 25134, win 646, options [nop,nop,TS val 1617129467 ecr 1313937599], length 0
> 16:49:26.726081 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 26508, win 669, options [nop,nop,TS val 1617129471 ecr 1313937599], length 0
> 16:49:26.726391 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 27882, win 691, options [nop,nop,TS val 1617129471 ecr 1313937601], length 0
> 16:49:26.726962 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 29256, win 714, options [nop,nop,TS val 1617129472 ecr 1313937601], length 0
> 16:49:26.727614 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 30630, win 737, options [nop,nop,TS val 1617129473 ecr 1313937602], length 0
> 16:49:26.728084 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 32004, win 759, options [nop,nop,TS val 1617129473 ecr 1313937602], length 0

[A] (packet above)

> 16:49:26.728507 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 59484:60858, ack 4190, win 307, options [nop,nop,TS val 1313937635 ecr 1617129473], length 1374
> 16:49:26.728511 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 60858:62232, ack 4190, win 307, options [nop,nop,TS val 1313937635 ecr 1617129473], length 1374
> 16:49:26.729531 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 33378, win 782, options [nop,nop,TS val 1617129475 ecr 1313937607], length 0
> 16:49:26.730002 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 34752, win 805, options [nop,nop,TS val 1617129475 ecr 1313937607], length 0
> 16:49:26.730340 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 62232:63606, ack 4190, win 307, options [nop,nop,TS val 1313937636 ecr 1617129473], length 1374
> 16:49:26.730344 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 63606:64980, ack 4190, win 307, options [nop,nop,TS val 1313937636 ecr 1617129473], length 1374
> 16:49:26.731398 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 64980:66354, ack 4190, win 307, options [nop,nop,TS val 1313937638 ecr 1617129473], length 1374
> 16:49:26.731402 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 66354:67728, ack 4190, win 307, options [nop,nop,TS val 1313937638 ecr 1617129473], length 1374
> 16:49:26.731634 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 36126, win 827, options [nop,nop,TS val 1617129476 ecr 1313937607], length 0
> 16:49:26.732955 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 67728:69102, ack 4190, win 307, options [nop,nop,TS val 1313937639 ecr 1617129473], length 1374
> 16:49:26.732963 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 69102:70476, ack 4190, win 307, options [nop,nop,TS val 1313937639 ecr 1617129473], length 1374
> 16:49:26.733956 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 37500, win 850, options [nop,nop,TS val 1617129476 ecr 1313937607], length 0
> 16:49:26.734242 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 38874, win 872, options [nop,nop,TS val 1617129477 ecr 1313937608], length 0
> 16:49:26.734653 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 40248, win 895, options [nop,nop,TS val 1617129478 ecr 1313937608], length 0
> 16:49:26.735042 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 70476:71850, ack 4190, win 307, options [nop,nop,TS val 1313937641 ecr 1617129473], length 1374
> 16:49:26.735046 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 71850:73224, ack 4190, win 307, options [nop,nop,TS val 1313937641 ecr 1617129473], length 1374
> 16:49:26.735334 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 41622, win 918, options [nop,nop,TS val 1617129478 ecr 1313937609], length 0
> 16:49:26.736005 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 42996, win 940, options [nop,nop,TS val 1617129478 ecr 1313937609], length 0
> 16:49:26.736402 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 73224:74598, ack 4190, win 307, options [nop,nop,TS val 1313937643 ecr 1617129473], length 1374
> 16:49:26.736408 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 74598:75972, ack 4190, win 307, options [nop,nop,TS val 1313937643 ecr 1617129473], length 1374
> 16:49:26.738561 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 44370, win 963, options [nop,nop,TS val 1617129482 ecr 1313937616], length 0
> 16:49:26.739539 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 45744, win 986, options [nop,nop,TS val 1617129482 ecr 1313937616], length 0
> 16:49:26.739882 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 47118, win 1008, options [nop,nop,TS val 1617129484 ecr 1313937617], length 0
> 16:49:26.740255 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 48492, win 1031, options [nop,nop,TS val 1617129484 ecr 1313937617], length 0
> 16:49:26.746756 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 49866, win 1053, options [nop,nop,TS val 1617129493 ecr 1313937627], length 0
> 16:49:26.747923 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 51240, win 1076, options [nop,nop,TS val 1617129494 ecr 1313937627], length 0
> 16:49:26.749083 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 52614, win 1099, options [nop,nop,TS val 1617129495 ecr 1313937629], length 0
> 16:49:26.750171 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 53988, win 1121, options [nop,nop,TS val 1617129496 ecr 1313937629], length 0
> 16:49:26.750808 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 55362, win 1144, options [nop,nop,TS val 1617129497 ecr 1313937629], length 0
> 16:49:26.754648 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 56736, win 1167, options [nop,nop,TS val 1617129500 ecr 1313937629], length 0
> 16:49:26.755985 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 58110, win 1189, options [nop,nop,TS val 1617129501 ecr 1313937630], length 0
> 16:49:26.758513 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 59484, win 1212, options [nop,nop,TS val 1617129502 ecr 1313937630], length 0
> 16:49:26.759096 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 60858, win 1234, options [nop,nop,TS val 1617129503 ecr 1313937635], length 0
> 16:49:26.759421 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 62232, win 1257, options [nop,nop,TS val 1617129503 ecr 1313937635], length 0
> 16:49:26.759755 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 63606, win 1280, options [nop,nop,TS val 1617129504 ecr 1313937636], length 0
> 16:49:26.760653 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 64980, win 1302, options [nop,nop,TS val 1617129505 ecr 1313937636], length 0
> 16:49:26.761453 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 66354, win 1325, options [nop,nop,TS val 1617129506 ecr 1313937638], length 0
> 16:49:26.762199 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 67728, win 1348, options [nop,nop,TS val 1617129507 ecr 1313937638], length 0
> 16:49:26.763547 IP <client public ip>.31872 > <server local ip>.13317: Flags [P.], seq 4190:4226, ack 67728, win 1348, options [nop,nop,TS val 1617129507 ecr 1313937638], length 36
> 16:49:26.763553 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 70476, win 1393, options [nop,nop,TS val 1617129508 ecr 1313937639], length 0
> 16:49:26.764298 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 73224, win 1438, options [nop,nop,TS val 1617129509 ecr 1313937641], length 0
> 16:49:26.764676 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 75972, win 1444, options [nop,nop,TS val 1617129510 ecr 1313937643], length 0
> 16:49:26.807754 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 75972:77346, ack 4190, win 307, options [nop,nop,TS val 1313937714 ecr 1617129473], length 1374
> 16:49:26.876467 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 77346, win 1444, options [nop,nop,TS val 1617129620 ecr 1313937714], length 0
> 16:49:27.048760 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 32004:33378, ack 4190, win 307, options [nop,nop,TS val 1313937955 ecr 1617129473], length 1374

Probably bogus retrans for conntrack, but OUTPUT is not filtered.

> 16:49:27.051791 IP <client public ip>.31872 > <server local ip>.13317: Flags [P.], seq 4190:4226, ack 77346, win 1444, options [nop,nop,TS val 1617129762 ecr 1313937714], length 36
> 16:49:27.076444 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 77346, win 1444, options [nop,nop,TS val 1617129822 ecr 1313937714,nop,nop,sack 1 {32004:33378}], length 0

I guess the issue lies with the dup-sack here.

AFAIR Disabling timestamps also disables SACK, which would avoid the
issue as conntrack will not see these as they won't happen anymore.

Then it gets locked into a loopy behavior:

> 16:49:27.371182 IP <client public ip>.31872 > <server local ip>.13317: Flags [P.], seq 4190:4226, ack 77346, win 1444, options [nop,nop,TS val 1617130018 ecr 1313937714], length 36

> 16:49:27.519862 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 32004:33378, ack 4190, win 307, options [nop,nop,TS val 1313938426 ecr 1617129473], length 1374
> 16:49:27.547662 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 77346, win 1444, options [nop,nop,TS val 1617130293 ecr 1313937714,nop,nop,sack 1 {32004:33378}], length 0
> 16:49:27.883372 IP <client public ip>.31872 > <server local ip>.13317: Flags [P.], seq 4190:4226, ack 77346, win 1444, options [nop,nop,TS val 1617130530 ecr 1313937714], length 36

Cycle 1

> 16:49:28.511861 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 32004:33378, ack 4190, win 307, options [nop,nop,TS val 1313939418 ecr 1617129473], length 1374
> 16:49:28.538891 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 77346, win 1444, options [nop,nop,TS val 1617131285 ecr 1313937714,nop,nop,sack 1 {32004:33378}], length 0
> 16:49:28.907197 IP <client public ip>.31872 > <server local ip>.13317: Flags [P.], seq 4190:4226, ack 77346, win 1444, options [nop,nop,TS val 1617131554 ecr 1313937714], length 36

Cycle 2

> 16:49:30.431864 IP <server local ip>.13317 > <client public ip>.31872: Flags [.], seq 32004:33378, ack 4190, win 307, options [nop,nop,TS val 1313941338 ecr 1617129473], length 1374
> 16:49:30.459127 IP <client public ip>.31872 > <server local ip>.13317: Flags [.], ack 77346, win 1444, options [nop,nop,TS val 1617133204 ecr 1313937714,nop,nop,sack 1 {32004:33378}], length 0
> 16:49:30.955388 IP <client public ip>.31872 > <server local ip>.13317: Flags [P.], seq 4190:4226, ack 77346, win 1444, options [nop,nop,TS val 1617133602 ecr 1313937714], length 36

Cycle 3...

Based on server RTO.

But I can't tell why packet [A] and subsequent packets from client
never made to the server's socket.

^ permalink raw reply

* Re: [RFC net-next 1/2] net: net-porcfs: Reduce rcu lock critical section
From: Saeed Mahameed @ 2018-04-16 20:50 UTC (permalink / raw)
  To: eric.dumazet@gmail.com, davem@davemloft.net; +Cc: netdev@vger.kernel.org
In-Reply-To: <1523560344.3587.40.camel@mellanox.com>

On Thu, 2018-04-12 at 19:12 +0000, Saeed Mahameed wrote:
> On Wed, 2018-04-11 at 19:59 -0700, Eric Dumazet wrote:
> > 
> > On 04/11/2018 04:47 PM, Saeed Mahameed wrote:
> > > 
> > > Well if we allow devices to access HW counters via FW command
> > > interfaces in ndo_get_stats and by testing mlx5 where we query up
> > > to 5
> > > hw registers, it could take 100us, still this is way smaller than
> > > 10sec
> > >  :) and it is really a nice rate to fetch HW stats on demand.
> > 
> > If hardware stats are slower than software ones, maybe it is time
> > to
> > use software stats,
> > instead of changing the whole stack ?
> > 
> 
> We already have SW stats for [rx/tx]_[packets/bytes] but for
> [rx/tx]_[error/drop] etc .. they can only be grabbed from HW.
> 
> We don't want to report only partial counters to get_stats ndo just
> to
> avoid sleeping.
> 
> > There are very few devices drivers having issues like that.
> > 

Dave, Eric, I would like to know whether it is ok to have this change
in the kernel (make get_stats ndo "might sleep")? I really didn't get
any convincing feedback to not do this change.

For smart nics where a lot of statistics computations and
driver/Hardware communication go through Firmware, there is not much we
can do in the driver in order to provide accurate and "fresh"
statistics other than talking directly to Firmware through command
interfaces that "might sleep". Currently this is done through a
background thread caching hardware statistics 5 times a second which is
wasteful.

In order to complete this work I need to take care of netlink, ovs and
bonding (similar to what i did in this patch).

Plus I might need to introduce a new dev hold/put API that guarantees
the netdev is not uninitialized while it is being briefly held, this is
needed since the current device chain locks guarantees that by design.
(Should be a relatively small change).

Go/No Go?

Thanks,
Saeed.



^ permalink raw reply

* [PATCH net-next] selftest: tc_flower: add testcase for 'ip_flags'
From: Davide Caratti @ 2018-04-16 20:59 UTC (permalink / raw)
  To: Jiri Pirko, David Ahern, David S. Miller; +Cc: netdev, Pieter Jansen van Vuuren

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 .../testing/selftests/net/forwarding/tc_flower.sh  | 70 ++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/tc_flower.sh b/tools/testing/selftests/net/forwarding/tc_flower.sh
index 032b882adfc0..0c54059f1875 100755
--- a/tools/testing/selftests/net/forwarding/tc_flower.sh
+++ b/tools/testing/selftests/net/forwarding/tc_flower.sh
@@ -149,6 +149,74 @@ match_src_ip_test()
 	log_test "src_ip match ($tcflags)"
 }
 
+match_ip_flags_test()
+{
+	RET=0
+
+	tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+		$tcflags ip_flags frag action continue
+	tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+		$tcflags ip_flags firstfrag action continue
+	tc filter add dev $h2 ingress protocol ip pref 3 handle 103 flower \
+		$tcflags ip_flags nofirstfrag action continue
+	tc filter add dev $h2 ingress protocol ip pref 4 handle 104 flower \
+		$tcflags ip_flags nofrag action drop
+
+	$MZ $h1 -c 1 -p 1000 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+		-t ip "frag=0" -q
+
+	tc_check_packets "dev $h2 ingress" 101 1
+	check_fail $? "Matched on wrong frag filter (nofrag)"
+
+	tc_check_packets "dev $h2 ingress" 102 1
+	check_fail $? "Matched on wrong firstfrag filter (nofrag)"
+
+	tc_check_packets "dev $h2 ingress" 103 1
+	check_err $? "Did not match on nofirstfrag filter (nofrag) "
+
+	tc_check_packets "dev $h2 ingress" 104 1
+	check_err $? "Did not match on nofrag filter (nofrag)"
+
+	$MZ $h1 -c 1 -p 1000 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+		-t ip "frag=0,mf" -q
+
+	tc_check_packets "dev $h2 ingress" 101 1
+	check_err $? "Did not match on frag filter (1stfrag)"
+
+	tc_check_packets "dev $h2 ingress" 102 1
+	check_err $? "Did not match fistfrag filter (1stfrag)"
+
+	tc_check_packets "dev $h2 ingress" 103 1
+	check_err $? "Matched on wrong nofirstfrag filter (1stfrag)"
+
+	tc_check_packets "dev $h2 ingress" 104 1
+	check_err $? "Match on wrong nofrag filter (1stfrag)"
+
+	$MZ $h1 -c 1 -p 1000 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+		-t ip "frag=256,mf" -q
+	$MZ $h1 -c 1 -p 1000 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+		-t ip "frag=256" -q
+
+	tc_check_packets "dev $h2 ingress" 101 3
+	check_err $? "Did not match on frag filter (no1stfrag)"
+
+	tc_check_packets "dev $h2 ingress" 102 1
+	check_err $? "Matched on wrong firstfrag filter (no1stfrag)"
+
+	tc_check_packets "dev $h2 ingress" 103 3
+	check_err $? "Did not match on nofirstfrag filter (no1stfrag)"
+
+	tc_check_packets "dev $h2 ingress" 104 1
+	check_err $? "Matched on nofrag filter (no1stfrag)"
+
+	tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+	tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+	tc filter del dev $h2 ingress protocol ip pref 3 handle 103 flower
+	tc filter del dev $h2 ingress protocol ip pref 4 handle 104 flower
+
+	log_test "ip_flags match ($tcflags)"
+}
+
 setup_prepare()
 {
 	h1=${NETIFS[p1]}
@@ -181,6 +249,7 @@ match_dst_mac_test
 match_src_mac_test
 match_dst_ip_test
 match_src_ip_test
+match_ip_flags_test
 
 tc_offload_check
 if [[ $? -ne 0 ]]; then
@@ -191,6 +260,7 @@ else
 	match_src_mac_test
 	match_dst_ip_test
 	match_src_ip_test
+	match_ip_flags_test
 fi
 
 exit $EXIT_STATUS
-- 
2.14.3

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox