Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf-next v3 4/9] bpf: Introduce mem, size argument pair support for kfunc
From: Kumar Kartikeya Dwivedi @ 2021-12-10 13:02 UTC (permalink / raw)
  To: bpf
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	Maxim Mikityanskiy, Florian Westphal, Jesper Dangaard Brouer,
	Toke Høiland-Jørgensen, netdev, netfilter-devel
In-Reply-To: <20211210130230.4128676-1-memxor@gmail.com>

BPF helpers can associate two adjacent arguments together to pass memory
of certain size, using ARG_PTR_TO_MEM and ARG_CONST_SIZE arguments.
Since we don't use bpf_func_proto for kfunc, we need to leverage BTF to
implement similar support.

The ARG_CONST_SIZE processing for helpers is refactored into a common
check_mem_size_reg helper that is shared with kfunc as well. kfunc
ptr_to_mem support follows logic similar to global functions, where
verification is done as if pointer is not null, even when it may be
null.

This leads to a simple to follow rule for writing kfunc: always check
the argument pointer for NULL, except when it is PTR_TO_CTX.

Currently, we require the size argument to be prefixed with "len__" in
the parameter name. This information is then recorded in kernel BTF and
verified during function argument checking. In the future we can use BTF
tagging instead, and modify the kernel function definitions. This will
be a purely kernel-side change.

This allows us to have some form of backwards compatibility for
structures that are passed in to the kernel function with their size,
and allow variable length structures to be passed in if they are
accompanied by a size parameter.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
---
 include/linux/bpf_verifier.h |   2 +
 kernel/bpf/btf.c             |  41 +++++++++++-
 kernel/bpf/verifier.c        | 124 ++++++++++++++++++++++-------------
 3 files changed, 119 insertions(+), 48 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 182b16a91084..b80fe5bf2a02 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -507,6 +507,8 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt);
 
 int check_ctx_reg(struct bpf_verifier_env *env,
 		  const struct bpf_reg_state *reg, int regno);
+int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+			     u32 regno);
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 		   u32 regno, u32 mem_size);
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 63b22ff73550..df9a3f77fc4a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5618,6 +5618,25 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
 	return true;
 }
 
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+				  const struct btf_param *arg,
+				  const struct bpf_reg_state *reg)
+{
+	const struct btf_type *t;
+	const char *param_name;
+
+	t = btf_type_skip_modifiers(btf, arg->type, NULL);
+	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+		return false;
+
+	/* In the future, this can be ported to use BTF tagging */
+	param_name = btf_name_by_offset(btf, arg->name_off);
+	if (strncmp(param_name, "len__", sizeof("len__") - 1))
+		return false;
+
+	return true;
+}
+
 static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    const struct btf *btf, u32 func_id,
 				    struct bpf_reg_state *regs,
@@ -5729,16 +5748,32 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			u32 type_size;
 
 			if (is_kfunc) {
+				bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], &regs[regno + 1]);
+
 				/* Permit pointer to mem, but only when argument
 				 * type is pointer to scalar, or struct composed
 				 * (recursively) of scalars.
+				 * When arg_mem_size is true, the pointer can be
+				 * void *.
 				 */
-				if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) {
+				if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(log, btf, ref_t, 0) &&
+				    (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
 					bpf_log(log,
-						"arg#%d pointer type %s %s must point to scalar or struct with scalar\n",
-						i, btf_type_str(ref_t), ref_tname);
+						"arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
+						i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
 					return -EINVAL;
 				}
+
+				/* Check for mem, len pair */
+				if (arg_mem_size) {
+					if (check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1)) {
+						bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n",
+							i, i + 1);
+						return -EINVAL;
+					}
+					i++;
+					continue;
+				}
 			}
 
 			resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1126b75fe650..074a78a0efa4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4780,6 +4780,62 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	}
 }
 
+static int check_mem_size_reg(struct bpf_verifier_env *env,
+			      struct bpf_reg_state *reg, u32 regno,
+			      bool zero_size_allowed,
+			      struct bpf_call_arg_meta *meta)
+{
+	int err;
+
+	/* This is used to refine r0 return value bounds for helpers
+	 * that enforce this value as an upper bound on return values.
+	 * See do_refine_retval_range() for helpers that can refine
+	 * the return value. C type of helper is u32 so we pull register
+	 * bound from umax_value however, if negative verifier errors
+	 * out. Only upper bounds can be learned because retval is an
+	 * int type and negative retvals are allowed.
+	 */
+	if (meta)
+		meta->msize_max_value = reg->umax_value;
+
+	/* The register is SCALAR_VALUE; the access check
+	 * happens using its boundaries.
+	 */
+	if (!tnum_is_const(reg->var_off))
+		/* For unprivileged variable accesses, disable raw
+		 * mode so that the program is required to
+		 * initialize all the memory that the helper could
+		 * just partially fill up.
+		 */
+		meta = NULL;
+
+	if (reg->smin_value < 0) {
+		verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
+			regno);
+		return -EACCES;
+	}
+
+	if (reg->umin_value == 0) {
+		err = check_helper_mem_access(env, regno - 1, 0,
+					      zero_size_allowed,
+					      meta);
+		if (err)
+			return err;
+	}
+
+	if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+		verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+			regno);
+		return -EACCES;
+	}
+	err = check_helper_mem_access(env, regno - 1,
+				      reg->umax_value,
+				      zero_size_allowed, meta);
+	if (!err)
+		err = mark_chain_precision(env, regno);
+	return err;
+}
+
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 		   u32 regno, u32 mem_size)
 {
@@ -4803,6 +4859,28 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 	return check_helper_mem_access(env, regno, mem_size, true, NULL);
 }
 
+int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+			     u32 regno)
+{
+	struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
+	bool may_be_null = reg_type_may_be_null(mem_reg->type);
+	struct bpf_reg_state saved_reg;
+	int err;
+
+	WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
+
+	if (may_be_null) {
+		saved_reg = *mem_reg;
+		mark_ptr_not_null_reg(mem_reg);
+	}
+
+	err = check_mem_size_reg(env, reg, regno, true, NULL);
+
+	if (may_be_null)
+		*mem_reg = saved_reg;
+	return err;
+}
+
 /* Implementation details:
  * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
  * Two bpf_map_lookups (even with the same key) will have different reg->id.
@@ -5316,51 +5394,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 	} else if (arg_type_is_mem_size(arg_type)) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
-		/* This is used to refine r0 return value bounds for helpers
-		 * that enforce this value as an upper bound on return values.
-		 * See do_refine_retval_range() for helpers that can refine
-		 * the return value. C type of helper is u32 so we pull register
-		 * bound from umax_value however, if negative verifier errors
-		 * out. Only upper bounds can be learned because retval is an
-		 * int type and negative retvals are allowed.
-		 */
-		meta->msize_max_value = reg->umax_value;
-
-		/* The register is SCALAR_VALUE; the access check
-		 * happens using its boundaries.
-		 */
-		if (!tnum_is_const(reg->var_off))
-			/* For unprivileged variable accesses, disable raw
-			 * mode so that the program is required to
-			 * initialize all the memory that the helper could
-			 * just partially fill up.
-			 */
-			meta = NULL;
-
-		if (reg->smin_value < 0) {
-			verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
-				regno);
-			return -EACCES;
-		}
-
-		if (reg->umin_value == 0) {
-			err = check_helper_mem_access(env, regno - 1, 0,
-						      zero_size_allowed,
-						      meta);
-			if (err)
-				return err;
-		}
-
-		if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
-			verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
-				regno);
-			return -EACCES;
-		}
-		err = check_helper_mem_access(env, regno - 1,
-					      reg->umax_value,
-					      zero_size_allowed, meta);
-		if (!err)
-			err = mark_chain_precision(env, regno);
+		err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta);
 	} else if (arg_type_is_alloc_size(arg_type)) {
 		if (!tnum_is_const(reg->var_off)) {
 			verbose(env, "R%d is not a known constant'\n",
-- 
2.34.1


^ permalink raw reply related

* [PATCH bpf-next v3 3/9] bpf: Extend kfunc with PTR_TO_CTX, PTR_TO_MEM argument support
From: Kumar Kartikeya Dwivedi @ 2021-12-10 13:02 UTC (permalink / raw)
  To: bpf
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	Maxim Mikityanskiy, Florian Westphal, Jesper Dangaard Brouer,
	Toke Høiland-Jørgensen, netdev, netfilter-devel
In-Reply-To: <20211210130230.4128676-1-memxor@gmail.com>

Allow passing PTR_TO_CTX, if the kfunc expects a matching struct type,
and punt to PTR_TO_MEM block if reg->type does not fall in one of
PTR_TO_BTF_ID or PTR_TO_SOCK* types. This will be used by future commits
to get access to XDP and TC PTR_TO_CTX, and pass various data (flags,
tuple, netns_id, etc.) encoded in opts struct as a pointer to the kfunc.

For PTR_TO_MEM support, arguments are currently limited to pointer to
scalar, or pointer to struct composed of scalars. This is done so that
unsafe scenarios (like passing PTR_TO_MEM where PTR_TO_BTF_ID of
in-kernel valid structure is expected, which may have pointers) are
avoided. kfunc argument checking is based on the passed in register type
and limited argument type matching, hence this limitation is imposed. In
the future, support for PTR_TO_MEM for kfunc can be extended to serve
other usecases. struct may have maximum 8 nested structs, all
recursively composed of scalars or struct with scalars.

Future commits will add negative tests that check whether these
restrictions imposed for kfunc arguments are duly rejected by BPF
verifier or not.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
---
 kernel/bpf/btf.c | 96 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 75 insertions(+), 21 deletions(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 450f9e37ceca..63b22ff73550 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5575,12 +5575,56 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
 #endif
 };
 
+/* Returns true if struct is composed of scalars, 8 levels of nesting allowed */
+static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
+					const struct btf *btf,
+					const struct btf_type *t, int rec)
+{
+	const struct btf_type *member_type;
+	const struct btf_member *member;
+	u16 i;
+
+	if (rec == 8) {
+		bpf_log(log, "max struct nesting depth 8 exceeded\n");
+		return false;
+	}
+
+	if (!btf_type_is_struct(t))
+		return false;
+
+	for_each_member(i, t, member) {
+		const struct btf_array *array;
+
+		member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+
+		if (btf_type_is_struct(member_type)) {
+			if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1))
+				return false;
+			continue;
+		}
+		if (btf_type_is_array(member_type)) {
+			array = btf_type_array(member_type);
+			/* FAM */
+			if (!array->nelems)
+				return false;
+			member_type = btf_type_skip_modifiers(btf, array->type, NULL);
+			if (!btf_type_is_scalar(member_type))
+				return false;
+			continue;
+		}
+		if (!btf_type_is_scalar(member_type))
+			return false;
+	}
+	return true;
+}
+
 static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    const struct btf *btf, u32 func_id,
 				    struct bpf_reg_state *regs,
 				    bool ptr_to_mem_ok)
 {
 	struct bpf_verifier_log *log = &env->log;
+	bool is_kfunc = btf_is_kernel(btf);
 	const char *func_name, *ref_tname;
 	const struct btf_type *t, *ref_t;
 	const struct btf_param *args;
@@ -5633,7 +5677,20 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 
 		ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
 		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
-		if (btf_is_kernel(btf)) {
+		if (btf_get_prog_ctx_type(log, btf, t,
+					  env->prog->type, i)) {
+			/* If function expects ctx type in BTF check that caller
+			 * is passing PTR_TO_CTX.
+			 */
+			if (reg->type != PTR_TO_CTX) {
+				bpf_log(log,
+					"arg#%d expected pointer to ctx, but got %s\n",
+					i, btf_type_str(t));
+				return -EINVAL;
+			}
+			if (check_ctx_reg(env, reg, regno))
+				return -EINVAL;
+		} else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || reg2btf_ids[reg->type])) {
 			const struct btf_type *reg_ref_t;
 			const struct btf *reg_btf;
 			const char *reg_ref_tname;
@@ -5649,14 +5706,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			if (reg->type == PTR_TO_BTF_ID) {
 				reg_btf = reg->btf;
 				reg_ref_id = reg->btf_id;
-			} else if (reg2btf_ids[reg->type]) {
+			} else {
 				reg_btf = btf_vmlinux;
 				reg_ref_id = *reg2btf_ids[reg->type];
-			} else {
-				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
-					func_name, i,
-					btf_type_str(ref_t), ref_tname, regno);
-				return -EINVAL;
 			}
 
 			reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
@@ -5672,23 +5724,23 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 					reg_ref_tname);
 				return -EINVAL;
 			}
-		} else if (btf_get_prog_ctx_type(log, btf, t,
-						 env->prog->type, i)) {
-			/* If function expects ctx type in BTF check that caller
-			 * is passing PTR_TO_CTX.
-			 */
-			if (reg->type != PTR_TO_CTX) {
-				bpf_log(log,
-					"arg#%d expected pointer to ctx, but got %s\n",
-					i, btf_type_str(t));
-				return -EINVAL;
-			}
-			if (check_ctx_reg(env, reg, regno))
-				return -EINVAL;
 		} else if (ptr_to_mem_ok) {
 			const struct btf_type *resolve_ret;
 			u32 type_size;
 
+			if (is_kfunc) {
+				/* Permit pointer to mem, but only when argument
+				 * type is pointer to scalar, or struct composed
+				 * (recursively) of scalars.
+				 */
+				if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) {
+					bpf_log(log,
+						"arg#%d pointer type %s %s must point to scalar or struct with scalar\n",
+						i, btf_type_str(ref_t), ref_tname);
+					return -EINVAL;
+				}
+			}
+
 			resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
 			if (IS_ERR(resolve_ret)) {
 				bpf_log(log,
@@ -5701,6 +5753,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			if (check_mem_reg(env, reg, regno, type_size))
 				return -EINVAL;
 		} else {
+			bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i,
+				is_kfunc ? "kernel " : "", func_name, func_id);
 			return -EINVAL;
 		}
 	}
@@ -5750,7 +5804,7 @@ int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
 			      const struct btf *btf, u32 func_id,
 			      struct bpf_reg_state *regs)
 {
-	return btf_check_func_arg_match(env, btf, func_id, regs, false);
+	return btf_check_func_arg_match(env, btf, func_id, regs, true);
 }
 
 /* Convert BTF of a function into bpf_reg_state if possible
-- 
2.34.1


^ permalink raw reply related

* [PATCH bpf-next v3 2/9] bpf: Remove DEFINE_KFUNC_BTF_ID_SET
From: Kumar Kartikeya Dwivedi @ 2021-12-10 13:02 UTC (permalink / raw)
  To: bpf
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	Maxim Mikityanskiy, Florian Westphal, Jesper Dangaard Brouer,
	Toke Høiland-Jørgensen, netdev, netfilter-devel
In-Reply-To: <20211210130230.4128676-1-memxor@gmail.com>

The only reason to keep it was to initialize list head, but future
commits will introduce more members that need to be set, which is more
convenient to do using designated initializer.

Hence, remove the macro, convert users, and initialize list head inside
register_kfunc_btf_id_set.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
---
 include/linux/btf.h                                   | 4 ----
 kernel/bpf/btf.c                                      | 1 +
 net/ipv4/tcp_bbr.c                                    | 5 ++++-
 net/ipv4/tcp_cubic.c                                  | 5 ++++-
 net/ipv4/tcp_dctcp.c                                  | 5 ++++-
 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c | 5 ++++-
 6 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/include/linux/btf.h b/include/linux/btf.h
index d96e2859382e..3730e845d266 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -361,10 +361,6 @@ static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist,
 }
 #endif
 
-#define DEFINE_KFUNC_BTF_ID_SET(set, name)                                     \
-	struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set),     \
-					 THIS_MODULE }
-
 extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list;
 extern struct kfunc_btf_id_list prog_test_kfunc_list;
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index c9413d13ca91..450f9e37ceca 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6375,6 +6375,7 @@ struct kfunc_btf_id_list {
 void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
 			       struct kfunc_btf_id_set *s)
 {
+	INIT_LIST_HEAD(&s->list);
 	mutex_lock(&l->mutex);
 	list_add(&s->list, &l->list);
 	mutex_unlock(&l->mutex);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index ec5550089b4d..280dada5d1ae 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -1169,7 +1169,10 @@ BTF_ID(func, bbr_set_state)
 #endif
 BTF_SET_END(tcp_bbr_kfunc_ids)
 
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_bbr_kfunc_ids, tcp_bbr_kfunc_btf_set);
+static struct kfunc_btf_id_set tcp_bbr_kfunc_btf_set = {
+	.owner = THIS_MODULE,
+	.set   = &tcp_bbr_kfunc_ids,
+};
 
 static int __init bbr_register(void)
 {
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 5e9d9c51164c..70384a8040c5 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -497,7 +497,10 @@ BTF_ID(func, cubictcp_acked)
 #endif
 BTF_SET_END(tcp_cubic_kfunc_ids)
 
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_cubic_kfunc_ids, tcp_cubic_kfunc_btf_set);
+static struct kfunc_btf_id_set tcp_cubic_kfunc_btf_set = {
+	.owner = THIS_MODULE,
+	.set   = &tcp_cubic_kfunc_ids,
+};
 
 static int __init cubictcp_register(void)
 {
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 0d7ab3cc7b61..ac2a47eb89d8 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -251,7 +251,10 @@ BTF_ID(func, dctcp_state)
 #endif
 BTF_SET_END(tcp_dctcp_kfunc_ids)
 
-static DEFINE_KFUNC_BTF_ID_SET(&tcp_dctcp_kfunc_ids, tcp_dctcp_kfunc_btf_set);
+static struct kfunc_btf_id_set tcp_dctcp_kfunc_btf_set = {
+	.owner = THIS_MODULE,
+	.set   = &tcp_dctcp_kfunc_ids,
+};
 
 static int __init dctcp_register(void)
 {
diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 5d52ea2768df..a437086e1860 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -93,7 +93,10 @@ BTF_SET_START(bpf_testmod_kfunc_ids)
 BTF_ID(func, bpf_testmod_test_mod_kfunc)
 BTF_SET_END(bpf_testmod_kfunc_ids)
 
-static DEFINE_KFUNC_BTF_ID_SET(&bpf_testmod_kfunc_ids, bpf_testmod_kfunc_btf_set);
+static struct kfunc_btf_id_set bpf_testmod_kfunc_btf_set = {
+	.owner = THIS_MODULE,
+	.set   = &bpf_testmod_kfunc_ids,
+};
 
 static int bpf_testmod_init(void)
 {
-- 
2.34.1


^ permalink raw reply related

* [PATCH bpf-next v3 1/9] bpf: Refactor bpf_check_mod_kfunc_call
From: Kumar Kartikeya Dwivedi @ 2021-12-10 13:02 UTC (permalink / raw)
  To: bpf
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	Maxim Mikityanskiy, Florian Westphal, Jesper Dangaard Brouer,
	Toke Høiland-Jørgensen, netdev, netfilter-devel
In-Reply-To: <20211210130230.4128676-1-memxor@gmail.com>

Future commits adding more callbacks will implement the same pattern of
matching module owner of kfunc_btf_id_set, and then operating on more
sets inside the struct.

While the btf_id_set for check_kfunc_call wouldn't have been NULL so
far, future commits introduce sets that are optional, hence the common
code also checks whether the pointer is valid.

Note that we must continue search on owner match and btf_id_set_contains
returning false, since more entries may have same owner (which can be
NULL for built-in modules). To clarify this case, a comment is added, so
that future commits don't regress the search.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
---
 include/linux/btf.h | 12 +++++++++++-
 kernel/bpf/btf.c    | 27 +++++++++++++++++++--------
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/include/linux/btf.h b/include/linux/btf.h
index acef6ef28768..d96e2859382e 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -320,9 +320,19 @@ static inline const char *btf_name_by_offset(const struct btf *btf,
 }
 #endif
 
+enum kfunc_btf_id_set_types {
+	BTF_SET_CHECK,
+	__BTF_SET_MAX,
+};
+
 struct kfunc_btf_id_set {
 	struct list_head list;
-	struct btf_id_set *set;
+	union {
+		struct btf_id_set *sets[__BTF_SET_MAX];
+		struct {
+			struct btf_id_set *set;
+		};
+	};
 	struct module *owner;
 };
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 27b7de538697..c9413d13ca91 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6390,22 +6390,33 @@ void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
 }
 EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set);
 
-bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
-			      struct module *owner)
+/* Caller must hold reference to module 'owner' */
+static bool kfunc_btf_id_set_contains(struct kfunc_btf_id_list *klist,
+				      u32 kfunc_id, struct module *owner,
+				      enum kfunc_btf_id_set_types type)
 {
-	struct kfunc_btf_id_set *s;
+	struct kfunc_btf_id_set *s = NULL;
+	bool ret = false;
 
-	if (!owner)
+	if (type >= __BTF_SET_MAX)
 		return false;
 	mutex_lock(&klist->mutex);
 	list_for_each_entry(s, &klist->list, list) {
-		if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) {
-			mutex_unlock(&klist->mutex);
-			return true;
+		if (s->owner == owner && s->sets[type] &&
+		    btf_id_set_contains(s->sets[type], kfunc_id)) {
+			ret = true;
+			break;
 		}
+		/* continue search, since multiple sets may have same owner */
 	}
 	mutex_unlock(&klist->mutex);
-	return false;
+	return ret;
+}
+
+bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
+			      struct module *owner)
+{
+	return kfunc_btf_id_set_contains(klist, kfunc_id, owner, BTF_SET_CHECK);
 }
 
 #endif
-- 
2.34.1


^ permalink raw reply related

* [PATCH bpf-next v3 0/9] Introduce unstable CT lookup helpers
From: Kumar Kartikeya Dwivedi @ 2021-12-10 13:02 UTC (permalink / raw)
  To: bpf
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	Maxim Mikityanskiy, Florian Westphal, Jesper Dangaard Brouer,
	Toke Høiland-Jørgensen, netdev, netfilter-devel

This series adds unstable conntrack lookup helpers using BPF kfunc support.  The
patch adding the lookup helper is based off of Maxim's recent patch to aid in
rebasing their series on top of this, all adjusted to work with module kfuncs [0].

  [0]: https://lore.kernel.org/bpf/20211019144655.3483197-8-maximmi@nvidia.com

To enable returning a reference to struct nf_conn, the verifier is extended to
support reference tracking for PTR_TO_BTF_ID, and kfunc is extended with support
for working as acquire/release functions, similar to existing BPF helpers. kfunc
returning pointer (limited to PTR_TO_BTF_ID in the kernel) can also return a
PTR_TO_BTF_ID_OR_NULL now, typically needed when acquiring a resource can fail.
kfunc can also receive PTR_TO_CTX and PTR_TO_MEM (with some limitations) as
arguments now. There is also support for passing a mem, len pair as argument
to kfunc now. In such cases, passing pointer to unsized type (void) is also
permitted.

Please see individual commits for details.

Note 1: Patch 1 in this series makes the same change as b12f03104324 ("bpf: Fix
bpf_check_mod_kfunc_call for built-in modules") in bpf tree, so there will be a
conflict if patch 1 is applied against that commit. I incorporated the same diff
change so that testing this set is possible (tests in patch 9 rely on it), but
before applying this, I'll rebase and resend, after bpf tree is merged into
bpf-next.

Note 2: BPF CI needs to add the following to config to test the set. I did
update the selftests config in patch 9, but not sure if that is enough.

	CONFIG_NETFILTER=y
	CONFIG_NF_DEFRAG_IPV4=y
	CONFIG_NF_DEFRAG_IPV6=y
	CONFIG_NF_CONNTRACK=y

Changelog:
----------
v2 -> v3:
v2: https://lore.kernel.org/bpf/20211209170929.3485242-1-memxor@gmail.com

 * Fix build error for !CONFIG_BPF_SYSCALL (Patchwork)

RFC v1 -> v2:
v1: https://lore.kernel.org/bpf/20211030144609.263572-1-memxor@gmail.com

 * Limit PTR_TO_MEM support to pointer to scalar, or struct with scalars (Alexei)
 * Use btf_id_set for checking acquire, release, ret type null (Alexei)
 * Introduce opts struct for CT helpers, move int err parameter to it
 * Add l4proto as parameter to CT helper's opts, remove separate tcp/udp helpers
 * Add support for mem, len argument pair to kfunc
 * Allow void * as pointer type for mem, len argument pair
 * Extend selftests to cover new additions to kfuncs
 * Copy ref_obj_id to PTR_TO_BTF_ID dst_reg on btf_struct_access, test it
 * Fix other misc nits, bugs, and expand commit messages

Kumar Kartikeya Dwivedi (9):
  bpf: Refactor bpf_check_mod_kfunc_call
  bpf: Remove DEFINE_KFUNC_BTF_ID_SET
  bpf: Extend kfunc with PTR_TO_CTX, PTR_TO_MEM argument support
  bpf: Introduce mem, size argument pair support for kfunc
  bpf: Add reference tracking support to kfunc
  bpf: Track provenance for pointers formed from referenced
    PTR_TO_BTF_ID
  net/netfilter: Add unstable CT lookup helpers for XDP and TC-BPF
  selftests/bpf: Extend kfunc selftests
  selftests/bpf: Add test for unstable CT lookup API

 include/linux/bpf.h                           |  27 +-
 include/linux/bpf_verifier.h                  |  12 +
 include/linux/btf.h                           |  45 +++-
 kernel/bpf/btf.c                              | 218 ++++++++++++---
 kernel/bpf/verifier.c                         | 232 +++++++++++-----
 net/bpf/test_run.c                            | 147 ++++++++++
 net/core/filter.c                             |  27 ++
 net/core/net_namespace.c                      |   1 +
 net/ipv4/tcp_bbr.c                            |   5 +-
 net/ipv4/tcp_cubic.c                          |   5 +-
 net/ipv4/tcp_dctcp.c                          |   5 +-
 net/netfilter/nf_conntrack_core.c             | 252 ++++++++++++++++++
 .../selftests/bpf/bpf_testmod/bpf_testmod.c   |   5 +-
 tools/testing/selftests/bpf/config            |   4 +
 .../testing/selftests/bpf/prog_tests/bpf_nf.c |  48 ++++
 .../selftests/bpf/prog_tests/kfunc_call.c     |  28 ++
 .../selftests/bpf/progs/kfunc_call_test.c     |  52 +++-
 .../bpf/progs/kfunc_call_test_fail1.c         |  16 ++
 .../bpf/progs/kfunc_call_test_fail2.c         |  16 ++
 .../bpf/progs/kfunc_call_test_fail3.c         |  16 ++
 .../bpf/progs/kfunc_call_test_fail4.c         |  16 ++
 .../bpf/progs/kfunc_call_test_fail5.c         |  16 ++
 .../bpf/progs/kfunc_call_test_fail6.c         |  16 ++
 .../bpf/progs/kfunc_call_test_fail7.c         |  24 ++
 .../bpf/progs/kfunc_call_test_fail8.c         |  22 ++
 .../testing/selftests/bpf/progs/test_bpf_nf.c | 113 ++++++++
 26 files changed, 1258 insertions(+), 110 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_nf.c
 create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test_fail1.c
 create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test_fail2.c
 create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test_fail3.c
 create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test_fail4.c
 create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test_fail5.c
 create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test_fail6.c
 create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test_fail7.c
 create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test_fail8.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_bpf_nf.c

-- 
2.34.1


^ permalink raw reply

* Re: [RFC net-next 2/2] ipv6: ioam: Support for Buffer occupancy data field
From: Justin Iurman @ 2021-12-10 12:57 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev, davem, dsahern, yoshfuji, linux-mm, cl, penberg, rientjes,
	iamjoonsoo kim, akpm, vbabka, Roopa Prabhu, Nikolay Aleksandrov,
	Andrew Lunn, Stephen Hemminger, Vladimir Oltean, Florian Fainelli,
	Florian Westphal, Paolo Abeni
In-Reply-To: <20211209163828.223815bd@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com>

>> Indeed, would be a better fit. I didn't know about this one, thanks for
>> that. It's a shame it can't be used in this context, though. But, at the
>> end of the day, we're left with nothing regarding buffer occupancy. So
>> I'm wondering if "something" is not better than "nothing" in this case.
>> And, for that, we're back to my previous answer on why I agree and
>> disagree with what you said about its utility.
> 
> I think we're on the same page, the main problem is I've not seen
> anyone use the skbuff_head_cache occupancy as a signal in practice.

Indeed.

> I'm adding a bunch of people to the CC list, hopefully someone has
> an opinion one way or the other.

+1, thanks Jakub.

> Lore link to the full thread, FWIW:
> 
> https://lore.kernel.org/all/20211206211758.19057-1-justin.iurman@uliege.be/

^ permalink raw reply

* linux-next: manual merge of the bpf-next tree with the netdev tree
From: broonie @ 2021-12-10 12:43 UTC (permalink / raw)
  To: Daniel Borkmann, Alexei Starovoitov, Andrii Nakryiko, bpf,
	Networking
  Cc: Kumar Kartikeya Dwivedi, Linux Kernel Mailing List,
	Linux Next Mailing List

Hi all,

Today's linux-next merge of the bpf-next tree got a conflict in:

  tools/lib/bpf/libbpf.c

between commit:

  ba05fd36b8512 ("libbpf: Perform map fd cleanup for gen_loader in case of error")

from the netdev tree and commit:

  fa5e5cc04e443 ("libbpf: Deprecate bpf_object__load_xattr()")

from the bpf-next tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

diff --cc tools/lib/bpf/libbpf.c
index f6faa33c80fa7,18d95c6a89fe3..0000000000000
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@@ -7263,7 -7475,7 +7475,7 @@@ static int bpf_object_load(struct bpf_o
  	}
  
  	if (obj->gen_loader)
- 		bpf_gen__init(obj->gen_loader, attr->log_level, obj->nr_programs, obj->nr_maps);
 -		bpf_gen__init(obj->gen_loader, extra_log_level);
++		bpf_gen__init(obj->gen_loader, extra_log_level, obj->nr_programs, obj->nr_maps);
  
  	err = bpf_object__probe_loading(obj);
  	err = err ? : bpf_object__load_vmlinux_btf(obj, false);

^ permalink raw reply

* Re: [PATCH 1/8] perf/kprobe: Add support to create multiple probes
From: Jiri Olsa @ 2021-12-10 12:42 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Arnaldo Carvalho de Melo, Peter Zijlstra, Masami Hiramatsu,
	Steven Rostedt, Networking, bpf, lkml, Ingo Molnar, Mark Rutland,
	Martin KaFai Lau, Alexander Shishkin, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Ravi Bangoria
In-Reply-To: <YbC4EXS3pyCbh7/i@krava>

On Wed, Dec 08, 2021 at 02:50:09PM +0100, Jiri Olsa wrote:
> On Mon, Dec 06, 2021 at 07:15:58PM -0800, Andrii Nakryiko wrote:
> > On Wed, Dec 1, 2021 at 1:32 PM Jiri Olsa <jolsa@redhat.com> wrote:
> > >
> > > On Tue, Nov 30, 2021 at 10:53:58PM -0800, Andrii Nakryiko wrote:
> > > > On Wed, Nov 24, 2021 at 12:41 AM Jiri Olsa <jolsa@redhat.com> wrote:
> > > > >
> > > > > Adding support to create multiple probes within single perf event.
> > > > > This way we can associate single bpf program with multiple kprobes,
> > > > > because bpf program gets associated with the perf event.
> > > > >
> > > > > The perf_event_attr is not extended, current fields for kprobe
> > > > > attachment are used for multi attachment.
> > > >
> > > > I'm a bit concerned with complicating perf_event_attr further to
> > > > support this multi-attach. For BPF, at least, we now have
> > > > bpf_perf_link and corresponding BPF_LINK_CREATE command in bpf()
> > > > syscall which allows much simpler and cleaner API to do this. Libbpf
> > > > will actually pick bpf_link-based attachment if kernel supports it. I
> > > > think we should better do bpf_link-based approach from the get go.
> > > >
> > > > Another thing I'd like you to keep in mind and think about is BPF
> > > > cookie. Currently kprobe/uprobe/tracepoint allow to associate
> > > > arbitrary user-provided u64 value which will be accessible from BPF
> > > > program with bpf_get_attach_cookie(). With multi-attach kprobes this
> > > > because extremely crucial feature to support, otherwise it's both
> > > > expensive, inconvenient and complicated to be able to distinguish
> > > > between different instances of the same multi-attach kprobe
> > > > invocation. So with that, what would be the interface to specify these
> > > > BPF cookies for this multi-attach kprobe, if we are going through
> > > > perf_event_attr. Probably picking yet another unused field and
> > > > union-izing it with a pointer. It will work, but makes the interface
> > > > even more overloaded. While for LINK_CREATE we can just add another
> > > > pointer to a u64[] with the same size as number of kfunc names and
> > > > offsets.
> > >
> > > I'm not sure we could bypass perf event easily.. perhaps introduce
> > > BPF_PROG_TYPE_RAW_KPROBE as we did for tracepoints or just new
> > > type for multi kprobe attachment like BPF_PROG_TYPE_MULTI_KPROBE
> > > that might be that way we'd have full control over the API
> > 
> > Sure, new type works.
> > 
> > >
> > > >
> > > > But other than that, I'm super happy that you are working on these
> > > > complicated multi-attach capabilities! It would be great to benchmark
> > > > one-by-one attachment vs multi-attach to the same set of kprobes once
> > > > you arrive at the final implementation.
> > >
> > > I have the change for bpftrace to use this and even though there's
> > > some speed up, it's not as substantial as for trampolines
> > >
> > > looks like we 'only' got rid of the multiple perf syscall overheads,
> > > compared to rcu syncs timeouts like we eliminated for trampolines
> > 
> > if it's just eliminating a pretty small overhead of multiple syscalls,
> > then it would be quite disappointing to add a bunch of complexity just
> > for that.
> 
> I meant it's not as huge save as for trampolines, but I expect some
> noticeable speedup, I'll make more becnhmarks with current patchset

so with this approach there's noticable speedup, but it's not the
'instant attachment speed' as for trampolines

as a base I used bpftrace with change that allows to reuse bpf program
for multiple kprobes

bpftrace standard attach of 672 kprobes:

  Performance counter stats for './src/bpftrace -vv -e kprobe:kvm* { @[kstack] += 1; }  i:ms:10 { printf("KRAVA\n"); exit() }':

      70.548897815 seconds time elapsed

       0.909996000 seconds user
      50.622834000 seconds sys


bpftrace using interface from this patchset attach of 673 kprobes:

  Performance counter stats for './src/bpftrace -vv -e kprobe:kvm* { @[kstack] += 1; }  i:ms:10 { printf("KRAVA\n"); exit() }':

      36.947586803 seconds time elapsed

       0.272585000 seconds user
      30.900831000 seconds sys


so it's noticeable, but I wonder it's not enough ;-)

jirka

> 
> > Are there any reasons we can't use the same low-level ftrace
> > batch attach API to speed this up considerably? I assume it's only
> > possible if kprobe is attached at the beginning of the function (not
> > sure how kretprobe is treated here), so we can either say that this
> > new kprobe prog type can only be attached at the beginning of each
> > function and enforce that (probably would be totally reasonable
> > assumption as that's what's happening most frequently in practice).
> > Worst case, should be possible to split all requested attach targets
> > into two groups, one fast at function entry and all the rest.
> > 
> > Am I too far off on this one? There might be some more complications
> > that I don't see.
> 
> I'd need to check more on kprobes internals, but.. ;-)
> 
> the new ftrace interface is special for 'direct' trampolines and
> I think that although kprobes can use ftrace for attaching, they
> use it in a different way
> 
> also this current 'multi attach' approach is on top of current kprobe
> interface, if we wanted to use the new ftrace API we'd need to add new
> kprobe interface and change the kprobe attaching to use it (for cases
> it's attached at the function entry)
> 
> jirka
> 
> > 
> > >
> > > I'll make full benchmarks once we have some final solution
> > >
> > > jirka
> > >
> > 


^ permalink raw reply

* Re: [PATCH net-next] xfrm: add net device refcount tracker to struct xfrm_state_offload
From: Steffen Klassert @ 2021-12-10 12:34 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, Jakub Kicinski, netdev, Eric Dumazet
In-Reply-To: <20211209154451.4184050-1-eric.dumazet@gmail.com>

On Thu, Dec 09, 2021 at 07:44:51AM -0800, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Steffen Klassert <steffen.klassert@secunet.com>

As the refcount tracking infrastructure is not yet in ipsec-next:

Acked-by: Steffen Klassert <steffen.klassert@secunet.com>

Thanks!

^ permalink raw reply

* Re: [PATCH net] xfrm: fix a small bug in frm_sa_len()
From: Steffen Klassert @ 2021-12-10 12:33 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, Jakub Kicinski, netdev, Eric Dumazet
In-Reply-To: <20211208202019.3423010-1-eric.dumazet@gmail.com>

On Wed, Dec 08, 2021 at 12:20:19PM -0800, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> copy_user_offload() will actually push a struct struct xfrm_user_offload,
> which is different than (struct xfrm_state *)->xso
> (struct xfrm_state_offload)
> 
> Fixes: d77e38e612a01 ("xfrm: Add an IPsec hardware offloading API")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Steffen Klassert <steffen.klassert@secunet.com>

Applied to the ipsec tree, thanks Eric!

Note: I fixed a typo in the subject 'frm_sa_len -> xfrm_sa_len'

^ permalink raw reply

* Re: [PATCH] bpf: return EOPNOTSUPP when JIT is needed and not possible
From: Thadeu Lima de Souza Cascardo @ 2021-12-10 12:24 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Daniel Borkmann, Ido Schimmel, John Fastabend, bpf, netdev, ast,
	linux-kernel
In-Reply-To: <20211209182349.038ac2b8@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com>

On Thu, Dec 09, 2021 at 06:23:49PM -0800, Jakub Kicinski wrote:
> On Fri, 10 Dec 2021 00:03:40 +0100 Daniel Borkmann wrote:
> > > Similar issue was discussed in the past. See:
> > > https://lore.kernel.org/netdev/20191204.125135.750458923752225025.davem@davemloft.net/  
> > 
> > With regards to ENOTSUPP exposure, if the consensus is that we should fix all
> > occurences over to EOPNOTSUPP even if they've been exposed for quite some time
> > (Jakub?), 
> 
> Did you mean me? :) In case you did - I think we should avoid it 
> for new code but changing existing now seems risky. Alexei and Andrii
> would know best but quick search of code bases at work reveals some
> scripts looking for ENOTSUPP.
> 
> Thadeu, what motivated the change?
> 
> If we're getting those changes fixes based on checkpatch output maybe 
> there is a way to mute the checkpatch warnings when it's not run on a 
> diff?
> 

It was not checkpatch that motivated me.

I was looking into the following commits as we hit a failed test.

be08815c5d3b ("bpf: add also cbpf long jump test cases with heavy expansion")
050fad7c4534 ("bpf: fix truncated jump targets on heavy expansions") 

Then, I realized that if given the right number of BPF_LDX | BPF_B | BPF_MSH
instructions, it will pass the bpf_convert_filter stage, but fail at blinding.
And if you have CONFIG_BPF_JIT_ALWAYS_ON, setting the filter will fail with
ENOTSUPP, which should not be sent to userspace.

I noticed other ENOTSUPP, but they seemed to be returned by helpers, and I was
not sure this would be relayed to userspace. So, I went for fixing the observed
case.

I will see if any of the tests I can run is broken by this change and submit it
again with the tests fixed as well.

Cascardo.

> > we could give this patch a try maybe via bpf-next and see if anyone complains.
> > 
> > Thadeu, I think you also need to fix up BPF selftests as test_verifier, to mention
> > one example (there are also bunch of others under tools/testing/selftests/), is
> > checking for ENOTSUPP specifically..

^ permalink raw reply

* Re: [PATCH net-next 2/2] net: stmmac: add tc flower filter for EtherType matching
From: Sebastian Andrzej Siewior @ 2021-12-10 11:57 UTC (permalink / raw)
  To: Kurt Kanzenbach
  Cc: Ong Boon Leong, Giuseppe Cavallaro, Alexandre Torgue, Jose Abreu,
	David S . Miller, Jakub Kicinski, Maxime Coquelin,
	alexandre.torgue, netdev, linux-stm32, linux-arm-kernel
In-Reply-To: <87fsr0zs77.fsf@kurt>

On 2021-12-10 11:10:04 [+0100], Kurt Kanzenbach wrote:
> > +	if (match.mask->n_proto) {
> > +		__be16 etype = ntohs(match.key->n_proto);
> 
> n_proto is be16. The ntohs() call will produce an u16.

While at it, could we be please remove that __force in
ETHER_TYPE_FULL_MASK and use cpu_to_be16() macro?

> Thanks,
> Kurt

Sebastian

^ permalink raw reply

* Re: [PATCH net-next 0/2] net: stmmac: add EthType Rx Frame steering
From: Vladimir Oltean @ 2021-12-10 11:57 UTC (permalink / raw)
  To: Ong Boon Leong, David S . Miller, Jakub Kicinski
  Cc: Giuseppe Cavallaro, Alexandre Torgue, Jose Abreu, Maxime Coquelin,
	alexandre.torgue, Kurt Kanzenbach, netdev, linux-stm32,
	linux-arm-kernel
In-Reply-To: <20211209151631.138326-1-boon.leong.ong@intel.com>

Hi David, Jakub,

On Thu, Dec 09, 2021 at 11:16:29PM +0800, Ong Boon Leong wrote:
> Hi,
> 
> Patch 1/2: Fixes issue in tc filter delete flower for VLAN priority
>            steering. Patch has been sent to 'net' ML. Link as follow:
> 
> https://patchwork.kernel.org/project/netdevbpf/patch/20211209130335.81114-1-boon.leong.ong@intel.com/
> 
> Patch 2/2: Patch to add LLDP and IEEE1588 EtherType RX frame steering
>            in tc flower that is implemented on-top of patch 1/2.
> 
> Below are the test steps for checking out the newly added feature:-
> 
> # Setup traffic class and ingress filter
> $ IFDEVNAME=eth0
> $ tc qdisc add dev $IFDEVNAME ingress
> $ tc qdisc add dev $IFDEVNAME root mqprio num_tc 8 \
>      map 0 1 2 3 4 5 6 7 0 0 0 0 0 0 0 0 \
>      queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 hw 0
> 
> # Add two VLAN priority based RX Frame Steering
> $ tc filter add dev $IFDEVNAME parent ffff: protocol 802.1Q \
>      flower vlan_prio 1 hw_tc 1
> $ tc filter add dev $IFDEVNAME parent ffff: protocol 802.1Q \
>      flower vlan_prio 2 hw_tc 2
> 
> # For LLDP
> $ tc filter add dev $IFDEVNAME parent ffff: protocol 0x88cc \
>      flower hw_tc 5
> 
> # For PTP
> $ tc filter add dev $IFDEVNAME parent ffff: protocol 0x88f7 \
>      flower hw_tc 6
> 
> # Show the ingress tc filters
> $ tc filter show dev $IFDEVNAME ingress
> 
> filter parent ffff: protocol ptp pref 49149 flower chain 0
> filter parent ffff: protocol ptp pref 49149 flower chain 0 handle 0x1 hw_tc 6
>   eth_type 88f7
>   in_hw in_hw_count 1
> filter parent ffff: protocol LLDP pref 49150 flower chain 0
> filter parent ffff: protocol LLDP pref 49150 flower chain 0 handle 0x1 hw_tc 5
>   eth_type 88cc
>   in_hw in_hw_count 1
> filter parent ffff: protocol 802.1Q pref 49151 flower chain 0
> filter parent ffff: protocol 802.1Q pref 49151 flower chain 0 handle 0x1 hw_tc 2
>   vlan_prio 2
>   in_hw in_hw_count 1
> filter parent ffff: protocol 802.1Q pref 49152 flower chain 0
> filter parent ffff: protocol 802.1Q pref 49152 flower chain 0 handle 0x1 hw_tc 1
>   vlan_prio 1
>   in_hw in_hw_count 1
> 
> # Delete tc filters
> $ tc filter del dev $IFDEVNAME parent ffff: pref 49149
> $ tc filter del dev $IFDEVNAME parent ffff: pref 49150
> $ tc filter del dev $IFDEVNAME parent ffff: pref 49151
> $ tc filter del dev $IFDEVNAME parent ffff: pref 49152
> 
> Thanks,
> BL
> 
> Ong Boon Leong (2):
>   net: stmmac: fix tc flower deletion for VLAN priority Rx steering
>   net: stmmac: add tc flower filter for EtherType matching
> 
>  drivers/net/ethernet/stmicro/stmmac/stmmac.h  |  20 ++
>  .../net/ethernet/stmicro/stmmac/stmmac_tc.c   | 189 +++++++++++++++++-
>  2 files changed, 205 insertions(+), 4 deletions(-)
> 
> -- 
> 2.25.1
> 

Is it the canonical approach to perform flow steering via tc-flower hw_tc,
as opposed to ethtool --config-nfc? My understanding from reading the
documentation is that tc-flower hw_tc only selects the hardware traffic
class for a packet, and that this has to do with prioritization
(although the concept in itself is a bit ill-defined as far as I
understand it, how does it relate to things like offloaded skbedit priority?).
But selecting a traffic class, in itself, doesn't (directly or
necessarily) select a ring per se, as ethtool does? Just like ethtool
doesn't select packet priority, just RX queue. When the RX queue
priority is configurable (see the "snps,priority" device tree property
in stmmac_mtl_setup) and more RX queues have the same priority, I'm not
sure what hw_tc is supposed to do in terms of RX queue selection?

^ permalink raw reply

* Re: [PATCH net] sch_cake: do not call cake_destroy() from cake_init()
From: Toke Høiland-Jørgensen @ 2021-12-10 11:34 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Eric Dumazet, David S . Miller, Jakub Kicinski, netdev, syzbot
In-Reply-To: <CANn89iJRu_uHi__pYr-y5p3Gw_FzmvCEgnYoBa4EGiXRNzxuPw@mail.gmail.com>

Eric Dumazet <edumazet@google.com> writes:

> On Fri, Dec 10, 2021 at 3:02 AM Toke Høiland-Jørgensen <toke@toke.dk> wrote:
>>
>> Eric Dumazet <eric.dumazet@gmail.com> writes:
>>
>> > From: Eric Dumazet <edumazet@google.com>
>> >
>> > qdiscs are not supposed to call their own destroy() method
>> > from init(), because core stack already does that.
>> >
>> > syzbot was able to trigger use after free:
>
>> >
>> > Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc")
>> > Signed-off-by: Eric Dumazet <edumazet@google.com>
>> > Reported-by: syzbot <syzkaller@googlegroups.com>
>> > Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
>>
>> Oops, thanks for the fix! I'm a little puzzled with the patch has my
>> S-o-b, though? It should probably be replaced by:
>>
>> Acked-by: Toke Høiland-Jørgensen <toke@toke.dk>
>
> Right, user error from my side, I copied it from your commit changelog
> and forgot to s/Signed-off-by/Cc/

Ah, right, makes sense; no worries :)

-Toke

^ permalink raw reply

* Re: [PATCH V6 2/5] x86/hyper-v: Add hyperv Isolation VM check in the cc_platform_has()
From: Tianyu Lan @ 2021-12-10 11:26 UTC (permalink / raw)
  To: Michael Kelley (LINUX), KY Srinivasan, Haiyang Zhang,
	Stephen Hemminger, wei.liu@kernel.org, Dexuan Cui,
	tglx@linutronix.de, mingo@redhat.com, bp@alien8.de,
	dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com,
	davem@davemloft.net, kuba@kernel.org, jejb@linux.ibm.com,
	martin.petersen@oracle.com, arnd@arndb.de, hch@infradead.org,
	m.szyprowski@samsung.com, robin.murphy@arm.com, Tianyu Lan,
	thomas.lendacky@amd.com
  Cc: iommu@lists.linux-foundation.org, linux-arch@vger.kernel.org,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-scsi@vger.kernel.org, netdev@vger.kernel.org, vkuznets,
	brijesh.singh@amd.com, konrad.wilk@oracle.com, hch@lst.de,
	joro@8bytes.org, parri.andrea@gmail.com, dave.hansen@intel.com
In-Reply-To: <MWHPR21MB1593F014EC440F5DEDCFDDFFD7709@MWHPR21MB1593.namprd21.prod.outlook.com>

On 12/10/2021 4:38 AM, Michael Kelley (LINUX) wrote:
> From: Tianyu Lan <ltykernel@gmail.com> Sent: Monday, December 6, 2021 11:56 PM
>>
>> Hyper-V provides Isolation VM which has memory encrypt support. Add
>> hyperv_cc_platform_has() and return true for check of GUEST_MEM_ENCRYPT
>> attribute.
>>
>> Signed-off-by: Tianyu Lan <Tianyu.Lan@microsoft.com>
>> ---
>> Change since v3:
>> 	* Change code style of checking GUEST_MEM attribute in the
>> 	  hyperv_cc_platform_has().
>> ---
>>   arch/x86/kernel/cc_platform.c | 8 ++++++++
>>   1 file changed, 8 insertions(+)
>>
>> diff --git a/arch/x86/kernel/cc_platform.c b/arch/x86/kernel/cc_platform.c
>> index 03bb2f343ddb..47db88c275d5 100644
>> --- a/arch/x86/kernel/cc_platform.c
>> +++ b/arch/x86/kernel/cc_platform.c
>> @@ -11,6 +11,7 @@
>>   #include <linux/cc_platform.h>
>>   #include <linux/mem_encrypt.h>
>>
>> +#include <asm/mshyperv.h>
>>   #include <asm/processor.h>
>>
>>   static bool __maybe_unused intel_cc_platform_has(enum cc_attr attr)
>> @@ -58,9 +59,16 @@ static bool amd_cc_platform_has(enum cc_attr attr)
>>   #endif
>>   }
>>
>> +static bool hyperv_cc_platform_has(enum cc_attr attr)
>> +{
>> +	return attr == CC_ATTR_GUEST_MEM_ENCRYPT;
>> +}
>>
>>   bool cc_platform_has(enum cc_attr attr)
>>   {
>> +	if (hv_is_isolation_supported())
>> +		return hyperv_cc_platform_has(attr);
>> +
>>   	if (sme_me_mask)
>>   		return amd_cc_platform_has(attr);
>>
> 
> Throughout Linux kernel code, there are about 20 calls to cc_platform_has()
> with CC_ATTR_GUEST_MEM_ENCRYPT as the argument.  The original code
> (from v1 of this patch set) only dealt with the call in sev_setup_arch().   But
> with this patch, all the other calls that previously returned "false" will now
> return "true" in a Hyper-V Isolated VM.  I didn't try to analyze all these other
> calls, so I think there's an open question about whether this is the behavior
> we want.
> 

CC_ATTR_GUEST_MEM_ENCRYPT is for SEV support so far. Hyper-V Isolation
VM is based on SEV or software memory encrypt. Most checks can be 
reused. The difference is that SEV code use encrypt bit in the page
table to encrypt and decrypt memory while Hyper-V uses vTOM. But the sev
memory encrypt mask "sme_me_mask" is unset in the Hyper-V Isolation VM
where claims sev and sme are unsupported. The rest of checks for mem enc
bit are still safe. So reuse CC_ATTR_GUEST_MEM_ENCRYPT for Hyper-V.



^ permalink raw reply

* [PATCH] samples: bpf: fix tracex2 due to empty sys_write count argument
From: Daniel T. Lee @ 2021-12-10 11:19 UTC (permalink / raw)
  To: Daniel Borkmann, Alexei Starovoitov, Andrii Nakryiko; +Cc: bpf, netdev

Currently from syscall entry, argument can't be fetched correctly as a
result of register cleanup.

    commit 6b8cf5cc9965 ("x86/entry/64/compat: Clear registers for compat syscalls, to reduce speculation attack surface")

For example in upper commit, registers are cleaned prior to syscall.
To be more specific, sys_write syscall has count size as a third argument.
But this can't be fetched from __x64_sys_enter/__s390x_sys_enter due to
register cleanup. (e.g. [x86] xorl %r8d, %r8d / [s390x] xgr %r7, %r7)

This commit fix this problem by modifying the trace event to ksys_write
instead of sys_write syscall entry.

    # Wrong example of 'write()' syscall argument fetching
    # ./tracex2
    ...
    pid 50909 cmd dd uid 0
           syscall write() stats
     byte_size       : count     distribution
       1 -> 1        : 4968837  |************************************* |

    # Successful example of 'write()' syscall argument fetching
    # (dd's write bytes at a time defaults to 512)
    # ./tracex2
    ...
    pid 3095 cmd dd uid 0
           syscall write() stats
     byte_size       : count     distribution
    ...
     256 -> 511      : 0        |                                      |
     512 -> 1023     : 4968844  |************************************* |

Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
---
 samples/bpf/tracex2_kern.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
index 5bc696bac27d..96dff3bea227 100644
--- a/samples/bpf/tracex2_kern.c
+++ b/samples/bpf/tracex2_kern.c
@@ -78,7 +78,7 @@ struct {
 	__uint(max_entries, 1024);
 } my_hist_map SEC(".maps");

-SEC("kprobe/" SYSCALL(sys_write))
+SEC("kprobe/ksys_write")
 int bpf_prog3(struct pt_regs *ctx)
 {
 	long write_size = PT_REGS_PARM3(ctx);
-- 
2.32.0

^ permalink raw reply related

* Re: [syzbot] general protection fault in inet_csk_accept
From: Eric Dumazet @ 2021-12-10 11:13 UTC (permalink / raw)
  To: syzbot, Paolo Abeni, Florian Westphal
  Cc: davem, dsahern, kuba, linux-kernel, netdev, syzkaller-bugs,
	yoshfuji
In-Reply-To: <0000000000004c679505d2c8c1d4@google.com>

On Fri, Dec 10, 2021 at 3:09 AM syzbot
<syzbot+e4d843bb96a9431e6331@syzkaller.appspotmail.com> wrote:
>
> Hello,
>
> syzbot found the following issue on:
>
> HEAD commit:    2a987e65025e Merge tag 'perf-tools-fixes-for-v5.16-2021-12..
> git tree:       upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=166f73adb00000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=221ffc09e39ebbd1
> dashboard link: https://syzkaller.appspot.com/bug?extid=e4d843bb96a9431e6331
> compiler:       gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils for Debian) 2.35.2
> syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=16280ae5b00000
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1000fdc5b00000
>
> The issue was bisected to:
>

Note to MPTCP maintainers, I think this issue is MPTCP one, and the
bisection result
shown here seems not relevant.

The C repro is however correct, I trigger an immediate crash.


> commit 7f700334be9aeb91d5d86ef9ad2d901b9b453e9b
> Author: Eric Dumazet <edumazet@google.com>
> Date:   Mon Mar 29 18:39:51 2021 +0000
>
>     ip6_gre: proper dev_{hold|put} in ndo_[un]init methods
>
> bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=117fe575b00000
> final oops:     https://syzkaller.appspot.com/x/report.txt?x=137fe575b00000
> console output: https://syzkaller.appspot.com/x/log.txt?x=157fe575b00000
>
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+e4d843bb96a9431e6331@syzkaller.appspotmail.com
> Fixes: 7f700334be9a ("ip6_gre: proper dev_{hold|put} in ndo_[un]init methods")
>
> general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN
> KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f]
> CPU: 1 PID: 6550 Comm: syz-executor122 Not tainted 5.16.0-rc4-syzkaller #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> RIP: 0010:__lock_acquire+0xd7d/0x54a0 kernel/locking/lockdep.c:4897
> Code: 0f 0e 41 be 01 00 00 00 0f 86 c8 00 00 00 89 05 69 cc 0f 0e e9 bd 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 da 48 c1 ea 03 <80> 3c 02 00 0f 85 f3 2f 00 00 48 81 3b 20 75 17 8f 0f 84 52 f3 ff
> RSP: 0018:ffffc90001f2f818 EFLAGS: 00010016
> RAX: dffffc0000000000 RBX: 0000000000000018 RCX: 0000000000000000
> RDX: 0000000000000003 RSI: 0000000000000000 RDI: 0000000000000001
> RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000001
> R10: 0000000000000000 R11: 000000000000000a R12: 0000000000000000
> R13: ffff88801b98d700 R14: 0000000000000000 R15: 0000000000000001
> FS:  00007f177cd3d700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007f177cd1b268 CR3: 000000001dd55000 CR4: 0000000000350ee0
> Call Trace:
>  <TASK>
>  lock_acquire kernel/locking/lockdep.c:5637 [inline]
>  lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5602
>  __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
>  _raw_spin_lock_irqsave+0x39/0x50 kernel/locking/spinlock.c:162
>  finish_wait+0xc0/0x270 kernel/sched/wait.c:400
>  inet_csk_wait_for_connect net/ipv4/inet_connection_sock.c:464 [inline]
>  inet_csk_accept+0x7de/0x9d0 net/ipv4/inet_connection_sock.c:497
>  mptcp_accept+0xe5/0x500 net/mptcp/protocol.c:2865
>  inet_accept+0xe4/0x7b0 net/ipv4/af_inet.c:739
>  mptcp_stream_accept+0x2e7/0x10e0 net/mptcp/protocol.c:3345
>  do_accept+0x382/0x510 net/socket.c:1773
>  __sys_accept4_file+0x7e/0xe0 net/socket.c:1816
>  __sys_accept4+0xb0/0x100 net/socket.c:1846
>  __do_sys_accept net/socket.c:1864 [inline]
>  __se_sys_accept net/socket.c:1861 [inline]
>  __x64_sys_accept+0x71/0xb0 net/socket.c:1861
>  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
>  do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
>  entry_SYSCALL_64_after_hwframe+0x44/0xae
> RIP: 0033:0x7f177cd8b8e9
> Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 b1 14 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
> RSP: 002b:00007f177cd3d308 EFLAGS: 00000246 ORIG_RAX: 000000000000002b
> RAX: ffffffffffffffda RBX: 00007f177ce13408 RCX: 00007f177cd8b8e9
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003
> RBP: 00007f177ce13400 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000246 R12: 00007f177ce1340c
> R13: 00007f177cde1004 R14: 6d705f706374706d R15: 0000000000022000
>  </TASK>
> Modules linked in:
> ---[ end trace 77ed64e4985d56c9 ]---
> RIP: 0010:__lock_acquire+0xd7d/0x54a0 kernel/locking/lockdep.c:4897
> Code: 0f 0e 41 be 01 00 00 00 0f 86 c8 00 00 00 89 05 69 cc 0f 0e e9 bd 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 da 48 c1 ea 03 <80> 3c 02 00 0f 85 f3 2f 00 00 48 81 3b 20 75 17 8f 0f 84 52 f3 ff
> RSP: 0018:ffffc90001f2f818 EFLAGS: 00010016
> RAX: dffffc0000000000 RBX: 0000000000000018 RCX: 0000000000000000
> RDX: 0000000000000003 RSI: 0000000000000000 RDI: 0000000000000001
> RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000001
> R10: 0000000000000000 R11: 000000000000000a R12: 0000000000000000
> R13: ffff88801b98d700 R14: 0000000000000000 R15: 0000000000000001
> FS:  00007f177cd3d700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007f177cd1b268 CR3: 000000001dd55000 CR4: 0000000000350ee0
> ----------------
> Code disassembly (best guess):
>    0:   0f 0e                   femms
>    2:   41 be 01 00 00 00       mov    $0x1,%r14d
>    8:   0f 86 c8 00 00 00       jbe    0xd6
>    e:   89 05 69 cc 0f 0e       mov    %eax,0xe0fcc69(%rip)        # 0xe0fcc7d
>   14:   e9 bd 00 00 00          jmpq   0xd6
>   19:   48 b8 00 00 00 00 00    movabs $0xdffffc0000000000,%rax
>   20:   fc ff df
>   23:   48 89 da                mov    %rbx,%rdx
>   26:   48 c1 ea 03             shr    $0x3,%rdx
> * 2a:   80 3c 02 00             cmpb   $0x0,(%rdx,%rax,1) <-- trapping instruction
>   2e:   0f 85 f3 2f 00 00       jne    0x3027
>   34:   48 81 3b 20 75 17 8f    cmpq   $0xffffffff8f177520,(%rbx)
>   3b:   0f                      .byte 0xf
>   3c:   84 52 f3                test   %dl,-0xd(%rdx)
>   3f:   ff                      .byte 0xff
>
>
> ---
> This report is generated by a bot. It may contain errors.
> See https://goo.gl/tpsmEJ for more information about syzbot.
> syzbot engineers can be reached at syzkaller@googlegroups.com.
>
> syzbot will keep track of this issue. See:
> https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
> For information about bisection process see: https://goo.gl/tpsmEJ#bisection
> syzbot can test patches for this issue, for details see:
> https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: [PATCH v4 net-next 2/9] i40e: respect metadata on XSK Rx to skb
From: Alexander Lobakin @ 2021-12-10 11:08 UTC (permalink / raw)
  To: Tony Nguyen
  Cc: Alexander Lobakin, jbrouer@redhat.com, songliubraving@fb.com,
	hawk@kernel.org, kafai@fb.com, andrii@kernel.org,
	davem@davemloft.net, Maciej Fijalkowski, john.fastabend@gmail.com,
	Jesse Brandeburg, kpsingh@kernel.org, ast@kernel.org,
	linux-kernel@vger.kernel.org, brouer@redhat.com, yhs@fb.com,
	Jith Joseph, intel-wired-lan@lists.osuosl.org, kuba@kernel.org,
	bjorn@kernel.org, daniel@iogearbox.net, netdev@vger.kernel.org,
	Michal Swiatkowski, bpf@vger.kernel.org
In-Reply-To: <11db2426b85eb8cedd5e2d66d6399143cb382b49.camel@intel.com>

From: Tony Nguyen <anthony.l.nguyen@intel.com>
Date: Thu, 9 Dec 2021 18:50:07 +0000

> On Thu, 2021-12-09 at 18:38 +0100, Alexander Lobakin wrote:
> > From: Jesper Dangaard Brouer <jbrouer@redhat.com>
> > Date: Thu, 9 Dec 2021 09:27:37 +0100
> > 
> > > On 08/12/2021 15.06, Alexander Lobakin wrote:
> > > > For now, if the XDP prog returns XDP_PASS on XSK, the metadata
> > > > will
> > > > be lost as it doesn't get copied to the skb.
> > > 
> > > I have an urge to add a newline here, when reading this, as IMHO it
> > > is a 
> > > paragraph with the problem statement.
> > > 
> > > > Copy it along with the frame headers. Account its size on skb
> > > > allocation, and when copying just treat it as a part of the frame
> > > > and do a pull after to "move" it to the "reserved" zone.
> > > 
> > > Also newline here, as next paragraph are some extra details, you
> > > felt a 
> > > need to explain to the reader.
> > > 
> > > > net_prefetch() xdp->data_meta and align the copy size to speed-up
> > > > memcpy() a little and better match i40e_costruct_skb().
> > >                                       ^^^^^^xx^^^^^^^^^
> > > 
> > > commit messages.
> > 
> > Oh gosh, I thought I don't have attention deficit. Thanks, maybe
> > Tony will fix it for me or I could send a follow-up (or resend if
> > needed, I saw those were already applied to dev-queue).
> 
> If there's no need for follow-ups beyond this change, I'll fix it up.

The rest is fine, thank you!

> Thanks,
> Tony
> 
> > > --Jesper
> > 
> > Al

Al

^ permalink raw reply

* [syzbot] general protection fault in inet_csk_accept
From: syzbot @ 2021-12-10 11:09 UTC (permalink / raw)
  To: davem, dsahern, edumazet, kuba, linux-kernel, netdev,
	syzkaller-bugs, yoshfuji

Hello,

syzbot found the following issue on:

HEAD commit:    2a987e65025e Merge tag 'perf-tools-fixes-for-v5.16-2021-12..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=166f73adb00000
kernel config:  https://syzkaller.appspot.com/x/.config?x=221ffc09e39ebbd1
dashboard link: https://syzkaller.appspot.com/bug?extid=e4d843bb96a9431e6331
compiler:       gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils for Debian) 2.35.2
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=16280ae5b00000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1000fdc5b00000

The issue was bisected to:

commit 7f700334be9aeb91d5d86ef9ad2d901b9b453e9b
Author: Eric Dumazet <edumazet@google.com>
Date:   Mon Mar 29 18:39:51 2021 +0000

    ip6_gre: proper dev_{hold|put} in ndo_[un]init methods

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=117fe575b00000
final oops:     https://syzkaller.appspot.com/x/report.txt?x=137fe575b00000
console output: https://syzkaller.appspot.com/x/log.txt?x=157fe575b00000

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+e4d843bb96a9431e6331@syzkaller.appspotmail.com
Fixes: 7f700334be9a ("ip6_gre: proper dev_{hold|put} in ndo_[un]init methods")

general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN
KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f]
CPU: 1 PID: 6550 Comm: syz-executor122 Not tainted 5.16.0-rc4-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:__lock_acquire+0xd7d/0x54a0 kernel/locking/lockdep.c:4897
Code: 0f 0e 41 be 01 00 00 00 0f 86 c8 00 00 00 89 05 69 cc 0f 0e e9 bd 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 da 48 c1 ea 03 <80> 3c 02 00 0f 85 f3 2f 00 00 48 81 3b 20 75 17 8f 0f 84 52 f3 ff
RSP: 0018:ffffc90001f2f818 EFLAGS: 00010016
RAX: dffffc0000000000 RBX: 0000000000000018 RCX: 0000000000000000
RDX: 0000000000000003 RSI: 0000000000000000 RDI: 0000000000000001
RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000001
R10: 0000000000000000 R11: 000000000000000a R12: 0000000000000000
R13: ffff88801b98d700 R14: 0000000000000000 R15: 0000000000000001
FS:  00007f177cd3d700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f177cd1b268 CR3: 000000001dd55000 CR4: 0000000000350ee0
Call Trace:
 <TASK>
 lock_acquire kernel/locking/lockdep.c:5637 [inline]
 lock_acquire+0x1ab/0x510 kernel/locking/lockdep.c:5602
 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
 _raw_spin_lock_irqsave+0x39/0x50 kernel/locking/spinlock.c:162
 finish_wait+0xc0/0x270 kernel/sched/wait.c:400
 inet_csk_wait_for_connect net/ipv4/inet_connection_sock.c:464 [inline]
 inet_csk_accept+0x7de/0x9d0 net/ipv4/inet_connection_sock.c:497
 mptcp_accept+0xe5/0x500 net/mptcp/protocol.c:2865
 inet_accept+0xe4/0x7b0 net/ipv4/af_inet.c:739
 mptcp_stream_accept+0x2e7/0x10e0 net/mptcp/protocol.c:3345
 do_accept+0x382/0x510 net/socket.c:1773
 __sys_accept4_file+0x7e/0xe0 net/socket.c:1816
 __sys_accept4+0xb0/0x100 net/socket.c:1846
 __do_sys_accept net/socket.c:1864 [inline]
 __se_sys_accept net/socket.c:1861 [inline]
 __x64_sys_accept+0x71/0xb0 net/socket.c:1861
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f177cd8b8e9
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 b1 14 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f177cd3d308 EFLAGS: 00000246 ORIG_RAX: 000000000000002b
RAX: ffffffffffffffda RBX: 00007f177ce13408 RCX: 00007f177cd8b8e9
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003
RBP: 00007f177ce13400 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f177ce1340c
R13: 00007f177cde1004 R14: 6d705f706374706d R15: 0000000000022000
 </TASK>
Modules linked in:
---[ end trace 77ed64e4985d56c9 ]---
RIP: 0010:__lock_acquire+0xd7d/0x54a0 kernel/locking/lockdep.c:4897
Code: 0f 0e 41 be 01 00 00 00 0f 86 c8 00 00 00 89 05 69 cc 0f 0e e9 bd 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 da 48 c1 ea 03 <80> 3c 02 00 0f 85 f3 2f 00 00 48 81 3b 20 75 17 8f 0f 84 52 f3 ff
RSP: 0018:ffffc90001f2f818 EFLAGS: 00010016
RAX: dffffc0000000000 RBX: 0000000000000018 RCX: 0000000000000000
RDX: 0000000000000003 RSI: 0000000000000000 RDI: 0000000000000001
RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000001
R10: 0000000000000000 R11: 000000000000000a R12: 0000000000000000
R13: ffff88801b98d700 R14: 0000000000000000 R15: 0000000000000001
FS:  00007f177cd3d700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f177cd1b268 CR3: 000000001dd55000 CR4: 0000000000350ee0
----------------
Code disassembly (best guess):
   0:	0f 0e                	femms
   2:	41 be 01 00 00 00    	mov    $0x1,%r14d
   8:	0f 86 c8 00 00 00    	jbe    0xd6
   e:	89 05 69 cc 0f 0e    	mov    %eax,0xe0fcc69(%rip)        # 0xe0fcc7d
  14:	e9 bd 00 00 00       	jmpq   0xd6
  19:	48 b8 00 00 00 00 00 	movabs $0xdffffc0000000000,%rax
  20:	fc ff df
  23:	48 89 da             	mov    %rbx,%rdx
  26:	48 c1 ea 03          	shr    $0x3,%rdx
* 2a:	80 3c 02 00          	cmpb   $0x0,(%rdx,%rax,1) <-- trapping instruction
  2e:	0f 85 f3 2f 00 00    	jne    0x3027
  34:	48 81 3b 20 75 17 8f 	cmpq   $0xffffffff8f177520,(%rbx)
  3b:	0f                   	.byte 0xf
  3c:	84 52 f3             	test   %dl,-0xd(%rdx)
  3f:	ff                   	.byte 0xff


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
For information about bisection process see: https://goo.gl/tpsmEJ#bisection
syzbot can test patches for this issue, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: [PATCH net] sch_cake: do not call cake_destroy() from cake_init()
From: Toke Høiland-Jørgensen @ 2021-12-10 11:02 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski
  Cc: netdev, Eric Dumazet, Eric Dumazet, syzbot
In-Reply-To: <20211210081536.451881-1-eric.dumazet@gmail.com>

Eric Dumazet <eric.dumazet@gmail.com> writes:

> From: Eric Dumazet <edumazet@google.com>
>
> qdiscs are not supposed to call their own destroy() method
> from init(), because core stack already does that.
>
> syzbot was able to trigger use after free:
>
> DEBUG_LOCKS_WARN_ON(lock->magic != lock)
> WARNING: CPU: 0 PID: 21902 at kernel/locking/mutex.c:586 __mutex_lock_common kernel/locking/mutex.c:586 [inline]
> WARNING: CPU: 0 PID: 21902 at kernel/locking/mutex.c:586 __mutex_lock+0x9ec/0x12f0 kernel/locking/mutex.c:740
> Modules linked in:
> CPU: 0 PID: 21902 Comm: syz-executor189 Not tainted 5.16.0-rc4-syzkaller #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> RIP: 0010:__mutex_lock_common kernel/locking/mutex.c:586 [inline]
> RIP: 0010:__mutex_lock+0x9ec/0x12f0 kernel/locking/mutex.c:740
> Code: 08 84 d2 0f 85 19 08 00 00 8b 05 97 38 4b 04 85 c0 0f 85 27 f7 ff ff 48 c7 c6 20 00 ac 89 48 c7 c7 a0 fe ab 89 e8 bf 76 ba ff <0f> 0b e9 0d f7 ff ff 48 8b 44 24 40 48 8d b8 c8 08 00 00 48 89 f8
> RSP: 0018:ffffc9000627f290 EFLAGS: 00010282
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> RDX: ffff88802315d700 RSI: ffffffff815f1db8 RDI: fffff52000c4fe44
> RBP: ffff88818f28e000 R08: 0000000000000000 R09: 0000000000000000
> R10: ffffffff815ebb5e R11: 0000000000000000 R12: 0000000000000000
> R13: dffffc0000000000 R14: ffffc9000627f458 R15: 0000000093c30000
> FS:  0000555556abc400(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007fda689c3303 CR3: 000000001cfbb000 CR4: 0000000000350ef0
> Call Trace:
>  <TASK>
>  tcf_chain0_head_change_cb_del+0x2e/0x3d0 net/sched/cls_api.c:810
>  tcf_block_put_ext net/sched/cls_api.c:1381 [inline]
>  tcf_block_put_ext net/sched/cls_api.c:1376 [inline]
>  tcf_block_put+0xbc/0x130 net/sched/cls_api.c:1394
>  cake_destroy+0x3f/0x80 net/sched/sch_cake.c:2695
>  qdisc_create.constprop.0+0x9da/0x10f0 net/sched/sch_api.c:1293
>  tc_modify_qdisc+0x4c5/0x1980 net/sched/sch_api.c:1660
>  rtnetlink_rcv_msg+0x413/0xb80 net/core/rtnetlink.c:5571
>  netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2496
>  netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline]
>  netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1345
>  netlink_sendmsg+0x904/0xdf0 net/netlink/af_netlink.c:1921
>  sock_sendmsg_nosec net/socket.c:704 [inline]
>  sock_sendmsg+0xcf/0x120 net/socket.c:724
>  ____sys_sendmsg+0x6e8/0x810 net/socket.c:2409
>  ___sys_sendmsg+0xf3/0x170 net/socket.c:2463
>  __sys_sendmsg+0xe5/0x1b0 net/socket.c:2492
>  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
>  do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
>  entry_SYSCALL_64_after_hwframe+0x44/0xae
> RIP: 0033:0x7f1bb06badb9
> Code: Unable to access opcode bytes at RIP 0x7f1bb06bad8f.
> RSP: 002b:00007fff3012a658 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
> RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f1bb06badb9
> RDX: 0000000000000000 RSI: 00000000200007c0 RDI: 0000000000000003
> RBP: 0000000000000000 R08: 0000000000000003 R09: 0000000000000003
> R10: 0000000000000003 R11: 0000000000000246 R12: 00007fff3012a688
> R13: 00007fff3012a6a0 R14: 00007fff3012a6e0 R15: 00000000000013c2
>  </TASK>
>
> Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-by: syzbot <syzkaller@googlegroups.com>
> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>

Oops, thanks for the fix! I'm a little puzzled with the patch has my
S-o-b, though? It should probably be replaced by:

Acked-by: Toke Høiland-Jørgensen <toke@toke.dk>

^ permalink raw reply

* Re: [PATCH net] sch_cake: do not call cake_destroy() from cake_init()
From: Eric Dumazet @ 2021-12-10 11:07 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Eric Dumazet, David S . Miller, Jakub Kicinski, netdev, syzbot
In-Reply-To: <87ilvwwwmm.fsf@toke.dk>

On Fri, Dec 10, 2021 at 3:02 AM Toke Høiland-Jørgensen <toke@toke.dk> wrote:
>
> Eric Dumazet <eric.dumazet@gmail.com> writes:
>
> > From: Eric Dumazet <edumazet@google.com>
> >
> > qdiscs are not supposed to call their own destroy() method
> > from init(), because core stack already does that.
> >
> > syzbot was able to trigger use after free:

> >
> > Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc")
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > Reported-by: syzbot <syzkaller@googlegroups.com>
> > Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
>
> Oops, thanks for the fix! I'm a little puzzled with the patch has my
> S-o-b, though? It should probably be replaced by:
>
> Acked-by: Toke Høiland-Jørgensen <toke@toke.dk>

Right, user error from my side, I copied it from your commit changelog
and forgot to s/Signed-off-by/Cc/

Thanks.

^ permalink raw reply

* [PATCH v4 7/7] dt-bindings: net: phy: Add 10-baseT1L 2.4 Vpp
From: alexandru.tachici @ 2021-12-10 11:05 UTC (permalink / raw)
  To: andrew
  Cc: o.rempel, alexandru.tachici, davem, devicetree, hkallweit1, kuba,
	linux-kernel, linux, netdev, robh+dt
In-Reply-To: <20211210110509.20970-1-alexandru.tachici@analog.com>

From: Alexandru Tachici <alexandru.tachici@analog.com>

Add a tristate property to advertise desired transmit level.

If the device supports the 2.4 Vpp operating mode for 10BASE-T1L,
as defined in 802.3gc, and the 2.4 Vpp transmit voltage operation
is desired, property should be set to 1. This property is used
to select whether Auto-Negotiation advertises a request to
operate the 10BASE-T1L PHY in increased transmit level mode.

If property is set to 1, the PHY shall advertise a request
to operate the 10BASE-T1L PHY in increased transmit level mode.
If property is set to zero, the PHY shall not advertise
a request to operate the 10BASE-T1L PHY in increased transmit level mode.

Signed-off-by: Alexandru Tachici <alexandru.tachici@analog.com>
---
 Documentation/devicetree/bindings/net/ethernet-phy.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
index 2766fe45bb98..c1615ea63b1f 100644
--- a/Documentation/devicetree/bindings/net/ethernet-phy.yaml
+++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
@@ -77,6 +77,15 @@ properties:
     description:
       Maximum PHY supported speed in Mbits / seconds.

+  phy-10base-t1l-2.4vpp:
+    description: |
+      tristate, request/disable 2.4 Vpp operating mode. The values are:
+      0: Disable 2.4 Vpp operating mode.
+      1: Request 2.4 Vpp operating mode from link partner.
+      Absence of this property will leave configuration to default values.
+    $ref: "/schemas/types.yaml#/definitions/uint32"
+    enum: [0, 1]
+
   broken-turn-around:
     $ref: /schemas/types.yaml#/definitions/flag
     description:
-- 
2.25.1

^ permalink raw reply related

* [PATCH v4 2/7] net: phy: Add 10-BaseT1L registers
From: alexandru.tachici @ 2021-12-10 11:05 UTC (permalink / raw)
  To: andrew
  Cc: o.rempel, alexandru.tachici, davem, devicetree, hkallweit1, kuba,
	linux-kernel, linux, netdev, robh+dt
In-Reply-To: <20211210110509.20970-1-alexandru.tachici@analog.com>

From: Alexandru Tachici <alexandru.tachici@analog.com>

The 802.3gc specification defines the 10-BaseT1L link
mode for ethernet trafic on twisted wire pair.

PMA status register can be used to detect if the phy supports
2.4 V TX level and PCS control register can be used to
enable/disable PCS level loopback.

Signed-off-by: Alexandru Tachici <alexandru.tachici@analog.com>
---
 include/uapi/linux/mdio.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h
index c54e6eae5366..0b2eba36dd7c 100644
--- a/include/uapi/linux/mdio.h
+++ b/include/uapi/linux/mdio.h
@@ -67,6 +67,9 @@
 #define MDIO_PCS_10GBRT_STAT2	33	/* 10GBASE-R/-T PCS status 2 */
 #define MDIO_AN_10GBT_CTRL	32	/* 10GBASE-T auto-negotiation control */
 #define MDIO_AN_10GBT_STAT	33	/* 10GBASE-T auto-negotiation status */
+#define MDIO_B10L_PMA_CTRL	2294	/* 10BASE-T1L PMA control */
+#define MDIO_PMA_10T1L_STAT	2295	/* 10BASE-T1L PMA status */
+#define MDIO_PCS_10T1L_CTRL	2278	/* 10BASE-T1L PCS control */
 
 /* LASI (Link Alarm Status Interrupt) registers, defined by XENPAK MSA. */
 #define MDIO_PMA_LASI_RXCTRL	0x9000	/* RX_ALARM control */
@@ -268,6 +271,28 @@
 #define MDIO_AN_10GBT_STAT_MS		0x4000	/* Master/slave config */
 #define MDIO_AN_10GBT_STAT_MSFLT	0x8000	/* Master/slave config fault */
 
+/* 10BASE-T1L PMA control */
+#define MDIO_PMA_10T1L_CTRL_LB_EN	0x0001	/* Enable loopback mode */
+#define MDIO_PMA_10T1L_CTRL_EEE_EN	0x0400	/* Enable EEE mode */
+#define MDIO_PMA_10T1L_CTRL_LOW_POWER	0x0800	/* Low-power mode */
+#define MDIO_PMA_10T1L_CTRL_2V4_EN	0x1000	/* Enable 2.4 Vpp operating mode */
+#define MDIO_PMA_10T1L_CTRL_TX_DIS	0x4000	/* Transmit disable */
+#define MDIO_PMA_10T1L_CTRL_PMA_RST	0x8000	/* MA reset */
+
+/* 10BASE-T1L PMA status register. */
+#define MDIO_PMA_10T1L_STAT_LINK	0x0001	/* PMA receive link up */
+#define MDIO_PMA_10T1L_STAT_FAULT	0x0002	/* Fault condition detected */
+#define MDIO_PMA_10T1L_STAT_POLARITY	0x0004	/* Receive polarity is reversed */
+#define MDIO_PMA_10T1L_STAT_RECV_FAULT	0x0200	/* Able to detect fault on receive path */
+#define MDIO_PMA_10T1L_STAT_EEE		0x0400	/* PHY has EEE ability */
+#define MDIO_PMA_10T1L_STAT_LOW_POWER	0x0800	/* PMA has low-power ability */
+#define MDIO_PMA_10T1L_STAT_2V4_ABLE	0x1000	/* PHY has 2.4 Vpp operating mode ability */
+#define MDIO_PMA_10T1L_STAT_LB_ABLE	0x2000	/* PHY has loopback ability */
+
+/* 10BASE-T1L PCS control register. */
+#define MDIO_PCS_10T1L_CTRL_LB		0x4000	/* Enable PCS level loopback mode */
+#define MDIO_PCS_10T1L_CTRL_RESET	0x8000	/* PCS reset */
+
 /* EEE Supported/Advertisement/LP Advertisement registers.
  *
  * EEE capability Register (3.20), Advertisement (7.60) and
-- 
2.25.1


^ permalink raw reply related

* [PATCH v4 6/7] net: phy: adin1100: Add SQI support
From: alexandru.tachici @ 2021-12-10 11:05 UTC (permalink / raw)
  To: andrew
  Cc: o.rempel, alexandru.tachici, davem, devicetree, hkallweit1, kuba,
	linux-kernel, linux, netdev, robh+dt
In-Reply-To: <20211210110509.20970-1-alexandru.tachici@analog.com>

From: Alexandru Tachici <alexandru.tachici@analog.com>

Determine the SQI from MSE using a predefined table
for the 10BASE-T1L.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Alexandru Tachici <alexandru.tachici@analog.com>
---
 drivers/net/phy/adin1100.c | 52 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/drivers/net/phy/adin1100.c b/drivers/net/phy/adin1100.c
index 23d1ae61e0ef..9cbe8b92dc6f 100644
--- a/drivers/net/phy/adin1100.c
+++ b/drivers/net/phy/adin1100.c
@@ -33,6 +33,26 @@
 #define   ADIN_CRSM_SFT_PD_RDY			BIT(1)
 #define   ADIN_CRSM_SYS_RDY			BIT(0)
 
+#define ADIN_MSE_VAL				0x830B
+
+#define ADIN_SQI_MAX	7
+
+struct adin_mse_sqi_range {
+	u16 start;
+	u16 end;
+};
+
+static const struct adin_mse_sqi_range adin_mse_sqi_map[] = {
+	{ 0x0A74, 0xFFFF },
+	{ 0x084E, 0x0A74 },
+	{ 0x0698, 0x084E },
+	{ 0x053D, 0x0698 },
+	{ 0x0429, 0x053D },
+	{ 0x034E, 0x0429 },
+	{ 0x02A0, 0x034E },
+	{ 0x0000, 0x02A0 },
+};
+
 /**
  * struct adin_priv - ADIN PHY driver private data
  * @tx_level_2v4_able:		set if the PHY supports 2.4V TX levels (10BASE-T1L)
@@ -206,6 +226,36 @@ static int adin_get_features(struct phy_device *phydev)
 	return genphy_c45_pma_read_abilities(phydev);
 }
 
+static int adin_get_sqi(struct phy_device *phydev)
+{
+	u16 mse_val;
+	int sqi;
+	int ret;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_STAT1);
+	if (ret < 0)
+		return ret;
+	else if (!(ret & MDIO_STAT1_LSTATUS))
+		return 0;
+
+	ret = phy_read_mmd(phydev, MDIO_STAT1, ADIN_MSE_VAL);
+	if (ret < 0)
+		return ret;
+
+	mse_val = 0xFFFF & ret;
+	for (sqi = 0; sqi < ARRAY_SIZE(adin_mse_sqi_map); sqi++) {
+		if (mse_val >= adin_mse_sqi_map[sqi].start && mse_val <= adin_mse_sqi_map[sqi].end)
+			return sqi;
+	}
+
+	return -EINVAL;
+}
+
+static int adin_get_sqi_max(struct phy_device *phydev)
+{
+	return ADIN_SQI_MAX;
+}
+
 static int adin_probe(struct phy_device *phydev)
 {
 	struct device *dev = &phydev->mdio.dev;
@@ -232,6 +282,8 @@ static struct phy_driver adin_driver[] = {
 		.set_loopback		= adin_set_loopback,
 		.suspend		= adin_suspend,
 		.resume			= adin_resume,
+		.get_sqi		= adin_get_sqi,
+		.get_sqi_max		= adin_get_sqi_max,
 	},
 };
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH v4 0/7] net: phy: adin1100: Add initial support for ADIN1100 industrial PHY
From: alexandru.tachici @ 2021-12-10 11:05 UTC (permalink / raw)
  To: andrew
  Cc: o.rempel, alexandru.tachici, davem, devicetree, hkallweit1, kuba,
	linux-kernel, linux, netdev, robh+dt

From: Alexandru Tachici <alexandru.tachici@analog.com>

The ADIN1100 is a low power single port 10BASE-T1L transceiver designed for
industrial Ethernet applications and is compliant with the IEEE 802.3cg
Ethernet standard for long reach 10 Mb/s Single Pair Ethernet.

The ADIN1100 uses Auto-Negotiation capability in accordance
with IEEE 802.3 Clause 98, providing a mechanism for
exchanging information between PHYs to allow link partners to
agree to a common mode of operation.

The concluded operating mode is the transmit amplitude mode and
master/slave preference common across the two devices.

Both device and LP advertise their ability and request for
increased transmit at:
- BASE-T1 autonegotiation advertisement register [47:32]\
Clause 45.2.7.21 of Standard 802.3
- BIT(13) - 10BASE-T1L High Level Transmit Operating Mode Ability
- BIT(12) - 10BASE-T1L High Level Transmit Operating Mode Request

For 2.4 Vpp (high level transmit) operation, both devices need
to have the High Level Transmit Operating Mode Ability bit set,
and only one of them needs to have the High Level Transmit
Operating Mode Request bit set. Otherwise 1.0 Vpp transmit level
will be used.

Settings for eth1:
	Supported ports: [ TP	 MII ]
	Supported link modes:   10baseT1L/Full
	Supported pause frame use: Symmetric Receive-only
	Supports auto-negotiation: Yes
	Supported FEC modes: Not reported
	Advertised link modes:  10baseT1L/Full
	Advertised pause frame use: No
	Advertised auto-negotiation: Yes
	Advertised FEC modes: Not reported
	Link partner advertised link modes:  10baseT1L/Full
	Link partner advertised pause frame use: No
	Link partner advertised auto-negotiation: Yes
	Link partner advertised FEC modes: Not reported
	Speed: 10Mb/s
	Duplex: Full
	Auto-negotiation: on
	master-slave cfg: preferred slave
	master-slave status: slave
	Port: Twisted Pair
	PHYAD: 0
	Transceiver: external
	MDI-X: Unknown
	Link detected: yes
	SQI: 7/7

1. Add basic support for ADIN1100.

Alexandru Ardelean (1):
  net: phy: adin1100: Add initial support for ADIN1100 industrial PHY

1. Added 10baset-T1L link modes.

2. Added 10-BasetT1L registers.

3. Added Base-T1 auto-negotiation registers. For Base-T1 these
registers decide master/slave status and TX voltage of the
device and link partner.

4. Added 10BASE-T1L support in phy-c45.c. Now genphy functions will call
Base-T1 functions where registers don't match, like the auto-negotiation ones.

5. Convert MSE to SQI using a predefined table and allow user access
through ethtool.

6. DT bindings for the 2.4 Vpp transmit mode.

Alexandru Ardelean (1):
  net: phy: adin1100: Add initial support for ADIN1100 industrial PHY

Alexandru Tachici (6):
  ethtool: Add 10base-T1L link mode entry
  net: phy: Add 10-BaseT1L registers
  net: phy: Add BaseT1 auto-negotiation registers
  net: phy: Add 10BASE-T1L support in phy-c45
  net: phy: adin1100: Add SQI support
  dt-bindings: net: phy: Add 10-baseT1L 2.4 Vpp

Changelog: V3 -> V4:
	- fixed kernel-doc errors
	- ETHTOOL_LINK_MODE_10baseT1L_Full_BIT of phydev->supported is now set inside
	in genphy_c45_pma_read_abilities() call if device supports 10BASE-T1L
	- fix 802.3 reg defines comments (kept documentation wording instead)
	- fix 0x0010 advertise master preference (T4) (instead of 0x0080)
	- added genphy_c45_baset1_read_lpa to phy-c45.c, will get called from genphy_c45_read_lpa,
	if the phy supports BASE-T1 advertisement register
	- added genphy_c45_baset1_read_link to phy-c45.c, will get called from genphy_c45_read_link,
	if the phy supports BASE-T1 registers
	- replaced adin_read_lpa from adin1100.c with genphy_c45_read_lpa
	- added support for BASE-T1 master/slave status and advertising in phy-c45.c
	- dropped yaml file (no need for it) no vendor specific properties to be added in the DT
	- moved most of the BASE-T1 specific code from adin1100.c to gen-phy-c45
	- changed an-10base-t1l-2.4vpp property name to phy-10base-t1l-2.4vpp
	- in adin1100.c, when auto-negotiation is disabled, if increased transmit property is set
	in DT (phy-10base-t1l-2.4vpp = <1>) force Tx PHY level to 2.4 vpp otherwise force
	to 1.0 vpp.
	- added 10BASE-T1L PMA control mdio.h

 .../devicetree/bindings/net/ethernet-phy.yaml |   9 +
 drivers/net/phy/Kconfig                       |   7 +
 drivers/net/phy/Makefile                      |   1 +
 drivers/net/phy/adin1100.c                    | 299 ++++++++++++++++++
 drivers/net/phy/phy-c45.c                     | 283 ++++++++++++++++-
 drivers/net/phy/phy-core.c                    |   3 +-
 include/linux/mdio.h                          |  70 ++++
 include/uapi/linux/ethtool.h                  |   1 +
 include/uapi/linux/mdio.h                     |  75 +++++
 net/ethtool/common.c                          |   3 +
 10 files changed, 744 insertions(+), 7 deletions(-)
 create mode 100644 drivers/net/phy/adin1100.c

--
2.25.1

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox