Netdev List
 help / color / mirror / Atom feed
* [RFC bpf-next 07/11] bpf: Add helper to retrieve socket in BPF
From: Joe Stringer @ 2018-05-09 21:07 UTC (permalink / raw)
  To: daniel; +Cc: netdev, ast, john.fastabend, kafai
In-Reply-To: <20180509210709.7201-1-joe@wand.net.nz>

This patch adds a new BPF helper function, sk_lookup() which allows BPF
programs to find out if there is a socket listening on this host, and
returns a socket pointer which the BPF program can then access to
determine, for instance, whether to forward or drop traffic. sk_lookup()
takes a reference on the socket, so when a BPF program makes use of this
function, it must subsequently pass the returned pointer into the newly
added sk_release() to return the reference.

By way of example, the following pseudocode would filter inbound
connections at XDP if there is no corresponding service listening for
the traffic:

  struct bpf_sock_tuple tuple;
  struct bpf_sock_ops *sk;

  populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
  sk = bpf_sk_lookup(ctx, &tuple, sizeof tuple, netns, 0);
  if (!sk) {
    // Couldn't find a socket listening for this traffic. Drop.
    return TC_ACT_SHOT;
  }
  bpf_sk_release(sk, 0);
  return TC_ACT_OK;

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 include/uapi/linux/bpf.h                  |  39 +++++++++++-
 kernel/bpf/verifier.c                     |   8 ++-
 net/core/filter.c                         | 102 ++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h            |  40 +++++++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |   7 ++
 5 files changed, 193 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d615c777b573..29f38838dbca 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1828,6 +1828,25 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * struct bpf_sock_ops *bpf_sk_lookup(ctx, tuple, tuple_size, netns, flags)
+ * 	Decription
+ * 		Look for socket matching 'tuple'. The return value must be checked,
+ * 		and if non-NULL, released via bpf_sk_release().
+ * 		@ctx: pointer to ctx
+ * 		@tuple: pointer to struct bpf_sock_tuple
+ * 		@tuple_size: size of the tuple
+ * 		@flags: flags value
+ * 	Return
+ * 		pointer to socket ops on success, or
+ * 		NULL in case of failure
+ *
+ *  int bpf_sk_release(sock, flags)
+ * 	Description
+ * 		Release the reference held by 'sock'.
+ * 		@sock: Pointer reference to release. Must be found via bpf_sk_lookup().
+ * 		@flags: flags value
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1898,7 +1917,9 @@ union bpf_attr {
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
-	FN(skb_load_bytes_relative),
+	FN(skb_load_bytes_relative),	\
+	FN(sk_lookup),			\
+	FN(sk_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2060,6 +2081,22 @@ struct bpf_sock {
 				 */
 };
 
+struct bpf_sock_tuple {
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} saddr;
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} daddr;
+	__be16 sport;
+	__be16 dport;
+	__u32 dst_if;
+	__u8 family;
+	__u8 proto;
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 92b9a5dc465a..579012c483e4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
  * passes through a NULL-check conditional. For the branch wherein the state is
  * changed to CONST_IMM, the verifier releases the reference.
+ *
+ * For each helper function that allocates a reference, such as bpf_sk_lookup(),
+ * there is a corresponding release function, such as bpf_sk_release(). When
+ * a reference type passes into the release function, the verifier also releases
+ * the reference. If any unchecked or unreleased reference remains at the end of
+ * the program, the verifier rejects it.
  */
 
 /* verifier_state + insn_idx are pushed to stack when branch is encountered */
@@ -277,7 +283,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type)
  */
 static bool is_release_function(enum bpf_func_id func_id)
 {
-	return false;
+	return func_id == BPF_FUNC_sk_release;
 }
 
 /* string representation of 'enum bpf_reg_type' */
diff --git a/net/core/filter.c b/net/core/filter.c
index 4c35152fb3a8..751c255d17d3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -58,8 +58,12 @@
 #include <net/busy_poll.h>
 #include <net/tcp.h>
 #include <net/xfrm.h>
+#include <net/udp.h>
 #include <linux/bpf_trace.h>
 #include <net/xdp_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+#include <net/net_namespace.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -4032,6 +4036,96 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
 };
 #endif
 
+struct sock *
+sk_lookup(struct net *net, struct bpf_sock_tuple *tuple) {
+	int dst_if = (int)tuple->dst_if;
+	struct in6_addr *src6;
+	struct in6_addr *dst6;
+
+	if (tuple->family == AF_INET6) {
+		src6 = (struct in6_addr *)&tuple->saddr.ipv6;
+		dst6 = (struct in6_addr *)&tuple->daddr.ipv6;
+	} else if (tuple->family != AF_INET) {
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (tuple->proto == IPPROTO_TCP) {
+		if (tuple->family == AF_INET)
+			return inet_lookup(net, &tcp_hashinfo, NULL, 0,
+					   tuple->saddr.ipv4, tuple->sport,
+					   tuple->daddr.ipv4, tuple->dport,
+					   dst_if);
+		else
+			return inet6_lookup(net, &tcp_hashinfo, NULL, 0,
+					    src6, tuple->sport,
+					    dst6, tuple->dport, dst_if);
+	} else if (tuple->proto == IPPROTO_UDP) {
+		if (tuple->family == AF_INET)
+			return udp4_lib_lookup(net, tuple->saddr.ipv4,
+					       tuple->sport, tuple->daddr.ipv4,
+					       tuple->dport, dst_if);
+		else
+			return udp6_lib_lookup(net, src6, tuple->sport,
+					       dst6, tuple->dport, dst_if);
+	} else {
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	return NULL;
+}
+
+BPF_CALL_5(bpf_sk_lookup, struct sk_buff *, skb,
+	   struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+	struct net *caller_net = dev_net(skb->dev);
+	struct sock *sk = NULL;
+	struct net *net;
+
+	/* XXX: Perform verification-time checking of tuple size? */
+	if (unlikely(len != sizeof(struct bpf_sock_tuple) || flags))
+		goto out;
+
+	net = get_net_ns_by_id(caller_net, netns_id);
+	if (unlikely(!net))
+		goto out;
+
+	sk = sk_lookup(net, tuple);
+	put_net(net);
+	if (IS_ERR_OR_NULL(sk))
+		sk = NULL;
+	else
+		sk = sk_to_full_sk(sk);
+out:
+	return (unsigned long) sk;
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_proto = {
+	.func		= bpf_sk_lookup,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_sk_release, struct sock *, sk, u64, flags)
+{
+	sock_gen_put(sk);
+	if (unlikely(flags))
+		return -EINVAL;
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_release_proto = {
+	.func		= bpf_sk_release,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_SOCKET,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -4181,6 +4275,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_get_xfrm_state:
 		return &bpf_skb_get_xfrm_state_proto;
 #endif
+	case BPF_FUNC_sk_lookup:
+		return &bpf_sk_lookup_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4292,6 +4390,10 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_uid_proto;
 	case BPF_FUNC_sk_redirect_map:
 		return &bpf_sk_redirect_map_proto;
+	case BPF_FUNC_sk_lookup:
+		return &bpf_sk_lookup_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index fff51c187d1e..29f38838dbca 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -117,6 +117,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
 	BPF_MAP_TYPE_CPUMAP,
+	BPF_MAP_TYPE_XSKMAP,
 };
 
 enum bpf_prog_type {
@@ -1827,6 +1828,25 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * struct bpf_sock_ops *bpf_sk_lookup(ctx, tuple, tuple_size, netns, flags)
+ * 	Decription
+ * 		Look for socket matching 'tuple'. The return value must be checked,
+ * 		and if non-NULL, released via bpf_sk_release().
+ * 		@ctx: pointer to ctx
+ * 		@tuple: pointer to struct bpf_sock_tuple
+ * 		@tuple_size: size of the tuple
+ * 		@flags: flags value
+ * 	Return
+ * 		pointer to socket ops on success, or
+ * 		NULL in case of failure
+ *
+ *  int bpf_sk_release(sock, flags)
+ * 	Description
+ * 		Release the reference held by 'sock'.
+ * 		@sock: Pointer reference to release. Must be found via bpf_sk_lookup().
+ * 		@flags: flags value
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1897,7 +1917,9 @@ union bpf_attr {
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
-	FN(skb_load_bytes_relative),
+	FN(skb_load_bytes_relative),	\
+	FN(sk_lookup),			\
+	FN(sk_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2059,6 +2081,22 @@ struct bpf_sock {
 				 */
 };
 
+struct bpf_sock_tuple {
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} saddr;
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} daddr;
+	__be16 sport;
+	__be16 dport;
+	__u32 dst_if;
+	__u8 family;
+	__u8 proto;
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 265f8e0e8ada..4dc311ea0c16 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -103,6 +103,13 @@ static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
 	(void *) BPF_FUNC_skb_get_xfrm_state;
 static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
 	(void *) BPF_FUNC_get_stack;
+static struct bpf_sock *(*bpf_sk_lookup)(void *ctx,
+					 struct bpf_sock_tuple *tuple,
+					 int size, unsigned int netns_id,
+					 unsigned long long flags) =
+	(void *) BPF_FUNC_sk_lookup;
+static int (*bpf_sk_release)(struct bpf_sock *sk, unsigned long long flags) =
+	(void *) BPF_FUNC_sk_release;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
2.14.1

^ permalink raw reply related

* [RFC bpf-next 06/11] bpf: Add reference tracking to verifier
From: Joe Stringer @ 2018-05-09 21:07 UTC (permalink / raw)
  To: daniel; +Cc: netdev, ast, john.fastabend, kafai
In-Reply-To: <20180509210709.7201-1-joe@wand.net.nz>

Allow helper functions to acquire a reference and return it into a
register. Specific pointer types such as the PTR_TO_SOCKET will
implicitly represent such a reference. The verifier must ensure that
these references are released exactly once in each path through the
program.

To achieve this, this commit assigns an id to the pointer and tracks it
in the 'bpf_func_state', then when the function or program exits,
verifies that all of the acquired references have been freed. When the
pointer is passed to a function that frees the reference, it is removed
from the 'bpf_func_state` and all existing copies of the pointer in
registers are marked invalid.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 include/linux/bpf_verifier.h |  18 ++-
 kernel/bpf/verifier.c        | 295 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 292 insertions(+), 21 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 9dcd87f1d322..8dbee360b3ec 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -104,6 +104,11 @@ struct bpf_stack_state {
 	u8 slot_type[BPF_REG_SIZE];
 };
 
+struct bpf_reference_state {
+	int id;
+	int insn_idx; /* allocation insn */
+};
+
 /* state of the program:
  * type of all registers and stack info
  */
@@ -122,7 +127,9 @@ struct bpf_func_state {
 	 */
 	u32 subprogno;
 
-	/* should be second to last. See copy_func_state() */
+	/* The following fields should be last. See copy_func_state() */
+	int acquired_refs;
+	struct bpf_reference_state *refs;
 	int allocated_stack;
 	struct bpf_stack_state *stack;
 };
@@ -218,11 +225,16 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 
-static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state *cur = env->cur_state;
 
-	return cur->frame[cur->curframe]->regs;
+	return cur->frame[cur->curframe];
+}
+
+static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+{
+	return cur_func(env)->regs;
 }
 
 int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f426ebf2b6bf..92b9a5dc465a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,5 +1,6 @@
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -140,6 +141,18 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  *
  * After the call R0 is set to return type of the function and registers R1-R5
  * are set to NOT_INIT to indicate that they are no longer readable.
+ *
+ * The following reference types represent a potential reference to a kernel
+ * resource which, after first being allocated, must be checked and freed by
+ * the BPF program:
+ * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET
+ *
+ * When the verifier sees a helper call return a reference type, it allocates a
+ * pointer id for the reference and stores it in the current function state.
+ * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into
+ * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
+ * passes through a NULL-check conditional. For the branch wherein the state is
+ * changed to CONST_IMM, the verifier releases the reference.
  */
 
 /* verifier_state + insn_idx are pushed to stack when branch is encountered */
@@ -229,7 +242,42 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)
 
 static bool reg_type_may_be_null(enum bpf_reg_type type)
 {
-	return type == PTR_TO_MAP_VALUE_OR_NULL;
+	return type == PTR_TO_MAP_VALUE_OR_NULL ||
+	       type == PTR_TO_SOCKET_OR_NULL;
+}
+
+static bool type_is_refcounted(enum bpf_reg_type type)
+{
+	return type == PTR_TO_SOCKET;
+}
+
+static bool type_is_refcounted_or_null(enum bpf_reg_type type)
+{
+	return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL;
+}
+
+static bool reg_is_refcounted(const struct bpf_reg_state *reg)
+{
+	return type_is_refcounted(reg->type);
+}
+
+static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg)
+{
+	return type_is_refcounted_or_null(reg->type);
+}
+
+static bool arg_type_is_refcounted(enum bpf_arg_type type)
+{
+	return type == ARG_PTR_TO_SOCKET;
+}
+
+/* Determine whether the function releases some resources allocated by another
+ * function call. The first reference type argument will be assumed to be
+ * released by release_reference().
+ */
+static bool is_release_function(enum bpf_func_id func_id)
+{
+	return false;
 }
 
 /* string representation of 'enum bpf_reg_type' */
@@ -344,6 +392,12 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 		if (state->stack[i].slot_type[0] == STACK_ZERO)
 			verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE);
 	}
+	if (state->acquired_refs && state->refs[0].id) {
+		verbose(env, " refs=%d", state->refs[0].id);
+		for (i = 1; i < state->acquired_refs; i++)
+			if (state->refs[i].id)
+				verbose(env, ",%d", state->refs[i].id);
+	}
 	verbose(env, "\n");
 }
 
@@ -362,6 +416,8 @@ static int copy_##NAME##_state(struct bpf_func_state *dst,		\
 	       sizeof(*src->FIELD) * (src->COUNT / SIZE));		\
 	return 0;							\
 }
+/* copy_reference_state() */
+COPY_STATE_FN(reference, acquired_refs, refs, 1)
 /* copy_stack_state() */
 COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
 #undef COPY_STATE_FN
@@ -400,6 +456,8 @@ static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \
 	state->FIELD = new_##FIELD;					\
 	return 0;							\
 }
+/* realloc_reference_state() */
+REALLOC_STATE_FN(reference, acquired_refs, refs, 1)
 /* realloc_stack_state() */
 REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
 #undef REALLOC_STATE_FN
@@ -411,16 +469,89 @@ REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
  * which realloc_stack_state() copies over. It points to previous
  * bpf_verifier_state which is never reallocated.
  */
-static int realloc_func_state(struct bpf_func_state *state, int size,
-			      bool copy_old)
+static int realloc_func_state(struct bpf_func_state *state, int stack_size,
+			      int refs_size, bool copy_old)
 {
-	return realloc_stack_state(state, size, copy_old);
+	int err = realloc_reference_state(state, refs_size, copy_old);
+	if (err)
+		return err;
+	return realloc_stack_state(state, stack_size, copy_old);
+}
+
+/* Acquire a pointer id from the env and update the state->refs to include
+ * this new pointer reference.
+ * On success, returns a valid pointer id to associate with the register
+ * On failure, returns a negative errno.
+ */
+static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
+{
+	struct bpf_func_state *state = cur_func(env);
+	int new_ofs = state->acquired_refs;
+	int id, err;
+
+	err = realloc_reference_state(state, state->acquired_refs + 1, true);
+	if (err)
+		return err;
+	id = ++env->id_gen;
+	state->refs[new_ofs].id = id;
+	state->refs[new_ofs].insn_idx = insn_idx;
+
+	return id;
+}
+
+/* release function corresponding to acquire_reference_state(). Idempotent. */
+static int __release_reference_state(struct bpf_func_state *state, int ptr_id)
+{
+	int i, last_idx;
+
+	if (!ptr_id)
+		return 0;
+
+	last_idx = state->acquired_refs - 1;
+	for (i = 0; i < state->acquired_refs; i++) {
+		if (state->refs[i].id == ptr_id) {
+			if (last_idx && i != last_idx)
+				memcpy(&state->refs[i], &state->refs[last_idx],
+				       sizeof(*state->refs));
+			memset(&state->refs[last_idx], 0, sizeof(*state->refs));
+			state->acquired_refs--;
+			return 0;
+		}
+	}
+	return -EFAULT;
+}
+
+/* variation on the above for cases where we expect that there must be an
+ * outstanding reference for the specified ptr_id.
+ */
+static int release_reference_state(struct bpf_verifier_env *env, int ptr_id)
+{
+	struct bpf_func_state *state = cur_func(env);
+	int err;
+
+	err = __release_reference_state(state, ptr_id);
+	if (WARN_ON_ONCE(err != 0))
+		verbose(env, "verifier internal error: can't release reference\n");
+	return err;
+}
+
+static int transfer_reference_state(struct bpf_func_state *dst,
+				    struct bpf_func_state *src)
+{
+	int err = realloc_reference_state(dst, src->acquired_refs, false);
+	if (err)
+		return err;
+	err = copy_reference_state(dst, src);
+	if (err)
+		return err;
+	return 0;
 }
 
 static void free_func_state(struct bpf_func_state *state)
 {
 	if (!state)
 		return;
+	kfree(state->refs);
 	kfree(state->stack);
 	kfree(state);
 }
@@ -446,10 +577,14 @@ static int copy_func_state(struct bpf_func_state *dst,
 {
 	int err;
 
-	err = realloc_func_state(dst, src->allocated_stack, false);
+	err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs,
+				 false);
+	if (err)
+		return err;
+	memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
+	err = copy_reference_state(dst, src);
 	if (err)
 		return err;
-	memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack));
 	return copy_stack_state(dst, src);
 }
 
@@ -1019,7 +1154,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
 	enum bpf_reg_type type;
 
 	err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
-				 true);
+				 state->acquired_refs, true);
 	if (err)
 		return err;
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
@@ -2259,10 +2394,32 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
 	return true;
 }
 
+static bool check_refcount_ok(const struct bpf_func_proto *fn)
+{
+	int count = 0;
+
+	if (arg_type_is_refcounted(fn->arg1_type))
+		count++;
+	if (arg_type_is_refcounted(fn->arg2_type))
+		count++;
+	if (arg_type_is_refcounted(fn->arg3_type))
+		count++;
+	if (arg_type_is_refcounted(fn->arg4_type))
+		count++;
+	if (arg_type_is_refcounted(fn->arg5_type))
+		count++;
+
+	/* We only support one arg being unreferenced at the moment,
+	 * which is sufficient for the helper functions we have right now.
+	 */
+	return count <= 1;
+}
+
 static int check_func_proto(const struct bpf_func_proto *fn)
 {
 	return check_raw_mode_ok(fn) &&
-	       check_arg_pair_ok(fn) ? 0 : -EINVAL;
+	       check_arg_pair_ok(fn) &&
+	       check_refcount_ok(fn) ? 0 : -EINVAL;
 }
 
 /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
@@ -2295,12 +2452,57 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 		__clear_all_pkt_pointers(env, vstate->frame[i]);
 }
 
+static void release_reg_references(struct bpf_verifier_env *env,
+				   struct bpf_func_state *state, int id)
+{
+	struct bpf_reg_state *regs = state->regs, *reg;
+	int i;
+
+	for (i = 0; i < MAX_BPF_REG; i++)
+		if (regs[i].id == id)
+			mark_reg_unknown(env, regs, i);
+
+	for_each_spilled_reg(i, state, reg) {
+		if (!reg)
+			continue;
+		if (reg_is_refcounted(reg) && reg->id == id)
+			__mark_reg_unknown(reg);
+	}
+}
+
+/* The pointer with the specified id has released its reference to kernel
+ * resources. Identify all copies of the same pointer and clear the reference.
+ */
+static int release_reference(struct bpf_verifier_env *env)
+{
+	struct bpf_verifier_state *vstate = env->cur_state;
+	struct bpf_reg_state *regs = cur_regs(env);
+	int i, ptr_id = 0;
+
+	for (i = BPF_REG_1; i < BPF_REG_6; i++) {
+		if (reg_is_refcounted(&regs[i])) {
+			ptr_id = regs[i].id;
+			break;
+		}
+	}
+	if (WARN_ON_ONCE(!ptr_id)) {
+		/* references must be special pointer types that are checked
+		 * against argument requirements for the release function. */
+		verbose(env, "verifier internal error: can't locate refcounted arg\n");
+		return -EFAULT;
+	}
+	for (i = 0; i <= vstate->curframe; i++)
+		release_reg_references(env, vstate->frame[i], ptr_id);
+
+	return release_reference_state(env, ptr_id);
+}
+
 static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			   int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_func_state *caller, *callee;
-	int i, subprog, target_insn;
+	int i, err, subprog, target_insn;
 
 	if (state->curframe + 1 >= MAX_CALL_FRAMES) {
 		verbose(env, "the call stack of %d frames is too deep\n",
@@ -2338,6 +2540,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			state->curframe + 1 /* frameno within this callchain */,
 			subprog /* subprog number within this prog */);
 
+	/* Transfer references to the callee */
+	err = transfer_reference_state(callee, caller);
+	if (err)
+		return err;
+
 	/* copy r1 - r5 args that callee can access */
 	for (i = BPF_REG_1; i <= BPF_REG_5; i++)
 		callee->regs[i] = caller->regs[i];
@@ -2368,6 +2575,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_func_state *caller, *callee;
 	struct bpf_reg_state *r0;
+	int err;
 
 	callee = state->frame[state->curframe];
 	r0 = &callee->regs[BPF_REG_0];
@@ -2387,6 +2595,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	/* return to the caller whatever r0 had in the callee */
 	caller->regs[BPF_REG_0] = *r0;
 
+	/* Transfer references to the caller */
+	err = transfer_reference_state(caller, callee);
+	if (err)
+		return err;
+
 	*insn_idx = callee->callsite + 1;
 	if (env->log.level) {
 		verbose(env, "returning from callee:\n");
@@ -2498,6 +2711,15 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 			return err;
 	}
 
+	/* If the function is a release() function, mark all copies of the same
+	 * pointer as "freed" in all registers and in the stack.
+	 */
+	if (is_release_function(func_id)) {
+		err = release_reference(env);
+		if (err)
+			return err;
+	}
+
 	regs = cur_regs(env);
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
@@ -2535,9 +2757,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		else if (insn_aux->map_ptr != meta.map_ptr)
 			insn_aux->map_ptr = BPF_MAP_PTR_POISON;
 	} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
+		int id = acquire_reference_state(env, insn_idx);
+		if (id < 0)
+			return id;
 		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
-		regs[BPF_REG_0].id = ++env->id_gen;
+		regs[BPF_REG_0].id = id;
 	} else {
 		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
@@ -3599,7 +3824,8 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src,
 	}
 }
 
-static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id,
+static void mark_ptr_or_null_reg(struct bpf_func_state *state,
+				 struct bpf_reg_state *reg, u32 id,
 				 bool is_null)
 {
 	if (reg_type_may_be_null(reg->type) && reg->id == id) {
@@ -3625,11 +3851,13 @@ static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id,
 		} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
 			reg->type = PTR_TO_SOCKET;
 		}
-		/* We don't need id from this point onwards anymore, thus we
-		 * should better reset it, so that state pruning has chances
-		 * to take effect.
-		 */
-		reg->id = 0;
+		if (is_null || !reg_is_refcounted(reg)) {
+			/* We don't need id from this point onwards anymore,
+			 * thus we should better reset it, so that state
+			 * pruning has chances to take effect.
+			 */
+			reg->id = 0;
+		}
 	}
 }
 
@@ -3644,15 +3872,18 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
 	u32 id = regs[regno].id;
 	int i, j;
 
+	if (reg_is_refcounted_or_null(&regs[regno]) && is_null)
+		__release_reference_state(state, id);
+
 	for (i = 0; i < MAX_BPF_REG; i++)
-		mark_ptr_or_null_reg(&regs[i], id, is_null);
+		mark_ptr_or_null_reg(state, &regs[i], id, is_null);
 
 	for (j = 0; j <= vstate->curframe; j++) {
 		state = vstate->frame[j];
 		for_each_spilled_reg(i, state, reg) {
 			if (!reg)
 				continue;
-			mark_ptr_or_null_reg(reg, id, is_null);
+			mark_ptr_or_null_reg(state, reg, id, is_null);
 		}
 	}
 }
@@ -4475,6 +4706,14 @@ static bool stacksafe(struct bpf_func_state *old,
 	return true;
 }
 
+static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur)
+{
+	if (old->acquired_refs != cur->acquired_refs)
+		return false;
+	return !memcmp(old->refs, cur->refs,
+		       sizeof(*old->refs) * old->acquired_refs);
+}
+
 /* compare two verifier states
  *
  * all states stored in state_list are known to be valid, since
@@ -4520,6 +4759,9 @@ static bool func_states_equal(struct bpf_func_state *old,
 
 	if (!stacksafe(old, cur, idmap))
 		goto out_free;
+
+	if (!refsafe(old, cur))
+		goto out_free;
 	ret = true;
 out_free:
 	kfree(idmap);
@@ -4669,6 +4911,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	return 0;
 }
 
+static int check_reference_leak(struct bpf_verifier_env *env)
+{
+	struct bpf_func_state *state = cur_func(env);
+	int i;
+
+	for (i = 0; i < state->acquired_refs; i++) {
+		verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
+			state->refs[i].id, state->refs[i].insn_idx);
+	}
+	return state->acquired_refs ? -EINVAL : 0;
+}
+
 static int do_check(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state *state;
@@ -4763,6 +5017,7 @@ static int do_check(struct bpf_verifier_env *env)
 
 		regs = cur_regs(env);
 		env->insn_aux_data[insn_idx].seen = true;
+
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
 			if (err)
@@ -4931,6 +5186,10 @@ static int do_check(struct bpf_verifier_env *env)
 					continue;
 				}
 
+				err = check_reference_leak(env);
+				if (err)
+					return err;
+
 				/* eBPF calling convetion is such that R0 is used
 				 * to return the value from eBPF program.
 				 * Make sure that it's readable at this time
-- 
2.14.1

^ permalink raw reply related

* [RFC bpf-next 05/11] bpf: Macrofy stack state copy
From: Joe Stringer @ 2018-05-09 21:07 UTC (permalink / raw)
  To: daniel; +Cc: netdev, ast, john.fastabend, kafai
In-Reply-To: <20180509210709.7201-1-joe@wand.net.nz>

An upcoming commit will need very similar copy/realloc boilerplate, so
refactor the existing stack copy/realloc functions into macros to
simplify it.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 kernel/bpf/verifier.c | 104 ++++++++++++++++++++++++++++----------------------
 1 file changed, 59 insertions(+), 45 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d38c7c1e9da6..f426ebf2b6bf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -347,60 +347,74 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 	verbose(env, "\n");
 }
 
-static int copy_stack_state(struct bpf_func_state *dst,
-			    const struct bpf_func_state *src)
-{
-	if (!src->stack)
-		return 0;
-	if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
-		/* internal bug, make state invalid to reject the program */
-		memset(dst, 0, sizeof(*dst));
-		return -EFAULT;
-	}
-	memcpy(dst->stack, src->stack,
-	       sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
-	return 0;
-}
+#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE)				\
+static int copy_##NAME##_state(struct bpf_func_state *dst,		\
+			       const struct bpf_func_state *src)	\
+{									\
+	if (!src->FIELD)						\
+		return 0;						\
+	if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) {			\
+		/* internal bug, make state invalid to reject the program */ \
+		memset(dst, 0, sizeof(*dst));				\
+		return -EFAULT;						\
+	}								\
+	memcpy(dst->FIELD, src->FIELD,					\
+	       sizeof(*src->FIELD) * (src->COUNT / SIZE));		\
+	return 0;							\
+}
+/* copy_stack_state() */
+COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
+#undef COPY_STATE_FN
+
+#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE)			\
+static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \
+				  bool copy_old)			\
+{									\
+	u32 old_size = state->COUNT;					\
+	struct bpf_##NAME##_state *new_##FIELD;				\
+	int slot = size / SIZE;						\
+									\
+	if (size <= old_size || !size) {				\
+		if (copy_old)						\
+			return 0;					\
+		state->COUNT = slot * SIZE;				\
+		if (!size && old_size) {				\
+			kfree(state->FIELD);				\
+			state->FIELD = NULL;				\
+		}							\
+		return 0;						\
+	}								\
+	new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \
+				    GFP_KERNEL);			\
+	if (!new_##FIELD)						\
+		return -ENOMEM;						\
+	if (copy_old) {							\
+		if (state->FIELD)					\
+			memcpy(new_##FIELD, state->FIELD,		\
+			       sizeof(*new_##FIELD) * (old_size / SIZE)); \
+		memset(new_##FIELD + old_size / SIZE, 0,		\
+		       sizeof(*new_##FIELD) * (size - old_size) / SIZE); \
+	}								\
+	state->COUNT = slot * SIZE;					\
+	kfree(state->FIELD);						\
+	state->FIELD = new_##FIELD;					\
+	return 0;							\
+}
+/* realloc_stack_state() */
+REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE)
+#undef REALLOC_STATE_FN
 
 /* do_check() starts with zero-sized stack in struct bpf_verifier_state to
  * make it consume minimal amount of memory. check_stack_write() access from
  * the program calls into realloc_func_state() to grow the stack size.
  * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
- * which this function copies over. It points to previous bpf_verifier_state
- * which is never reallocated
+ * which realloc_stack_state() copies over. It points to previous
+ * bpf_verifier_state which is never reallocated.
  */
 static int realloc_func_state(struct bpf_func_state *state, int size,
 			      bool copy_old)
 {
-	u32 old_size = state->allocated_stack;
-	struct bpf_stack_state *new_stack;
-	int slot = size / BPF_REG_SIZE;
-
-	if (size <= old_size || !size) {
-		if (copy_old)
-			return 0;
-		state->allocated_stack = slot * BPF_REG_SIZE;
-		if (!size && old_size) {
-			kfree(state->stack);
-			state->stack = NULL;
-		}
-		return 0;
-	}
-	new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
-				  GFP_KERNEL);
-	if (!new_stack)
-		return -ENOMEM;
-	if (copy_old) {
-		if (state->stack)
-			memcpy(new_stack, state->stack,
-			       sizeof(*new_stack) * (old_size / BPF_REG_SIZE));
-		memset(new_stack + old_size / BPF_REG_SIZE, 0,
-		       sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE);
-	}
-	state->allocated_stack = slot * BPF_REG_SIZE;
-	kfree(state->stack);
-	state->stack = new_stack;
-	return 0;
+	return realloc_stack_state(state, size, copy_old);
 }
 
 static void free_func_state(struct bpf_func_state *state)
-- 
2.14.1

^ permalink raw reply related

* [RFC bpf-next 04/11] bpf: Add PTR_TO_SOCKET verifier type
From: Joe Stringer @ 2018-05-09 21:07 UTC (permalink / raw)
  To: daniel; +Cc: netdev, ast, john.fastabend, kafai
In-Reply-To: <20180509210709.7201-1-joe@wand.net.nz>

Teach the verifier a little bit about a new type of pointer, a
PTR_TO_SOCKET. This pointer type is accessed from BPF through the
'struct bpf_sock' structure.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 include/linux/bpf.h          | 19 +++++++++-
 include/linux/bpf_verifier.h |  2 ++
 kernel/bpf/verifier.c        | 86 ++++++++++++++++++++++++++++++++++++++------
 net/core/filter.c            | 30 +++++++++-------
 4 files changed, 114 insertions(+), 23 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a38e474bf7ee..a03b4b0edcb6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -136,7 +136,7 @@ enum bpf_arg_type {
 	/* the following constraints used to prototype bpf_memcmp() and other
 	 * functions that access data on eBPF program stack
 	 */
-	ARG_PTR_TO_MEM,		/* pointer to valid memory (stack, packet, map value) */
+	ARG_PTR_TO_MEM,		/* pointer to valid memory (stack, packet, map value, socket) */
 	ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */
 	ARG_PTR_TO_UNINIT_MEM,	/* pointer to memory does not need to be initialized,
 				 * helper function must fill all bytes or clear
@@ -148,6 +148,7 @@ enum bpf_arg_type {
 
 	ARG_PTR_TO_CTX,		/* pointer to context */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
+	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock */
 };
 
 /* type of values returned from helper functions */
@@ -155,6 +156,7 @@ enum bpf_return_type {
 	RET_INTEGER,			/* function returns integer */
 	RET_VOID,			/* function doesn't return anything */
 	RET_PTR_TO_MAP_VALUE_OR_NULL,	/* returns a pointer to map elem value or NULL */
+	RET_PTR_TO_SOCKET_OR_NULL,	/* returns a pointer to a socket or NULL */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
@@ -205,6 +207,8 @@ enum bpf_reg_type {
 	PTR_TO_PACKET_META,	 /* skb->data - meta_len */
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
+	PTR_TO_SOCKET,		 /* reg points to struct bpf_sock */
+	PTR_TO_SOCKET_OR_NULL,	 /* reg points to struct bpf_sock or NULL */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -326,6 +330,11 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
 typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src,
 					unsigned long off, unsigned long len);
+typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type,
+					const struct bpf_insn *src,
+					struct bpf_insn *dst,
+					struct bpf_prog *prog,
+					u32 *target_size);
 
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);
@@ -729,4 +738,12 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+			      struct bpf_insn_access_aux *info);
+u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
+			        const struct bpf_insn *si,
+			        struct bpf_insn *insn_buf,
+			        struct bpf_prog *prog,
+			        u32 *target_size);
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index a613b52ce939..9dcd87f1d322 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -57,6 +57,8 @@ struct bpf_reg_state {
 	 * offset, so they can share range knowledge.
 	 * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
 	 * came from, when one is tested for != NULL.
+	 * For PTR_TO_SOCKET this is used to share which pointers retain the
+	 * same reference to the socket, to determine proper reference freeing.
 	 */
 	u32 id;
 	/* Ordering of fields matters.  See states_equal() */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1b31b805dea4..d38c7c1e9da6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -80,8 +80,8 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  * (like pointer plus pointer becomes SCALAR_VALUE type)
  *
  * When verifier sees load or store instructions the type of base register
- * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer
- * types recognized by check_mem_access() function.
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are
+ * four pointer types recognized by check_mem_access() function.
  *
  * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
  * and the range of [ptr, ptr + map's value_size) is accessible.
@@ -244,6 +244,8 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_PACKET]		= "pkt",
 	[PTR_TO_PACKET_META]	= "pkt_meta",
 	[PTR_TO_PACKET_END]	= "pkt_end",
+	[PTR_TO_SOCKET]		= "sock",
+	[PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
 };
 
 static void print_liveness(struct bpf_verifier_env *env,
@@ -977,6 +979,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET_END:
 	case CONST_PTR_TO_MAP:
+	case PTR_TO_SOCKET:
+	case PTR_TO_SOCKET_OR_NULL:
 		return true;
 	default:
 		return false;
@@ -1360,6 +1364,28 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 	return -EACCES;
 }
 
+static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off,
+			     int size, enum bpf_access_type t)
+{
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *reg = &regs[regno];
+	struct bpf_insn_access_aux info;
+
+	if (reg->smin_value < 0) {
+		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+			regno);
+		return -EACCES;
+	}
+
+	if (!bpf_sock_is_valid_access(off, size, t, &info)) {
+		verbose(env, "invalid bpf_sock_ops access off=%d size=%d\n",
+			off, size);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
 static bool __is_pointer_value(bool allow_ptr_leaks,
 			       const struct bpf_reg_state *reg)
 {
@@ -1475,6 +1501,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 		 */
 		strict = true;
 		break;
+	case PTR_TO_SOCKET:
+		pointer_desc = "sock ";
+		break;
 	default:
 		break;
 	}
@@ -1723,6 +1752,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_packet_access(env, regno, off, size, false);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
+
+	} else if (reg->type == PTR_TO_SOCKET) {
+		if (t == BPF_WRITE) {
+			verbose(env, "cannot write into socket\n");
+			return -EACCES;
+		}
+		err = check_sock_access(env, regno, off, size, t);
+		if (!err && t == BPF_READ && value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
+
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str[reg->type]);
@@ -1941,6 +1980,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = PTR_TO_CTX;
 		if (type != expected_type)
 			goto err_type;
+	} else if (arg_type == ARG_PTR_TO_SOCKET) {
+		expected_type = PTR_TO_SOCKET;
+		if (type != expected_type)
+			goto err_type;
 	} else if (arg_type_is_mem_ptr(arg_type)) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
@@ -2477,6 +2520,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 			insn_aux->map_ptr = meta.map_ptr;
 		else if (insn_aux->map_ptr != meta.map_ptr)
 			insn_aux->map_ptr = BPF_MAP_PTR_POISON;
+	} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
+		mark_reg_known_zero(env, regs, BPF_REG_0);
+		regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
+		regs[BPF_REG_0].id = ++env->id_gen;
 	} else {
 		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
@@ -2614,6 +2661,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		return -EACCES;
 	case CONST_PTR_TO_MAP:
 	case PTR_TO_PACKET_END:
+	case PTR_TO_SOCKET:
+	case PTR_TO_SOCKET_OR_NULL:
 		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
 			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
@@ -3559,6 +3608,8 @@ static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id,
 			} else {
 				reg->type = PTR_TO_MAP_VALUE;
 			}
+		} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
+			reg->type = PTR_TO_SOCKET;
 		}
 		/* We don't need id from this point onwards anymore, thus we
 		 * should better reset it, so that state pruning has chances
@@ -4333,6 +4384,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	case PTR_TO_CTX:
 	case CONST_PTR_TO_MAP:
 	case PTR_TO_PACKET_END:
+	case PTR_TO_SOCKET:
+	case PTR_TO_SOCKET_OR_NULL:
 		/* Only valid matches are exact, which memcmp() above
 		 * would have accepted
 		 */
@@ -5188,10 +5241,14 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
 	}
 }
 
-/* convert load instructions that access fields of 'struct __sk_buff'
- * into sequence of instructions that access fields of 'struct sk_buff'
+/* convert load instructions that access fields of a context type into a
+ * sequence of instructions that access fields of the underlying structure:
+ *     struct __sk_buff    -> struct sk_buff
+ *     struct bpf_sock_ops -> struct sock
  */
-static int convert_ctx_accesses(struct bpf_verifier_env *env)
+static int convert_ctx_accesses(struct bpf_verifier_env *env,
+				bpf_convert_ctx_access_t convert_ctx_access,
+				enum bpf_reg_type ctx_type)
 {
 	const struct bpf_verifier_ops *ops = env->ops;
 	int i, cnt, size, ctx_field_size, delta = 0;
@@ -5218,12 +5275,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		}
 	}
 
-	if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux))
+	if (!convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux))
 		return 0;
 
 	insn = env->prog->insnsi + delta;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
+		enum bpf_reg_type ptr_type;
+
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
@@ -5237,7 +5296,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		else
 			continue;
 
-		if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
+		ptr_type = env->insn_aux_data[i + delta].ptr_type;
+		if (ptr_type != ctx_type)
 			continue;
 
 		ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
@@ -5269,8 +5329,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		}
 
 		target_size = 0;
-		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
-					      &target_size);
+		cnt = convert_ctx_access(type, insn, insn_buf, env->prog,
+					 &target_size);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
 		    (ctx_field_size && !target_size)) {
 			verbose(env, "bpf verifier is misconfigured\n");
@@ -5785,7 +5845,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 
 	if (ret == 0)
 		/* program is valid, convert *(u32*)(ctx + off) accesses */
-		ret = convert_ctx_accesses(env);
+		ret = convert_ctx_accesses(env, env->ops->convert_ctx_access,
+					   PTR_TO_CTX);
+
+	if (ret == 0)
+		/* Convert *(u32*)(sock_ops + off) accesses */
+		ret = convert_ctx_accesses(env, bpf_sock_convert_ctx_access,
+					   PTR_TO_SOCKET);
 
 	if (ret == 0)
 		ret = fixup_bpf_calls(env);
diff --git a/net/core/filter.c b/net/core/filter.c
index 0baa715e4699..4c35152fb3a8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4496,23 +4496,29 @@ static bool __sock_filter_check_size(int off, int size,
 	return size == size_default;
 }
 
-static bool sock_filter_is_valid_access(int off, int size,
-					enum bpf_access_type type,
-					const struct bpf_prog *prog,
-					struct bpf_insn_access_aux *info)
+bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+			      struct bpf_insn_access_aux *info)
 {
 	if (off < 0 || off >= sizeof(struct bpf_sock))
 		return false;
 	if (off % size != 0)
 		return false;
-	if (!__sock_filter_check_attach_type(off, type,
-					     prog->expected_attach_type))
-		return false;
 	if (!__sock_filter_check_size(off, size, info))
 		return false;
 	return true;
 }
 
+static bool sock_filter_is_valid_access(int off, int size,
+					enum bpf_access_type type,
+					const struct bpf_prog *prog,
+					struct bpf_insn_access_aux *info)
+{
+	if (!__sock_filter_check_attach_type(off, type,
+					     prog->expected_attach_type))
+		return false;
+	return bpf_sock_is_valid_access(off, size, type, info);
+}
+
 static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
 				const struct bpf_prog *prog, int drop_verdict)
 {
@@ -5153,10 +5159,10 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
-					  const struct bpf_insn *si,
-					  struct bpf_insn *insn_buf,
-					  struct bpf_prog *prog, u32 *target_size)
+u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
+				const struct bpf_insn *si,
+				struct bpf_insn *insn_buf,
+				struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
 	int off;
@@ -5926,7 +5932,7 @@ const struct bpf_prog_ops lwt_xmit_prog_ops = {
 const struct bpf_verifier_ops cg_sock_verifier_ops = {
 	.get_func_proto		= sock_filter_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
-	.convert_ctx_access	= sock_filter_convert_ctx_access,
+	.convert_ctx_access	= bpf_sock_convert_ctx_access,
 };
 
 const struct bpf_prog_ops cg_sock_prog_ops = {
-- 
2.14.1

^ permalink raw reply related

* [RFC bpf-next 03/11] bpf: Generalize ptr_or_null regs check
From: Joe Stringer @ 2018-05-09 21:07 UTC (permalink / raw)
  To: daniel; +Cc: netdev, ast, john.fastabend, kafai
In-Reply-To: <20180509210709.7201-1-joe@wand.net.nz>

This check will be reused by an upcoming commit for conditional jump
checks for sockets. Refactor it a bit to simplify the later commit.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 kernel/bpf/verifier.c | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a32b560072d7..1b31b805dea4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -227,6 +227,11 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)
 	       type == PTR_TO_PACKET_META;
 }
 
+static bool reg_type_may_be_null(enum bpf_reg_type type)
+{
+	return type == PTR_TO_MAP_VALUE_OR_NULL;
+}
+
 /* string representation of 'enum bpf_reg_type' */
 static const char * const reg_type_str[] = {
 	[NOT_INIT]		= "?",
@@ -3531,12 +3536,10 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src,
 	}
 }
 
-static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
-			 bool is_null)
+static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id,
+				 bool is_null)
 {
-	struct bpf_reg_state *reg = &regs[regno];
-
-	if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
+	if (reg_type_may_be_null(reg->type) && reg->id == id) {
 		/* Old offset (both fixed and variable parts) should
 		 * have been known-zero, because we don't allow pointer
 		 * arithmetic on pointers that might be NULL.
@@ -3549,11 +3552,13 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
 		}
 		if (is_null) {
 			reg->type = SCALAR_VALUE;
-		} else if (reg->map_ptr->inner_map_meta) {
-			reg->type = CONST_PTR_TO_MAP;
-			reg->map_ptr = reg->map_ptr->inner_map_meta;
-		} else {
-			reg->type = PTR_TO_MAP_VALUE;
+		} else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
+			if (reg->map_ptr->inner_map_meta) {
+				reg->type = CONST_PTR_TO_MAP;
+				reg->map_ptr = reg->map_ptr->inner_map_meta;
+			} else {
+				reg->type = PTR_TO_MAP_VALUE;
+			}
 		}
 		/* We don't need id from this point onwards anymore, thus we
 		 * should better reset it, so that state pruning has chances
@@ -3566,8 +3571,8 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
 /* The logic is similar to find_good_pkt_pointers(), both could eventually
  * be folded together at some point.
  */
-static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
-			  bool is_null)
+static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
+				  bool is_null)
 {
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
 	struct bpf_reg_state *reg, *regs = state->regs;
@@ -3575,14 +3580,14 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
 	int i, j;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
-		mark_map_reg(regs, i, id, is_null);
+		mark_ptr_or_null_reg(&regs[i], id, is_null);
 
 	for (j = 0; j <= vstate->curframe; j++) {
 		state = vstate->frame[j];
 		for_each_spilled_reg(i, state, reg) {
 			if (!reg)
 				continue;
-			mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
+			mark_ptr_or_null_reg(reg, id, is_null);
 		}
 	}
 }
@@ -3784,12 +3789,14 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	/* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
 	if (BPF_SRC(insn->code) == BPF_K &&
 	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
-	    dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-		/* Mark all identical map registers in each branch as either
+	    reg_type_may_be_null(dst_reg->type)) {
+		/* Mark all identical registers in each branch as either
 		 * safe or unknown depending R == 0 or R != 0 conditional.
 		 */
-		mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
-		mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
+		mark_ptr_or_null_regs(this_branch, insn->dst_reg,
+				      opcode == BPF_JNE);
+		mark_ptr_or_null_regs(other_branch, insn->dst_reg,
+				      opcode == BPF_JEQ);
 	} else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
 					   this_branch, other_branch) &&
 		   is_pointer_value(env, insn->dst_reg)) {
-- 
2.14.1

^ permalink raw reply related

* [RFC bpf-next 02/11] bpf: Simplify ptr_min_max_vals adjustment
From: Joe Stringer @ 2018-05-09 21:07 UTC (permalink / raw)
  To: daniel; +Cc: netdev, ast, john.fastabend, kafai
In-Reply-To: <20180509210709.7201-1-joe@wand.net.nz>

An upcoming commit will add another two pointer types that need very
similar behaviour, so generalise this function now.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 kernel/bpf/verifier.c                       | 22 ++++++++++------------
 tools/testing/selftests/bpf/test_verifier.c | 14 +++++++-------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f40e089c3893..a32b560072d7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2602,20 +2602,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
-	if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-		verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
-			dst);
-		return -EACCES;
-	}
-	if (ptr_reg->type == CONST_PTR_TO_MAP) {
-		verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
-			dst);
+	switch (ptr_reg->type) {
+	case PTR_TO_MAP_VALUE_OR_NULL:
+		verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
+			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
-	}
-	if (ptr_reg->type == PTR_TO_PACKET_END) {
-		verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
-			dst);
+	case CONST_PTR_TO_MAP:
+	case PTR_TO_PACKET_END:
+		verbose(env, "R%d pointer arithmetic on %s prohibited\n",
+			dst, reg_type_str[ptr_reg->type]);
 		return -EACCES;
+	default:
+		break;
 	}
 
 	/* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 275b4570b5b8..53439f40e1de 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -3497,7 +3497,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
+		.errstr = "R3 pointer arithmetic on pkt_end",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
@@ -4525,7 +4525,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 4 },
-		.errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
+		.errstr = "R4 pointer arithmetic on map_value_or_null",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS
 	},
@@ -4546,7 +4546,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 4 },
-		.errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
+		.errstr = "R4 pointer arithmetic on map_value_or_null",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS
 	},
@@ -4567,7 +4567,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 4 },
-		.errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
+		.errstr = "R4 pointer arithmetic on map_value_or_null",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS
 	},
@@ -6864,7 +6864,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map_in_map = { 3 },
-		.errstr = "R1 pointer arithmetic on CONST_PTR_TO_MAP prohibited",
+		.errstr = "R1 pointer arithmetic on map_ptr prohibited",
 		.result = REJECT,
 	},
 	{
@@ -8538,7 +8538,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
+		.errstr = "R3 pointer arithmetic on pkt_end",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
@@ -8557,7 +8557,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
+		.errstr = "R3 pointer arithmetic on pkt_end",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
-- 
2.14.1

^ permalink raw reply related

* [RFC bpf-next 01/11] bpf: Add iterator for spilled registers
From: Joe Stringer @ 2018-05-09 21:06 UTC (permalink / raw)
  To: daniel; +Cc: netdev, ast, john.fastabend, kafai
In-Reply-To: <20180509210709.7201-1-joe@wand.net.nz>

Add this iterator for spilled registers, it concentrates the details of
how to get the current frame's spilled registers into a single macro
while clarifying the intention of the code which is calling the macro.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 include/linux/bpf_verifier.h | 11 +++++++++++
 kernel/bpf/verifier.c        | 16 +++++++---------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 8f70dc181e23..a613b52ce939 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -133,6 +133,17 @@ struct bpf_verifier_state {
 	u32 curframe;
 };
 
+#define __get_spilled_reg(slot, frame)					\
+	(((slot < frame->allocated_stack / BPF_REG_SIZE) &&		\
+	  (frame->stack[slot].slot_type[0] == STACK_SPILL))		\
+	 ? &frame->stack[slot].spilled_ptr : NULL)
+
+/* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */
+#define for_each_spilled_reg(iter, frame, reg)				\
+	for (iter = 0, reg = __get_spilled_reg(iter, frame);		\
+	     iter < frame->allocated_stack / BPF_REG_SIZE;		\
+	     iter++, reg = __get_spilled_reg(iter, frame))
+
 /* linked list of verifier states used to prune search */
 struct bpf_verifier_state_list {
 	struct bpf_verifier_state state;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d92d9c37affd..f40e089c3893 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2216,10 +2216,9 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
 		if (reg_is_pkt_pointer_any(&regs[i]))
 			mark_reg_unknown(env, regs, i);
 
-	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-		if (state->stack[i].slot_type[0] != STACK_SPILL)
+	for_each_spilled_reg(i, state, reg) {
+		if (!reg)
 			continue;
-		reg = &state->stack[i].spilled_ptr;
 		if (reg_is_pkt_pointer_any(reg))
 			__mark_reg_unknown(reg);
 	}
@@ -3326,10 +3325,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 
 	for (j = 0; j <= vstate->curframe; j++) {
 		state = vstate->frame[j];
-		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-			if (state->stack[i].slot_type[0] != STACK_SPILL)
+		for_each_spilled_reg(i, state, reg) {
+			if (!reg)
 				continue;
-			reg = &state->stack[i].spilled_ptr;
 			if (reg->type == type && reg->id == dst_reg->id)
 				reg->range = max(reg->range, new_range);
 		}
@@ -3574,7 +3572,7 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
 			  bool is_null)
 {
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *reg, *regs = state->regs;
 	u32 id = regs[regno].id;
 	int i, j;
 
@@ -3583,8 +3581,8 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno,
 
 	for (j = 0; j <= vstate->curframe; j++) {
 		state = vstate->frame[j];
-		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
-			if (state->stack[i].slot_type[0] != STACK_SPILL)
+		for_each_spilled_reg(i, state, reg) {
+			if (!reg)
 				continue;
 			mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
 		}
-- 
2.14.1

^ permalink raw reply related

* [RFC bpf-next 00/11] Add socket lookup support
From: Joe Stringer @ 2018-05-09 21:06 UTC (permalink / raw)
  To: daniel; +Cc: netdev, ast, john.fastabend, tgraf, kafai

This series proposes a new helper for the BPF API which allows BPF programs to
perform lookups for sockets in a network namespace. This would allow programs
to determine early on in processing whether the stack is expecting to receive
the packet, and perform some action (eg drop, forward somewhere) based on this
information.

The series is structured roughly into:
* Misc refactor
* Add the socket pointer type
* Add reference tracking to ensure that socket references are freed
* Extend the BPF API to add sk_lookup() / sk_release() functions
* Add tests/documentation

The helper proposed in this series includes a parameter for a tuple which must
be filled in by the caller to determine the socket to look up. The simplest
case would be filling with the contents of the packet, ie mapping the packet's
5-tuple into the parameter. In common cases, it may alternatively be useful to
reverse the direction of the tuple and perform a lookup, to find the socket
that initiates this connection; and if the BPF program ever performs a form of
IP address translation, it may further be useful to be able to look up
arbitrary tuples that are not based upon the packet, but instead based on state
held in BPF maps or hardcoded in the BPF program.

Currently, access into the socket's fields are limited to those which are
otherwise already accessible, and are restricted to read-only access.

A few open points:
* Currently, the lookup interface only returns either a valid socket or a NULL
  pointer. This means that if there is any kind of issue with the tuple, such
  as it provides an unsupported protocol number, or the socket can't be found,
  then we are unable to differentiate these cases from one another. One natural
  approach to improve this could be to return an ERR_PTR from the
  bpf_sk_lookup() helper. This would be more complicated but maybe it's
  worthwhile.
* No ordering is defined between sockets. If the tuple could find multiple
  sockets, then it will arbitrarily return one. It is up to the caller to
  handle this. If we wish to handle this more reliably in future, we could
  encode an ordering preference in the flags field.
* Currently this helper is only defined for TC hook point, but it should also
  be valid at XDP and perhaps some other hooks.

Joe Stringer (11):
  bpf: Add iterator for spilled registers
  bpf: Simplify ptr_min_max_vals adjustment
  bpf: Generalize ptr_or_null regs check
  bpf: Add PTR_TO_SOCKET verifier type
  bpf: Macrofy stack state copy
  bpf: Add reference tracking to verifier
  bpf: Add helper to retrieve socket in BPF
  selftests/bpf: Add tests for reference tracking
  libbpf: Support loading individual progs
  selftests/bpf: Add C tests for reference tracking
  Documentation: Describe bpf reference tracking

 Documentation/networking/filter.txt               |  64 +++
 include/linux/bpf.h                               |  19 +-
 include/linux/bpf_verifier.h                      |  31 +-
 include/uapi/linux/bpf.h                          |  39 +-
 kernel/bpf/verifier.c                             | 548 ++++++++++++++++++----
 net/core/filter.c                                 | 132 +++++-
 tools/include/uapi/linux/bpf.h                    |  40 +-
 tools/lib/bpf/libbpf.c                            |   4 +-
 tools/lib/bpf/libbpf.h                            |   3 +
 tools/testing/selftests/bpf/Makefile              |   2 +-
 tools/testing/selftests/bpf/bpf_helpers.h         |   7 +
 tools/testing/selftests/bpf/test_progs.c          |  38 ++
 tools/testing/selftests/bpf/test_sk_lookup_kern.c | 127 +++++
 tools/testing/selftests/bpf/test_verifier.c       | 373 ++++++++++++++-
 14 files changed, 1299 insertions(+), 128 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_sk_lookup_kern.c

-- 
2.14.1

^ permalink raw reply

* Re: KASAN: use-after-free Read in __dev_queue_xmit
From: Willem de Bruijn @ 2018-05-09 21:05 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Eric Biggers, syzbot, alexander.deucher, Andrey Konovalov,
	Anoob Soman, chris, David Miller, Reshetova, Elena,
	Greg Kroah-Hartman, Kees Cook, LKML, Mike Maloney, mchehab,
	netdev, Rosen, Rami, Sowmini Varadhan, syzkaller-bugs,
	Willem de Bruijn
In-Reply-To: <e7be4d14-be04-c40d-82ad-2050876f9ce9@gmail.com>

On Wed, May 9, 2018 at 3:36 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
> On 05/09/2018 12:21 PM, Willem de Bruijn wrote:
>
>> Indeed. The skb shared info struct is zeroed by dev_validate_header
>> as a result of dev->hard_header_len exceeding skb->end - skb->data.
>>
>> Not exactly sure yet how this can happen. The hard header length space
>> is accounted for during allocation as reserved memory. But,
>> packet_alloc_skb does call skb_reserve(), moving skb->data
>> effectively beyond this reserved region.
>>
>> It may be incorrect to pass skb->data to dev_validate_header, as that
>> does not point to the start of the ll_header anymore. Still figuring out what
>> the right fix is..
>>
>
> I believe the bug happens if the sock_wmalloc() call at line 1921 has to sleep.
>
> device can change (or at lest dev->hard_header_len can change)
>
> So we need to bailout if reserved/hhlen had changed.
>
> Or revert some patches, since dev_hold() and dev_put() are no longer high cost,
> since it is now using per cpu counter.

Oh nice, another bug :/

That seems quite plausible.

This reproducer does not modify hard_header_len, however.
It sends a long array of zero byte requests with sendmmsg to
eventually exceed so_rcvbuf of the error queue. Hard header
length is 116 throughout.

^ permalink raw reply

* [PATCH iproute2] ss: remove non-functional slabinfo
From: Stephen Hemminger @ 2018-05-09 21:00 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger

Ss was using slabinfo to try and intuit TCP statistics.
The slabinfo changed several times since 2.4 and all these statistics
are broken by renames and slab merging. Plus slabinfo does not exist
at all if kernel is compiled with SLUB option.

This has been broken for many years and is not worth fixing;
rather than trying to fix kernel, just trim away the no longer
valid statistics.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 misc/ss.c | 103 ++++--------------------------------------------------
 1 file changed, 6 insertions(+), 97 deletions(-)

diff --git a/misc/ss.c b/misc/ss.c
index 3ed7e66962f3..41e7762bb61f 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -474,7 +474,6 @@ static FILE *generic_proc_open(const char *env, const char *name)
 							"net/packet")
 #define net_netlink_open()	generic_proc_open("PROC_NET_NETLINK", \
 							"net/netlink")
-#define slabinfo_open()		generic_proc_open("PROC_SLABINFO", "slabinfo")
 #define net_sockstat_open()	generic_proc_open("PROC_NET_SOCKSTAT", \
 							"net/sockstat")
 #define net_sockstat6_open()	generic_proc_open("PROC_NET_SOCKSTAT6", \
@@ -728,67 +727,6 @@ next:
 	return cnt;
 }
 
-/* Get stats from slab */
-
-struct slabstat {
-	int socks;
-	int tcp_ports;
-	int tcp_tws;
-	int tcp_syns;
-	int skbs;
-};
-
-static struct slabstat slabstat;
-
-static int get_slabstat(struct slabstat *s)
-{
-	char buf[256];
-	FILE *fp;
-	int cnt;
-	static int slabstat_valid;
-	static const char * const slabstat_ids[] = {
-		"sock",
-		"tcp_bind_bucket",
-		"tcp_tw_bucket",
-		"tcp_open_request",
-		"skbuff_head_cache",
-	};
-
-	if (slabstat_valid)
-		return 0;
-
-	memset(s, 0, sizeof(*s));
-
-	fp = slabinfo_open();
-	if (!fp)
-		return -1;
-
-	cnt = sizeof(*s)/sizeof(int);
-
-	if (!fgets(buf, sizeof(buf), fp)) {
-		fclose(fp);
-		return -1;
-	}
-	while (fgets(buf, sizeof(buf), fp) != NULL) {
-		int i;
-
-		for (i = 0; i < ARRAY_SIZE(slabstat_ids); i++) {
-			if (memcmp(buf, slabstat_ids[i], strlen(slabstat_ids[i])) == 0) {
-				sscanf(buf, "%*s%d", ((int *)s) + i);
-				cnt--;
-				break;
-			}
-		}
-		if (cnt <= 0)
-			break;
-	}
-
-	slabstat_valid = 1;
-
-	fclose(fp);
-	return 0;
-}
-
 static unsigned long long cookie_sk_get(const uint32_t *cookie)
 {
 	return (((unsigned long long)cookie[1] << 31) << 1) | cookie[0];
@@ -3372,7 +3310,7 @@ static int tcp_show(struct filter *f)
 {
 	FILE *fp = NULL;
 	char *buf = NULL;
-	int bufsize = 64*1024;
+	int bufsize = 1024*1024;
 
 	if (!filter_af_get(f, AF_INET) && !filter_af_get(f, AF_INET6))
 		return 0;
@@ -3387,27 +3325,6 @@ static int tcp_show(struct filter *f)
 		return 0;
 
 	/* Sigh... We have to parse /proc/net/tcp... */
-
-
-	/* Estimate amount of sockets and try to allocate
-	 * huge buffer to read all the table at one read.
-	 * Limit it by 16MB though. The assumption is: as soon as
-	 * kernel was able to hold information about N connections,
-	 * it is able to give us some memory for snapshot.
-	 */
-	if (1) {
-		get_slabstat(&slabstat);
-
-		int guess = slabstat.socks+slabstat.tcp_syns;
-
-		if (f->states&(1<<SS_TIME_WAIT))
-			guess += slabstat.tcp_tws;
-		if (guess > (16*1024*1024)/128)
-			guess = (16*1024*1024)/128;
-		guess *= 128;
-		if (guess > bufsize)
-			bufsize = guess;
-	}
 	while (bufsize >= 64*1024) {
 		if ((buf = malloc(bufsize)) != NULL)
 			break;
@@ -4666,23 +4583,15 @@ static int print_summary(void)
 	if (get_snmp_int("Tcp:", "CurrEstab", &tcp_estab) < 0)
 		perror("ss: get_snmpstat");
 
-	get_slabstat(&slabstat);
-
-	printf("Total: %d (kernel %d)\n", s.socks, slabstat.socks);
+	printf("Total: %d\n", s.socks);
 
-	printf("TCP:   %d (estab %d, closed %d, orphaned %d, synrecv %d, timewait %d/%d), ports %d\n",
-	       s.tcp_total + slabstat.tcp_syns + s.tcp_tws,
-	       tcp_estab,
-	       s.tcp_total - (s.tcp4_hashed+s.tcp6_hashed-s.tcp_tws),
-	       s.tcp_orphans,
-	       slabstat.tcp_syns,
-	       s.tcp_tws, slabstat.tcp_tws,
-	       slabstat.tcp_ports
-	       );
+	printf("TCP:   %d (estab %d, closed %d, orphaned %d, timewait %d)\n",
+	       s.tcp_total + s.tcp_tws, tcp_estab,
+	       s.tcp_total - (s.tcp4_hashed + s.tcp6_hashed - s.tcp_tws),
+	       s.tcp_orphans, s.tcp_tws);
 
 	printf("\n");
 	printf("Transport Total     IP        IPv6\n");
-	printf("*	  %-9d %-9s %-9s\n", slabstat.socks, "-", "-");
 	printf("RAW	  %-9d %-9d %-9d\n", s.raw4+s.raw6, s.raw4, s.raw6);
 	printf("UDP	  %-9d %-9d %-9d\n", s.udp4+s.udp6, s.udp4, s.udp6);
 	printf("TCP	  %-9d %-9d %-9d\n", s.tcp4_hashed+s.tcp6_hashed, s.tcp4_hashed, s.tcp6_hashed);
-- 
2.17.0

^ permalink raw reply related

* Re: Performance regression between 4.13 and 4.14
From: Ben Greear @ 2018-05-09 20:55 UTC (permalink / raw)
  To: Eric Dumazet, netdev
In-Reply-To: <b3acabde-cd6d-589c-05f9-a2e75db94b6b@candelatech.com>

On 05/09/2018 12:02 PM, Ben Greear wrote:
> On 05/09/2018 11:48 AM, Eric Dumazet wrote:
>>
>>
>> On 05/09/2018 11:43 AM, Ben Greear wrote:
>>> On 05/08/2018 10:10 AM, Eric Dumazet wrote:
>>>>
>>>>
>>>> On 05/08/2018 09:44 AM, Ben Greear wrote:
>>>>> Hello,
>>>>>
>>>>> I am trying to track down a performance regression that appears to be between 4.13
>>>>> and 4.14.
>>>>>
>>>>> I first saw the problem with a hacked version of pktgen on some ixgbe NICs.  4.13 can do
>>>>> right at 10G bi-directional on two ports, and 4.14 and later can do only about 6Gbps.
>>>>>
>>>>> I also tried with user-space UDP traffic on a stock kernel, and I can get about 3.2Gbps combined tx+rx
>>>>> on 4.14 and about 4.4Gbps on 4.13.
>>>>>
>>>>> Attempting to bisect seems to be triggering a weirdness in git, and also lots of commits
>>>>> crash or do not bring up networking, which makes the bisect difficult.
>>>>>
>>>>> Looking at perf top, it would appear that some lock is probably to blame.
>>>>
>>>>
>>>> perf record -a -g -e cycles:pp sleep 5
>>>> perf report
>>>>
>>>> Then you'll be able to tell us which lock (or call graph) is killing your perf.
>>>>
>>>
>>> I seem to be chasing multiple issues.  For 4.13, at least part of my problem was that LOCKDEP was enabled,
>>> during my bisect, though it does NOT appear enabled in 4.16.  I think maybe CONFIG_LOCKDEP moved to CONFIG_PROVE_LOCKING
>>> in 4.16, or something like that?  My 4.16 .config does have CONFIG_LOCKDEP_SUPPORT enabled, and I see no option to disable it:
>>>
>>> [greearb@ben-dt3 linux-4.16.x64]$ grep LOCKDEP .config
>>> CONFIG_LOCKDEP_SUPPORT=y
>>>
>>>
>>> For 4.16, I am disabling RETRAMPOLINE...are there any other such things I need
>>> to disable to keep from getting a performance hit from the spectre-related bug
>>> fixes?  At this point, I do not care about the security implications.
>>>
>>> greearb@ben-dt3 linux-4.16.x64]$ grep RETPO .config
>>> # CONFIG_RETPOLINE is not set
>>>
>>>
>>> Thanks,
>>> Ben
>>>
>>
>> No idea really, you mention a 4.13 -> 4.14 regression and jump then to 4.16 :/
>
> I initially saw the problem in 4.16, then bisected, and 4.14 still showed the
> issue.

So, I guess I must have been enabling lockdep the whole time.  This __lock_acquire
is from lockdep as far as I can tell, not normal locking.  I re-built 4.16 after
verifying as best as I could that lockdep was not enabled, and now it performs
as expected.

I'm going to test a patch to change __lock_acquire to __lock_acquire_lockdep so
maybe someone else will not make the same mistake I made.

> +   17.78%    17.78%  kpktgend_1       [kernel.kallsyms]             [k] __lock_acquire.isra.3


Thanks,
Ben


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply

* Re: [PATCH v6 05/13] firmware_loader: enhance Kconfig documentation over FW_LOADER
From: Luis R. Rodriguez @ 2018-05-09 20:55 UTC (permalink / raw)
  To: Kees Cook
  Cc: Luis R. Rodriguez, Greg KH, Andrew Morton, Josh Triplett, maco,
	Andy Gross, David Brown, bjorn.andersson, teg, wagi,
	Hans de Goede, Andres Rodriguez, Mimi Zohar, Jakub Kicinski,
	Shuah Khan, Martin Fuzzey, David Howells, pali.rohar,
	Takashi Iwai, Kalle Valo, Arend van Spriel,
	Rafał Miłecki
In-Reply-To: <CAGXu5jJXMxx2Xna_M_g1wsS59XnYKqHAGdxhzDtk3PhnLL-gQQ@mail.gmail.com>

On Tue, May 08, 2018 at 03:42:33PM -0700, Kees Cook wrote:
> On Tue, May 8, 2018 at 11:12 AM, Luis R. Rodriguez <mcgrof@kernel.org> wrote:
> > +         This used to be the default firmware loading facility, and udev used
> > +         to listen for uvents to load firmware for the kernel. The firmware
> > +         loading facility functionality in udev has been removed, as such it
> > +         can no longer be relied upon as a fallback mechanism. Linux no longer
> > +         relies on or uses a fallback mechanism in userspace. If you need to
> > +         rely on one refer to the permissively licensed firmwared:
> 
> Typo: firmware

Thanks fixed all typos except this one, this one is meant to be firmwared as
that is the name of the project, the url is below.

> 
> > +
> > +         https://github.com/teg/firmwared

  Luis

^ permalink raw reply

* Re: [net-next PATCH 1/3] net: Refactor XPS for CPUs and Rx queues
From: Nambiar, Amritha @ 2018-05-09 20:54 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Linux Kernel Network Developers, David S. Miller, Alexander Duyck,
	Sridhar Samudrala, Eric Dumazet, Hannes Frederic Sowa
In-Reply-To: <CALx6S354UZXK_Zn79e-tuUaUw3+QDdU-gsmDESY6MHJYFuHbsg@mail.gmail.com>

On 5/9/2018 1:31 PM, Tom Herbert wrote:
> On Thu, Apr 19, 2018 at 6:04 PM, Amritha Nambiar
> <amritha.nambiar@intel.com> wrote:
>> Refactor XPS code to support Tx queue selection based on
>> CPU map or Rx queue map.
>>
>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>> ---
>>  include/linux/netdevice.h |   82 +++++++++++++++++-
>>  net/core/dev.c            |  206 +++++++++++++++++++++++++++++----------------
>>  net/core/net-sysfs.c      |    4 -
>>  3 files changed, 216 insertions(+), 76 deletions(-)
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 14e0777..40a9171 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -730,10 +730,21 @@ struct xps_map {
>>   */
>>  struct xps_dev_maps {
>>         struct rcu_head rcu;
>> -       struct xps_map __rcu *cpu_map[0];
>> +       struct xps_map __rcu *attr_map[0];
> 
> This seems unnecessarily complicated to me. Why not just add another
> map called something like "rxq2txq_map". Then when selecting TXQ just
> check the new map first and then the normal cpu_map if there's not a
> hit.
> 

This is just a change in the name to something more generic ('attr')
since the maps can either be cpu based or rxq based. I have added two
map types, XPS_MAP_RXQS, XPS_MAP_CPUS and the TXQ selection (in patch
2/3) works how you described,  first based on the RXQ map and if there
is no hit, falls to the normal CPU map.

>>  };
>> -#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +         \
>> +
>> +#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +     \
>>         (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
>> +
>> +#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
>> +       (_rxqs * (_tcs) * sizeof(struct xps_map *)))
>> +
>> +enum xps_map_type {
>> +       XPS_MAP_RXQS,
>> +       XPS_MAP_CPUS,
>> +       __XPS_MAP_MAX
>> +};
>> +
>>  #endif /* CONFIG_XPS */
>>
>>  #define TC_MAX_QUEUE   16
>> @@ -1867,7 +1878,7 @@ struct net_device {
>>         int                     watchdog_timeo;
>>
>>  #ifdef CONFIG_XPS
>> -       struct xps_dev_maps __rcu *xps_maps;
>> +       struct xps_dev_maps __rcu *xps_maps[__XPS_MAP_MAX];
>>  #endif
>>  #ifdef CONFIG_NET_CLS_ACT
>>         struct mini_Qdisc __rcu *miniq_egress;
>> @@ -3204,6 +3215,71 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
>>  #ifdef CONFIG_XPS
>>  int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>                         u16 index);
>> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
>> +                         u16 index, enum xps_map_type type);
>> +
>> +static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
>> +                                 unsigned int nr_bits)
>> +{
>> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
>> +       WARN_ON_ONCE(j >= nr_bits);
>> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
> 
> This #ifdef block appears 3 times in the patch. Seems like it should
> be replace by simple macro.

Sure, will do in the next version.

> 
>> +       return test_bit(j, mask);
>> +}
>> +
>> +static inline bool attr_test_online(unsigned long j,
>> +                                   const unsigned long *online_mask,
>> +                                   unsigned int nr_bits)
>> +{
>> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
>> +       WARN_ON_ONCE(j >= nr_bits);
>> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>> +
>> +       if (online_mask)
>> +               return test_bit(j, online_mask);
>> +
>> +       if (j >= 0 && j < nr_bits)
>> +               return true;
>> +
>> +       return false;
>> +}
>> +
>> +static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
>> +                                        unsigned int nr_bits)
>> +{
>> +       /* -1 is a legal arg here. */
>> +       if (n != -1) {
>> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
>> +               WARN_ON_ONCE(n >= nr_bits);
>> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>> +       }
>> +
>> +       if (srcp)
>> +               return find_next_bit(srcp, nr_bits, n + 1);
>> +
>> +       return n + 1;
>> +}
>> +
>> +static inline int attrmask_next_and(int n, const unsigned long *src1p,
>> +                                   const unsigned long *src2p,
>> +                                   unsigned int nr_bits)
>> +{
>> +       /* -1 is a legal arg here. */
>> +       if (n != -1) {
>> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
>> +               WARN_ON_ONCE(n >= nr_bits);
>> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
>> +       }
>> +
>> +       if (src1p && src2p)
>> +               return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
>> +       else if (src1p)
>> +               return find_next_bit(src1p, nr_bits, n + 1);
>> +       else if (src2p)
>> +               return find_next_bit(src2p, nr_bits, n + 1);
>> +
>> +       return n + 1;
>> +}
>>  #else
>>  static inline int netif_set_xps_queue(struct net_device *dev,
>>                                       const struct cpumask *mask,
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index a490ef6..17c4883 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
>>         int pos;
>>
>>         if (dev_maps)
>> -               map = xmap_dereference(dev_maps->cpu_map[tci]);
>> +               map = xmap_dereference(dev_maps->attr_map[tci]);
>>         if (!map)
>>                 return false;
>>
>> @@ -2105,7 +2105,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
>>                         break;
>>                 }
>>
>> -               RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
>> +               RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
>>                 kfree_rcu(map, rcu);
>>                 return false;
>>         }
>> @@ -2138,30 +2138,47 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
>>  static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
>>                                    u16 count)
>>  {
>> +       const unsigned long *possible_mask = NULL;
>> +       enum xps_map_type type = XPS_MAP_RXQS;
>>         struct xps_dev_maps *dev_maps;
>> -       int cpu, i;
>>         bool active = false;
>> +       unsigned int nr_ids;
>> +       int i, j;
>>
>>         mutex_lock(&xps_map_mutex);
>> -       dev_maps = xmap_dereference(dev->xps_maps);
>>
>> -       if (!dev_maps)
>> -               goto out_no_maps;
>> +       while (type < __XPS_MAP_MAX) {
>> +               dev_maps = xmap_dereference(dev->xps_maps[type]);
>> +               if (!dev_maps)
>> +                       goto out_no_maps;
>> +
>> +               if (type == XPS_MAP_CPUS) {
>> +                       if (num_possible_cpus() > 1)
>> +                               possible_mask = cpumask_bits(cpu_possible_mask);
>> +                       nr_ids = nr_cpu_ids;
>> +               } else if (type == XPS_MAP_RXQS) {
>> +                       nr_ids = dev->num_rx_queues;
>> +               }
>>
>> -       for_each_possible_cpu(cpu)
>> -               active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
>> -                                              offset, count);
>> +               for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>> +                    j < nr_ids;)
>> +                       active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
>> +                                                      count);
>> +               if (!active) {
>> +                       RCU_INIT_POINTER(dev->xps_maps[type], NULL);
>> +                       kfree_rcu(dev_maps, rcu);
>> +               }
>>
>> -       if (!active) {
>> -               RCU_INIT_POINTER(dev->xps_maps, NULL);
>> -               kfree_rcu(dev_maps, rcu);
>> +               if (type == XPS_MAP_CPUS) {
>> +                       for (i = offset + (count - 1); count--; i--)
>> +                               netdev_queue_numa_node_write(
>> +                                       netdev_get_tx_queue(dev, i),
>> +                                                           NUMA_NO_NODE);
>> +               }
>> +out_no_maps:
>> +               type++;
>>         }
>>
>> -       for (i = offset + (count - 1); count--; i--)
>> -               netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
>> -                                            NUMA_NO_NODE);
>> -
>> -out_no_maps:
>>         mutex_unlock(&xps_map_mutex);
>>  }
>>
>> @@ -2170,11 +2187,11 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
>>         netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
>>  }
>>
>> -static struct xps_map *expand_xps_map(struct xps_map *map,
>> -                                     int cpu, u16 index)
>> +static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
>> +                                     u16 index, enum xps_map_type type)
>>  {
>> -       struct xps_map *new_map;
>>         int alloc_len = XPS_MIN_MAP_ALLOC;
>> +       struct xps_map *new_map = NULL;
>>         int i, pos;
>>
>>         for (pos = 0; map && pos < map->len; pos++) {
>> @@ -2183,7 +2200,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>                 return map;
>>         }
>>
>> -       /* Need to add queue to this CPU's existing map */
>> +       /* Need to add tx-queue to this CPU's/rx-queue's existing map */
>>         if (map) {
>>                 if (pos < map->alloc_len)
>>                         return map;
>> @@ -2191,9 +2208,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>                 alloc_len = map->alloc_len * 2;
>>         }
>>
>> -       /* Need to allocate new map to store queue on this CPU's map */
>> -       new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
>> -                              cpu_to_node(cpu));
>> +       /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
>> +        *  map
>> +        */
>> +       if (type == XPS_MAP_RXQS)
>> +               new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
>> +       else if (type == XPS_MAP_CPUS)
>> +               new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
>> +                                      cpu_to_node(attr_index));
>>         if (!new_map)
>>                 return NULL;
>>
>> @@ -2205,14 +2227,16 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>>         return new_map;
>>  }
>>
>> -int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>> -                       u16 index)
>> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
>> +                         u16 index, enum xps_map_type type)
>>  {
>> +       const unsigned long *online_mask = NULL, *possible_mask = NULL;
>>         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
>> -       int i, cpu, tci, numa_node_id = -2;
>> +       int i, j, tci, numa_node_id = -2;
>>         int maps_sz, num_tc = 1, tc = 0;
>>         struct xps_map *map, *new_map;
>>         bool active = false;
>> +       unsigned int nr_ids;
>>
>>         if (dev->num_tc) {
>>                 num_tc = dev->num_tc;
>> @@ -2221,16 +2245,33 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>                         return -EINVAL;
>>         }
>>
>> -       maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
>> +       switch (type) {
>> +       case XPS_MAP_RXQS:
>> +               maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
>> +               dev_maps = xmap_dereference(dev->xps_maps[XPS_MAP_RXQS]);
>> +               nr_ids = dev->num_rx_queues;
>> +               break;
>> +       case XPS_MAP_CPUS:
>> +               maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
>> +               if (num_possible_cpus() > 1) {
>> +                       online_mask = cpumask_bits(cpu_online_mask);
>> +                       possible_mask = cpumask_bits(cpu_possible_mask);
>> +               }
>> +               dev_maps = xmap_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>> +               nr_ids = nr_cpu_ids;
>> +               break;
>> +       default:
>> +               return -EINVAL;
>> +       }
>> +
>>         if (maps_sz < L1_CACHE_BYTES)
>>                 maps_sz = L1_CACHE_BYTES;
>>
>>         mutex_lock(&xps_map_mutex);
>>
>> -       dev_maps = xmap_dereference(dev->xps_maps);
>> -
>>         /* allocate memory for queue storage */
>> -       for_each_cpu_and(cpu, cpu_online_mask, mask) {
>> +       for (j = -1; j = attrmask_next_and(j, online_mask, mask, nr_ids),
>> +            j < nr_ids;) {
>>                 if (!new_dev_maps)
>>                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
>>                 if (!new_dev_maps) {
>> @@ -2238,73 +2279,81 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>                         return -ENOMEM;
>>                 }
>>
>> -               tci = cpu * num_tc + tc;
>> -               map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
>> +               tci = j * num_tc + tc;
>> +               map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
>>                                  NULL;
>>
>> -               map = expand_xps_map(map, cpu, index);
>> +               map = expand_xps_map(map, j, index, type);
>>                 if (!map)
>>                         goto error;
>>
>> -               RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>> +               RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>         }
>>
>>         if (!new_dev_maps)
>>                 goto out_no_new_maps;
>>
>> -       for_each_possible_cpu(cpu) {
>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>> +            j < nr_ids;) {
>>                 /* copy maps belonging to foreign traffic classes */
>> -               for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
>> +               for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
>>                         /* fill in the new device map from the old device map */
>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>                 }
>>
>>                 /* We need to explicitly update tci as prevous loop
>>                  * could break out early if dev_maps is NULL.
>>                  */
>> -               tci = cpu * num_tc + tc;
>> +               tci = j * num_tc + tc;
>>
>> -               if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
>> -                       /* add queue to CPU maps */
>> +               if (attr_test_mask(j, mask, nr_ids) &&
>> +                   attr_test_online(j, online_mask, nr_ids)) {
>> +                       /* add tx-queue to CPU/rx-queue maps */
>>                         int pos = 0;
>>
>> -                       map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>> +                       map = xmap_dereference(new_dev_maps->attr_map[tci]);
>>                         while ((pos < map->len) && (map->queues[pos] != index))
>>                                 pos++;
>>
>>                         if (pos == map->len)
>>                                 map->queues[map->len++] = index;
>>  #ifdef CONFIG_NUMA
>> -                       if (numa_node_id == -2)
>> -                               numa_node_id = cpu_to_node(cpu);
>> -                       else if (numa_node_id != cpu_to_node(cpu))
>> -                               numa_node_id = -1;
>> +                       if (type == XPS_MAP_CPUS) {
>> +                               if (numa_node_id == -2)
>> +                                       numa_node_id = cpu_to_node(j);
>> +                               else if (numa_node_id != cpu_to_node(j))
>> +                                       numa_node_id = -1;
>> +                       }
>>  #endif
>>                 } else if (dev_maps) {
>>                         /* fill in the new device map from the old device map */
>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>                 }
>>
>>                 /* copy maps belonging to foreign traffic classes */
>>                 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
>>                         /* fill in the new device map from the old device map */
>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>>                 }
>>         }
>>
>> -       rcu_assign_pointer(dev->xps_maps, new_dev_maps);
>> +       if (type == XPS_MAP_RXQS)
>> +               rcu_assign_pointer(dev->xps_maps[XPS_MAP_RXQS], new_dev_maps);
>> +       else if (type == XPS_MAP_CPUS)
>> +               rcu_assign_pointer(dev->xps_maps[XPS_MAP_CPUS], new_dev_maps);
>>
>>         /* Cleanup old maps */
>>         if (!dev_maps)
>>                 goto out_no_old_maps;
>>
>> -       for_each_possible_cpu(cpu) {
>> -               for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
>> -                       new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>> +            j < nr_ids;) {
>> +               for (i = num_tc, tci = j * num_tc; i--; tci++) {
>> +                       new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
>> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>>                         if (map && map != new_map)
>>                                 kfree_rcu(map, rcu);
>>                 }
>> @@ -2317,19 +2366,23 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>         active = true;
>>
>>  out_no_new_maps:
>> -       /* update Tx queue numa node */
>> -       netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
>> -                                    (numa_node_id >= 0) ? numa_node_id :
>> -                                    NUMA_NO_NODE);
>> +       if (type == XPS_MAP_CPUS) {
>> +               /* update Tx queue numa node */
>> +               netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
>> +                                            (numa_node_id >= 0) ?
>> +                                            numa_node_id : NUMA_NO_NODE);
>> +       }
>>
>>         if (!dev_maps)
>>                 goto out_no_maps;
>>
>> -       /* removes queue from unused CPUs */
>> -       for_each_possible_cpu(cpu) {
>> -               for (i = tc, tci = cpu * num_tc; i--; tci++)
>> +       /* removes tx-queue from unused CPUs/rx-queues */
>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>> +            j < nr_ids;) {
>> +               for (i = tc, tci = j * num_tc; i--; tci++)
>>                         active |= remove_xps_queue(dev_maps, tci, index);
>> -               if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
>> +               if (!attr_test_mask(j, mask, nr_ids) ||
>> +                   !attr_test_online(j, online_mask, nr_ids))
>>                         active |= remove_xps_queue(dev_maps, tci, index);
>>                 for (i = num_tc - tc, tci++; --i; tci++)
>>                         active |= remove_xps_queue(dev_maps, tci, index);
>> @@ -2337,7 +2390,10 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>
>>         /* free map if not active */
>>         if (!active) {
>> -               RCU_INIT_POINTER(dev->xps_maps, NULL);
>> +               if (type == XPS_MAP_RXQS)
>> +                       RCU_INIT_POINTER(dev->xps_maps[XPS_MAP_RXQS], NULL);
>> +               else if (type == XPS_MAP_CPUS)
>> +                       RCU_INIT_POINTER(dev->xps_maps[XPS_MAP_CPUS], NULL);
>>                 kfree_rcu(dev_maps, rcu);
>>         }
>>
>> @@ -2347,11 +2403,12 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>         return 0;
>>  error:
>>         /* remove any maps that we added */
>> -       for_each_possible_cpu(cpu) {
>> -               for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
>> -                       new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
>> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
>> +            j < nr_ids;) {
>> +               for (i = num_tc, tci = j * num_tc; i--; tci++) {
>> +                       new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
>>                         map = dev_maps ?
>> -                             xmap_dereference(dev_maps->cpu_map[tci]) :
>> +                             xmap_dereference(dev_maps->attr_map[tci]) :
>>                               NULL;
>>                         if (new_map && new_map != map)
>>                                 kfree(new_map);
>> @@ -2363,6 +2420,13 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>>         kfree(new_dev_maps);
>>         return -ENOMEM;
>>  }
>> +
>> +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>> +                       u16 index)
>> +{
>> +       return __netif_set_xps_queue(dev, cpumask_bits(mask), index,
>> +                                    XPS_MAP_CPUS);
>> +}
>>  EXPORT_SYMBOL(netif_set_xps_queue);
>>
>>  #endif
>> @@ -3400,7 +3464,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>         int queue_index = -1;
>>
>>         rcu_read_lock();
>> -       dev_maps = rcu_dereference(dev->xps_maps);
>> +       dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>>         if (dev_maps) {
>>                 unsigned int tci = skb->sender_cpu - 1;
>>
>> @@ -3409,7 +3473,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>                         tci += netdev_get_prio_tc_map(dev, skb->priority);
>>                 }
>>
>> -               map = rcu_dereference(dev_maps->cpu_map[tci]);
>> +               map = rcu_dereference(dev_maps->attr_map[tci]);
>>                 if (map) {
>>                         if (map->len == 1)
>>                                 queue_index = map->queues[0];
>> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
>> index c476f07..d7abd33 100644
>> --- a/net/core/net-sysfs.c
>> +++ b/net/core/net-sysfs.c
>> @@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
>>         }
>>
>>         rcu_read_lock();
>> -       dev_maps = rcu_dereference(dev->xps_maps);
>> +       dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>>         if (dev_maps) {
>>                 for_each_possible_cpu(cpu) {
>>                         int i, tci = cpu * num_tc + tc;
>>                         struct xps_map *map;
>>
>> -                       map = rcu_dereference(dev_maps->cpu_map[tci]);
>> +                       map = rcu_dereference(dev_maps->attr_map[tci]);
>>                         if (!map)
>>                                 continue;
>>
>>

^ permalink raw reply

* [PATCH ipsec-next] xfrm: Allow Output Mark to be Updated Using UPDSA
From: Nathan Harold @ 2018-05-09 20:46 UTC (permalink / raw)
  To: netdev, nharold; +Cc: Nathan Harold

Allow UPDSA to change output_mark to permit
policy separation of packet routing decisions from
SA keying in systems that use mark-based routing.

In the output_mark, used as a routing and firewall
mark for outbound packets, is made update-able which
allows routing decisions to be handled independently
of keying/SA creation. To maintain consistency with
other optional attributes, the output mark is only
updated if sent with a non-zero value. Once set, the
output mark may not be reset to zero, which ensures
that updating the SA does not require the mark to
be re-sent to avoid the value being clobbered.

The per-SA lock and the xfrm_state_lock are taken in
that order to avoid a deadlock with
xfrm_timer_handler(), which also takes the locks in
that order.

Signed-off-by: Nathan Harold <nharold@google.com>
Change-Id: Ia05c6733a94c1901cd1e54eb7c7e237704678d71
---
 net/xfrm/xfrm_state.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index f595797a20ce..5d6085e81030 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1554,6 +1554,13 @@ int xfrm_state_update(struct xfrm_state *x)
 		if (x1->curlft.use_time)
 			xfrm_state_check_expire(x1);
 
+		spin_lock_bh(&net->xfrm.xfrm_state_lock);
+		if (x->props.output_mark) {
+			x1->props.output_mark = x->props.output_mark;
+			__xfrm_state_bump_genids(x1);
+		}
+		spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+
 		err = 0;
 		x->km.state = XFRM_STATE_DEAD;
 		__xfrm_state_put(x);
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related

* Re: [bpf-next v2 8/9] bpf: Provide helper to do forwarding lookups in kernel FIB table
From: Daniel Borkmann @ 2018-05-09 20:44 UTC (permalink / raw)
  To: David Ahern, netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend
In-Reply-To: <69063df2-8987-b5f1-ced2-b2443139b5a5@gmail.com>

On 05/09/2018 06:05 PM, David Ahern wrote:
> On 5/9/18 2:15 AM, Daniel Borkmann wrote:
>>
>> Ohh well, this is causing allmodconfig build warnings (e.g. on x86) as reported today:
> 
> lovely.
> 
>> In file included from include/linux/dma-mapping.h:5:0,
>>                  from include/linux/skbuff.h:34,
>>                  from include/linux/if_ether.h:23,
>>                  from include/uapi/linux/bpf.h:13,
>>                  from include/linux/bpf-cgroup.h:6,
>>                  from include/linux/cgroup-defs.h:22,
>>                  from include/linux/cgroup.h:28,
>>                  from include/linux/perf_event.h:57,
>>                  from include/linux/trace_events.h:10,
>>                  from include/trace/trace_events.h:20,
>>                  from include/trace/define_trace.h:96,
>>                  from drivers/android/binder_trace.h:387,
>>                  from drivers/android/binder.c:5702:
>> include/linux/sizes.h:24:0: warning: "SZ_1K" redefined
>>  #define SZ_1K    0x00000400
>> drivers/android/binder.c:116:0: note: this is the location of the previous definition
>>  #define SZ_1K                               0x400
> 
> binder.c has very few recent commits to it. Are you ok with me
> submitting the change with the others in this set (with proper cc's of
> course)?
> 
>> fs/ecryptfs/miscdev.c:206:0: warning: "PKT_TYPE_OFFSET" redefined
>>  #define PKT_TYPE_OFFSET  0
>> In file included from include/linux/if_ether.h:23:0,
>>                  from include/uapi/linux/bpf.h:13,
>>                  from include/linux/bpf-cgroup.h:6,
>>                  from include/linux/cgroup-defs.h:22,
>>                  from include/linux/cgroup.h:28,
>>                  from include/linux/writeback.h:183,
>>                  from include/linux/backing-dev.h:16,
>>                  from fs/ecryptfs/ecryptfs_kernel.h:41,
>>                  from fs/ecryptfs/miscdev.c:30:
>> include/linux/skbuff.h:753:0: note: this is the location of the previous definition
>>  #define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset)
> 
> And this one I renamed to SKB_PKT_TYPE_OFFSET
> 
> With that it compiles cleanly.

Generally, no objection. However, could we get rid of the two extra includes altogether
to avoid running into any such dependency issue? Right now the only includes we have in
the bpf uapi header is linux/types.h and linux/bpf_common.h (latter has no extra deps
by itself). Both the ETH_ALEN and struct in6_addr are in uapi and therefore never allowed
to change so we can e.g. avoid to use ETH_ALEN and just have the value instead. In the
other places of the header we use __u32 remote_ipv6[4], __u32 src_ip6[4] etc to denote
a v6 address, we could do the same here and should be all good then.

^ permalink raw reply

* Re: [PATCH v3 next-next] drivers: net: davinci_mdio: prevent spurious timeout
From: Grygorii Strashko @ 2018-05-09 20:36 UTC (permalink / raw)
  To: Sekhar Nori, David S . Miller; +Cc: linux-omap, netdev, Andrew Lunn
In-Reply-To: <20180509154515.5968-1-nsekhar@ti.com>



On 05/09/2018 10:45 AM, Sekhar Nori wrote:
> A well timed kernel preemption in the time_after() loop
> in wait_for_idle() can result in a spurious timeout
> error to be returned.
> 
> Fix it by using readl_poll_timeout() which takes care of
> this issue.
> 
> Reviewed-by: Andrew Lunn <andrew@lunn.ch>
> Signed-off-by: Sekhar Nori <nsekhar@ti.com>
> ---
> v3: simplify return path based on comment from Andrew
> 
> The issue has not been personally observed by me, but has
> been reported by users. Sending for next-next given the
> non-critical nature. There is seems to be no easy way to
> reproduce this.
> 

Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>



-- 
regards,
-grygorii

^ permalink raw reply

* Re: RTL8723BE performance regression
From: João Paulo Rechi Vita @ 2018-05-09 20:33 UTC (permalink / raw)
  To: Pkshih
  Cc: linux-kernel@vger.kernel.org, Larry.Finger@lwfinger.net,
	jprvita@endlessm.com, Birming Chiu, drake@endlessm.com,
	Chaoming_Li, kvalo@codeaurora.org, 莊彥宣,
	derosier@gmail.com, Steven Ting, netdev@vger.kernel.org,
	linux@endlessm.com, Shaofu, linux-wireless@vger.kernel.org
In-Reply-To: <1525768634.2885.11.camel@realtek.com>

On Tue, May 8, 2018 at 1:37 AM, Pkshih <pkshih@realtek.com> wrote:
> On Mon, 2018-05-07 at 14:49 -0700, João Paulo Rechi Vita wrote:
>> On Tue, May 1, 2018 at 10:58 PM, Pkshih <pkshih@realtek.com> wrote:
>> > On Wed, 2018-05-02 at 05:44 +0000, Pkshih wrote:
>> >>
>> >> > -----Original Message-----
>> >> > From: João Paulo Rechi Vita [mailto:jprvita@gmail.com]
>> >> > Sent: Wednesday, May 02, 2018 6:41 AM
>> >> > To: Larry Finger
>> >> > Cc: Steve deRosier; 莊彥宣; Pkshih; Birming Chiu; Shaofu; Steven Ting; Chaoming_Li; Kalle Valo;
>> >> > linux-wireless; Network Development; LKML; Daniel Drake; João Paulo Rechi Vita; linux@endless
>> m.c
>> >> om
>> >> > Subject: Re: RTL8723BE performance regression
>> >> >
>> >> > On Tue, Apr 3, 2018 at 7:51 PM, Larry Finger <Larry.Finger@lwfinger.net> wrote:
>> >> > > On 04/03/2018 09:37 PM, João Paulo Rechi Vita wrote:
>> >> > >>
>> >> > >> On Tue, Apr 3, 2018 at 7:28 PM, Larry Finger <Larry.Finger@lwfinger.net>
>> >> > >> wrote:
>> >> > >>
>> >> > >> (...)
>> >> > >>
>> >> > >>> As the antenna selection code changes affected your first bisection, do
>> >> > >>> you
>> >> > >>> have one of those HP laptops with only one antenna and the incorrect
>> >> > >>> coding
>> >> > >>> in the FUSE?
>> >> > >>
>> >> > >>
>> >> > >> Yes, that is why I've been passing ant_sel=1 during my tests -- this
>> >> > >> was needed to achieve a good performance in the past, before this
>> >> > >> regression. I've also opened the laptop chassis and confirmed the
>> >> > >> antenna cable is plugged to the connector labeled with "1" on the
>> >> > >> card.
>> >> > >>
>> >> > >>> If so, please make sure that you still have the same signal
>> >> > >>> strength for good and bad cases. I have tried to keep the driver and the
>> >> > >>> btcoex code in sync, but there may be some combinations of antenna
>> >> > >>> configuration and FUSE contents that cause the code to fail.
>> >> > >>>
>> >> > >>
>> >> > >> What is the recommended way to monitor the signal strength?
>> >> > >
>> >> > >
>> >> > > The btcoex code is developed for multiple platforms by a different group
>> >> > > than the Linux driver. I think they made a change that caused ant_sel to
>> >> > > switch from 1 to 2. At least numerous comments at
>> >> > > github.com/lwfinger/rtlwifi_new claimed they needed to make that change.
>> >> > >
>> >> > > Mhy recommended method is to verify the wifi device name with "iw dev". Then
>> >> > > using that device
>> >> > >
>> >> > > sudo iw dev <dev_name> scan | egrep "SSID|signal"
>> >> > >
>> >> >
>> >> > I have confirmed that the performance regression is indeed tied to
>> >> > signal strength: on the good cases signal was between -16 and -8 dBm,
>> >> > whereas in bad cases signal was always between -50 to - 40 dBm. I've
>> >> > also switched to testing bandwidth in controlled LAN environment using
>> >> > iperf3, as suggested by Steve deRosier, with the DUT being the only
>> >> > machine connected to the 2.4 GHz radio and the machine running the
>> >> > iperf3 server connected via ethernet.
>> >> >
>> >>
>> >> We have new experimental results in commit af8a41cccf8f46 ("rtlwifi: cleanup
>> >> 8723be ant_sel definition"). You can use the above commit and do the same
>> >> experiments (with ant_sel=0, 1 and 2) in your side, and then share your results.
>> >> Since performance is tied to signal strength, you can only share signal strength.
>> >>
>> >
>> > Please pay attention to cold reboot once ant_sel is changed.
>> >
>>
>> I've tested the commit mentioned above and it fixes the problem on top
>> of v4.16 (in addition to the latest wireless-drivers-next also been
>> fixed as it already contains such commit). On v4.15, we also need the
>> following commits before "af8a41cccf8f rtlwifi: cleanup 8723be ant_sel
>> definition" to have a good performance again:
>>
>>   874e837d67d0 rtlwifi: fill FW version and subversion
>>   a44709bba70f rtlwifi: btcoex: Add power_on_setting routine
>>   40d9dd4f1c5d rtlwifi: btcoex: Remove global variables from btcoex
>
> v4.15 isn't longterm version and had been EOL.
>

Right, but this is a performace regression in comparison to v4.11, so
if "af8a41cccf8f rtlwifi: cleanup 8723be ant_sel definition" is marked
for stable, shouldn't these other patches be brought as well? All
releases since v4.11 are probably affected, but honestly I don't have
a strong understanding of how the stable trees operate in situations
like this.

>>
>> Surprisingly, it seems forcing ant_sel=1 is not needed anymore on
>> these machines, as the shown by the numbers bellow (ant_sel=0 means
>> that actually no parameter was passed to the module). I have powered
>> off the machine and done a cold boot for every test. It seems
>> something have changed in the antenna auto-selection code since v4.11,
>> the latest point where I could confirm we definitely need to force
>> ant_sel=1. I've been trying to understand what causes this difference,
>> but haven't made progress on that so far, so any suggestions are
>> appreciated (we are trying to decide if we can confidently drop the
>> downstream DMI quirks for these specific machines).
>>
> I think your rtl8723be module programed correct efuse content, so it
> works properly with ant_sel=0, and quirk isn't required for your
> machine.
>
>>   w-d-n ant_sel=0: -14.00 dBm,  69.5 Mbps -> good
>>   w-d-n ant_sel=1: -10.00 dBm,  41.1 Mbps -> good
>>   w-d-n ant_sel=2: -44.00 dBm,   607 kbps -> bad
>>
>>   v4.16 ant_sel=0: -12.00 dBm,  63.0 Mbps -> good
>>   v4.16 ant_sel=1: - 8.00 dBm,  69.0 Mbps -> good
>>   v4.16 ant_sel=2: -50.00 dBm,   224 kbps -> bad
>>
>>   v4.15 ant_sel=0: - 8.00 dBm,  33.0 Mbps -> good
>>   v4.15 ant_sel=1: -10.00 dBm,  38.1 Mbps -> good
>>   v4.15 ant_sel=2: -48.00 dBm,   206 kbps -> bad
>>
>
> With your results, the efuse content is programmed as one or two antenna
> on AUX path.
>

With v4.11 I had good performance results on this very same machine
(thus same efuse contents) only when passing ant_sel=1, so there has
to be some change on the code that parses the efuse contents and
decides which antenna will be used.

--
João Paulo Rechi Vita
http://about.me/jprvita

^ permalink raw reply

* Re: [net-next PATCH 1/3] net: Refactor XPS for CPUs and Rx queues
From: Tom Herbert @ 2018-05-09 20:31 UTC (permalink / raw)
  To: Amritha Nambiar
  Cc: Linux Kernel Network Developers, David S. Miller, Alexander Duyck,
	Sridhar Samudrala, Eric Dumazet, Hannes Frederic Sowa
In-Reply-To: <152418626993.5832.6216077462845621375.stgit@anamdev.jf.intel.com>

On Thu, Apr 19, 2018 at 6:04 PM, Amritha Nambiar
<amritha.nambiar@intel.com> wrote:
> Refactor XPS code to support Tx queue selection based on
> CPU map or Rx queue map.
>
> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
> ---
>  include/linux/netdevice.h |   82 +++++++++++++++++-
>  net/core/dev.c            |  206 +++++++++++++++++++++++++++++----------------
>  net/core/net-sysfs.c      |    4 -
>  3 files changed, 216 insertions(+), 76 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 14e0777..40a9171 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -730,10 +730,21 @@ struct xps_map {
>   */
>  struct xps_dev_maps {
>         struct rcu_head rcu;
> -       struct xps_map __rcu *cpu_map[0];
> +       struct xps_map __rcu *attr_map[0];

This seems unnecessarily complicated to me. Why not just add another
map called something like "rxq2txq_map". Then when selecting TXQ just
check the new map first and then the normal cpu_map if there's not a
hit.

>  };
> -#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +         \
> +
> +#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +     \
>         (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
> +
> +#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
> +       (_rxqs * (_tcs) * sizeof(struct xps_map *)))
> +
> +enum xps_map_type {
> +       XPS_MAP_RXQS,
> +       XPS_MAP_CPUS,
> +       __XPS_MAP_MAX
> +};
> +
>  #endif /* CONFIG_XPS */
>
>  #define TC_MAX_QUEUE   16
> @@ -1867,7 +1878,7 @@ struct net_device {
>         int                     watchdog_timeo;
>
>  #ifdef CONFIG_XPS
> -       struct xps_dev_maps __rcu *xps_maps;
> +       struct xps_dev_maps __rcu *xps_maps[__XPS_MAP_MAX];
>  #endif
>  #ifdef CONFIG_NET_CLS_ACT
>         struct mini_Qdisc __rcu *miniq_egress;
> @@ -3204,6 +3215,71 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
>  #ifdef CONFIG_XPS
>  int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>                         u16 index);
> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
> +                         u16 index, enum xps_map_type type);
> +
> +static inline bool attr_test_mask(unsigned long j, const unsigned long *mask,
> +                                 unsigned int nr_bits)
> +{
> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
> +       WARN_ON_ONCE(j >= nr_bits);
> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */

This #ifdef block appears 3 times in the patch. Seems like it should
be replace by simple macro.

> +       return test_bit(j, mask);
> +}
> +
> +static inline bool attr_test_online(unsigned long j,
> +                                   const unsigned long *online_mask,
> +                                   unsigned int nr_bits)
> +{
> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
> +       WARN_ON_ONCE(j >= nr_bits);
> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
> +
> +       if (online_mask)
> +               return test_bit(j, online_mask);
> +
> +       if (j >= 0 && j < nr_bits)
> +               return true;
> +
> +       return false;
> +}
> +
> +static inline unsigned int attrmask_next(int n, const unsigned long *srcp,
> +                                        unsigned int nr_bits)
> +{
> +       /* -1 is a legal arg here. */
> +       if (n != -1) {
> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
> +               WARN_ON_ONCE(n >= nr_bits);
> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
> +       }
> +
> +       if (srcp)
> +               return find_next_bit(srcp, nr_bits, n + 1);
> +
> +       return n + 1;
> +}
> +
> +static inline int attrmask_next_and(int n, const unsigned long *src1p,
> +                                   const unsigned long *src2p,
> +                                   unsigned int nr_bits)
> +{
> +       /* -1 is a legal arg here. */
> +       if (n != -1) {
> +#ifdef CONFIG_DEBUG_PER_CPU_MAPS
> +               WARN_ON_ONCE(n >= nr_bits);
> +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
> +       }
> +
> +       if (src1p && src2p)
> +               return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
> +       else if (src1p)
> +               return find_next_bit(src1p, nr_bits, n + 1);
> +       else if (src2p)
> +               return find_next_bit(src2p, nr_bits, n + 1);
> +
> +       return n + 1;
> +}
>  #else
>  static inline int netif_set_xps_queue(struct net_device *dev,
>                                       const struct cpumask *mask,
> diff --git a/net/core/dev.c b/net/core/dev.c
> index a490ef6..17c4883 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2092,7 +2092,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
>         int pos;
>
>         if (dev_maps)
> -               map = xmap_dereference(dev_maps->cpu_map[tci]);
> +               map = xmap_dereference(dev_maps->attr_map[tci]);
>         if (!map)
>                 return false;
>
> @@ -2105,7 +2105,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
>                         break;
>                 }
>
> -               RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
> +               RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
>                 kfree_rcu(map, rcu);
>                 return false;
>         }
> @@ -2138,30 +2138,47 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
>  static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
>                                    u16 count)
>  {
> +       const unsigned long *possible_mask = NULL;
> +       enum xps_map_type type = XPS_MAP_RXQS;
>         struct xps_dev_maps *dev_maps;
> -       int cpu, i;
>         bool active = false;
> +       unsigned int nr_ids;
> +       int i, j;
>
>         mutex_lock(&xps_map_mutex);
> -       dev_maps = xmap_dereference(dev->xps_maps);
>
> -       if (!dev_maps)
> -               goto out_no_maps;
> +       while (type < __XPS_MAP_MAX) {
> +               dev_maps = xmap_dereference(dev->xps_maps[type]);
> +               if (!dev_maps)
> +                       goto out_no_maps;
> +
> +               if (type == XPS_MAP_CPUS) {
> +                       if (num_possible_cpus() > 1)
> +                               possible_mask = cpumask_bits(cpu_possible_mask);
> +                       nr_ids = nr_cpu_ids;
> +               } else if (type == XPS_MAP_RXQS) {
> +                       nr_ids = dev->num_rx_queues;
> +               }
>
> -       for_each_possible_cpu(cpu)
> -               active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
> -                                              offset, count);
> +               for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
> +                    j < nr_ids;)
> +                       active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
> +                                                      count);
> +               if (!active) {
> +                       RCU_INIT_POINTER(dev->xps_maps[type], NULL);
> +                       kfree_rcu(dev_maps, rcu);
> +               }
>
> -       if (!active) {
> -               RCU_INIT_POINTER(dev->xps_maps, NULL);
> -               kfree_rcu(dev_maps, rcu);
> +               if (type == XPS_MAP_CPUS) {
> +                       for (i = offset + (count - 1); count--; i--)
> +                               netdev_queue_numa_node_write(
> +                                       netdev_get_tx_queue(dev, i),
> +                                                           NUMA_NO_NODE);
> +               }
> +out_no_maps:
> +               type++;
>         }
>
> -       for (i = offset + (count - 1); count--; i--)
> -               netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
> -                                            NUMA_NO_NODE);
> -
> -out_no_maps:
>         mutex_unlock(&xps_map_mutex);
>  }
>
> @@ -2170,11 +2187,11 @@ static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
>         netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
>  }
>
> -static struct xps_map *expand_xps_map(struct xps_map *map,
> -                                     int cpu, u16 index)
> +static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
> +                                     u16 index, enum xps_map_type type)
>  {
> -       struct xps_map *new_map;
>         int alloc_len = XPS_MIN_MAP_ALLOC;
> +       struct xps_map *new_map = NULL;
>         int i, pos;
>
>         for (pos = 0; map && pos < map->len; pos++) {
> @@ -2183,7 +2200,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>                 return map;
>         }
>
> -       /* Need to add queue to this CPU's existing map */
> +       /* Need to add tx-queue to this CPU's/rx-queue's existing map */
>         if (map) {
>                 if (pos < map->alloc_len)
>                         return map;
> @@ -2191,9 +2208,14 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>                 alloc_len = map->alloc_len * 2;
>         }
>
> -       /* Need to allocate new map to store queue on this CPU's map */
> -       new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
> -                              cpu_to_node(cpu));
> +       /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
> +        *  map
> +        */
> +       if (type == XPS_MAP_RXQS)
> +               new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
> +       else if (type == XPS_MAP_CPUS)
> +               new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
> +                                      cpu_to_node(attr_index));
>         if (!new_map)
>                 return NULL;
>
> @@ -2205,14 +2227,16 @@ static struct xps_map *expand_xps_map(struct xps_map *map,
>         return new_map;
>  }
>
> -int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
> -                       u16 index)
> +int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
> +                         u16 index, enum xps_map_type type)
>  {
> +       const unsigned long *online_mask = NULL, *possible_mask = NULL;
>         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
> -       int i, cpu, tci, numa_node_id = -2;
> +       int i, j, tci, numa_node_id = -2;
>         int maps_sz, num_tc = 1, tc = 0;
>         struct xps_map *map, *new_map;
>         bool active = false;
> +       unsigned int nr_ids;
>
>         if (dev->num_tc) {
>                 num_tc = dev->num_tc;
> @@ -2221,16 +2245,33 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>                         return -EINVAL;
>         }
>
> -       maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
> +       switch (type) {
> +       case XPS_MAP_RXQS:
> +               maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
> +               dev_maps = xmap_dereference(dev->xps_maps[XPS_MAP_RXQS]);
> +               nr_ids = dev->num_rx_queues;
> +               break;
> +       case XPS_MAP_CPUS:
> +               maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
> +               if (num_possible_cpus() > 1) {
> +                       online_mask = cpumask_bits(cpu_online_mask);
> +                       possible_mask = cpumask_bits(cpu_possible_mask);
> +               }
> +               dev_maps = xmap_dereference(dev->xps_maps[XPS_MAP_CPUS]);
> +               nr_ids = nr_cpu_ids;
> +               break;
> +       default:
> +               return -EINVAL;
> +       }
> +
>         if (maps_sz < L1_CACHE_BYTES)
>                 maps_sz = L1_CACHE_BYTES;
>
>         mutex_lock(&xps_map_mutex);
>
> -       dev_maps = xmap_dereference(dev->xps_maps);
> -
>         /* allocate memory for queue storage */
> -       for_each_cpu_and(cpu, cpu_online_mask, mask) {
> +       for (j = -1; j = attrmask_next_and(j, online_mask, mask, nr_ids),
> +            j < nr_ids;) {
>                 if (!new_dev_maps)
>                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
>                 if (!new_dev_maps) {
> @@ -2238,73 +2279,81 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>                         return -ENOMEM;
>                 }
>
> -               tci = cpu * num_tc + tc;
> -               map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
> +               tci = j * num_tc + tc;
> +               map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
>                                  NULL;
>
> -               map = expand_xps_map(map, cpu, index);
> +               map = expand_xps_map(map, j, index, type);
>                 if (!map)
>                         goto error;
>
> -               RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
> +               RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>         }
>
>         if (!new_dev_maps)
>                 goto out_no_new_maps;
>
> -       for_each_possible_cpu(cpu) {
> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
> +            j < nr_ids;) {
>                 /* copy maps belonging to foreign traffic classes */
> -               for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
> +               for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
>                         /* fill in the new device map from the old device map */
> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>                 }
>
>                 /* We need to explicitly update tci as prevous loop
>                  * could break out early if dev_maps is NULL.
>                  */
> -               tci = cpu * num_tc + tc;
> +               tci = j * num_tc + tc;
>
> -               if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
> -                       /* add queue to CPU maps */
> +               if (attr_test_mask(j, mask, nr_ids) &&
> +                   attr_test_online(j, online_mask, nr_ids)) {
> +                       /* add tx-queue to CPU/rx-queue maps */
>                         int pos = 0;
>
> -                       map = xmap_dereference(new_dev_maps->cpu_map[tci]);
> +                       map = xmap_dereference(new_dev_maps->attr_map[tci]);
>                         while ((pos < map->len) && (map->queues[pos] != index))
>                                 pos++;
>
>                         if (pos == map->len)
>                                 map->queues[map->len++] = index;
>  #ifdef CONFIG_NUMA
> -                       if (numa_node_id == -2)
> -                               numa_node_id = cpu_to_node(cpu);
> -                       else if (numa_node_id != cpu_to_node(cpu))
> -                               numa_node_id = -1;
> +                       if (type == XPS_MAP_CPUS) {
> +                               if (numa_node_id == -2)
> +                                       numa_node_id = cpu_to_node(j);
> +                               else if (numa_node_id != cpu_to_node(j))
> +                                       numa_node_id = -1;
> +                       }
>  #endif
>                 } else if (dev_maps) {
>                         /* fill in the new device map from the old device map */
> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>                 }
>
>                 /* copy maps belonging to foreign traffic classes */
>                 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
>                         /* fill in the new device map from the old device map */
> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
> -                       RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
> +                       RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
>                 }
>         }
>
> -       rcu_assign_pointer(dev->xps_maps, new_dev_maps);
> +       if (type == XPS_MAP_RXQS)
> +               rcu_assign_pointer(dev->xps_maps[XPS_MAP_RXQS], new_dev_maps);
> +       else if (type == XPS_MAP_CPUS)
> +               rcu_assign_pointer(dev->xps_maps[XPS_MAP_CPUS], new_dev_maps);
>
>         /* Cleanup old maps */
>         if (!dev_maps)
>                 goto out_no_old_maps;
>
> -       for_each_possible_cpu(cpu) {
> -               for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
> -                       new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
> -                       map = xmap_dereference(dev_maps->cpu_map[tci]);
> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
> +            j < nr_ids;) {
> +               for (i = num_tc, tci = j * num_tc; i--; tci++) {
> +                       new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
> +                       map = xmap_dereference(dev_maps->attr_map[tci]);
>                         if (map && map != new_map)
>                                 kfree_rcu(map, rcu);
>                 }
> @@ -2317,19 +2366,23 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>         active = true;
>
>  out_no_new_maps:
> -       /* update Tx queue numa node */
> -       netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
> -                                    (numa_node_id >= 0) ? numa_node_id :
> -                                    NUMA_NO_NODE);
> +       if (type == XPS_MAP_CPUS) {
> +               /* update Tx queue numa node */
> +               netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
> +                                            (numa_node_id >= 0) ?
> +                                            numa_node_id : NUMA_NO_NODE);
> +       }
>
>         if (!dev_maps)
>                 goto out_no_maps;
>
> -       /* removes queue from unused CPUs */
> -       for_each_possible_cpu(cpu) {
> -               for (i = tc, tci = cpu * num_tc; i--; tci++)
> +       /* removes tx-queue from unused CPUs/rx-queues */
> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
> +            j < nr_ids;) {
> +               for (i = tc, tci = j * num_tc; i--; tci++)
>                         active |= remove_xps_queue(dev_maps, tci, index);
> -               if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
> +               if (!attr_test_mask(j, mask, nr_ids) ||
> +                   !attr_test_online(j, online_mask, nr_ids))
>                         active |= remove_xps_queue(dev_maps, tci, index);
>                 for (i = num_tc - tc, tci++; --i; tci++)
>                         active |= remove_xps_queue(dev_maps, tci, index);
> @@ -2337,7 +2390,10 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>
>         /* free map if not active */
>         if (!active) {
> -               RCU_INIT_POINTER(dev->xps_maps, NULL);
> +               if (type == XPS_MAP_RXQS)
> +                       RCU_INIT_POINTER(dev->xps_maps[XPS_MAP_RXQS], NULL);
> +               else if (type == XPS_MAP_CPUS)
> +                       RCU_INIT_POINTER(dev->xps_maps[XPS_MAP_CPUS], NULL);
>                 kfree_rcu(dev_maps, rcu);
>         }
>
> @@ -2347,11 +2403,12 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>         return 0;
>  error:
>         /* remove any maps that we added */
> -       for_each_possible_cpu(cpu) {
> -               for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
> -                       new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
> +       for (j = -1; j = attrmask_next(j, possible_mask, nr_ids),
> +            j < nr_ids;) {
> +               for (i = num_tc, tci = j * num_tc; i--; tci++) {
> +                       new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
>                         map = dev_maps ?
> -                             xmap_dereference(dev_maps->cpu_map[tci]) :
> +                             xmap_dereference(dev_maps->attr_map[tci]) :
>                               NULL;
>                         if (new_map && new_map != map)
>                                 kfree(new_map);
> @@ -2363,6 +2420,13 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
>         kfree(new_dev_maps);
>         return -ENOMEM;
>  }
> +
> +int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
> +                       u16 index)
> +{
> +       return __netif_set_xps_queue(dev, cpumask_bits(mask), index,
> +                                    XPS_MAP_CPUS);
> +}
>  EXPORT_SYMBOL(netif_set_xps_queue);
>
>  #endif
> @@ -3400,7 +3464,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>         int queue_index = -1;
>
>         rcu_read_lock();
> -       dev_maps = rcu_dereference(dev->xps_maps);
> +       dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>         if (dev_maps) {
>                 unsigned int tci = skb->sender_cpu - 1;
>
> @@ -3409,7 +3473,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>                         tci += netdev_get_prio_tc_map(dev, skb->priority);
>                 }
>
> -               map = rcu_dereference(dev_maps->cpu_map[tci]);
> +               map = rcu_dereference(dev_maps->attr_map[tci]);
>                 if (map) {
>                         if (map->len == 1)
>                                 queue_index = map->queues[0];
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index c476f07..d7abd33 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
>         }
>
>         rcu_read_lock();
> -       dev_maps = rcu_dereference(dev->xps_maps);
> +       dev_maps = rcu_dereference(dev->xps_maps[XPS_MAP_CPUS]);
>         if (dev_maps) {
>                 for_each_possible_cpu(cpu) {
>                         int i, tci = cpu * num_tc + tc;
>                         struct xps_map *map;
>
> -                       map = rcu_dereference(dev_maps->cpu_map[tci]);
> +                       map = rcu_dereference(dev_maps->attr_map[tci]);
>                         if (!map)
>                                 continue;
>
>

^ permalink raw reply

* Re: [PATCH v6 12/13] Documentation: remove stale firmware API reference
From: Luis R. Rodriguez @ 2018-05-09 19:43 UTC (permalink / raw)
  To: Mauro Carvalho Chehab
  Cc: Luis R. Rodriguez, gregkh, akpm, keescook, josh, maco, andy.gross,
	david.brown, bjorn.andersson, teg, wagi, hdegoede, andresx7,
	zohar, kubakici, shuah, mfuzzey, dhowells, pali.rohar, tiwai,
	kvalo, arend.vanspriel, zajec5, nbroeking, markivx, broonie,
	dmitry.torokhov, dwmw2, torvalds, Abhay_Salunke, jewalt, oneukum,
	cantabile.desu, ast, hare, jejb
In-Reply-To: <20180509121209.1e332f63@vento.lan>

On Wed, May 09, 2018 at 12:12:09PM -0300, Mauro Carvalho Chehab wrote:
> Em Tue,  8 May 2018 11:12:46 -0700
> "Luis R. Rodriguez" <mcgrof@kernel.org> escreveu:
> 
> > It refers to a pending patch, but this was merged eons ago.
> 
> Didn't know that such patch was already merged. Great!
> 
> Regards,
> Mauro
> 
> > 
> > Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
> > ---
> >  Documentation/dell_rbu.txt | 3 ---
> >  1 file changed, 3 deletions(-)
> > 
> > diff --git a/Documentation/dell_rbu.txt b/Documentation/dell_rbu.txt
> > index 0fdb6aa2704c..077fdc29a0d0 100644
> > --- a/Documentation/dell_rbu.txt
> > +++ b/Documentation/dell_rbu.txt
> > @@ -121,9 +121,6 @@ read back the image downloaded.
> >  
> >  .. note::
> >  
> > -   This driver requires a patch for firmware_class.c which has the modified
> > -   request_firmware_nowait function.
> > -
> 
> >     Also after updating the BIOS image a user mode application needs to execute
> >     code which sends the BIOS update request to the BIOS. So on the next reboot
> >     the BIOS knows about the new image downloaded and it updates itself.
> 
> You should likely remove the "Also" here.

Good catch. Will adjust.

> 
> With that,
> 
> Reviewed-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>

Great, thanks.

  Luis

^ permalink raw reply

* pull-request: mac80211-next 2018-05-09
From: Johannes Berg @ 2018-05-09 19:40 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-wireless

Hi Dave,

Nothing major here either. I'm still waiting for the HE/802.11ax
stuff, but even when we do get that it'll just add rates and not
be very exciting at this point :-)

Please pull and let me know if there's any problem.

Thanks,
johannes



The following changes since commit 415787d7799f4fccbe8d49cb0b8e5811be6b0389:

  ipv6: frags: fix a lockdep false positive (2018-04-18 23:19:39 -0400)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git tags/mac80211-next-for-davem-2018-05-09

for you to fetch changes up to 57c6cb81717f957fb741f2e4c79bd0e2f4f55910:

  mac80211: ethtool: avoid 32 bit multiplication overflow (2018-05-08 15:02:03 +0200)

----------------------------------------------------------------
While we're still waiting for HE/802.11ax code, we just
have a few things:
 * some new statistics (ACK RSSI, TXQ)
 * TXQ configuration
 * infrastructure to handle CSA in firmware
 * and some various cleanups/improvements

----------------------------------------------------------------
Amar Singhal (1):
      cfg80211: Call reg_notifier for self managed hints conditionally

Balaji Pothunoori (2):
      cfg80211: average ack rssi support for data frames
      mac80211: average ack rssi support for data frames

Bjoern Johansson (1):
      mac80211_hwsim: indicate support for powersave.

Colin Ian King (1):
      mac80211: ethtool: avoid 32 bit multiplication overflow

Gregory Greenman (1):
      mac80211: add api to set CSA counter in mac80211

Haim Dreyfuss (1):
      nl80211: Add wmm rule attribute to NL80211_CMD_GET_WIPHY dump command

Johannes Berg (4):
      mac80211: rename rtap_vendor_space to rtap_space
      mac80211: clean up rate info bandwidth setting
      mac80211: ethtool: memset the whole sinfo struct to 0
      mac80211: remove pointless flags=0 assignment

Toke Høiland-Jørgensen (3):
      regulatory: Rename confusing 'country IE' in log output
      cfg80211: Expose TXQ stats and parameters to userspace
      mac80211: Support the new cfg80211 TXQ stats API

 drivers/net/wireless/mac80211_hwsim.c |   1 +
 include/net/cfg80211.h                |  53 +++++++
 include/net/mac80211.h                |  13 ++
 include/uapi/linux/nl80211.h          |  93 ++++++++++++
 net/mac80211/cfg.c                    |  99 +++++++++++++
 net/mac80211/ethtool.c                |  37 +++--
 net/mac80211/ieee80211_i.h            |   3 +
 net/mac80211/main.c                   |   3 +
 net/mac80211/rx.c                     |  40 +++---
 net/mac80211/sta_info.c               |  24 +++-
 net/mac80211/sta_info.h               |   2 +
 net/mac80211/status.c                 |   2 +
 net/mac80211/tx.c                     |  45 ++++++
 net/mac80211/util.c                   |   6 +-
 net/wireless/nl80211.c                | 262 ++++++++++++++++++++++++++++++----
 net/wireless/rdev-ops.h               |  12 ++
 net/wireless/reg.c                    |  37 ++++-
 net/wireless/trace.h                  |  14 ++
 net/wireless/wext-compat.c            |  23 +--
 19 files changed, 683 insertions(+), 86 deletions(-)

^ permalink raw reply

* Re: KASAN: use-after-free Read in __dev_queue_xmit
From: Eric Dumazet @ 2018-05-09 19:36 UTC (permalink / raw)
  To: Willem de Bruijn, Eric Biggers
  Cc: Eric Dumazet, syzbot, alexander.deucher, Andrey Konovalov,
	Anoob Soman, chris, David Miller, Reshetova, Elena,
	Greg Kroah-Hartman, Kees Cook, LKML, Mike Maloney, mchehab,
	netdev, Rosen, Rami, Sowmini Varadhan, syzkaller-bugs,
	Willem de Bruijn
In-Reply-To: <CAF=yD-JPmU-q1iUoxtmyQa4BM=fU3CMR3BUDGNMg-4v8=2_deA@mail.gmail.com>



On 05/09/2018 12:21 PM, Willem de Bruijn wrote:

> Indeed. The skb shared info struct is zeroed by dev_validate_header
> as a result of dev->hard_header_len exceeding skb->end - skb->data.
> 
> Not exactly sure yet how this can happen. The hard header length space
> is accounted for during allocation as reserved memory. But,
> packet_alloc_skb does call skb_reserve(), moving skb->data
> effectively beyond this reserved region.
> 
> It may be incorrect to pass skb->data to dev_validate_header, as that
> does not point to the start of the ll_header anymore. Still figuring out what
> the right fix is..
> 

I believe the bug happens if the sock_wmalloc() call at line 1921 has to sleep.

device can change (or at lest dev->hard_header_len can change)

So we need to bailout if reserved/hhlen had changed.

Or revert some patches, since dev_hold() and dev_put() are no longer high cost,
since it is now using per cpu counter.

 

^ permalink raw reply

* pull-request: mac80211 2018-05-09
From: Johannes Berg @ 2018-05-09 19:36 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-wireless

Hi Dave,

We just have a few fixes this time around.

Please pull and let me know if there's any problem.

Thanks,
johannes



The following changes since commit 64e86fec54069266ba32be551d7b7f75e88ab60c:

  net: qualcomm: rmnet: Fix warning seen with fill_info (2018-04-18 21:23:06 -0400)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211.git tags/mac80211-for-davem-2018-05-09

for you to fetch changes up to 914eac248d876f9c00cd1792ffec3d182c863f13:

  mac80211: use timeout from the AddBA response instead of the request (2018-05-07 20:35:15 +0200)

----------------------------------------------------------------
We only have a few fixes this time:
 * WMM element validation
 * SAE timeout
 * add-BA timeout
 * docbook parsing
 * a few memory leaks in error paths

----------------------------------------------------------------
Ilan Peer (2):
      mac80211: Fix condition validating WMM IE
      mac80211: Adjust SAE authentication timeout

Johan Hovold (1):
      rfkill: gpio: fix memory leak in probe error path

Johannes Berg (1):
      cfg80211: limit wiphy names to 128 bytes

Randy Dunlap (1):
      mac80211: fix kernel-doc "bad line" warning

Sara Sharon (1):
      mac80211: use timeout from the AddBA response instead of the request

Srinivas Dasari (1):
      nl80211: Free connkeys on external authentication failure

YueHaibing (1):
      mac80211_hwsim: fix a possible memory leak in hwsim_new_radio_nl()

weiyongjun (A) (1):
      cfg80211: fix possible memory leak in regdb_query_country()

 drivers/net/wireless/mac80211_hwsim.c |  1 +
 include/net/mac80211.h                |  2 +-
 include/uapi/linux/nl80211.h          |  2 ++
 net/mac80211/agg-tx.c                 |  4 ++++
 net/mac80211/mlme.c                   | 27 +++++++++++++++++++--------
 net/mac80211/tx.c                     |  3 ++-
 net/rfkill/rfkill-gpio.c              |  7 ++++++-
 net/wireless/core.c                   |  3 +++
 net/wireless/nl80211.c                |  1 +
 net/wireless/reg.c                    |  1 +
 10 files changed, 40 insertions(+), 11 deletions(-)

^ permalink raw reply

* Re: [PATCH net-next v3 0/2] socket statistics for ss
From: Stephen Hemminger @ 2018-05-09 19:34 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: davem, gerrit, kuznet, yoshfuji, netdev, dccp, Stephen Hemminger
In-Reply-To: <ba77dfc0-56fe-9c1c-f68d-e8c06ffa1cb4@gmail.com>

On Wed, 9 May 2018 11:53:50 -0700
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> On 05/09/2018 11:44 AM, Stephen Hemminger wrote:
> > On Wed, 9 May 2018 10:53:58 -0700
> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >   
> >> On 05/09/2018 10:31 AM, Stephen Hemminger wrote:  
> >>> On Wed, 9 May 2018 10:18:23 -0700
> >>> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >>>     
> >>>> On 05/09/2018 08:22 AM, Stephen Hemminger wrote:
> >>>>    
> >>>>> I am not sure if these patches are worth applying.
> >>>>> The 'ss -s' command has had missing values since 2.4 kernel.
> >>>>> And the first complaints came in only this year.
> >>>>>
> >>>>> Another alternative would be just to remove these fields from ss -s
> >>>>> output and move on.
> >>>>>       
> >>>>
> >>>> Anyway your patches are not netns ready, so lets remove these fields from ss.
> >>>>
> >>>> Or you have to spend _much_ more time on writing and testing the kernel part.
> >>>>
> >>>> Thanks.    
> >>>
> >>> The patches only expose the existing TCP socket accounting infrastructure.
> >>> Several other pieces that sockstat has are not netns aware.
> >>> That is a completely different problem.    
> >>
> >>
> >> Adding a new field counting 'bounds ports' without being netns ready is a total mistake,
> >> as it is useless by current standards.
> >>
> >> The first thing that users will do is add proper netns support, with extra complexity in the kernel.
> >>
> >> So, instead of pushing some incomplete feature, trying to fool ourselves with a sentiment of 'small cost'
> >> that will later need another 100 lines of code in the kernel, please give us the complete picture.
> >>
> >> I am just saying, you can of course ignore my feedback.  
> > 
> > The current TCP hashinfo should be moved into netns. The current method of scanning and matching
> > by net namespace is a scalability issue now.  
> 
> It is not the plan yet, and we have no scalability issue.
> 
> Before switching to netns hash table, this would need rhashtable conversions
> but so far this has not been done.
> 
> - Time to create/delete netns is critical.
> - Adding few Mbytes of overhead per netns is a nogo,
> 
> Please do not change subject, this is adding noise to this particular thread.

Back to original subject. My current intention is to just pull all these statistics
from ss command. They are always zero now, and very few people noticed and  no one
really needs them.

^ permalink raw reply

* Re: [PATCH net-next v10 3/4] netdev: octeon-ethernet: Add Cavium Octeon III support.
From: Steven J. Hill @ 2018-05-09 19:29 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20180508.222824.409067086455315118.davem@davemloft.net>

On 05/08/2018 09:28 PM, David Miller wrote:
> 
> That's all I have the stomache for at the moment.
> 
> This thing is really large, making it nearly impossible to review
> as one huge patch #3. Perhaps you can find a way to split it up
> logically somehow?
> 
Hey David.

This code was inherited by me, so there a lot of parts I personally
have not looked at. The third patch indeed needs to be broken up.
Let me devote some hours to cleaning, splitting and simplifying. My
apologies for the time you spent looking at it. The first patch,
however, only makes changes in the existing 'staging' directory. It
seems like that one could go in without trouble? Cheers.

Steve

^ permalink raw reply

* Re: [PATCH net-next 3/3] net/mlx4_core: Use msi_x module param to limit num of MSI-X irqs
From: Ajaykumar Hotchandani @ 2018-05-09 19:29 UTC (permalink / raw)
  To: Tariq Toukan, David S. Miller; +Cc: netdev, Eran Ben Elisha
In-Reply-To: <1525879744-1858-4-git-send-email-tariqt@mellanox.com>

Thanks Tariq.

Reviewed-by: Ajaykumar Hotchandani <ajaykumar.hotchandani@oracle.com>

On 05/09/2018 08:29 AM, Tariq Toukan wrote:
> Extend the boolean interpretation of msi_x module parameter
> to numerical, as follows:
>
> 0   - Don't use MSI-X.
> 1   - Use MSI-X, driver decides the num of MSI-X irqs.
>> =2 - Use MSI-X, limit number of MSI-X irqs to msi_x.
>        In SRIOV, this limits the number of MSI-X irqs per VF.
>
> Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
> Cc: Ajaykumar Hotchandani <ajaykumar.hotchandani@oracle.com>
> ---
>   drivers/net/ethernet/mellanox/mlx4/main.c | 10 +++++++++-
>   1 file changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
> index b6aaf34d6648..80a75c80a463 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/main.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/main.c
> @@ -73,7 +73,7 @@
>   
>   static int msi_x = 1;
>   module_param(msi_x, int, 0444);
> -MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
> +MODULE_PARM_DESC(msi_x, "0 - don't use MSI-X, 1 - use MSI-X, >1 - limit number of MSI-X irqs to msi_x");
>   
>   #else /* CONFIG_PCI_MSI */
>   
> @@ -2815,6 +2815,9 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
>   				dev->caps.num_eqs - dev->caps.reserved_eqs,
>   				MAX_MSIX);
>   
> +		if (msi_x > 1)
> +			nreq = min_t(int, nreq, msi_x);
> +
>   		entries = kcalloc(nreq, sizeof(*entries), GFP_KERNEL);
>   		if (!entries)
>   			goto no_msi;
> @@ -4182,6 +4185,11 @@ static int mlx4_resume(struct pci_dev *pdev)
>   
>   static int __init mlx4_verify_params(void)
>   {
> +	if (msi_x < 0) {
> +		pr_warn("mlx4_core: bad msi_x: %d\n", msi_x);
> +		return -1;
> +	}
> +
>   	if ((log_num_mac < 0) || (log_num_mac > 7)) {
>   		pr_warn("mlx4_core: bad num_mac: %d\n", log_num_mac);
>   		return -1;

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox