Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC bpf-next 07/11] bpf: Add helper to retrieve socket in BPF
From: Joe Stringer @ 2018-05-09 21:07 UTC (permalink / raw)
  To: daniel; +Cc: netdev, ast, john.fastabend, kafai
In-Reply-To: <20180509210709.7201-1-joe@wand.net.nz>

This patch adds a new BPF helper function, sk_lookup() which allows BPF
programs to find out if there is a socket listening on this host, and
returns a socket pointer which the BPF program can then access to
determine, for instance, whether to forward or drop traffic. sk_lookup()
takes a reference on the socket, so when a BPF program makes use of this
function, it must subsequently pass the returned pointer into the newly
added sk_release() to return the reference.

By way of example, the following pseudocode would filter inbound
connections at XDP if there is no corresponding service listening for
the traffic:

  struct bpf_sock_tuple tuple;
  struct bpf_sock_ops *sk;

  populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
  sk = bpf_sk_lookup(ctx, &tuple, sizeof tuple, netns, 0);
  if (!sk) {
    // Couldn't find a socket listening for this traffic. Drop.
    return TC_ACT_SHOT;
  }
  bpf_sk_release(sk, 0);
  return TC_ACT_OK;

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 include/uapi/linux/bpf.h                  |  39 +++++++++++-
 kernel/bpf/verifier.c                     |   8 ++-
 net/core/filter.c                         | 102 ++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h            |  40 +++++++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |   7 ++
 5 files changed, 193 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d615c777b573..29f38838dbca 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1828,6 +1828,25 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * struct bpf_sock_ops *bpf_sk_lookup(ctx, tuple, tuple_size, netns, flags)
+ * 	Decription
+ * 		Look for socket matching 'tuple'. The return value must be checked,
+ * 		and if non-NULL, released via bpf_sk_release().
+ * 		@ctx: pointer to ctx
+ * 		@tuple: pointer to struct bpf_sock_tuple
+ * 		@tuple_size: size of the tuple
+ * 		@flags: flags value
+ * 	Return
+ * 		pointer to socket ops on success, or
+ * 		NULL in case of failure
+ *
+ *  int bpf_sk_release(sock, flags)
+ * 	Description
+ * 		Release the reference held by 'sock'.
+ * 		@sock: Pointer reference to release. Must be found via bpf_sk_lookup().
+ * 		@flags: flags value
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1898,7 +1917,9 @@ union bpf_attr {
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
-	FN(skb_load_bytes_relative),
+	FN(skb_load_bytes_relative),	\
+	FN(sk_lookup),			\
+	FN(sk_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2060,6 +2081,22 @@ struct bpf_sock {
 				 */
 };
 
+struct bpf_sock_tuple {
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} saddr;
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} daddr;
+	__be16 sport;
+	__be16 dport;
+	__u32 dst_if;
+	__u8 family;
+	__u8 proto;
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 92b9a5dc465a..579012c483e4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
  * passes through a NULL-check conditional. For the branch wherein the state is
  * changed to CONST_IMM, the verifier releases the reference.
+ *
+ * For each helper function that allocates a reference, such as bpf_sk_lookup(),
+ * there is a corresponding release function, such as bpf_sk_release(). When
+ * a reference type passes into the release function, the verifier also releases
+ * the reference. If any unchecked or unreleased reference remains at the end of
+ * the program, the verifier rejects it.
  */
 
 /* verifier_state + insn_idx are pushed to stack when branch is encountered */
@@ -277,7 +283,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type)
  */
 static bool is_release_function(enum bpf_func_id func_id)
 {
-	return false;
+	return func_id == BPF_FUNC_sk_release;
 }
 
 /* string representation of 'enum bpf_reg_type' */
diff --git a/net/core/filter.c b/net/core/filter.c
index 4c35152fb3a8..751c255d17d3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -58,8 +58,12 @@
 #include <net/busy_poll.h>
 #include <net/tcp.h>
 #include <net/xfrm.h>
+#include <net/udp.h>
 #include <linux/bpf_trace.h>
 #include <net/xdp_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+#include <net/net_namespace.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -4032,6 +4036,96 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
 };
 #endif
 
+struct sock *
+sk_lookup(struct net *net, struct bpf_sock_tuple *tuple) {
+	int dst_if = (int)tuple->dst_if;
+	struct in6_addr *src6;
+	struct in6_addr *dst6;
+
+	if (tuple->family == AF_INET6) {
+		src6 = (struct in6_addr *)&tuple->saddr.ipv6;
+		dst6 = (struct in6_addr *)&tuple->daddr.ipv6;
+	} else if (tuple->family != AF_INET) {
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (tuple->proto == IPPROTO_TCP) {
+		if (tuple->family == AF_INET)
+			return inet_lookup(net, &tcp_hashinfo, NULL, 0,
+					   tuple->saddr.ipv4, tuple->sport,
+					   tuple->daddr.ipv4, tuple->dport,
+					   dst_if);
+		else
+			return inet6_lookup(net, &tcp_hashinfo, NULL, 0,
+					    src6, tuple->sport,
+					    dst6, tuple->dport, dst_if);
+	} else if (tuple->proto == IPPROTO_UDP) {
+		if (tuple->family == AF_INET)
+			return udp4_lib_lookup(net, tuple->saddr.ipv4,
+					       tuple->sport, tuple->daddr.ipv4,
+					       tuple->dport, dst_if);
+		else
+			return udp6_lib_lookup(net, src6, tuple->sport,
+					       dst6, tuple->dport, dst_if);
+	} else {
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	return NULL;
+}
+
+BPF_CALL_5(bpf_sk_lookup, struct sk_buff *, skb,
+	   struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+	struct net *caller_net = dev_net(skb->dev);
+	struct sock *sk = NULL;
+	struct net *net;
+
+	/* XXX: Perform verification-time checking of tuple size? */
+	if (unlikely(len != sizeof(struct bpf_sock_tuple) || flags))
+		goto out;
+
+	net = get_net_ns_by_id(caller_net, netns_id);
+	if (unlikely(!net))
+		goto out;
+
+	sk = sk_lookup(net, tuple);
+	put_net(net);
+	if (IS_ERR_OR_NULL(sk))
+		sk = NULL;
+	else
+		sk = sk_to_full_sk(sk);
+out:
+	return (unsigned long) sk;
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_proto = {
+	.func		= bpf_sk_lookup,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_sk_release, struct sock *, sk, u64, flags)
+{
+	sock_gen_put(sk);
+	if (unlikely(flags))
+		return -EINVAL;
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_release_proto = {
+	.func		= bpf_sk_release,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_SOCKET,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -4181,6 +4275,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_get_xfrm_state:
 		return &bpf_skb_get_xfrm_state_proto;
 #endif
+	case BPF_FUNC_sk_lookup:
+		return &bpf_sk_lookup_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4292,6 +4390,10 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_uid_proto;
 	case BPF_FUNC_sk_redirect_map:
 		return &bpf_sk_redirect_map_proto;
+	case BPF_FUNC_sk_lookup:
+		return &bpf_sk_lookup_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index fff51c187d1e..29f38838dbca 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -117,6 +117,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
 	BPF_MAP_TYPE_CPUMAP,
+	BPF_MAP_TYPE_XSKMAP,
 };
 
 enum bpf_prog_type {
@@ -1827,6 +1828,25 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * struct bpf_sock_ops *bpf_sk_lookup(ctx, tuple, tuple_size, netns, flags)
+ * 	Decription
+ * 		Look for socket matching 'tuple'. The return value must be checked,
+ * 		and if non-NULL, released via bpf_sk_release().
+ * 		@ctx: pointer to ctx
+ * 		@tuple: pointer to struct bpf_sock_tuple
+ * 		@tuple_size: size of the tuple
+ * 		@flags: flags value
+ * 	Return
+ * 		pointer to socket ops on success, or
+ * 		NULL in case of failure
+ *
+ *  int bpf_sk_release(sock, flags)
+ * 	Description
+ * 		Release the reference held by 'sock'.
+ * 		@sock: Pointer reference to release. Must be found via bpf_sk_lookup().
+ * 		@flags: flags value
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1897,7 +1917,9 @@ union bpf_attr {
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
-	FN(skb_load_bytes_relative),
+	FN(skb_load_bytes_relative),	\
+	FN(sk_lookup),			\
+	FN(sk_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2059,6 +2081,22 @@ struct bpf_sock {
 				 */
 };
 
+struct bpf_sock_tuple {
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} saddr;
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} daddr;
+	__be16 sport;
+	__be16 dport;
+	__u32 dst_if;
+	__u8 family;
+	__u8 proto;
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 265f8e0e8ada..4dc311ea0c16 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -103,6 +103,13 @@ static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
 	(void *) BPF_FUNC_skb_get_xfrm_state;
 static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
 	(void *) BPF_FUNC_get_stack;
+static struct bpf_sock *(*bpf_sk_lookup)(void *ctx,
+					 struct bpf_sock_tuple *tuple,
+					 int size, unsigned int netns_id,
+					 unsigned long long flags) =
+	(void *) BPF_FUNC_sk_lookup;
+static int (*bpf_sk_release)(struct bpf_sock *sk, unsigned long long flags) =
+	(void *) BPF_FUNC_sk_release;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
2.14.1

^ permalink raw reply related

* [RFC bpf-next 08/11] selftests/bpf: Add tests for reference tracking
From: Joe Stringer @ 2018-05-09 21:07 UTC (permalink / raw)
  To: daniel; +Cc: netdev, ast, john.fastabend, kafai
In-Reply-To: <20180509210709.7201-1-joe@wand.net.nz>

reference tracking: leak potential reference
reference tracking: leak potential reference on stack
reference tracking: leak potential reference on stack 2
reference tracking: zero potential reference
reference tracking: copy and zero potential references
reference tracking: release reference without check
reference tracking: release reference
reference tracking: release reference twice
reference tracking: release reference twice inside branch
reference tracking: alloc, check, free in one subbranch
reference tracking: alloc, check, free in both subbranches
reference tracking in call: free reference in subprog
reference tracking in call: free reference in subprog and outside
reference tracking in call: alloc & leak reference in subprog
reference tracking in call: alloc in subprog, release outside
reference tracking in call: sk_ptr leak into caller stack
reference tracking in call: sk_ptr spill into caller stack

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 tools/testing/selftests/bpf/test_verifier.c | 359 ++++++++++++++++++++++++++++
 1 file changed, 359 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 53439f40e1de..150c7c19eb51 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -3,6 +3,7 @@
  *
  * Copyright (c) 2014 PLUMgrid, http://plumgrid.com
  * Copyright (c) 2017 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -152,6 +153,23 @@ static void bpf_fill_jump_around_ld_abs(struct bpf_test *self)
 	insn[i] = BPF_EXIT_INSN();
 }
 
+#define BPF_SK_LOOKUP						\
+	/* struct bpf_sock_tuple tuple = {} */			\
+	BPF_MOV64_IMM(BPF_REG_2, 0),				\
+	BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -16),	\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -24),	\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -32),	\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -40),	\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -48),	\
+	/* sk = sk_lookup(ctx, &tuple, sizeof tuple, 0, 0) */	\
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),			\
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -48),			\
+	BPF_MOV64_IMM(BPF_REG_3, 44),				\
+	BPF_MOV64_IMM(BPF_REG_4, 0),				\
+	BPF_MOV64_IMM(BPF_REG_5, 0),				\
+	BPF_EMIT_CALL(BPF_FUNC_sk_lookup)
+
 static struct bpf_test tests[] = {
 	{
 		"add+sub+mul",
@@ -11974,6 +11992,347 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.retval = 10,
 	},
+	{
+		"reference tracking: leak potential reference",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), /* leak reference */
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: leak potential reference on stack",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: leak potential reference on stack 2",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: zero potential reference",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_IMM(BPF_REG_0, 0), /* leak reference */
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: copy and zero potential references",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_7, 0), /* leak reference */
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: release reference without check",
+		.insns = {
+			BPF_SK_LOOKUP,
+			/* reference in r0 may be NULL */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "type=sock_or_null expected=sock",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: release reference",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking: release reference 2",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking: release reference twice",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "type=inv expected=sock",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: release reference twice inside branch",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), /* goto end */
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "type=inv expected=sock",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: alloc, check, free in one subbranch",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 16),
+			/* if (offsetof(skb, mark) > data_len) exit; */
+			BPF_JMP_REG(BPF_JLE, BPF_REG_0, BPF_REG_3, 1),
+			BPF_EXIT_INSN(),
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_2,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_SK_LOOKUP,
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 1), /* mark == 0? */
+			/* Leak reference in R0 */
+			BPF_EXIT_INSN(),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), /* sk NULL? */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: alloc, check, free in both subbranches",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 16),
+			/* if (offsetof(skb, mark) > data_len) exit; */
+			BPF_JMP_REG(BPF_JLE, BPF_REG_0, BPF_REG_3, 1),
+			BPF_EXIT_INSN(),
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_2,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_SK_LOOKUP,
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 5), /* mark == 0? */
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), /* sk NULL? */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), /* sk NULL? */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking in call: free reference in subprog",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* unchecked reference */
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_2, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking in call: free reference in subprog and outside",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* unchecked reference */
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_2, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "type=inv expected=sock",
+		.result = REJECT,
+	},
+	{
+		"reference tracking in call: alloc & leak reference in subprog",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_4),
+			BPF_SK_LOOKUP,
+			/* spill unchecked sk_ptr into stack of caller */
+			BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking in call: alloc in subprog, release outside",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_SK_LOOKUP,
+			BPF_EXIT_INSN(), /* return sk */
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.retval = POINTER_VALUE,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking in call: sk_ptr leak into caller stack",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5),
+			/* spill unchecked sk_ptr into stack of caller */
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_5, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 2 */
+			BPF_SK_LOOKUP,
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking in call: sk_ptr spill into caller stack",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 9),
+			/* spill unchecked sk_ptr into stack of caller */
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_5, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+			/* now the sk_ptr is verified, free the reference */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_4, 0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+
+			/* subprog 2 */
+			BPF_SK_LOOKUP,
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
2.14.1

^ permalink raw reply related

* [RFC bpf-next 09/11] libbpf: Support loading individual progs
From: Joe Stringer @ 2018-05-09 21:07 UTC (permalink / raw)
  To: daniel; +Cc: netdev, ast, john.fastabend, kafai
In-Reply-To: <20180509210709.7201-1-joe@wand.net.nz>

Allow the individual program load to be invoked. This will help with
testing, where a single ELF may contain several sections, some of which
denote subprograms that are expected to fail verification, along with
some which are expected to pass verification. By allowing programs to be
iterated and individually loaded, each program can be independently
checked against its expected verification result.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 tools/lib/bpf/libbpf.c | 4 ++--
 tools/lib/bpf/libbpf.h | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 7bcdca13083a..04e3754bcf30 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -268,7 +268,7 @@ struct bpf_object {
 };
 #define obj_elf_valid(o)	((o)->efile.elf)
 
-static void bpf_program__unload(struct bpf_program *prog)
+void bpf_program__unload(struct bpf_program *prog)
 {
 	int i;
 
@@ -1338,7 +1338,7 @@ load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type,
 	return ret;
 }
 
-static int
+int
 bpf_program__load(struct bpf_program *prog,
 		  char *license, u32 kern_version)
 {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 197f9ce2248c..c07e9969e4ed 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -112,10 +112,13 @@ void *bpf_program__priv(struct bpf_program *prog);
 
 const char *bpf_program__title(struct bpf_program *prog, bool needs_copy);
 
+int bpf_program__load(struct bpf_program *prog, char *license,
+		      u32 kern_version);
 int bpf_program__fd(struct bpf_program *prog);
 int bpf_program__pin_instance(struct bpf_program *prog, const char *path,
 			      int instance);
 int bpf_program__pin(struct bpf_program *prog, const char *path);
+void bpf_program__unload(struct bpf_program *prog);
 
 struct bpf_insn;
 
-- 
2.14.1

^ permalink raw reply related

* [RFC bpf-next 10/11] selftests/bpf: Add C tests for reference tracking
From: Joe Stringer @ 2018-05-09 21:07 UTC (permalink / raw)
  To: daniel; +Cc: netdev, ast, john.fastabend, kafai
In-Reply-To: <20180509210709.7201-1-joe@wand.net.nz>

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 tools/testing/selftests/bpf/Makefile              |   2 +-
 tools/testing/selftests/bpf/test_progs.c          |  38 +++++++
 tools/testing/selftests/bpf/test_sk_lookup_kern.c | 127 ++++++++++++++++++++++
 3 files changed, 166 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_sk_lookup_kern.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 9d762184b805..cf71baa9d51d 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -33,7 +33,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
 	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
-	test_get_stack_rawtp.o
+	test_get_stack_rawtp.o test_sk_lookup_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index ed197eef1cfc..6d868a031b00 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1409,6 +1409,43 @@ static void test_get_stack_raw_tp(void)
 	bpf_object__close(obj);
 }
 
+static void test_reference_tracking()
+{
+	const char *file = "./test_sk_lookup_kern.o";
+	struct bpf_object *obj;
+	struct bpf_program *prog;
+	__u32 duration;
+	int err = 0;
+
+	obj = bpf_object__open(file);
+	if (IS_ERR(obj)) {
+		error_cnt++;
+		return;
+	}
+
+	bpf_object__for_each_program(prog, obj) {
+		const char *title;
+
+		/* Ignore .text sections */
+		title = bpf_program__title(prog, false);
+		if (strstr(title, ".text") != NULL)
+			continue;
+
+		bpf_program__set_type(prog, BPF_PROG_TYPE_SCHED_CLS);
+
+		/* Expect verifier failure if test name has 'fail' */
+		if (strstr(title, "fail") != NULL) {
+			libbpf_set_print(NULL, NULL, NULL);
+			err = !bpf_program__load(prog, "GPL", 0);
+			libbpf_set_print(printf, printf, NULL);
+		} else {
+			err = bpf_program__load(prog, "GPL", 0);
+		}
+		CHECK(err, title, "\n");
+	}
+	bpf_object__close(obj);
+}
+
 int main(void)
 {
 	jit_enabled = is_jit_enabled();
@@ -1427,6 +1464,7 @@ int main(void)
 	test_stacktrace_build_id();
 	test_stacktrace_map_raw_tp();
 	test_get_stack_raw_tp();
+	test_reference_tracking();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/test_sk_lookup_kern.c
new file mode 100644
index 000000000000..4f7383a31916
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sk_lookup_kern.c
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/tcp.h>
+#include <sys/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */
+static void *fill_ip(struct bpf_sock_tuple *tuple, void *data, __u64 nh_off,
+		     void *data_end, __u16 eth_proto)
+{
+	__u64 ihl_len;
+
+	if (eth_proto == bpf_htons(ETH_P_IP)) {
+		struct iphdr *iph = (struct iphdr *)(data + nh_off);
+
+		if (iph + 1 > data_end)
+			return NULL;
+		ihl_len = iph->ihl * 4;
+
+		tuple->family = AF_INET;
+		tuple->proto = iph->protocol;
+		tuple->saddr.ipv4 = iph->saddr;
+		tuple->daddr.ipv4 = iph->daddr;
+	} else if (eth_proto == bpf_htons(ETH_P_IPV6)) {
+		struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off);
+
+		if (ip6h + 1 > data_end)
+			return NULL;
+		ihl_len = sizeof(*ip6h);
+
+		tuple->family = AF_INET6;
+		tuple->proto = ip6h->nexthdr;
+		*((struct in6_addr *)&tuple->saddr.ipv6) = ip6h->saddr;
+		*((struct in6_addr *)&tuple->daddr.ipv6) = ip6h->daddr;
+	}
+
+	if (tuple->proto != IPPROTO_TCP)
+		return NULL;
+
+	return data + nh_off + ihl_len;
+}
+
+SEC("sk_lookup_success")
+int bpf_sk_lookup_test0(struct __sk_buff *skb)
+{
+	void *data_end = (void *)(long)skb->data_end;
+	void *data = (void *)(long)skb->data;
+	struct ethhdr *eth = (struct ethhdr *)(data);
+	struct bpf_sock_tuple tuple = {};
+	struct tcphdr *tcp;
+	struct bpf_sock *sk;
+	void *l4;
+
+	if (eth + 1 > data_end)
+		return TC_ACT_SHOT;
+
+	l4 = fill_ip(&tuple, data, sizeof(*eth), data_end, eth->h_proto);
+	if (!l4 || l4 + sizeof *tcp > data_end)
+		return TC_ACT_SHOT;
+
+	tcp = l4;
+	tuple.sport = tcp->source;
+	tuple.dport = tcp->dest;
+
+	sk = bpf_sk_lookup(skb, &tuple, sizeof(tuple), 0, 0);
+	if (sk)
+		bpf_sk_release(sk, 0);
+	return sk ? TC_ACT_OK : TC_ACT_UNSPEC;
+}
+
+SEC("fail_no_release")
+int bpf_sk_lookup_test1(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+
+	bpf_sk_lookup(skb, &tuple, sizeof(tuple), 0, 0);
+	return 0;
+}
+
+SEC("fail_release_twice")
+int bpf_sk_lookup_test2(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+	struct bpf_sock *sk;
+
+	sk = bpf_sk_lookup(skb, &tuple, sizeof(tuple), 0, 0);
+	bpf_sk_release(sk, 0);
+	bpf_sk_release(sk, 0);
+	return 0;
+}
+
+SEC("fail_release_unchecked")
+int bpf_sk_lookup_test3(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+	struct bpf_sock *sk;
+
+	sk = bpf_sk_lookup(skb, &tuple, sizeof(tuple), 0, 0);
+	bpf_sk_release(sk, 0);
+	return 0;
+}
+
+void lookup_no_release(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+	bpf_sk_lookup(skb, &tuple, sizeof(tuple), 0, 0);
+}
+
+SEC("fail_no_release_subcall")
+int bpf_sk_lookup_test4(struct __sk_buff *skb)
+{
+	lookup_no_release(skb);
+	return 0;
+}
-- 
2.14.1

^ permalink raw reply related

* [RFC bpf-next 11/11] Documentation: Describe bpf reference tracking
From: Joe Stringer @ 2018-05-09 21:07 UTC (permalink / raw)
  To: daniel; +Cc: netdev, ast, john.fastabend, kafai
In-Reply-To: <20180509210709.7201-1-joe@wand.net.nz>

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 Documentation/networking/filter.txt | 64 +++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 5032e1263bc9..77be17977bc5 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1125,6 +1125,14 @@ pointer type.  The types of pointers describe their base, as follows:
     PTR_TO_STACK        Frame pointer.
     PTR_TO_PACKET       skb->data.
     PTR_TO_PACKET_END   skb->data + headlen; arithmetic forbidden.
+    PTR_TO_SOCKET       Pointer to struct bpf_sock_ops, implicitly refcounted.
+    PTR_TO_SOCKET_OR_NULL
+                        Either a pointer to a socket, or NULL; socket lookup
+                        returns this type, which becomes a PTR_TO_SOCKET when
+                        checked != NULL. PTR_TO_SOCKET is reference-counted,
+                        so programs must release the reference through the
+                        socket release function before the end of the program.
+                        Arithmetic on these pointers is forbidden.
 However, a pointer may be offset from this base (as a result of pointer
 arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable
 offset'.  The former is used when an exactly-known value (e.g. an immediate
@@ -1168,6 +1176,13 @@ over the Ethernet header, then reads IHL and addes (IHL * 4), the resulting
 pointer will have a variable offset known to be 4n+2 for some n, so adding the 2
 bytes (NET_IP_ALIGN) gives a 4-byte alignment and so word-sized accesses through
 that pointer are safe.
+The 'id' field is also used on PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL, common
+to all copies of the pointer returned from a socket lookup. This has similar
+behaviour to the handling for PTR_TO_MAP_VALUE_OR_NULL->PTR_TO_MAP_VALUE, but
+it also handles reference tracking for the pointer. PTR_TO_SOCKET implicitly
+represents a reference to the corresponding 'struct sock'. To ensure that the
+reference is not leaked, it is imperative to NULL-check the reference and in
+the non-NULL case, and pass the valid reference to the socket release function.
 
 Direct packet access
 --------------------
@@ -1441,6 +1456,55 @@ Error:
   8: (7a) *(u64 *)(r0 +0) = 1
   R0 invalid mem access 'imm'
 
+Program that performs a socket lookup then sets the pointer to NULL without
+checking it:
+value:
+  BPF_MOV64_IMM(BPF_REG_2, 0),
+  BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),
+  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+  BPF_MOV64_IMM(BPF_REG_3, 4),
+  BPF_MOV64_IMM(BPF_REG_4, 0),
+  BPF_MOV64_IMM(BPF_REG_5, 0),
+  BPF_EMIT_CALL(BPF_FUNC_sk_lookup),
+  BPF_MOV64_IMM(BPF_REG_0, 0),
+  BPF_EXIT_INSN(),
+Error:
+  0: (b7) r2 = 0
+  1: (63) *(u32 *)(r10 -8) = r2
+  2: (bf) r2 = r10
+  3: (07) r2 += -8
+  4: (b7) r3 = 4
+  5: (b7) r4 = 0
+  6: (b7) r5 = 0
+  7: (85) call bpf_sk_lookup#65
+  8: (b7) r0 = 0
+  9: (95) exit
+  Unreleased reference id=1, alloc_insn=7
+
+Program that performs a socket lookup but does not NULL-check the returned
+value:
+  BPF_MOV64_IMM(BPF_REG_2, 0),
+  BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),
+  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+  BPF_MOV64_IMM(BPF_REG_3, 4),
+  BPF_MOV64_IMM(BPF_REG_4, 0),
+  BPF_MOV64_IMM(BPF_REG_5, 0),
+  BPF_EMIT_CALL(BPF_FUNC_sk_lookup),
+  BPF_EXIT_INSN(),
+Error:
+  0: (b7) r2 = 0
+  1: (63) *(u32 *)(r10 -8) = r2
+  2: (bf) r2 = r10
+  3: (07) r2 += -8
+  4: (b7) r3 = 4
+  5: (b7) r4 = 0
+  6: (b7) r5 = 0
+  7: (85) call bpf_sk_lookup#65
+  8: (95) exit
+  Unreleased reference id=1, alloc_insn=7
+
 Testing
 -------
 
-- 
2.14.1

^ permalink raw reply related

* [PATCH net] hv_netvsc: set master device
From: Stephen Hemminger @ 2018-05-09 21:09 UTC (permalink / raw)
  To: davem; +Cc: netdev, Stephen Hemminger, Stephen Hemminger

The hyper-v transparent bonding should have used master_dev_link.
The netvsc device should look like a master bond device not
like the upper side of a tunnel.

This makes the semantics the same so that userspace applications
looking at network devices see the correct master relationshipship.

Fixes: 0c195567a8f6 ("netvsc: transparent VF management")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 drivers/net/hyperv/netvsc_drv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index ecc84954c511..da07ccdf84bf 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1840,7 +1840,8 @@ static int netvsc_vf_join(struct net_device *vf_netdev,
 		goto rx_handler_failed;
 	}
 
-	ret = netdev_upper_dev_link(vf_netdev, ndev, NULL);
+	ret = netdev_master_upper_dev_link(vf_netdev, ndev,
+					   NULL, NULL, NULL);
 	if (ret != 0) {
 		netdev_err(vf_netdev,
 			   "can not set master device %s (err = %d)\n",
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next 1/1] net sched actions: fix invalid pointer dereferencing if skbedit flags missing
From: Roman Mashak @ 2018-05-09 21:18 UTC (permalink / raw)
  To: davem
  Cc: netdev, kernel, jhs, xiyou.wangcong, jiri, alexander.duyck,
	Roman Mashak

When application fails to pass flags in netlink TLV for a new skbedit action,
the kernel results in the following oops:

[    8.307732] BUG: unable to handle kernel paging request at 0000000000021130
[    8.309167] PGD 80000000193d1067 P4D 80000000193d1067 PUD 180e0067 PMD 0 
[    8.310595] Oops: 0000 [#1] SMP PTI
[    8.311334] Modules linked in: kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper serio_raw
[    8.314190] CPU: 1 PID: 397 Comm: tc Not tainted 4.17.0-rc3+ #357
[    8.315252] RIP: 0010:__tcf_idr_release+0x33/0x140
[    8.316203] RSP: 0018:ffffa0718038f840 EFLAGS: 00010246
[    8.317123] RAX: 0000000000000001 RBX: 0000000000021100 RCX: 0000000000000000
[    8.319831] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000021100
[    8.321181] RBP: 0000000000000000 R08: 000000000004adf8 R09: 0000000000000122
[    8.322645] R10: 0000000000000000 R11: ffffffff9e5b01ed R12: 0000000000000000
[    8.324157] R13: ffffffff9e0d3cc0 R14: 0000000000000000 R15: 0000000000000000
[    8.325590] FS:  00007f591292e700(0000) GS:ffff8fcf5bc40000(0000) knlGS:0000000000000000
[    8.327001] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    8.327987] CR2: 0000000000021130 CR3: 00000000180e6004 CR4: 00000000001606a0
[    8.329289] Call Trace:
[    8.329735]  tcf_skbedit_init+0xa7/0xb0
[    8.330423]  tcf_action_init_1+0x362/0x410
[    8.331139]  ? try_to_wake_up+0x44/0x430
[    8.331817]  tcf_action_init+0x103/0x190
[    8.332511]  tc_ctl_action+0x11a/0x220
[    8.333174]  rtnetlink_rcv_msg+0x23d/0x2e0
[    8.333902]  ? _cond_resched+0x16/0x40
[    8.334569]  ? __kmalloc_node_track_caller+0x5b/0x2c0
[    8.335440]  ? rtnl_calcit.isra.31+0xf0/0xf0
[    8.336178]  netlink_rcv_skb+0xdb/0x110
[    8.336855]  netlink_unicast+0x167/0x220
[    8.337550]  netlink_sendmsg+0x2a7/0x390
[    8.338258]  sock_sendmsg+0x30/0x40
[    8.338865]  ___sys_sendmsg+0x2c5/0x2e0
[    8.339531]  ? pagecache_get_page+0x27/0x210
[    8.340271]  ? filemap_fault+0xa2/0x630
[    8.340943]  ? page_add_file_rmap+0x108/0x200
[    8.341732]  ? alloc_set_pte+0x2aa/0x530
[    8.342573]  ? finish_fault+0x4e/0x70
[    8.343332]  ? __handle_mm_fault+0xbc1/0x10d0
[    8.344337]  ? __sys_sendmsg+0x53/0x80
[    8.345040]  __sys_sendmsg+0x53/0x80
[    8.345678]  do_syscall_64+0x4f/0x100
[    8.346339]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[    8.347206] RIP: 0033:0x7f591191da67
[    8.347831] RSP: 002b:00007fff745abd48 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
[    8.349179] RAX: ffffffffffffffda RBX: 00007fff745abe70 RCX: 00007f591191da67
[    8.350431] RDX: 0000000000000000 RSI: 00007fff745abdc0 RDI: 0000000000000003
[    8.351659] RBP: 000000005af35251 R08: 0000000000000001 R09: 0000000000000000
[    8.352922] R10: 00000000000005f1 R11: 0000000000000246 R12: 0000000000000000
[    8.354183] R13: 00007fff745afed0 R14: 0000000000000001 R15: 00000000006767c0
[    8.355400] Code: 41 89 d4 53 89 f5 48 89 fb e8 aa 20 fd ff 85 c0 0f 84 ed 00
00 00 48 85 db 0f 84 cf 00 00 00 40 84 ed 0f 85 cd 00 00 00 45 84 e4 <8b> 53 30
74 0d 85 d2 b8 ff ff ff ff 0f 8f b3 00 00 00 8b 43 2c 
[    8.358699] RIP: __tcf_idr_release+0x33/0x140 RSP: ffffa0718038f840
[    8.359770] CR2: 0000000000021130
[    8.360438] ---[ end trace 60c66be45dfc14f0 ]---

The caller calls action's ->init() and passes pointer to "struct tc_action *a",
which later may initialized to point at the existing action, otherwise
"struct tc_action *a" is still invalid, and therefore dereferencing it is error.

So checking flags should be done as early as possible, before idr lookups.

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
---
 net/sched/act_skbedit.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index ddf69fc01bdf..6c88037faf51 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -114,17 +114,15 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 		mask = nla_data(tb[TCA_SKBEDIT_MASK]);
 	}
 
+	if (!flags)
+		return -EINVAL;
+
 	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
 
 	exists = tcf_idr_check(tn, parm->index, a, bind);
 	if (exists && bind)
 		return 0;
 
-	if (!flags) {
-		tcf_idr_release(*a, bind);
-		return -EINVAL;
-	}
-
 	if (!exists) {
 		ret = tcf_idr_create(tn, parm->index, est, a,
 				     &act_skbedit_ops, bind, false);
-- 
2.7.4

^ permalink raw reply related

* Re: [bpf-next v2 8/9] bpf: Provide helper to do forwarding lookups in kernel FIB table
From: David Ahern @ 2018-05-09 21:29 UTC (permalink / raw)
  To: Daniel Borkmann, netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend
In-Reply-To: <88bd5281-2fa6-41d1-98f3-33c3d4a95674@iogearbox.net>

On 5/9/18 2:44 PM, Daniel Borkmann wrote:
> Generally, no objection. However, could we get rid of the two extra includes altogether
> to avoid running into any such dependency issue? Right now the only includes we have in
> the bpf uapi header is linux/types.h and linux/bpf_common.h (latter has no extra deps
> by itself). Both the ETH_ALEN and struct in6_addr are in uapi and therefore never allowed
> to change so we can e.g. avoid to use ETH_ALEN and just have the value instead. In the
> other places of the header we use __u32 remote_ipv6[4], __u32 src_ip6[4] etc to denote
> a v6 address, we could do the same here and should be all good then.

I was able to drop the include of linux/in6.h and still use in6_addr. I
would prefer to keep in6_addr since it works and avoid the need to add
typecasts.

As for ETH_ALEN, I could redefine it but it just kicks the can down the
road. If if_ether.h is included after bpf.h, it will cause redefinition
warnings.

^ permalink raw reply

* Re: pull-request: mac80211-next 2018-05-09
From: Johannes Berg @ 2018-05-09 21:29 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-wireless
In-Reply-To: <20180509194034.11992-1-johannes@sipsolutions.net>

Hi,

Sorry, scratch that.

I forgot that this commit:

> Toke Høiland-Jørgensen (3):

>       cfg80211: Expose TXQ stats and parameters to userspace

caused a bunch of "too much stack" warnings - I should put in at least
the non-driver fix for that first, and then coordinate with Kalle to
send the driver fixes in too.

johannes

^ permalink raw reply

* Re: [bpf-next v2 8/9] bpf: Provide helper to do forwarding lookups in kernel FIB table
From: David Ahern @ 2018-05-09 21:39 UTC (permalink / raw)
  To: Daniel Borkmann, netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend
In-Reply-To: <b2ebbdde-3198-9c58-b4cb-e44602568134@gmail.com>

On 5/9/18 3:29 PM, David Ahern wrote:
> On 5/9/18 2:44 PM, Daniel Borkmann wrote:
>> Generally, no objection. However, could we get rid of the two extra includes altogether
>> to avoid running into any such dependency issue? Right now the only includes we have in
>> the bpf uapi header is linux/types.h and linux/bpf_common.h (latter has no extra deps
>> by itself). Both the ETH_ALEN and struct in6_addr are in uapi and therefore never allowed
>> to change so we can e.g. avoid to use ETH_ALEN and just have the value instead. In the
>> other places of the header we use __u32 remote_ipv6[4], __u32 src_ip6[4] etc to denote
>> a v6 address, we could do the same here and should be all good then.
> 
> I was able to drop the include of linux/in6.h and still use in6_addr. I
> would prefer to keep in6_addr since it works and avoid the need to add
> typecasts.

Never mind; that was working because if_ether.h was pulling in skbuff.h
which included in6.h.
> 
> As for ETH_ALEN, I could redefine it but it just kicks the can down the
> road. If if_ether.h is included after bpf.h, it will cause redefinition
> warnings.
> 

I guess I will continue the open coded magic numbers for mac and ipv6
addresses.

^ permalink raw reply

* Re: [bpf-next v2 8/9] bpf: Provide helper to do forwarding lookups in kernel FIB table
From: Alexei Starovoitov @ 2018-05-09 21:49 UTC (permalink / raw)
  To: David Ahern
  Cc: Daniel Borkmann, netdev, borkmann, ast, davem, shm, roopa, brouer,
	toke, john.fastabend
In-Reply-To: <d6d01920-efd0-7e8b-f9ba-5e3b642160d3@gmail.com>

On Wed, May 09, 2018 at 03:39:52PM -0600, David Ahern wrote:
> On 5/9/18 3:29 PM, David Ahern wrote:
> > On 5/9/18 2:44 PM, Daniel Borkmann wrote:
> >> Generally, no objection. However, could we get rid of the two extra includes altogether
> >> to avoid running into any such dependency issue? Right now the only includes we have in
> >> the bpf uapi header is linux/types.h and linux/bpf_common.h (latter has no extra deps
> >> by itself). Both the ETH_ALEN and struct in6_addr are in uapi and therefore never allowed
> >> to change so we can e.g. avoid to use ETH_ALEN and just have the value instead. In the
> >> other places of the header we use __u32 remote_ipv6[4], __u32 src_ip6[4] etc to denote
> >> a v6 address, we could do the same here and should be all good then.
> > 
> > I was able to drop the include of linux/in6.h and still use in6_addr. I
> > would prefer to keep in6_addr since it works and avoid the need to add
> > typecasts.
> 
> Never mind; that was working because if_ether.h was pulling in skbuff.h
> which included in6.h.
> > 
> > As for ETH_ALEN, I could redefine it but it just kicks the can down the
> > road. If if_ether.h is included after bpf.h, it will cause redefinition
> > warnings.
> > 
> 
> I guess I will continue the open coded magic numbers for mac and ipv6
> addresses.

That's the only way.
Adding
+#include <linux/if_ether.h>
+#include <linux/in6.h>
to uapi/bpf.h is no-go. It will cause all sorts of breakage
not only to kernel build as we realized, but to various user space apps too.
Please use be32 ipv6[4] and hard coded mac instead.

^ permalink raw reply

* Re: [bpf-next v2 8/9] bpf: Provide helper to do forwarding lookups in kernel FIB table
From: Daniel Borkmann @ 2018-05-09 21:49 UTC (permalink / raw)
  To: David Ahern, netdev, borkmann, ast
  Cc: davem, shm, roopa, brouer, toke, john.fastabend
In-Reply-To: <d6d01920-efd0-7e8b-f9ba-5e3b642160d3@gmail.com>

On 05/09/2018 11:39 PM, David Ahern wrote:
> On 5/9/18 3:29 PM, David Ahern wrote:
>> On 5/9/18 2:44 PM, Daniel Borkmann wrote:
>>> Generally, no objection. However, could we get rid of the two extra includes altogether
>>> to avoid running into any such dependency issue? Right now the only includes we have in
>>> the bpf uapi header is linux/types.h and linux/bpf_common.h (latter has no extra deps
>>> by itself). Both the ETH_ALEN and struct in6_addr are in uapi and therefore never allowed
>>> to change so we can e.g. avoid to use ETH_ALEN and just have the value instead. In the
>>> other places of the header we use __u32 remote_ipv6[4], __u32 src_ip6[4] etc to denote
>>> a v6 address, we could do the same here and should be all good then.
>>
>> I was able to drop the include of linux/in6.h and still use in6_addr. I
>> would prefer to keep in6_addr since it works and avoid the need to add
>> typecasts.
> 
> Never mind; that was working because if_ether.h was pulling in skbuff.h
> which included in6.h.
>
>> As for ETH_ALEN, I could redefine it but it just kicks the can down the
>> road. If if_ether.h is included after bpf.h, it will cause redefinition
>> warnings.
> 
> I guess I will continue the open coded magic numbers for mac and ipv6
> addresses.

Agree, it will avoid breakage. We cannot assume that every BPF prog out there has
one specific ordering of if_ether.h and bpf.h includes. Open coding the numbers
seems best here.

^ permalink raw reply

* Re: [PATCH net] tc-testing: fix tdc tests for 'bpf' action
From: Lucas Bates @ 2018-05-09 21:51 UTC (permalink / raw)
  To: Davide Caratti
  Cc: Roman Mashak, David Miller, Linux Kernel Network Developers
In-Reply-To: <54d69bac92e9c0a216997261ef7b1e0eb0dd28c9.1525884149.git.dcaratti@redhat.com>

On Wed, May 9, 2018 at 12:45 PM, Davide Caratti <dcaratti@redhat.com> wrote:
> - correct a typo in the value of 'matchPattern' of test 282d, potentially
>  causing false negative
> - allow errors when 'teardown' executes '$TC action flush action bpf' in
>  test 282d, to fix false positive when it is run with act_bpf unloaded
> - correct the value of 'matchPattern' in test e939, causing false positive
>  in case the BPF JIT is enabled
>
> Fixes: 440ea4ae1828 ("tc-testing: add selftests for 'bpf' action")
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>
> ---
>  tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json | 11 ++++++++---
>  1 file changed, 8 insertions(+), 3 deletions(-)
>
> diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json b/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json
> index 5b012f4981d4..6f289a49e5ec 100644
> --- a/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json
> +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json
> @@ -66,7 +66,7 @@
>          "cmdUnderTest": "$TC action add action bpf object-file _b.o index 667",
>          "expExitCode": "0",
>          "verifyCmd": "$TC action get action bpf index 667",
> -        "matchPattern": "action order [0-9]*: bpf _b.o:\\[action\\] id [0-9]* tag 3b185187f1855c4c default-action pipe.*index 667 ref",
> +        "matchPattern": "action order [0-9]*: bpf _b.o:\\[action\\] id [0-9]* tag 3b185187f1855c4c( jited)? default-action pipe.*index 667 ref",
>          "matchCount": "1",
>          "teardown": [
>              "$TC action flush action bpf",
> @@ -92,10 +92,15 @@
>          "cmdUnderTest": "$TC action add action bpf object-file _c.o index 667",
>          "expExitCode": "255",
>          "verifyCmd": "$TC action get action bpf index 667",
> -        "matchPattern": "action order [0-9]*: bpf _b.o:\\[action\\] id [0-9].*index 667 ref",
> +        "matchPattern": "action order [0-9]*: bpf _c.o:\\[action\\] id [0-9].*index 667 ref",
>          "matchCount": "0",
>          "teardown": [
> -            "$TC action flush action bpf",
> +            [
> +                "$TC action flush action bpf",
> +                0,
> +                1,
> +                255
> +            ],
>              "rm -f _c.o"
>          ]
>      },
> --
> 2.14.3
>

Acked-by: Lucas Bates <lucasb@mojatatu.com>

^ permalink raw reply

* Re: [PATCH] selinux: add AF_UNSPEC and INADDR_ANY checks to selinux_socket_bind()
From: Paul Moore @ 2018-05-09 22:02 UTC (permalink / raw)
  To: Stephen Smalley, Alexey Kodanev, Richard Haines
  Cc: selinux, Eric Paris, linux-security-module, netdev
In-Reply-To: <CAHC9VhSg9UAmXaK0xsy_h9HfzM4uUBqCNKN5qtf3hHLSi=ZENw@mail.gmail.com>

On Wed, May 9, 2018 at 11:34 AM, Paul Moore <paul@paul-moore.com> wrote:
> On Wed, May 9, 2018 at 11:11 AM, Stephen Smalley <sds@tycho.nsa.gov> wrote:
>> On 05/09/2018 11:01 AM, Paul Moore wrote:
>>> On Wed, May 9, 2018 at 8:37 AM, Stephen Smalley <sds@tycho.nsa.gov> wrote:
>>>> On 05/08/2018 08:25 PM, Paul Moore wrote:
>>>>> On Tue, May 8, 2018 at 2:40 PM, Stephen Smalley <sds@tycho.nsa.gov> wrote:
>>>>>> On 05/08/2018 01:05 PM, Paul Moore wrote:
>>>>>>> On Tue, May 8, 2018 at 10:05 AM, Alexey Kodanev
>>>>>>> <alexey.kodanev@oracle.com> wrote:
>>>>>>>> Commit d452930fd3b9 ("selinux: Add SCTP support") breaks compatibility
>>>>>>>> with the old programs that can pass sockaddr_in with AF_UNSPEC and
>>>>>>>> INADDR_ANY to bind(). As a result, bind() returns EAFNOSUPPORT error.
>>>>>>>> It was found with LTP/asapi_01 test.
>>>>>>>>
>>>>>>>> Similar to commit 29c486df6a20 ("net: ipv4: relax AF_INET check in
>>>>>>>> bind()"), which relaxed AF_INET check for compatibility, add AF_UNSPEC
>>>>>>>> case to AF_INET and make sure that the address is INADDR_ANY.
>>>>>>>>
>>>>>>>> Also, in the end of selinux_socket_bind(), instead of adding AF_UNSPEC
>>>>>>>> to 'address->sa_family == AF_INET', verify AF_INET6 first.
>>>>>>>>
>>>>>>>> Fixes: d452930fd3b9 ("selinux: Add SCTP support")
>>>>>>>> Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
>>>>>>>> ---
>>>>>>>>  security/selinux/hooks.c | 12 +++++++++---
>>>>>>>>  1 file changed, 9 insertions(+), 3 deletions(-)
>>>>>>>
>>>>>>> Thanks for finding and reporting this regression.
>>>>>>>
>>>>>>> I think I would prefer to avoid having to duplicate the
>>>>>>> AF_UNSPEC/INADDR_ANY checking logic in the SELinux hook, even though
>>>>>>> it is a small bit of code and unlikely to change.  I'm wondering if it
>>>>>>> would be better to check both the socket and sockaddr address family
>>>>>>> in the main if conditional inside selinux_socket_bind(), what do you
>>>>>>> think?  Another option would be to go back to just checking the
>>>>>>> soackaddr address family; we moved away from that for a reason which
>>>>>>> escapes at the moment (code cleanliness?), but perhaps that was a
>>>>>>> mistake.
>>>>>>
>>>>>> We've always used the sk->sk_family there; it was only the recent code from Richard that started
>>>>>> using the socket address family.
>>>>>
>>>>> Yes I know, I thought I was the one that suggested it at some point
>>>>> (I'll take the blame) ... although now that I'm looking at the git
>>>>> log, maybe I'm confusing it with something else.
>>>>>
>>>>>>> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
>>>>>>> index 4cafe6a19167..a3789b167667 100644
>>>>>>> --- a/security/selinux/hooks.c
>>>>>>> +++ b/security/selinux/hooks.c
>>>>>>> @@ -4577,6 +4577,7 @@ static int selinux_socket_bind(struct socket *sock, struc>
>>>>>>> {
>>>>>>>        struct sock *sk = sock->sk;
>>>>>>>        u16 family;
>>>>>>> +       u16 family_sa;
>>>>>>>        int err;
>>>>>>>
>>>>>>>        err = sock_has_perm(sk, SOCKET__BIND);
>>>>>>> @@ -4585,7 +4586,9 @@ static int selinux_socket_bind(struct socket *sock, struc>
>>>>>>>
>>>>>>>        /* If PF_INET or PF_INET6, check name_bind permission for the port. */
>>>>>>>        family = sk->sk_family;
>>>>>>> -       if (family == PF_INET || family == PF_INET6) {
>>>>>>> +       family_sa = address->sa_family;
>>>>>>> +       if ((family == PF_INET || family == PF_INET6) &&
>>>>>>> +           (family_sa == PF_INET || family_sa == PF_INET6)) {
>>>>>>
>>>>>> Wouldn't this allow bypassing the name_bind permission check by passing in AF_UNSPEC?
>>>>>
>>>>> I believe these name_bind permission checkis skipped for AF_UNSPEC
>>>>> already, isn't it?  The only way the name_bind check would be
>>>>> triggered is if the source port, snum, was non-zero and I didn't think
>>>>> that was really legal for AF_UNSPEC/INADDR_ANY, is it?
>>>>
>>>> 1) What in inet_bind() prevents that from occurring?
>>>> 2) Regardless, what about the node_bind check?
>>>
>>> Fair enough.  As mentioned above, perhaps the right fix is to move the
>>> address family checking back to how it was pre-SCTP.
>>
>> It isn't clear to me how to do that without breaking SCTP multiple address binding, which is why
>> Richard had to switch to checking address->sa_family instead of just using the sk->sk_family.
>> Alexey's original fix might be the simplest solution.
>
> I'm going to have to apologize, I'm traveling at the moment and more
> distracted than usual as a result.  Let me take a closer look later
> today.  It may be that Alexey's original fix the only practical
> solution, but I really would like to avoid having to duplicate checks
> like that in the SELinux code.

I just had a better look at this and I believe that Alexey and Stephen
are right: this is the best option.  My apologies for the noise
earlier.  However, while looking at the code I think there are some
additional necessary changes:

* In the case of an SCTP socket, we should return -EINVAL, just as we
do with other address families.
* While not strictly related to AF_UNSPEC, we really should be passing
the address family of the sockaddr, and not the socket, to functions
that need to interpret the bind address/port.

I'm waiting for my kernel to compile so I haven't given this any
sanity testing, but the patch below is what I think we need ...

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4cafe6a19167..5f30045b2053 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4576,6 +4576,7 @@ static int selinux_socket_post_create(struct socket *sock,
int family,
static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, i
nt addrlen)
{
       struct sock *sk = sock->sk;
+       struct sk_security_struct *sksec = sk->sk_security;
       u16 family;
       int err;

@@ -4587,13 +4588,13 @@ static int selinux_socket_bind(struct socket *sock, stru
ct sockaddr *address, in
       family = sk->sk_family;
       if (family == PF_INET || family == PF_INET6) {
               char *addrp;
-               struct sk_security_struct *sksec = sk->sk_security;
               struct common_audit_data ad;
               struct lsm_network_audit net = {0,};
               struct sockaddr_in *addr4 = NULL;
               struct sockaddr_in6 *addr6 = NULL;
               unsigned short snum;
               u32 sid, node_perm;
+               u16 family_sa = address->sa_family;

               /*
                * sctp_bindx(3) calls via selinux_sctp_bind_connect()
@@ -4601,11 +4602,19 @@ static int selinux_socket_bind(struct socket *sock, stru
ct sockaddr *address, in
                * need to check address->sa_family as it is possible to have
                * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET.
                */
-               switch (address->sa_family) {
+               switch (family_sa) {
+               case AF_UNSPEC:
               case AF_INET:
                       if (addrlen < sizeof(struct sockaddr_in))
                               return -EINVAL;
                       addr4 = (struct sockaddr_in *)address;
+                       if (family_sa == AF_UNSPEC) {
+                               /* see "__inet_bind()", we only want to allow
+                                * AF_UNSPEC if the address is INADDR_ANY */
+                               if (addr4->sin_addr.s_addr != htonl(INADDR_ANY))
+                                       goto err_af;
+                               family_sa = AF_INET;
+                       }
                       snum = ntohs(addr4->sin_port);
                       addrp = (char *)&addr4->sin_addr.s_addr;
                       break;
@@ -4617,15 +4626,14 @@ static int selinux_socket_bind(struct socket *sock, stru
ct sockaddr *address, in
                       addrp = (char *)&addr6->sin6_addr.s6_addr;
                       break;
               default:
-                       /* Note that SCTP services expect -EINVAL, whereas
-                        * others expect -EAFNOSUPPORT.
-                        */
-                       if (sksec->sclass == SECCLASS_SCTP_SOCKET)
-                               return -EINVAL;
-                       else
-                               return -EAFNOSUPPORT;
+                       goto err_af;
               }

+               ad.type = LSM_AUDIT_DATA_NET;
+               ad.u.net = &net;
+               ad.u.net->sport = htons(snum);
+               ad.u.net->family = family_sa;
+
               if (snum) {
                       int low, high;

@@ -4637,10 +4645,6 @@ static int selinux_socket_bind(struct socket *sock, struc
t sockaddr *address, in
                                                     snum, &sid);
                               if (err)
                                       goto out;
-                               ad.type = LSM_AUDIT_DATA_NET;
-                               ad.u.net = &net;
-                               ad.u.net->sport = htons(snum);
-                               ad.u.net->family = family;
                               err = avc_has_perm(&selinux_state,
                                                  sksec->sid, sid,
                                                  sksec->sclass,
@@ -4672,16 +4676,11 @@ static int selinux_socket_bind(struct socket *sock, stru
ct sockaddr *address, in
                       break;
               }

-               err = sel_netnode_sid(addrp, family, &sid);
+               err = sel_netnode_sid(addrp, family_sa, &sid);
               if (err)
                       goto out;

-               ad.type = LSM_AUDIT_DATA_NET;
-               ad.u.net = &net;
-               ad.u.net->sport = htons(snum);
-               ad.u.net->family = family;
-
-               if (address->sa_family == AF_INET)
+               if (family_sa == AF_INET)
                       ad.u.net->v4info.saddr = addr4->sin_addr.s_addr;
               else
                       ad.u.net->v6info.saddr = addr6->sin6_addr;
@@ -4694,6 +4693,12 @@ static int selinux_socket_bind(struct socket *sock, struc
t sockaddr *address, in
       }
out:
       return err;
+err_af:
+       /* Note that SCTP services expect -EINVAL, others -EAFNOSUPPORT. */
+       if (sksec->sclass == SECCLASS_SCTP_SOCKET)
+               return -EINVAL;
+       else
+               return -EAFNOSUPPORT;
}

/* This supports connect(2) and SCTP connect services such as sctp_connectx(3)

-- 
paul moore
www.paul-moore.com

^ permalink raw reply related

* libbpf backward compatibility (was: [PATCH bpf-next v5 09/10] bpf: btf: Add BTF support to libbpf)
From: Jakub Kicinski @ 2018-05-09 22:17 UTC (permalink / raw)
  To: Martin KaFai Lau, Alexei Starovoitov, Daniel Borkmann,
	Andrey Ignatov
  Cc: netdev, kernel-team, David Beckett
In-Reply-To: <20180418225606.2771620-10-kafai@fb.com>

On Wed, 18 Apr 2018 15:56:05 -0700, Martin KaFai Lau wrote:
> diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
> index 39f6a0d64a3b..01bda076310f 100644
> --- a/tools/lib/bpf/bpf.h
> +++ b/tools/lib/bpf/bpf.h
> @@ -26,6 +26,20 @@
>  #include <linux/bpf.h>
>  #include <stddef.h>
>  
> +struct bpf_create_map_attr {
> +	const char *name;
> +	enum bpf_map_type map_type;
> +	__u32 map_flags;
> +	__u32 key_size;
> +	__u32 value_size;
> +	__u32 max_entries;
> +	__u32 numa_node;
> +	__u32 btf_fd;
> +	__u32 btf_key_id;
> +	__u32 btf_value_id;
> +};
> +
> +int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);
>  int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
>  			int key_size, int value_size, int max_entries,
>  			__u32 map_flags, int node);
> @@ -87,4 +101,6 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len);
>  int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
>  		   __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt);
>  int bpf_raw_tracepoint_open(const char *name, int prog_fd);
> +int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
> +		 bool do_log);
>  #endif

Does libbpf have to provide backward compatibility?  Are the function
prototypes not supposed to change?

Recently a number of *_xattr functions were added, these are nice for
limiting the number of parameters passed to functions, but they don't
buy us anything in terms of extensibility unless we can grow the
structures.  Should structure size be passed in as one of parameters?

^ permalink raw reply

* Re: [PATCH] net/mlx5: Fix mlx5_get_vector_affinity function
From: Guenter Roeck @ 2018-05-09 22:19 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Israel Rukshin, Max Gurtovoy, Matan Barak, Doug Ledford,
	linux-rdma, linux-kernel, netdev
In-Reply-To: <alpine.DEB.2.21.1805060929540.1685@nanos.tec.linutronix.de>

On Sun, May 06, 2018 at 09:33:26AM +0200, Thomas Gleixner wrote:
> On Sat, 5 May 2018, Guenter Roeck wrote:
> 
> > On Thu, Apr 12, 2018 at 09:49:11AM +0000, Israel Rukshin wrote:
> > > Adding the vector offset when calling to mlx5_vector2eqn() is wrong.
> > > This is because mlx5_vector2eqn() checks if EQ index is equal to vector number
> > > and the fact that the internal completion vectors that mlx5 allocates
> > > don't get an EQ index.
> > > 
> > > The second problem here is that using effective_affinity_mask gives the same
> > > CPU for different vectors.
> > > This leads to unmapped queues when calling it from blk_mq_rdma_map_queues().
> > > This doesn't happen when using affinity_hint mask.
> > > 
> > Except that affinity_hint is only defined if SMP is enabled. Without:
> > 
> > include/linux/mlx5/driver.h: In function ‘mlx5_get_vector_affinity_hint’:
> > include/linux/mlx5/driver.h:1299:13: error:
> >         ‘struct irq_desc’ has no member named ‘affinity_hint’
> > 
> > Note that this is the only use of affinity_hint outside kernel/irq.
> > Don't other drivers have similar problems ?
> 
> Aside of that.
> 
> > >  static inline const struct cpumask *
> > > -mlx5_get_vector_affinity(struct mlx5_core_dev *dev, int vector)
> > > +mlx5_get_vector_affinity_hint(struct mlx5_core_dev *dev, int vector)
> > >  {
> > > -	const struct cpumask *mask;
> > >  	struct irq_desc *desc;
> > >  	unsigned int irq;
> > >  	int eqn;
> > >  	int err;
> > >  
> > > -	err = mlx5_vector2eqn(dev, MLX5_EQ_VEC_COMP_BASE + vector, &eqn, &irq);
> > > +	err = mlx5_vector2eqn(dev, vector, &eqn, &irq);
> > >  	if (err)
> > >  		return NULL;
> > >  
> > >  	desc = irq_to_desc(irq);
> > > -#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
> > > -	mask = irq_data_get_effective_affinity_mask(&desc->irq_data);
> > > -#else
> > > -	mask = desc->irq_common_data.affinity;
> > > -#endif
> > > -	return mask;
> > > +	return desc->affinity_hint;
> 
> NAK.
> 

The offending patch is upstream, breaking non-SMP test builds, and I have
not seen any feedback from the submitter. Any suggestion how to proceed ?

Guenter

> Nothing in regular device drivers is supposed to ever fiddle with struct
> irq_desc. The existing code is already a violation of that rule and needs
> to be fixed, but not in that way.
> 
> The logic here is completely screwed. affinity_hint is set by the driver,
> so the driver already knows what it is. If the driver does not set it, then
> the thing is NULL.
> 
> Thanks,
> 
> 	tglx
> 

^ permalink raw reply

* Re: libbpf backward compatibility (was: [PATCH bpf-next v5 09/10] bpf: btf: Add BTF support to libbpf)
From: Jakub Kicinski @ 2018-05-09 22:20 UTC (permalink / raw)
  To: Martin KaFai Lau, Alexei Starovoitov, Daniel Borkmann,
	Andrey Ignatov
  Cc: netdev, kernel-team, David Beckett
In-Reply-To: <20180509151712.7d6826f7@cakuba.netronome.com>

On Wed, 9 May 2018 15:17:12 -0700, Jakub Kicinski wrote:
> On Wed, 18 Apr 2018 15:56:05 -0700, Martin KaFai Lau wrote:
> > diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
> > index 39f6a0d64a3b..01bda076310f 100644
> > --- a/tools/lib/bpf/bpf.h
> > +++ b/tools/lib/bpf/bpf.h
> > @@ -26,6 +26,20 @@
> >  #include <linux/bpf.h>
> >  #include <stddef.h>
> >  
> > +struct bpf_create_map_attr {
> > +	const char *name;
> > +	enum bpf_map_type map_type;
> > +	__u32 map_flags;
> > +	__u32 key_size;
> > +	__u32 value_size;
> > +	__u32 max_entries;
> > +	__u32 numa_node;
> > +	__u32 btf_fd;
> > +	__u32 btf_key_id;
> > +	__u32 btf_value_id;
> > +};
> > +
> > +int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);
> >  int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
> >  			int key_size, int value_size, int max_entries,
> >  			__u32 map_flags, int node);
> > @@ -87,4 +101,6 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len);
> >  int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
> >  		   __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt);
> >  int bpf_raw_tracepoint_open(const char *name, int prog_fd);
> > +int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
> > +		 bool do_log);
> >  #endif  
> 
> Does libbpf have to provide backward compatibility?  Are the function
> prototypes not supposed to change?
> 
> Recently a number of *_xattr functions were added, these are nice for
> limiting the number of parameters passed to functions, but they don't
> buy us anything in terms of extensibility unless we can grow the
> structures.  Should structure size be passed in as one of parameters?

Or is the backward compatibility not at the binary level so we can grow
the structures at will?

^ permalink raw reply

* Re: [PATCH bpf-next v5 3/4] bpf: selftest additions for SOCKHASH
From: Daniel Borkmann @ 2018-05-09 22:53 UTC (permalink / raw)
  To: John Fastabend, borkmann, ast; +Cc: netdev
In-Reply-To: <1525562710-11603-4-git-send-email-john.fastabend@gmail.com>

Hi John,

On 05/06/2018 01:25 AM, John Fastabend wrote:
> This runs existing SOCKMAP tests with SOCKHASH map type. To do this
> we push programs into include file and build two BPF programs. One
> for SOCKHASH and one for SOCKMAP.
> 
> We then run the entire test suite with each type.
> 
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> Acked-by: David S. Miller <davem@davemloft.net>
[...]
> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> index 9d76218..28316f1 100644
> --- a/tools/testing/selftests/bpf/Makefile
> +++ b/tools/testing/selftests/bpf/Makefile
> @@ -33,7 +33,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
>  	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
>  	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
>  	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
> -	test_get_stack_rawtp.o
> +	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o
>  
>  # Order correspond to 'make run_tests' order
>  TEST_PROGS := test_kmod.sh \
> diff --git a/tools/testing/selftests/bpf/test_sockhash_kern.c b/tools/testing/selftests/bpf/test_sockhash_kern.c
> new file mode 100644
> index 0000000..3bf4ad4
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/test_sockhash_kern.c
> @@ -0,0 +1,4 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
> +#define TEST_MAP_TYPE BPF_MAP_TYPE_SOCKHASH
> +#include "./test_sockmap_kern.h"
> diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
> index 29c022d..df7afc7 100644
> --- a/tools/testing/selftests/bpf/test_sockmap.c
> +++ b/tools/testing/selftests/bpf/test_sockmap.c
> @@ -47,7 +47,8 @@
>  #define S1_PORT 10000
>  #define S2_PORT 10001
>  
> -#define BPF_FILENAME "test_sockmap_kern.o"
> +#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o"
> +#define BPF_SOCKHASH_FILENAME "test_sockmap_kern.o"

Is this testing the right thing? Shouldn't above BPF_SOCKHASH_FILENAME say "test_sockhash_kern.o"
in order to select the correct BPF prog for hashmap? Seems here we're testing sock/arraymap twice.

>  #define CG_PATH "/sockmap"
>  
[...]
> +static int test_suite(void)
> +{
> +	int err;
> +
> +	err = __test_suite(BPF_SOCKMAP_FILENAME);
> +	if (err)
> +		goto out;
> +	err = __test_suite(BPF_SOCKHASH_FILENAME);
> +out:
> +	return err;
> +}
> +
Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH v4 0/2] selftests/bpf
From: Daniel Borkmann @ 2018-05-09 22:56 UTC (permalink / raw)
  To: Sirio Balmelli; +Cc: netdev
In-Reply-To: <20180508133544.6kwfglryatxkzatp@vm4>

On 05/08/2018 03:35 PM, Sirio Balmelli wrote:
> Review of v3 patch much appreciated.
> 
> Respun the series to omit the Makefile include, will work on
> a separate patch for that;
> replied to the v3 thread with queries specific to the include issue.
> 
> best,
> 
> Sirio
> 
> Sirio Balmelli (2):
>   selftests/bpf: add architecture-agnostic headers
>   selftests/bpf: ignore build products
> 
>  tools/bpf/bpftool/.gitignore           |  3 +++
>  tools/include/uapi/asm/bitsperlong.h   | 18 ++++++++++++++++++
>  tools/include/uapi/asm/errno.h         | 17 +++++++++++++++++
>  tools/testing/selftests/bpf/.gitignore |  1 +
>  4 files changed, 39 insertions(+)
>  create mode 100644 tools/bpf/bpftool/.gitignore
>  create mode 100644 tools/include/uapi/asm/bitsperlong.h
>  create mode 100644 tools/include/uapi/asm/errno.h

Applied to bpf-next, thanks Sirio!

^ permalink raw reply

* Re: [PATCH bpf-next] bpf: sync tools bpf.h uapi header
From: Daniel Borkmann @ 2018-05-09 22:59 UTC (permalink / raw)
  To: Prashant Bhole, Alexei Starovoitov; +Cc: David S . Miller, netdev
In-Reply-To: <20180509020459.6564-1-bhole_prashant_q7@lab.ntt.co.jp>

On 05/09/2018 04:04 AM, Prashant Bhole wrote:
> sync the header from include/uapi/linux/bpf.h which was updated to add
> fib lookup helper function. This fixes selftests/bpf build failure
> 
> Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
> ---
>  tools/include/uapi/linux/bpf.h | 84 +++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 83 insertions(+), 1 deletion(-)

This doesn't apply cleanly to bpf-next, please respin.

^ permalink raw reply

* Re: [PATCH v6 05/13] firmware_loader: enhance Kconfig documentation over FW_LOADER
From: Kees Cook @ 2018-05-09 23:01 UTC (permalink / raw)
  To: Luis R. Rodriguez
  Cc: Greg KH, Andrew Morton, Josh Triplett, maco, Andy Gross,
	David Brown, bjorn.andersson, Tom Gundersen, wagi, Hans de Goede,
	Andres Rodriguez, Mimi Zohar, Jakub Kicinski, Shuah Khan,
	Martin Fuzzey, David Howells, pali.rohar, Takashi Iwai,
	Kalle Valo, Arend van Spriel, Rafał Miłecki,
	Nicolas Broeking
In-Reply-To: <20180509205544.GW27853@wotan.suse.de>

On Wed, May 9, 2018 at 1:55 PM, Luis R. Rodriguez <mcgrof@kernel.org> wrote:
> On Tue, May 08, 2018 at 03:42:33PM -0700, Kees Cook wrote:
>> On Tue, May 8, 2018 at 11:12 AM, Luis R. Rodriguez <mcgrof@kernel.org> wrote:
>> > +         This used to be the default firmware loading facility, and udev used
>> > +         to listen for uvents to load firmware for the kernel. The firmware
>> > +         loading facility functionality in udev has been removed, as such it
>> > +         can no longer be relied upon as a fallback mechanism. Linux no longer
>> > +         relies on or uses a fallback mechanism in userspace. If you need to
>> > +         rely on one refer to the permissively licensed firmwared:
>>
>> Typo: firmware
>
> Thanks fixed all typos except this one, this one is meant to be firmwared as
> that is the name of the project, the url is below.
>
>>
>> > +
>> > +         https://github.com/teg/firmwared

Oh! Yes, hah. :) Thanks!

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply

* Proposal
From: Zeliha Omer Faruk @ 2018-05-09 23:04 UTC (permalink / raw)




-- 
Hello

Greetings to you please i have a business proposal for you contact me
for more detailes asap thanks.

Best Regards,
Miss.Zeliha ömer faruk
Esentepe Mahallesi Büyükdere
Caddesi Kristal Kule Binasi
No:215
Sisli - Istanbul, Turkey

^ permalink raw reply

* Re: [PATCH net-next RFC 1/3] net: Add support to configure SR-IOV VF minimum and maximum queues.
From: Jakub Kicinski @ 2018-05-09 23:15 UTC (permalink / raw)
  To: Michael Chan, Or Gerlitz; +Cc: davem, netdev
In-Reply-To: <1525864903-32619-2-git-send-email-michael.chan@broadcom.com>

On Wed,  9 May 2018 07:21:41 -0400, Michael Chan wrote:
> VF Queue resources are always limited and there is currently no
> infrastructure to allow the admin. on the host to add or reduce queue
> resources for any particular VF.  With ever increasing number of VFs
> being supported, it is desirable to allow the admin. to configure queue
> resources differently for the VFs.  Some VFs may require more or fewer
> queues due to different bandwidth requirements or different number of
> vCPUs in the VM.  This patch adds the infrastructure to do that by
> adding IFLA_VF_QUEUES netlink attribute and a new .ndo_set_vf_queues()
> to the net_device_ops.
> 
> Four parameters are exposed for each VF:
> 
> o min_tx_queues - Guaranteed or current tx queues assigned to the VF.

This muxing of semantics may be a little awkward and unnecessary, would
it make sense for struct ifla_vf_info to have a separate fields for
current number of queues and the admin-set guaranteed min?

Is there a real world use case for the min value or are you trying to
make the API feature complete?

> o max_tx_queues - Maximum but not necessarily guaranteed tx queues
>   available to the VF.
> 
> o min_rx_queues - Guaranteed or current rx queues assigned to the VF.
> 
> o max_rx_queues - Maximum but not necessarily guaranteed rx queues
>   available to the VF.
> 
> The "ip link set" command will subsequently be patched to support the new
> operation to set the above parameters.
> 
> After the admin. makes a change to the above parameters, the corresponding
> VF will have a new range of channels to set using ethtool -L.
> 
> Signed-off-by: Michael Chan <michael.chan@broadcom.com>

In switchdev mode we can use number of queues on the representor as a
proxy for max number of queues allowed for the ASIC port.  This works
better when representors are muxed in the first place than when they
have actual queues backing them.  WDYT about such scheme, Or?  A very
pleasant side-effect is that one can configure qdiscs and get stats
per-HW queue.

^ permalink raw reply

* Re: [PATCH] net/mlx5: Fix mlx5_get_vector_affinity function
From: Saeed Mahameed @ 2018-05-09 23:19 UTC (permalink / raw)
  To: linux@roeck-us.net, tglx@linutronix.de
  Cc: netdev@vger.kernel.org, Max Gurtovoy, Israel Rukshin,
	linux-rdma@vger.kernel.org, Matan Barak, dledford@redhat.com,
	linux-kernel@vger.kernel.org
In-Reply-To: <20180509221906.GA7548@roeck-us.net>

On Wed, 2018-05-09 at 15:19 -0700, Guenter Roeck wrote:
> On Sun, May 06, 2018 at 09:33:26AM +0200, Thomas Gleixner wrote:
> > On Sat, 5 May 2018, Guenter Roeck wrote:
> > 
> > > On Thu, Apr 12, 2018 at 09:49:11AM +0000, Israel Rukshin wrote:
> > > > Adding the vector offset when calling to mlx5_vector2eqn() is
> > > > wrong.
> > > > This is because mlx5_vector2eqn() checks if EQ index is equal
> > > > to vector number
> > > > and the fact that the internal completion vectors that mlx5
> > > > allocates
> > > > don't get an EQ index.
> > > > 
> > > > The second problem here is that using effective_affinity_mask
> > > > gives the same
> > > > CPU for different vectors.
> > > > This leads to unmapped queues when calling it from
> > > > blk_mq_rdma_map_queues().
> > > > This doesn't happen when using affinity_hint mask.
> > > > 
> > > 
> > > Except that affinity_hint is only defined if SMP is enabled.
> > > Without:
> > > 
> > > include/linux/mlx5/driver.h: In function
> > > ‘mlx5_get_vector_affinity_hint’:
> > > include/linux/mlx5/driver.h:1299:13: error:
> > >         ‘struct irq_desc’ has no member named ‘affinity_hint’
> > > 
> > > Note that this is the only use of affinity_hint outside
> > > kernel/irq.
> > > Don't other drivers have similar problems ?
> > 
> > Aside of that.
> > 
> > > >  static inline const struct cpumask *
> > > > -mlx5_get_vector_affinity(struct mlx5_core_dev *dev, int
> > > > vector)
> > > > +mlx5_get_vector_affinity_hint(struct mlx5_core_dev *dev, int
> > > > vector)
> > > >  {
> > > > -	const struct cpumask *mask;
> > > >  	struct irq_desc *desc;
> > > >  	unsigned int irq;
> > > >  	int eqn;
> > > >  	int err;
> > > >  
> > > > -	err = mlx5_vector2eqn(dev, MLX5_EQ_VEC_COMP_BASE +
> > > > vector, &eqn, &irq);
> > > > +	err = mlx5_vector2eqn(dev, vector, &eqn, &irq);
> > > >  	if (err)
> > > >  		return NULL;
> > > >  
> > > >  	desc = irq_to_desc(irq);
> > > > -#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
> > > > -	mask = irq_data_get_effective_affinity_mask(&desc-
> > > > >irq_data);
> > > > -#else
> > > > -	mask = desc->irq_common_data.affinity;
> > > > -#endif
> > > > -	return mask;
> > > > +	return desc->affinity_hint;
> > 
> > NAK.
> > 
> 
> The offending patch is upstream, breaking non-SMP test builds, and I
> have
> not seen any feedback from the submitter. Any suggestion how to
> proceed ?
> 
> Guenter
> 
> > 
> 

Hi Guenter and Thomas,

Max and Israel are handling this internally to find a solution that
provides the needed functionality for rdma and addresses your comments.

Thanks,
Saeed.

^ permalink raw reply

* Re: linux-next: Tree for May 9 (mlx5)
From: Saeed Mahameed @ 2018-05-09 23:21 UTC (permalink / raw)
  To: sfr@canb.auug.org.au, rdunlap@infradead.org, Israel Rukshin,
	linux-next@vger.kernel.org, Max Gurtovoy
  Cc: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	Leon Romanovsky, Matan Barak
In-Reply-To: <826ab15b-f2bc-0c5d-8d5e-6466badcd3e0@infradead.org>

On Wed, 2018-05-09 at 08:31 -0700, Randy Dunlap wrote:
> On 05/09/2018 04:21 AM, Stephen Rothwell wrote:
> > Hi all,
> > 
> > Changes since 20180508:
> > 
> 
> on x86_64:
> # CONFIG_SMP is not set
> 
> In file included from
> ../drivers/net/ethernet/mellanox/mlx5/core/main.c:43:0:
> ../include/linux/mlx5/driver.h: In function
> 'mlx5_get_vector_affinity_hint':
> ../include/linux/mlx5/driver.h:1299:13: error: 'struct irq_desc' has
> no member named 'affinity_hint'
>   return desc->affinity_hint;
>              ^
> 
> 

Hi Stephen,

Israel and Max are working on a solution, will provide it ASAP.

Thanks,
Saeed.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox