Netdev List
 help / color / mirror / Atom feed
* [PATCH bpf-next 07/11] bpf: Add helper to retrieve socket in BPF
From: Joe Stringer @ 2018-09-12  0:36 UTC (permalink / raw)
  To: daniel
  Cc: netdev, ast, john.fastabend, tgraf, kafai, nitin.hande,
	mauricio.vasquez
In-Reply-To: <20180912003640.28316-1-joe@wand.net.nz>

This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and
bpf_sk_lookup_udp() which allows BPF programs to find out if there is a
socket listening on this host, and returns a socket pointer which the
BPF program can then access to determine, for instance, whether to
forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the
socket, so when a BPF program makes use of this function, it must
subsequently pass the returned pointer into the newly added sk_release()
to return the reference.

By way of example, the following pseudocode would filter inbound
connections at XDP if there is no corresponding service listening for
the traffic:

  struct bpf_sock_tuple tuple;
  struct bpf_sock_ops *sk;

  populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
  sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0);
  if (!sk) {
    // Couldn't find a socket listening for this traffic. Drop.
    return TC_ACT_SHOT;
  }
  bpf_sk_release(sk, 0);
  return TC_ACT_OK;

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 include/uapi/linux/bpf.h                  |  54 +++++++-
 kernel/bpf/verifier.c                     |   8 +-
 net/core/filter.c                         | 145 ++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h            |  54 +++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  12 ++
 5 files changed, 270 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 66917a4eba27..8ed6e293113f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2141,6 +2141,41 @@ union bpf_attr {
  *		request in the skb.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * struct bpf_sock_ops *bpf_sk_lookup_tcp(ctx, tuple, tuple_size, netns, flags)
+ * 	Decription
+ * 		Look for TCP socket matching 'tuple'. The return value must
+ * 		be checked, and if non-NULL, released via bpf_sk_release().
+ * 		@ctx: pointer to ctx
+ * 		@tuple: pointer to struct bpf_sock_tuple
+ * 		@tuple_size: size of the tuple
+ * 		@netns: network namespace id
+ * 		@flags: flags value
+ * 	Return
+ * 		pointer to socket ops on success, or
+ * 		NULL in case of failure
+ *
+ * struct bpf_sock_ops *bpf_sk_lookup_udp(ctx, tuple, tuple_size, netns, flags)
+ * 	Decription
+ * 		Look for UDP socket matching 'tuple'. The return value must
+ * 		be checked, and if non-NULL, released via bpf_sk_release().
+ * 		@ctx: pointer to ctx
+ * 		@tuple: pointer to struct bpf_sock_tuple
+ * 		@tuple_size: size of the tuple
+ * 		@netns: network namespace id
+ * 		@flags: flags value
+ * 	Return
+ * 		pointer to socket ops on success, or
+ * 		NULL in case of failure
+ *
+ *  int bpf_sk_release(sock, flags)
+ * 	Description
+ * 		Release the reference held by 'sock'.
+ * 		@sock: Pointer reference to release. Must be found via
+ * 		       bpf_sk_lookup_xxx().
+ * 		@flags: flags value
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2226,7 +2261,10 @@ union bpf_attr {
 	FN(get_current_cgroup_id),	\
 	FN(get_local_storage),		\
 	FN(sk_select_reuseport),	\
-	FN(skb_ancestor_cgroup_id),
+	FN(skb_ancestor_cgroup_id),	\
+	FN(sk_lookup_tcp),		\
+	FN(sk_lookup_udp),		\
+	FN(sk_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2395,6 +2433,20 @@ struct bpf_sock {
 				 */
 };
 
+struct bpf_sock_tuple {
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} saddr;
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} daddr;
+	__be16 sport;
+	__be16 dport;
+	__u8 family;
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 67c62ef67d37..37feedaaa1c3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
  * passes through a NULL-check conditional. For the branch wherein the state is
  * changed to CONST_IMM, the verifier releases the reference.
+ *
+ * For each helper function that allocates a reference, such as
+ * bpf_sk_lookup_tcp(), there is a corresponding release function, such as
+ * bpf_sk_release(). When a reference type passes into the release function,
+ * the verifier also releases the reference. If any unchecked or unreleased
+ * reference remains at the end of the program, the verifier rejects it.
  */
 
 /* verifier_state + insn_idx are pushed to stack when branch is encountered */
@@ -300,7 +306,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type)
  */
 static bool is_release_function(enum bpf_func_id func_id)
 {
-	return false;
+	return func_id == BPF_FUNC_sk_release;
 }
 
 /* string representation of 'enum bpf_reg_type' */
diff --git a/net/core/filter.c b/net/core/filter.c
index 23e6e5202138..b092cae8efd5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -58,13 +58,17 @@
 #include <net/busy_poll.h>
 #include <net/tcp.h>
 #include <net/xfrm.h>
+#include <net/udp.h>
 #include <linux/bpf_trace.h>
 #include <net/xdp_sock.h>
 #include <linux/inetdevice.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
 #include <net/ip_fib.h>
 #include <net/flow.h>
 #include <net/arp.h>
 #include <net/ipv6.h>
+#include <net/net_namespace.h>
 #include <linux/seg6_local.h>
 #include <net/seg6.h>
 #include <net/seg6_local.h>
@@ -4811,6 +4815,135 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
 };
 #endif /* CONFIG_IPV6_SEG6_BPF */
 
+struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
+		       struct sk_buff *skb, u8 proto)
+{
+	int dif = skb->dev->ifindex;
+	bool refcounted = false;
+	struct sock *sk = NULL;
+
+	if (tuple->family == AF_INET) {
+		int sdif = inet_sdif(skb);
+
+		if (proto == IPPROTO_TCP)
+			sk = __inet_lookup(net, &tcp_hashinfo, skb, 0,
+					   tuple->saddr.ipv4, tuple->sport,
+					   tuple->daddr.ipv4, tuple->dport,
+					   dif, sdif, &refcounted);
+		else
+			sk = __udp4_lib_lookup(net,
+					       tuple->saddr.ipv4, tuple->sport,
+					       tuple->daddr.ipv4, tuple->dport,
+					       dif, sdif, &udp_table, skb);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else {
+		struct in6_addr *src6 = (struct in6_addr *)&tuple->saddr.ipv6;
+		struct in6_addr *dst6 = (struct in6_addr *)&tuple->daddr.ipv6;
+		int sdif = inet6_sdif(skb);
+
+		if (proto == IPPROTO_TCP)
+			sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0,
+					    src6, tuple->sport,
+					    dst6, tuple->dport,
+					    dif, sdif, &refcounted);
+		else
+			sk = __udp6_lib_lookup(net, src6, tuple->sport,
+					       dst6, tuple->dport,
+					       dif, sdif, &udp_table, skb);
+	}
+#endif
+
+	if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
+		WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+		sk = NULL;
+	}
+	return sk;
+}
+
+/* bpf_sk_lookup performs the core lookup for different types of sockets,
+ * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
+ * Returns the socket as an 'unsigned long' to simplify the casting in the
+ * callers to satisfy BPF_CALL declarations.
+ */
+static unsigned long
+bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+	      u8 proto, u32 netns_id, u64 flags)
+{
+	struct net *caller_net = dev_net(skb->dev);
+	struct sock *sk = NULL;
+	struct net *net;
+
+	if (unlikely(len != sizeof(struct bpf_sock_tuple) || flags ||
+		     (tuple->family != AF_INET && tuple->family != AF_INET6)))
+		goto out;
+
+	if (netns_id)
+		net = get_net_ns_by_id(caller_net, netns_id);
+	else
+		net = caller_net;
+	if (unlikely(!net))
+		goto out;
+	sk = sk_lookup(net, tuple, skb, proto);
+	put_net(net);
+
+	if (sk)
+		sk = sk_to_full_sk(sk);
+out:
+	return (unsigned long) sk;
+}
+
+BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
+	   struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+	return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
+	.func		= bpf_sk_lookup_tcp,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
+	   struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+	return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
+	.func		= bpf_sk_lookup_udp,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_sk_release, struct sock *, sk, u64, flags)
+{
+	if (!sock_flag(sk, SOCK_RCU_FREE))
+		sock_gen_put(sk);
+
+	if (unlikely(flags))
+		return -EINVAL;
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_release_proto = {
+	.func		= bpf_sk_release,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_SOCKET,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 bool bpf_helper_changes_pkt_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
@@ -5017,6 +5150,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_ancestor_cgroup_id:
 		return &bpf_skb_ancestor_cgroup_id_proto;
 #endif
+	case BPF_FUNC_sk_lookup_tcp:
+		return &bpf_sk_lookup_tcp_proto;
+	case BPF_FUNC_sk_lookup_udp:
+		return &bpf_sk_lookup_udp_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -5117,6 +5256,12 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sk_redirect_hash_proto;
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
+	case BPF_FUNC_sk_lookup_tcp:
+		return &bpf_sk_lookup_tcp_proto;
+	case BPF_FUNC_sk_lookup_udp:
+		return &bpf_sk_lookup_udp_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 66917a4eba27..8ed6e293113f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2141,6 +2141,41 @@ union bpf_attr {
  *		request in the skb.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * struct bpf_sock_ops *bpf_sk_lookup_tcp(ctx, tuple, tuple_size, netns, flags)
+ * 	Decription
+ * 		Look for TCP socket matching 'tuple'. The return value must
+ * 		be checked, and if non-NULL, released via bpf_sk_release().
+ * 		@ctx: pointer to ctx
+ * 		@tuple: pointer to struct bpf_sock_tuple
+ * 		@tuple_size: size of the tuple
+ * 		@netns: network namespace id
+ * 		@flags: flags value
+ * 	Return
+ * 		pointer to socket ops on success, or
+ * 		NULL in case of failure
+ *
+ * struct bpf_sock_ops *bpf_sk_lookup_udp(ctx, tuple, tuple_size, netns, flags)
+ * 	Decription
+ * 		Look for UDP socket matching 'tuple'. The return value must
+ * 		be checked, and if non-NULL, released via bpf_sk_release().
+ * 		@ctx: pointer to ctx
+ * 		@tuple: pointer to struct bpf_sock_tuple
+ * 		@tuple_size: size of the tuple
+ * 		@netns: network namespace id
+ * 		@flags: flags value
+ * 	Return
+ * 		pointer to socket ops on success, or
+ * 		NULL in case of failure
+ *
+ *  int bpf_sk_release(sock, flags)
+ * 	Description
+ * 		Release the reference held by 'sock'.
+ * 		@sock: Pointer reference to release. Must be found via
+ * 		       bpf_sk_lookup_xxx().
+ * 		@flags: flags value
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2226,7 +2261,10 @@ union bpf_attr {
 	FN(get_current_cgroup_id),	\
 	FN(get_local_storage),		\
 	FN(sk_select_reuseport),	\
-	FN(skb_ancestor_cgroup_id),
+	FN(skb_ancestor_cgroup_id),	\
+	FN(sk_lookup_tcp),		\
+	FN(sk_lookup_udp),		\
+	FN(sk_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2395,6 +2433,20 @@ struct bpf_sock {
 				 */
 };
 
+struct bpf_sock_tuple {
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} saddr;
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} daddr;
+	__be16 sport;
+	__be16 dport;
+	__u8 family;
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index e4be7730222d..88ce00c3aa0f 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -143,6 +143,18 @@ static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) =
 	(void *) BPF_FUNC_skb_cgroup_id;
 static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) =
 	(void *) BPF_FUNC_skb_ancestor_cgroup_id;
+static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx,
+					     struct bpf_sock_tuple *tuple,
+					     int size, unsigned int netns_id,
+					     unsigned long long flags) =
+	(void *) BPF_FUNC_sk_lookup_tcp;
+static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx,
+					     struct bpf_sock_tuple *tuple,
+					     int size, unsigned int netns_id,
+					     unsigned long long flags) =
+	(void *) BPF_FUNC_sk_lookup_udp;
+static int (*bpf_sk_release)(struct bpf_sock *sk, unsigned long long flags) =
+	(void *) BPF_FUNC_sk_release;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 08/11] selftests/bpf: Add tests for reference tracking
From: Joe Stringer @ 2018-09-12  0:36 UTC (permalink / raw)
  To: daniel
  Cc: netdev, ast, john.fastabend, tgraf, kafai, nitin.hande,
	mauricio.vasquez
In-Reply-To: <20180912003640.28316-1-joe@wand.net.nz>

reference tracking: leak potential reference
reference tracking: leak potential reference on stack
reference tracking: leak potential reference on stack 2
reference tracking: zero potential reference
reference tracking: copy and zero potential references
reference tracking: release reference without check
reference tracking: release reference
reference tracking: release reference twice
reference tracking: release reference twice inside branch
reference tracking: alloc, check, free in one subbranch
reference tracking: alloc, check, free in both subbranches
reference tracking in call: free reference in subprog
reference tracking in call: free reference in subprog and outside
reference tracking in call: alloc & leak reference in subprog
reference tracking in call: alloc in subprog, release outside
reference tracking in call: sk_ptr leak into caller stack
reference tracking in call: sk_ptr spill into caller stack

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 tools/testing/selftests/bpf/test_verifier.c | 359 ++++++++++++++++++++
 1 file changed, 359 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index ceb55a9f3da9..eb760ead257a 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -3,6 +3,7 @@
  *
  * Copyright (c) 2014 PLUMgrid, http://plumgrid.com
  * Copyright (c) 2017 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -177,6 +178,23 @@ static void bpf_fill_rand_ld_dw(struct bpf_test *self)
 	self->retval = (uint32_t)res;
 }
 
+#define BPF_SK_LOOKUP							\
+	/* struct bpf_sock_tuple tuple = {} */				\
+	BPF_MOV64_IMM(BPF_REG_2, 0),					\
+	BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),			\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -16),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -24),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -32),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -40),		\
+	BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -48),		\
+	/* sk = sk_lookup_tcp(ctx, &tuple, sizeof tuple, 0, 0) */	\
+	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),				\
+	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -48),				\
+	BPF_MOV64_IMM(BPF_REG_3, 44),					\
+	BPF_MOV64_IMM(BPF_REG_4, 0),					\
+	BPF_MOV64_IMM(BPF_REG_5, 0),					\
+	BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp)
+
 static struct bpf_test tests[] = {
 	{
 		"add+sub+mul",
@@ -12441,6 +12459,222 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.result = ACCEPT,
 	},
+	{
+		"reference tracking: leak potential reference",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), /* leak reference */
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: leak potential reference on stack",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: leak potential reference on stack 2",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: zero potential reference",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_IMM(BPF_REG_0, 0), /* leak reference */
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: copy and zero potential references",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_7, 0), /* leak reference */
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: release reference without check",
+		.insns = {
+			BPF_SK_LOOKUP,
+			/* reference in r0 may be NULL */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "type=sock_or_null expected=sock",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: release reference",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking: release reference 2",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking: release reference twice",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "type=inv expected=sock",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: release reference twice inside branch",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), /* goto end */
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "type=inv expected=sock",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: alloc, check, free in one subbranch",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 16),
+			/* if (offsetof(skb, mark) > data_len) exit; */
+			BPF_JMP_REG(BPF_JLE, BPF_REG_0, BPF_REG_3, 1),
+			BPF_EXIT_INSN(),
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_2,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_SK_LOOKUP,
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 1), /* mark == 0? */
+			/* Leak reference in R0 */
+			BPF_EXIT_INSN(),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), /* sk NULL? */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking: alloc, check, free in both subbranches",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 16),
+			/* if (offsetof(skb, mark) > data_len) exit; */
+			BPF_JMP_REG(BPF_JLE, BPF_REG_0, BPF_REG_3, 1),
+			BPF_EXIT_INSN(),
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_2,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_SK_LOOKUP,
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 5), /* mark == 0? */
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), /* sk NULL? */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), /* sk NULL? */
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking in call: free reference in subprog",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* unchecked reference */
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_2, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
 	{
 		"pass modified ctx pointer to helper, 1",
 		.insns = {
@@ -12511,6 +12745,131 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 		.result = ACCEPT,
 	},
+	{
+		"reference tracking in call: free reference in subprog and outside",
+		.insns = {
+			BPF_SK_LOOKUP,
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* unchecked reference */
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_2, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "type=inv expected=sock",
+		.result = REJECT,
+	},
+	{
+		"reference tracking in call: alloc & leak reference in subprog",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_4),
+			BPF_SK_LOOKUP,
+			/* spill unchecked sk_ptr into stack of caller */
+			BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking in call: alloc in subprog, release outside",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_SK_LOOKUP,
+			BPF_EXIT_INSN(), /* return sk */
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.retval = POINTER_VALUE,
+		.result = ACCEPT,
+	},
+	{
+		"reference tracking in call: sk_ptr leak into caller stack",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5),
+			/* spill unchecked sk_ptr into stack of caller */
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_5, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 2 */
+			BPF_SK_LOOKUP,
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.errstr = "Unreleased reference",
+		.result = REJECT,
+	},
+	{
+		"reference tracking in call: sk_ptr spill into caller stack",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+
+			/* subprog 1 */
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 9),
+			/* spill unchecked sk_ptr into stack of caller */
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_5, 0),
+			BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+			/* now the sk_ptr is verified, free the reference */
+			BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_4, 0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_EMIT_CALL(BPF_FUNC_sk_release),
+			BPF_EXIT_INSN(),
+
+			/* subprog 2 */
+			BPF_SK_LOOKUP,
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.result = ACCEPT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 09/11] libbpf: Support loading individual progs
From: Joe Stringer @ 2018-09-12  0:36 UTC (permalink / raw)
  To: daniel
  Cc: netdev, ast, john.fastabend, tgraf, kafai, nitin.hande,
	mauricio.vasquez
In-Reply-To: <20180912003640.28316-1-joe@wand.net.nz>

Allow the individual program load to be invoked. This will help with
testing, where a single ELF may contain several sections, some of which
denote subprograms that are expected to fail verification, along with
some which are expected to pass verification. By allowing programs to be
iterated and individually loaded, each program can be independently
checked against its expected verification result.

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 tools/lib/bpf/libbpf.c | 4 ++--
 tools/lib/bpf/libbpf.h | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 8476da7f2720..aadf05f6bfa0 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -227,7 +227,7 @@ struct bpf_object {
 };
 #define obj_elf_valid(o)	((o)->efile.elf)
 
-static void bpf_program__unload(struct bpf_program *prog)
+void bpf_program__unload(struct bpf_program *prog)
 {
 	int i;
 
@@ -1375,7 +1375,7 @@ load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type,
 	return ret;
 }
 
-static int
+int
 bpf_program__load(struct bpf_program *prog,
 		  char *license, u32 kern_version)
 {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index e3b00e23e181..40e4395f1c07 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -126,10 +126,13 @@ void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex);
 
 const char *bpf_program__title(struct bpf_program *prog, bool needs_copy);
 
+int bpf_program__load(struct bpf_program *prog, char *license,
+		      u32 kern_version);
 int bpf_program__fd(struct bpf_program *prog);
 int bpf_program__pin_instance(struct bpf_program *prog, const char *path,
 			      int instance);
 int bpf_program__pin(struct bpf_program *prog, const char *path);
+void bpf_program__unload(struct bpf_program *prog);
 
 struct bpf_insn;
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 10/11] selftests/bpf: Add C tests for reference tracking
From: Joe Stringer @ 2018-09-12  0:36 UTC (permalink / raw)
  To: daniel
  Cc: netdev, ast, john.fastabend, tgraf, kafai, nitin.hande,
	mauricio.vasquez
In-Reply-To: <20180912003640.28316-1-joe@wand.net.nz>

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 tools/testing/selftests/bpf/Makefile          |   2 +-
 tools/testing/selftests/bpf/test_progs.c      |  38 ++++++
 .../selftests/bpf/test_sk_lookup_kern.c       | 128 ++++++++++++++++++
 3 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_sk_lookup_kern.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index fff7fb1285fc..311b7bc9e37a 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -35,7 +35,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
 	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
 	get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
-	test_skb_cgroup_id_kern.o
+	test_skb_cgroup_id_kern.o test_sk_lookup_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 63a671803ed6..e8becca9c521 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1698,6 +1698,43 @@ static void test_task_fd_query_tp(void)
 				   "sys_enter_read");
 }
 
+static void test_reference_tracking()
+{
+	const char *file = "./test_sk_lookup_kern.o";
+	struct bpf_object *obj;
+	struct bpf_program *prog;
+	__u32 duration;
+	int err = 0;
+
+	obj = bpf_object__open(file);
+	if (IS_ERR(obj)) {
+		error_cnt++;
+		return;
+	}
+
+	bpf_object__for_each_program(prog, obj) {
+		const char *title;
+
+		/* Ignore .text sections */
+		title = bpf_program__title(prog, false);
+		if (strstr(title, ".text") != NULL)
+			continue;
+
+		bpf_program__set_type(prog, BPF_PROG_TYPE_SCHED_CLS);
+
+		/* Expect verifier failure if test name has 'fail' */
+		if (strstr(title, "fail") != NULL) {
+			libbpf_set_print(NULL, NULL, NULL);
+			err = !bpf_program__load(prog, "GPL", 0);
+			libbpf_set_print(printf, printf, NULL);
+		} else {
+			err = bpf_program__load(prog, "GPL", 0);
+		}
+		CHECK(err, title, "\n");
+	}
+	bpf_object__close(obj);
+}
+
 int main(void)
 {
 	jit_enabled = is_jit_enabled();
@@ -1719,6 +1756,7 @@ int main(void)
 	test_get_stack_raw_tp();
 	test_task_fd_query_rawtp();
 	test_task_fd_query_tp();
+	test_reference_tracking();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/test_sk_lookup_kern.c
new file mode 100644
index 000000000000..321a2299a3ac
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sk_lookup_kern.c
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/tcp.h>
+#include <sys/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */
+static void *fill_ip(struct bpf_sock_tuple *tuple, void *data, __u64 nh_off,
+		     void *data_end, __u16 eth_proto)
+{
+	__u64 ihl_len;
+	__u8 proto;
+
+	if (eth_proto == bpf_htons(ETH_P_IP)) {
+		struct iphdr *iph = (struct iphdr *)(data + nh_off);
+
+		if (iph + 1 > data_end)
+			return NULL;
+		ihl_len = iph->ihl * 4;
+		proto = iph->protocol;
+
+		tuple->family = AF_INET;
+		tuple->saddr.ipv4 = iph->saddr;
+		tuple->daddr.ipv4 = iph->daddr;
+	} else if (eth_proto == bpf_htons(ETH_P_IPV6)) {
+		struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off);
+
+		if (ip6h + 1 > data_end)
+			return NULL;
+		ihl_len = sizeof(*ip6h);
+		proto = ip6h->nexthdr;
+
+		tuple->family = AF_INET6;
+		*((struct in6_addr *)&tuple->saddr.ipv6) = ip6h->saddr;
+		*((struct in6_addr *)&tuple->daddr.ipv6) = ip6h->daddr;
+	}
+
+	if (proto != IPPROTO_TCP)
+		return NULL;
+
+	return data + nh_off + ihl_len;
+}
+
+SEC("sk_lookup_success")
+int bpf_sk_lookup_test0(struct __sk_buff *skb)
+{
+	void *data_end = (void *)(long)skb->data_end;
+	void *data = (void *)(long)skb->data;
+	struct ethhdr *eth = (struct ethhdr *)(data);
+	struct bpf_sock_tuple tuple = {};
+	struct tcphdr *tcp;
+	struct bpf_sock *sk;
+	void *l4;
+
+	if (eth + 1 > data_end)
+		return TC_ACT_SHOT;
+
+	l4 = fill_ip(&tuple, data, sizeof(*eth), data_end, eth->h_proto);
+	if (!l4 || l4 + sizeof *tcp > data_end)
+		return TC_ACT_SHOT;
+
+	tcp = l4;
+	tuple.sport = tcp->source;
+	tuple.dport = tcp->dest;
+
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	if (sk)
+		bpf_sk_release(sk, 0);
+	return sk ? TC_ACT_OK : TC_ACT_UNSPEC;
+}
+
+SEC("fail_no_release")
+int bpf_sk_lookup_test1(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+
+	bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	return 0;
+}
+
+SEC("fail_release_twice")
+int bpf_sk_lookup_test2(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+	struct bpf_sock *sk;
+
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	bpf_sk_release(sk, 0);
+	bpf_sk_release(sk, 0);
+	return 0;
+}
+
+SEC("fail_release_unchecked")
+int bpf_sk_lookup_test3(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+	struct bpf_sock *sk;
+
+	sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+	bpf_sk_release(sk, 0);
+	return 0;
+}
+
+void lookup_no_release(struct __sk_buff *skb)
+{
+	struct bpf_sock_tuple tuple = {};
+	bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), 0, 0);
+}
+
+SEC("fail_no_release_subcall")
+int bpf_sk_lookup_test4(struct __sk_buff *skb)
+{
+	lookup_no_release(skb);
+	return 0;
+}
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 11/11] Documentation: Describe bpf reference tracking
From: Joe Stringer @ 2018-09-12  0:36 UTC (permalink / raw)
  To: daniel
  Cc: netdev, ast, john.fastabend, tgraf, kafai, nitin.hande,
	mauricio.vasquez
In-Reply-To: <20180912003640.28316-1-joe@wand.net.nz>

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 Documentation/networking/filter.txt | 64 +++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index e6b4ebb2b243..4443ce958862 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1125,6 +1125,14 @@ pointer type.  The types of pointers describe their base, as follows:
     PTR_TO_STACK        Frame pointer.
     PTR_TO_PACKET       skb->data.
     PTR_TO_PACKET_END   skb->data + headlen; arithmetic forbidden.
+    PTR_TO_SOCKET       Pointer to struct bpf_sock_ops, implicitly refcounted.
+    PTR_TO_SOCKET_OR_NULL
+                        Either a pointer to a socket, or NULL; socket lookup
+                        returns this type, which becomes a PTR_TO_SOCKET when
+                        checked != NULL. PTR_TO_SOCKET is reference-counted,
+                        so programs must release the reference through the
+                        socket release function before the end of the program.
+                        Arithmetic on these pointers is forbidden.
 However, a pointer may be offset from this base (as a result of pointer
 arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable
 offset'.  The former is used when an exactly-known value (e.g. an immediate
@@ -1171,6 +1179,13 @@ over the Ethernet header, then reads IHL and addes (IHL * 4), the resulting
 pointer will have a variable offset known to be 4n+2 for some n, so adding the 2
 bytes (NET_IP_ALIGN) gives a 4-byte alignment and so word-sized accesses through
 that pointer are safe.
+The 'id' field is also used on PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL, common
+to all copies of the pointer returned from a socket lookup. This has similar
+behaviour to the handling for PTR_TO_MAP_VALUE_OR_NULL->PTR_TO_MAP_VALUE, but
+it also handles reference tracking for the pointer. PTR_TO_SOCKET implicitly
+represents a reference to the corresponding 'struct sock'. To ensure that the
+reference is not leaked, it is imperative to NULL-check the reference and in
+the non-NULL case, and pass the valid reference to the socket release function.
 
 Direct packet access
 --------------------
@@ -1444,6 +1459,55 @@ Error:
   8: (7a) *(u64 *)(r0 +0) = 1
   R0 invalid mem access 'imm'
 
+Program that performs a socket lookup then sets the pointer to NULL without
+checking it:
+value:
+  BPF_MOV64_IMM(BPF_REG_2, 0),
+  BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),
+  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+  BPF_MOV64_IMM(BPF_REG_3, 4),
+  BPF_MOV64_IMM(BPF_REG_4, 0),
+  BPF_MOV64_IMM(BPF_REG_5, 0),
+  BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp),
+  BPF_MOV64_IMM(BPF_REG_0, 0),
+  BPF_EXIT_INSN(),
+Error:
+  0: (b7) r2 = 0
+  1: (63) *(u32 *)(r10 -8) = r2
+  2: (bf) r2 = r10
+  3: (07) r2 += -8
+  4: (b7) r3 = 4
+  5: (b7) r4 = 0
+  6: (b7) r5 = 0
+  7: (85) call bpf_sk_lookup_tcp#65
+  8: (b7) r0 = 0
+  9: (95) exit
+  Unreleased reference id=1, alloc_insn=7
+
+Program that performs a socket lookup but does not NULL-check the returned
+value:
+  BPF_MOV64_IMM(BPF_REG_2, 0),
+  BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),
+  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+  BPF_MOV64_IMM(BPF_REG_3, 4),
+  BPF_MOV64_IMM(BPF_REG_4, 0),
+  BPF_MOV64_IMM(BPF_REG_5, 0),
+  BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp),
+  BPF_EXIT_INSN(),
+Error:
+  0: (b7) r2 = 0
+  1: (63) *(u32 *)(r10 -8) = r2
+  2: (bf) r2 = r10
+  3: (07) r2 += -8
+  4: (b7) r3 = 4
+  5: (b7) r4 = 0
+  6: (b7) r5 = 0
+  7: (85) call bpf_sk_lookup_tcp#65
+  8: (95) exit
+  Unreleased reference id=1, alloc_insn=7
+
 Testing
 -------
 
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH] net: dsa: mv88e6xxx: Add MV88E6352 DT compatible
From: Andrew Lunn @ 2018-09-12  0:46 UTC (permalink / raw)
  To: Marek Vasut; +Cc: netdev
In-Reply-To: <30900c22-1717-c907-7129-f2dcb90bbc6c@denx.de>

On Wed, Sep 12, 2018 at 02:04:54AM +0200, Marek Vasut wrote:
> On 09/12/2018 01:32 AM, Andrew Lunn wrote:
> >> compatible = "marvell,mv88e6352", "marvell,mv88e6085";
> > 
> > Just "marvell,mv88e6085";
> > 
> > Please take a look at all the other DT files using the Marvell
> > chips. You will only ever find "marvell,mv88e6085" or
> > "marvell,mv88e6190", because everything is compatible to one of these
> > two.
> 
> Well, until you find a difference between those chips which you cannot
> discern based solely on the ID register content. Then it's better to
> have a compatible to match on which matches the actual chip.

Hi Marek

We have been around this loop before. The problem with putting in a
more specific compatible is that nothing is validating it. So it is
going to be wrong, simple because people cut/paste, and don't
necessary change it. So when we do need to look at it, we cannot look
at it, because it is wrong.

I would only add a more specific compatible if and when we need it, it
is actually used, and we can verify it is correct.

   Andrew

^ permalink raw reply

* Re: Regression caused by commit 7bb05b85bc2d ("r8169: don't use MSI-X on RTL8106e")
From: Kai-Heng Feng @ 2018-09-12  5:57 UTC (permalink / raw)
  To: Jian-Hong Pan
  Cc: Thomas Gleixner, Linux Netdev List, Linux Kernel Mailing List,
	Daniel Drake
In-Reply-To: <CAPpJ_ecuPysq2SE7=X7ip-3M=4yOcwuu6m_2292WgYA8j6gqjw@mail.gmail.com>

at 12:56, Jian-Hong Pan <jian-hong@endlessm.com> wrote:

> 2018-09-12 11:42 GMT+08:00 Kai-Heng Feng <kai.heng.feng@canonical.com>:
>> Hi Jian-Hong,
>>
>> There's a Dell machine with RTL8106e stops to work after S3 since the  
>> commit
>> introduced.
>> So I am wondering if it's possible to revert the commit and use
>> DMI/subsystem id based quirk table?
>>
>> It's because of commit bc976233a872 ("genirq/msi, x86/vector: Prevent
>> reservation mode for non maskable MSI") cleared the reservation mode,  
>> and I
>> can see this after S3:
>>
>> [   94.872838] do_IRQ: 3.33 No irq handler for vector
>>
>> If the device uses MSI-X instead of MSI, the issue doesn't happen  
>> because of
>> reservation mode.
>
> Interesting!  Opposite symptom!
> Could you help try the patch
> https://marc.info/?l=linux-pci&m=153629858601668&w=4 with and without
> reverting the commit?

Same issue after applying this patch. MSI-X works, MSI doesn't work.

>
> If the patch does not work, another suggestion: You can try falling
> back to only PCI_IRQ_LEGACY.

This device is capable of using MSI-X, I don't think falls back to use  
legacy is a good idea.
Instead, using a quirk table should be more appropriate.

Kai-Heng

>
> Regards,
> Jian-Hong Pan
>
>> Hi Thomas,
>>
>> Is it something should be handled by x86 BIOS? Because I don't see this
>> issue when I use Suspend-to-Idle, which doesn't use BIOS to do suspend.
>>
>> Kai-Heng

^ permalink raw reply

* Re: [PATCH] xen-netback: remove unecessary condition check before debugfs_remove_recursive
From: David Miller @ 2018-09-12  6:02 UTC (permalink / raw)
  To: zhongjiang; +Cc: paul.durrant, wei.liu2, xen-devel, netdev, linux-kernel
In-Reply-To: <1536414822-32911-1-git-send-email-zhongjiang@huawei.com>

From: zhong jiang <zhongjiang@huawei.com>
Date: Sat, 8 Sep 2018 21:53:42 +0800

> debugfs_remove_recursive has taken IS_ERR_OR_NULL into account. So just
> remove the condition check before debugfs_remove_recursive.
> 
> Signed-off-by: zhong jiang <zhongjiang@huawei.com>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH v2] ethernet: hnae: drop adhoc assert() macros
From: David Miller @ 2018-09-12  6:07 UTC (permalink / raw)
  To: igor.stoppa
  Cc: igor.stoppa, huangdaode, yisen.zhuang, salil.mehta, netdev,
	linux-kernel
In-Reply-To: <20180908150142.27976-1-igor.stoppa@huawei.com>

From: Igor Stoppa <igor.stoppa@gmail.com>
Date: Sat,  8 Sep 2018 18:01:42 +0300

> Replace assert() with a less misleading test_condition() using WARN()
> Drop one check which had bitrotted and didn't compile anymore.
> 
> Signed-off-by: Igor Stoppa <igor.stoppa@huawei.com>

I'm still kind of not happy about this.

Make the driver use kernel interfaces like WARN_ON_ONCE()
etc. directly instead of defining alias CPP macros private to the
driver.

If it needs to be conditional upon DEBUG, we have pr_debug() and
the likes as well.

Thank you.

^ permalink raw reply

* Re: [PATCH net-next v3 01/17] asm: simd context helper API
From: Kevin Easton @ 2018-09-12  6:14 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: linux-kernel, netdev, davem, gregkh, Andy Lutomirski,
	Thomas Gleixner, Samuel Neves, linux-arch
In-Reply-To: <20180911010838.8818-2-Jason@zx2c4.com>

On Mon, Sep 10, 2018 at 07:08:21PM -0600, Jason A. Donenfeld wrote:
> Sometimes it's useful to amortize calls to XSAVE/XRSTOR and the related
> FPU/SIMD functions over a number of calls, because FPU restoration is
> quite expensive. This adds a simple header for carrying out this pattern:
> 
>     simd_context_t simd_context = simd_get();
>     while ((item = get_item_from_queue()) != NULL) {
>         encrypt_item(item, simd_context);
>         simd_context = simd_relax(simd_context);
>     }
>     simd_put(simd_context);
> 
> The relaxation step ensures that we don't trample over preemption, and
> the get/put API should be a familiar paradigm in the kernel.

Given that it's always supposed to be used like that, mightn't it be
better if simd_relax() took a pointer to the context, so the call is
just

    simd_relax(&simd_context);

?

The inlining means that there won't actually be a pointer dereference in
the emitted code.

If simd_put() also took a pointer then it could set the context back to
HAVE_NO_SIMD as well?

    - Kevin

^ permalink raw reply

* Re: Regression caused by commit 7bb05b85bc2d ("r8169: don't use MSI-X on RTL8106e")
From: Thomas Gleixner @ 2018-09-12  6:32 UTC (permalink / raw)
  To: Kai-Heng Feng; +Cc: jian-hong, Linux Netdev List, Linux Kernel Mailing List
In-Reply-To: <5D685053-1A33-4553-8678-A50C542466FD@canonical.com>

On Wed, 12 Sep 2018, Kai-Heng Feng wrote:

> There's a Dell machine with RTL8106e stops to work after S3 since the
> commit introduced. So I am wondering if it's possible to revert the
> commit and use DMI/subsystem id based quirk table?

Probably.

> It's because of commit bc976233a872 ("genirq/msi, x86/vector: Prevent
> reservation mode for non maskable MSI") cleared the reservation mode, and I
> can see this after S3:
> 
> [   94.872838] do_IRQ: 3.33 No irq handler for vector

It's not because of that commit, really. There is a interrupt sent after
resume to the wrong vector for whatever reason. The MSI vector cannot be
masked it seems in the device, but the driver should quiescen the device to
a point where it does not send interrupts.

> If the device uses MSI-X instead of MSI, the issue doesn't happen because of
> reservation mode.

Reservation mode has absolutely nothing to do with that. What prevents the
issue is the fact that MSI-X can be masked by the IRQ core.

> Is it something should be handled by x86 BIOS? Because I don't see this issue
> when I use Suspend-to-Idle, which doesn't use BIOS to do suspend.

Suspend to idle works completely different and I don't see the BIOS at
fault here. it's more an issue of MSI not being maskable on that device,
which can't be fixed in BIOS or it's some half quiescened state which is
used when suspending and that's a pure driver issue.

Thanks,

	tglx

^ permalink raw reply

* [PATCHv4 iproute2] bridge/mdb: fix missing new line when show bridge mdb
From: Hangbin Liu @ 2018-09-12  1:39 UTC (permalink / raw)
  To: netdev; +Cc: Phil Sutter, Stephen Hemminger, David Ahern, Hangbin Liu
In-Reply-To: <1536118423-20604-1-git-send-email-liuhangbin@gmail.com>

The bridge mdb show is broken on current iproute2. e.g.
]# bridge mdb show
34: br0  veth0_br  224.1.1.2  temp 34: br0  veth0_br  224.1.1.1  temp

After fix:
]# bridge mdb show
34: br0  veth0_br  224.1.1.2  temp
34: br0  veth0_br  224.1.1.1  temp

v2: Use json print lib as Stephen suggested.
v3: No need to use is_json_context() as print_string() could handle both cases.
v4: use new function print_nl() to print new line in non-json mode.

Reported-by: Ying Xu <yinxu@redhat.com>
Fixes: c7c1a1ef51aea ("bridge: colorize output and use JSON print library")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
 bridge/mdb.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/bridge/mdb.c b/bridge/mdb.c
index cc1b454..841a361 100644
--- a/bridge/mdb.c
+++ b/bridge/mdb.c
@@ -107,6 +107,10 @@ static void br_print_router_ports(FILE *f, struct rtattr *attr,
 			fprintf(f, "%s ", port_ifname);
 		}
 	}
+
+	if (!show_stats)
+		print_nl();
+
 	close_json_array(PRINT_JSON, NULL);
 }
 
@@ -157,6 +161,8 @@ static void print_mdb_entry(FILE *f, int ifindex, const struct br_mdb_entry *e,
 		print_string(PRINT_ANY, "timer", " %s",
 			     format_timer(timer));
 	}
+
+	print_nl();
 	close_json_object();
 }
 
-- 
2.5.5

^ permalink raw reply related

* [PATCH v2] PCI: Reprogram bridge prefetch registers on resume
From: Daniel Drake @ 2018-09-12  6:45 UTC (permalink / raw)
  To: bhelgaas-hpIqsD4AKlfQT0dZR+AlfA
  Cc: andy.shevchenko-VuQAYsv1563Yd54FQh9/CA,
	mika.westerberg-VuQAYsv1563Yd54FQh9/CA,
	linux-pm-u79uwXL29TY76Z2rM5mHXA, linux-pci-u79uwXL29TY76Z2rM5mHXA,
	rafael.j.wysocki-ral2JQCrhuEAvxtiuMwx3w,
	nic_swsd-Rasf1IRRPZFBDgjK7y7TUQ,
	keith.busch-ral2JQCrhuEAvxtiuMwx3w, netdev-u79uwXL29TY76Z2rM5mHXA,
	nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	rchang-eYqpPyKDWXRBDgjK7y7TUQ, linux-6IF/jdPJHihWk0Htik3J/w,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	jonathan.derrick-ral2JQCrhuEAvxtiuMwx3w,
	hkallweit1-Re5JQEeQqe8AvxtiuMwx3w

On 38+ Intel-based Asus products, the nvidia GPU becomes unusable
after S3 suspend/resume. The affected products include multiple
generations of nvidia GPUs and Intel SoCs. After resume, nouveau logs
many errors such as:

    fifo: fault 00 [READ] at 0000005555555000 engine 00 [GR] client 04 [HUB/FE] reason 4a [] on channel -1 [007fa91000 unknown]
    DRM: failed to idle channel 0 [DRM]

Similarly, the nvidia proprietary driver also fails after resume
(black screen, 100% CPU usage in Xorg process). We shipped a sample
to Nvidia for diagnosis, and their response indicated that it's a
problem with the parent PCI bridge (on the Intel SoC), not the GPU.

Runtime suspend/resume works fine, only S3 suspend is affected.

We found a workaround: on resume, rewrite the Intel PCI bridge
'Prefetchable Base Upper 32 Bits' register (PCI_PREF_BASE_UPPER32). In
the cases that I checked, this register has value 0 and we just have to
rewrite that value.

Linux already saves and restores PCI config space during suspend/resume,
but this register was being skipped because upon resume, it already
has value 0 (the correct, pre-suspend value).

Intel appear to have previously acknowledged this behaviour and the
requirement to rewrite this register.
https://bugzilla.kernel.org/show_bug.cgi?id=116851#c23

Based on that, rewrite the prefetch register values even when that
appears unnecessary.

We have confirmed this solution on all the affected models we have
in-hands (X542UQ, UX533FD, X530UN, V272UN).

Additionally, this solves an issue where r8169 MSI-X interrupts were
broken after S3 suspend/resume on Asus X441UAR. This issue was recently
worked around in commit 7bb05b85bc2d ("r8169: don't use MSI-X on
RTL8106e"). It also fixes the same issue on RTL6186evl/8111evl on an
Aimfor-tech laptop that we had not yet patched. I suspect it will also
fix the issue that was worked around in commit 7c53a722459c ("r8169:
don't use MSI-X on RTL8168g").

Thomas Martitz reports that this change also solves an issue where
the AMD Radeon Polaris 10 GPU on the HP Zbook 14u G5 is unresponsive
after S3 suspend/resume.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=201069
Signed-off-by: Daniel Drake <drake@endlessm.com>
---

Notes:
    Replaces patch:
       PCI: add prefetch quirk to work around Asus/Nvidia suspend issues
    
    Some of the more verbose info was moved to bugzilla:
    https://bugzilla.kernel.org/show_bug.cgi?id=201069
    
    This patch is aimed at v4.19 (and maybe v4.18-stable); we may follow
    up with more intrusive improvements for v4.20+.
    
    v2: reimplement the register restore within the existing
        pci_restore_config_space() code.

 drivers/pci/pci.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 29ff9619b5fa..e1704100e72d 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1289,13 +1289,15 @@ int pci_save_state(struct pci_dev *dev)
 EXPORT_SYMBOL(pci_save_state);
 
 static void pci_restore_config_dword(struct pci_dev *pdev, int offset,
-				     u32 saved_val, int retry)
+				     u32 saved_val, int retry, bool force)
 {
 	u32 val;
 
-	pci_read_config_dword(pdev, offset, &val);
-	if (val == saved_val)
-		return;
+	if (!force) {
+		pci_read_config_dword(pdev, offset, &val);
+		if (val == saved_val)
+			return;
+	}
 
 	for (;;) {
 		pci_dbg(pdev, "restoring config space at offset %#x (was %#x, writing %#x)\n",
@@ -1313,25 +1315,34 @@ static void pci_restore_config_dword(struct pci_dev *pdev, int offset,
 }
 
 static void pci_restore_config_space_range(struct pci_dev *pdev,
-					   int start, int end, int retry)
+					   int start, int end, int retry,
+					   bool force)
 {
 	int index;
 
 	for (index = end; index >= start; index--)
 		pci_restore_config_dword(pdev, 4 * index,
 					 pdev->saved_config_space[index],
-					 retry);
+					 retry, force);
 }
 
 static void pci_restore_config_space(struct pci_dev *pdev)
 {
 	if (pdev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
-		pci_restore_config_space_range(pdev, 10, 15, 0);
+		pci_restore_config_space_range(pdev, 10, 15, 0, false);
 		/* Restore BARs before the command register. */
-		pci_restore_config_space_range(pdev, 4, 9, 10);
-		pci_restore_config_space_range(pdev, 0, 3, 0);
+		pci_restore_config_space_range(pdev, 4, 9, 10, false);
+		pci_restore_config_space_range(pdev, 0, 3, 0, false);
+	} else if (pdev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+		pci_restore_config_space_range(pdev, 12, 15, 0, false);
+		/* Force rewriting of prefetch registers to avoid
+		 * S3 resume issues on Intel PCI bridges that occur when
+		 * these registers are not explicitly written.
+		 */
+		pci_restore_config_space_range(pdev, 9, 11, 0, true);
+		pci_restore_config_space_range(pdev, 0, 8, 0, false);
 	} else {
-		pci_restore_config_space_range(pdev, 0, 15, 0);
+		pci_restore_config_space_range(pdev, 0, 15, 0, false);
 	}
 }
 
-- 
2.17.1

_______________________________________________
Nouveau mailing list
Nouveau@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/nouveau

^ permalink raw reply related

* Re: [PATCH 1/2] erspan: return PACKET_REJECT when the appropriate tunnel is not found
From: David Miller @ 2018-09-12  6:52 UTC (permalink / raw)
  To: yanhaishuang; +Cc: kuznet, netdev, linux-kernel, u9012063
In-Reply-To: <1536589188-27550-1-git-send-email-yanhaishuang@cmss.chinamobile.com>

From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Mon, 10 Sep 2018 22:19:47 +0800

> If erspan tunnel hasn't been established, we'd better send icmp port
> unreachable message after receive erspan packets.
> 
> Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN")
> Cc: William Tu <u9012063@gmail.com>
> Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>

Applied and queued up for -stable.

^ permalink raw reply

* Re: [PATCH 2/2] erspan: fix error handling for erspan tunnel
From: David Miller @ 2018-09-12  6:52 UTC (permalink / raw)
  To: yanhaishuang; +Cc: kuznet, netdev, linux-kernel, u9012063
In-Reply-To: <1536589188-27550-2-git-send-email-yanhaishuang@cmss.chinamobile.com>

From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Mon, 10 Sep 2018 22:19:48 +0800

> When processing icmp unreachable message for erspan tunnel, tunnel id
> should be erspan_net_id instead of ipgre_net_id.
> 
> Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN")
> Cc: William Tu <u9012063@gmail.com>
> Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>

Applied and queued up for -stable.

^ permalink raw reply

* Re: [PATCH] net: ipv4: Use BUG_ON directly instead of a if condition followed by BUG
From: David Miller @ 2018-09-12  6:54 UTC (permalink / raw)
  To: zhongjiang; +Cc: edumazet, kuznet, yoshfuji, netdev, linux-kernel
In-Reply-To: <1536590282-23899-1-git-send-email-zhongjiang@huawei.com>

From: zhong jiang <zhongjiang@huawei.com>
Date: Mon, 10 Sep 2018 22:38:02 +0800

> The if condition can be removed if we use BUG_ON directly.
> The issule is detected with the help of Coccinelle.
> 
> Signed-off-by: zhong jiang <zhongjiang@huawei.com>

You keep asking if this is worthy to do.

I think the most worthy thing to do is actually build test your
patches.

^ permalink raw reply

* [PATCH 1/2] r8169: Align ASPM/CLKREQ setting function with vendor driver
From: Kai-Heng Feng @ 2018-09-12  6:58 UTC (permalink / raw)
  To: nic_swsd; +Cc: davem, hkallweit1, netdev, linux-kernel, Kai-Heng Feng

There's a small delay after setting ASPM in vendor drivers, r8101 and
r8168.
In addition, those drivers enable ASPM before ClkReq, also change that
to align with vendor driver.

I haven't seen anything bad becasue of this, but I think it's better to
keep in sync with vendor driver.

Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
---
 drivers/net/ethernet/realtek/r8169.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index b08d51bf7a20..5091e9fd601f 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4775,12 +4775,14 @@ static void rtl_pcie_state_l2l3_enable(struct rtl8169_private *tp, bool enable)
 static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable)
 {
 	if (enable) {
-		RTL_W8(tp, Config2, RTL_R8(tp, Config2) | ClkReqEn);
 		RTL_W8(tp, Config5, RTL_R8(tp, Config5) | ASPM_en);
+		RTL_W8(tp, Config2, RTL_R8(tp, Config2) | ClkReqEn);
 	} else {
 		RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~ClkReqEn);
 		RTL_W8(tp, Config5, RTL_R8(tp, Config5) & ~ASPM_en);
 	}
+
+	udelay(10);
 }
 
 static void rtl_hw_start_8168bb(struct rtl8169_private *tp)
-- 
2.17.1

^ permalink raw reply related

* [PATCH 2/2] r8169: enable ASPM on RTL8106E
From: Kai-Heng Feng @ 2018-09-12  6:58 UTC (permalink / raw)
  To: nic_swsd; +Cc: davem, hkallweit1, netdev, linux-kernel, Kai-Heng Feng
In-Reply-To: <20180912065821.7198-1-kai.heng.feng@canonical.com>

The Intel SoC was prevented from entering lower idle state because
of RTL8106E's ASPM was not enabled.

So enable ASPM on RTL8106E (chip version 39).
Now the Intel SoC can enter lower idle state, power consumption and
temperature are much lower.

Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
---
 drivers/net/ethernet/realtek/r8169.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 5091e9fd601f..3752b1bbe6db 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -5627,6 +5627,8 @@ static void rtl_hw_start_8402(struct rtl8169_private *tp)
 
 static void rtl_hw_start_8106(struct rtl8169_private *tp)
 {
+	rtl_hw_aspm_clkreq_enable(tp, false);
+
 	/* Force LAN exit from ASPM if Rx/Tx are not idle */
 	RTL_W32(tp, FuncEvent, RTL_R32(tp, FuncEvent) | 0x002800);
 
@@ -5635,6 +5637,7 @@ static void rtl_hw_start_8106(struct rtl8169_private *tp)
 	RTL_W8(tp, DLLPR, RTL_R8(tp, DLLPR) & ~PFM_EN);
 
 	rtl_pcie_state_l2l3_enable(tp, false);
+	rtl_hw_aspm_clkreq_enable(tp, true);
 }
 
 static void rtl_hw_start_8101(struct rtl8169_private *tp)
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH v2] r8169: Clear RTL_FLAG_TASK_*_PENDING when clearing RTL_FLAG_TASK_ENABLED
From: David Miller @ 2018-09-12  7:06 UTC (permalink / raw)
  To: kai.heng.feng; +Cc: nic_swsd, netdev, linux-kernel, hkallweit1
In-Reply-To: <20180910175143.23878-1-kai.heng.feng@canonical.com>

From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Tue, 11 Sep 2018 01:51:43 +0800

> After system suspend, sometimes the r8169 doesn't work when ethernet
> cable gets pluggued.
> 
> This issue happens because rtl_reset_work() doesn't get called from
> rtl8169_runtime_resume(), after system suspend.
> 
> In rtl_task(), RTL_FLAG_TASK_* only gets cleared if this condition is
> met:
> if (!netif_running(dev) ||
>     !test_bit(RTL_FLAG_TASK_ENABLED, tp->wk.flags))
>     ...
> 
> If RTL_FLAG_TASK_ENABLED was cleared during system suspend while
> RTL_FLAG_TASK_RESET_PENDING was set, the next rtl_schedule_task() won't
> schedule task as the flag is still there.
> 
> So in addition to clearing RTL_FLAG_TASK_ENABLED, also clears other
> flags.
> 
> Cc: Heiner Kallweit <hkallweit1@gmail.com>
> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> ---
> v2:
> - Use bitmap_zero(), suggested by Florian Fainelli.
> - Assign zero to first enum member in rtl_flag, just in case.

Applied, thank you.

^ permalink raw reply

* [PATCH net] geneve: add ttl inherit support
From: Hangbin Liu @ 2018-09-12  2:04 UTC (permalink / raw)
  To: netdev
  Cc: Pravin Shelar, David S. Miller, Jiri Benc, Stefano Brivio,
	Hangbin Liu

Similar with commit 72f6d71e491e6 ("vxlan: add ttl inherit support"),
currently ttl == 0 means "use whatever default value" on geneve instead
of inherit inner ttl. To respect compatibility with old behavior, let's
add a new IFLA_GENEVE_TTL_INHERIT for geneve ttl inherit support.

Reported-by: Jianlin Shi <jishi@redhat.com>
Suggested-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
 drivers/net/geneve.c               | 41 ++++++++++++++++++++++++++++++--------
 include/uapi/linux/if_link.h       |  1 +
 tools/include/uapi/linux/if_link.h |  1 +
 3 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 6acb6b5..6625fab 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -69,6 +69,7 @@ struct geneve_dev {
 	struct gro_cells   gro_cells;
 	bool		   collect_md;
 	bool		   use_udp6_rx_checksums;
+	bool		   ttl_inherit;
 };
 
 struct geneve_sock {
@@ -843,7 +844,11 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		ttl = key->ttl;
 	} else {
 		tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, ip_hdr(skb), skb);
-		ttl = key->ttl ? : ip4_dst_hoplimit(&rt->dst);
+		if (geneve->ttl_inherit)
+			ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb);
+		else
+			ttl = key->ttl;
+		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
 	}
 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
 
@@ -889,7 +894,11 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	} else {
 		prio = ip_tunnel_ecn_encap(ip6_tclass(fl6.flowlabel),
 					   ip_hdr(skb), skb);
-		ttl = key->ttl ? : ip6_dst_hoplimit(dst);
+		if (geneve->ttl_inherit)
+			ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb);
+		else
+			ttl = key->ttl;
+		ttl = ttl ? : ip6_dst_hoplimit(dst);
 	}
 	err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr));
 	if (unlikely(err))
@@ -1091,6 +1100,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
 	[IFLA_GENEVE_UDP_CSUM]		= { .type = NLA_U8 },
 	[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
 	[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]	= { .type = NLA_U8 },
+	[IFLA_GENEVE_TTL_INHERIT]	= { .type = NLA_U8 },
 };
 
 static int geneve_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -1170,7 +1180,8 @@ static bool geneve_dst_addr_equal(struct ip_tunnel_info *a,
 static int geneve_configure(struct net *net, struct net_device *dev,
 			    struct netlink_ext_ack *extack,
 			    const struct ip_tunnel_info *info,
-			    bool metadata, bool ipv6_rx_csum)
+			    bool metadata, bool ipv6_rx_csum,
+			    bool ttl_inherit)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_dev *t, *geneve = netdev_priv(dev);
@@ -1219,6 +1230,7 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 	geneve->info = *info;
 	geneve->collect_md = metadata;
 	geneve->use_udp6_rx_checksums = ipv6_rx_csum;
+	geneve->ttl_inherit = ttl_inherit;
 
 	err = register_netdevice(dev);
 	if (err)
@@ -1237,7 +1249,8 @@ static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port)
 static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 			  struct netlink_ext_ack *extack,
 			  struct ip_tunnel_info *info, bool *metadata,
-			  bool *use_udp6_rx_checksums, bool changelink)
+			  bool *use_udp6_rx_checksums, bool *ttl_inherit,
+			  bool changelink)
 {
 	int attrtype;
 
@@ -1315,6 +1328,9 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 	if (data[IFLA_GENEVE_TTL])
 		info->key.ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
 
+	if (data[IFLA_GENEVE_TTL_INHERIT])
+		*ttl_inherit = true;
+
 	if (data[IFLA_GENEVE_TOS])
 		info->key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
 
@@ -1438,17 +1454,18 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 {
 	bool use_udp6_rx_checksums = false;
 	struct ip_tunnel_info info;
+	bool ttl_inherit = false;
 	bool metadata = false;
 	int err;
 
 	init_tnl_info(&info, GENEVE_UDP_PORT);
 	err = geneve_nl2info(tb, data, extack, &info, &metadata,
-			     &use_udp6_rx_checksums, false);
+			     &use_udp6_rx_checksums, &ttl_inherit, false);
 	if (err)
 		return err;
 
 	err = geneve_configure(net, dev, extack, &info, metadata,
-			       use_udp6_rx_checksums);
+			       use_udp6_rx_checksums, ttl_inherit);
 	if (err)
 		return err;
 
@@ -1511,6 +1528,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
 	struct ip_tunnel_info info;
 	bool metadata;
 	bool use_udp6_rx_checksums;
+	bool ttl_inherit;
 	int err;
 
 	/* If the geneve device is configured for metadata (or externally
@@ -1523,8 +1541,9 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
 	memcpy(&info, &geneve->info, sizeof(info));
 	metadata = geneve->collect_md;
 	use_udp6_rx_checksums = geneve->use_udp6_rx_checksums;
+	ttl_inherit = geneve->ttl_inherit;
 	err = geneve_nl2info(tb, data, extack, &info, &metadata,
-			     &use_udp6_rx_checksums, true);
+			     &use_udp6_rx_checksums, &ttl_inherit, true);
 	if (err)
 		return err;
 
@@ -1537,6 +1556,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
 	geneve->info = info;
 	geneve->collect_md = metadata;
 	geneve->use_udp6_rx_checksums = use_udp6_rx_checksums;
+	geneve->ttl_inherit = ttl_inherit;
 	geneve_unquiesce(geneve, gs4, gs6);
 
 	return 0;
@@ -1562,6 +1582,7 @@ static size_t geneve_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */
 		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */
 		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */
+		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */
 		0;
 }
 
@@ -1569,6 +1590,7 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
 	struct ip_tunnel_info *info = &geneve->info;
+	bool ttl_inherit = geneve->ttl_inherit;
 	bool metadata = geneve->collect_md;
 	__u8 tmp_vni[3];
 	__u32 vni;
@@ -1614,6 +1636,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 		goto nla_put_failure;
 #endif
 
+	if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit))
+		goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
@@ -1650,7 +1675,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 		return dev;
 
 	init_tnl_info(&info, dst_port);
-	err = geneve_configure(net, dev, NULL, &info, true, true);
+	err = geneve_configure(net, dev, NULL, &info, true, true, false);
 	if (err) {
 		free_netdev(dev);
 		return ERR_PTR(err);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 43391e2..5118af5 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -554,6 +554,7 @@ enum {
 	IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
 	IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
 	IFLA_GENEVE_LABEL,
+	IFLA_GENEVE_TTL_INHERIT,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index cf01b68..1678d08 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -541,6 +541,7 @@ enum {
 	IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
 	IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
 	IFLA_GENEVE_LABEL,
+	IFLA_GENEVE_TTL_INHERIT,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
-- 
2.5.5

^ permalink raw reply related

* Re: [PATCH net-next] net: dsa: b53: Only call b53_port_event() for SGMII ports
From: David Miller @ 2018-09-12  7:07 UTC (permalink / raw)
  To: f.fainelli; +Cc: netdev, andrew, vivien.didelot, linux-kernel
In-Reply-To: <20180910234707.566-1-f.fainelli@gmail.com>

From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 10 Sep 2018 16:47:07 -0700

> Built-in PHY ports are still being polled, avoid generating spurious
> and duplicate events which the PHY library resolves through polling
> anyways.
> 
> Fixes: 0e01491de646 ("net: dsa: b53: Add SerDes support")
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>

Applied, thanks Florian.

^ permalink raw reply

* [RFC] netlink: add NLA_REJECT policy type
From: Johannes Berg @ 2018-09-12  7:32 UTC (permalink / raw)
  To: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: Johannes Berg

From: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

In some situations some netlink attributes may be used for output
only (kernel->userspace) or may be reserved for future use. It's
then helpful to be able to prevent userspace from using them in
messages sent to the kernel, since they'd otherwise be ignored and
any future will become impossible if this happens.

Add NLA_REJECT to the policy which does nothing but reject (with
EINVAL) validation of any messages containing this attribute.

The specific case I have in mind now is a shared nested attribute
containing request/response data, and it would be pointless and
potentially confusing to have userspace include response data in
the messages that actually contain a request.

Signed-off-by: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
 include/net/netlink.h | 1 +
 lib/nlattr.c          | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 0c154f98e987..ea7e87564f59 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -180,6 +180,7 @@ enum {
 	NLA_S32,
 	NLA_S64,
 	NLA_BITFIELD32,
+	NLA_REJECT,
 	__NLA_TYPE_MAX,
 };
 
diff --git a/lib/nlattr.c b/lib/nlattr.c
index e335bcafa9e4..4644ae74bf63 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -87,6 +87,9 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 	}
 
 	switch (pt->type) {
+	case NLA_REJECT:
+		return -EINVAL;
+
 	case NLA_FLAG:
 		if (attrlen > 0)
 			return -ERANGE;
-- 
2.14.4

^ permalink raw reply related

* Re: [PATCH net-next v2 2/5] virtio_ring: support creating packed ring
From: Tiwei Bie @ 2018-09-12  7:51 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: jasowang, virtualization, linux-kernel, netdev, virtio-dev, wexu,
	jfreimann
In-Reply-To: <20180910022837.GA11822@debian>

On Mon, Sep 10, 2018 at 10:28:37AM +0800, Tiwei Bie wrote:
> On Fri, Sep 07, 2018 at 10:03:24AM -0400, Michael S. Tsirkin wrote:
> > On Wed, Jul 11, 2018 at 10:27:08AM +0800, Tiwei Bie wrote:
> > > This commit introduces the support for creating packed ring.
> > > All split ring specific functions are added _split suffix.
> > > Some necessary stubs for packed ring are also added.
> > > 
> > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > 
[...]
> > > +
> > > +/*
> > > + * The layout for the packed ring is a continuous chunk of memory
> > > + * which looks like this.
> > > + *
> > > + * struct vring_packed {
> > > + *	// The actual descriptors (16 bytes each)
> > > + *	struct vring_packed_desc desc[num];
> > > + *
> > > + *	// Padding to the next align boundary.
> > > + *	char pad[];
> > > + *
> > > + *	// Driver Event Suppression
> > > + *	struct vring_packed_desc_event driver;
> > > + *
> > > + *	// Device Event Suppression
> > > + *	struct vring_packed_desc_event device;
> > > + * };
> > > + */
> > 
> > Why not just allocate event structures separately?
> > Is it a win to have them share a cache line for some reason?

Users may call vring_new_virtqueue() with preallocated
memory to setup the ring, e.g.:

https://github.com/torvalds/linux/blob/11da3a7f84f1/drivers/s390/virtio/virtio_ccw.c#L513-L522
https://github.com/torvalds/linux/blob/11da3a7f84f1/drivers/misc/mic/vop/vop_main.c#L306-L307

Below is the corresponding definition in split ring:

https://github.com/oasis-tcs/virtio-spec/blob/89dd55f5e606/split-ring.tex#L64-L78

If my understanding is correct, this is just for legacy
interfaces, and we won't define layout in packed ring
and don't need to support vring_new_virtqueue() in packed
ring. Is it correct? Thanks!



> 
> Will do that.
> 
> > 
> > > +static inline void vring_init_packed(struct vring_packed *vr, unsigned int num,
> > > +				     void *p, unsigned long align)
> > > +{
> > > +	vr->num = num;
> > > +	vr->desc = p;
> > > +	vr->driver = (void *)ALIGN(((uintptr_t)p +
> > > +		sizeof(struct vring_packed_desc) * num), align);
> > > +	vr->device = vr->driver + 1;
> > > +}
> > 
> > What's all this about alignment? Where does it come from?
> 
> It comes from the `vring_align` parameter of vring_create_virtqueue()
> and vring_new_virtqueue():
> 
> https://github.com/torvalds/linux/blob/a49a9dcce802/drivers/virtio/virtio_ring.c#L1061
> https://github.com/torvalds/linux/blob/a49a9dcce802/drivers/virtio/virtio_ring.c#L1123
> 
> Should I just ignore it in packed ring?
> 
> CCW defined this:
> 
> #define KVM_VIRTIO_CCW_RING_ALIGN 4096
> 
> I'm not familiar with CCW. Currently, in this patch set, packed ring
> isn't enabled on CCW dues to some legacy accessors are not implemented
> in packed ring yet.
> 
> > 
> > > +
> > > +static inline unsigned vring_size_packed(unsigned int num, unsigned long align)
> > > +{
> > > +	return ((sizeof(struct vring_packed_desc) * num + align - 1)
> > > +		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
> > > +}
> [...]
> > > @@ -1129,10 +1388,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
> > >  				      void (*callback)(struct virtqueue *vq),
> > >  				      const char *name)
> > >  {
> > > -	struct vring vring;
> > > -	vring_init(&vring, num, pages, vring_align);
> > > -	return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> > > -				     notify, callback, name);
> > > +	union vring_union vring;
> > > +	bool packed;
> > > +
> > > +	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> > > +	if (packed)
> > > +		vring_init_packed(&vring.vring_packed, num, pages, vring_align);
> > > +	else
> > > +		vring_init(&vring.vring_split, num, pages, vring_align);
> > 
> > 
> > vring_init in the UAPI header is more or less a bug.
> > I'd just stop using it, keep it around for legacy userspace.
> 
> Got it. I'd like to do that. Thanks.
> 
> > 
> > > +
> > > +	return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> > > +				     context, notify, callback, name);
> > >  }
> > >  EXPORT_SYMBOL_GPL(vring_new_virtqueue);
> > >  
> > > @@ -1142,7 +1408,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
> > >  
> > >  	if (vq->we_own_ring) {
> > >  		vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
> > > -				 vq->vring.desc, vq->queue_dma_addr);
> > > +				 vq->packed ? (void *)vq->vring_packed.desc :
> > > +					      (void *)vq->vring.desc,
> > > +				 vq->queue_dma_addr);
> > >  	}
> > >  	list_del(&_vq->list);
> > >  	kfree(vq);
> > > @@ -1184,7 +1452,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
> > >  
> > >  	struct vring_virtqueue *vq = to_vvq(_vq);
> > >  
> > > -	return vq->vring.num;
> > > +	return vq->packed ? vq->vring_packed.num : vq->vring.num;
> > >  }
> > >  EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
> > >  
> > > @@ -1227,6 +1495,10 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
> > >  
> > >  	BUG_ON(!vq->we_own_ring);
> > >  
> > > +	if (vq->packed)
> > > +		return vq->queue_dma_addr + ((char *)vq->vring_packed.driver -
> > > +				(char *)vq->vring_packed.desc);
> > > +
> > >  	return vq->queue_dma_addr +
> > >  		((char *)vq->vring.avail - (char *)vq->vring.desc);
> > >  }
> > > @@ -1238,11 +1510,16 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
> > >  
> > >  	BUG_ON(!vq->we_own_ring);
> > >  
> > > +	if (vq->packed)
> > > +		return vq->queue_dma_addr + ((char *)vq->vring_packed.device -
> > > +				(char *)vq->vring_packed.desc);
> > > +
> > >  	return vq->queue_dma_addr +
> > >  		((char *)vq->vring.used - (char *)vq->vring.desc);
> > >  }
> > >  EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
> > >  
> > > +/* Only available for split ring */
> > >  const struct vring *virtqueue_get_vring(struct virtqueue *vq)
> > >  {
> > >  	return &to_vvq(vq)->vring;
> > > diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
> > > index fab02133a919..992142b35f55 100644
> > > --- a/include/linux/virtio_ring.h
> > > +++ b/include/linux/virtio_ring.h
> > > @@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
> > >  struct virtio_device;
> > >  struct virtqueue;
> > >  
> > > +union vring_union {
> > > +	struct vring vring_split;
> > > +	struct vring_packed vring_packed;
> > > +};
> > > +
> > >  /*
> > >   * Creates a virtqueue and allocates the descriptor ring.  If
> > >   * may_reduce_num is set, then this may allocate a smaller ring than
> > > @@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
> > >  
> > >  /* Creates a virtqueue with a custom layout. */
> > >  struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > > -					struct vring vring,
> > > +					union vring_union vring,
> > > +					bool packed,
> > >  					struct virtio_device *vdev,
> > >  					bool weak_barriers,
> > >  					bool ctx,
> > > -- 
> > > 2.18.0

^ permalink raw reply

* [PATCH] docs: net: Remove TCP congestion document
From: Tobin C. Harding @ 2018-09-12  8:14 UTC (permalink / raw)
  To: David S. Miller
  Cc: Tobin C. Harding, Eric Dumazet, netdev, linux-doc, linux-kernel

Document is stale, let's remove it.

Remove TCP congestion document.

Signed-off-by: Tobin C. Harding <me@tobin.cc>
---

Previous patch converted this file to rst.

 [PATCH RESEND net-next] docs: net: Convert tcp.txt to RST format

Response to that patch on LKML by Eric:

 I dunno, this 'doc' is probably useless and should be deleted.

Removing as suggested.

thanks,
Tobin.

 Documentation/networking/00-INDEX |   2 -
 Documentation/networking/tcp.txt  | 101 ------------------------------
 2 files changed, 103 deletions(-)
 delete mode 100644 Documentation/networking/tcp.txt

diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX
index 02a323c43261..dcbccae4043e 100644
--- a/Documentation/networking/00-INDEX
+++ b/Documentation/networking/00-INDEX
@@ -198,8 +198,6 @@ tc-actions-env-rules.txt
 	- rules for traffic control (tc) actions.
 timestamping.txt
 	- overview of network packet timestamping variants.
-tcp.txt
-	- short blurb on how TCP output takes place.
 tcp-thin.txt
 	- kernel tuning options for low rate 'thin' TCP streams.
 team.txt
diff --git a/Documentation/networking/tcp.txt b/Documentation/networking/tcp.txt
deleted file mode 100644
index 9c7139d57e57..000000000000
--- a/Documentation/networking/tcp.txt
+++ /dev/null
@@ -1,101 +0,0 @@
-TCP protocol
-============
-
-Last updated: 3 June 2017
-
-Contents
-========
-
-- Congestion control
-- How the new TCP output machine [nyi] works
-
-Congestion control
-==================
-
-The following variables are used in the tcp_sock for congestion control:
-snd_cwnd		The size of the congestion window
-snd_ssthresh		Slow start threshold. We are in slow start if
-			snd_cwnd is less than this.
-snd_cwnd_cnt		A counter used to slow down the rate of increase
-			once we exceed slow start threshold.
-snd_cwnd_clamp		This is the maximum size that snd_cwnd can grow to.
-snd_cwnd_stamp		Timestamp for when congestion window last validated.
-snd_cwnd_used		Used as a highwater mark for how much of the
-			congestion window is in use. It is used to adjust
-			snd_cwnd down when the link is limited by the
-			application rather than the network.
-
-As of 2.6.13, Linux supports pluggable congestion control algorithms.
-A congestion control mechanism can be registered through functions in
-tcp_cong.c. The functions used by the congestion control mechanism are
-registered via passing a tcp_congestion_ops struct to
-tcp_register_congestion_control. As a minimum, the congestion control
-mechanism must provide a valid name and must implement either ssthresh,
-cong_avoid and undo_cwnd hooks or the "omnipotent" cong_control hook.
-
-Private data for a congestion control mechanism is stored in tp->ca_priv.
-tcp_ca(tp) returns a pointer to this space.  This is preallocated space - it
-is important to check the size of your private data will fit this space, or
-alternatively, space could be allocated elsewhere and a pointer to it could
-be stored here.
-
-There are three kinds of congestion control algorithms currently: The
-simplest ones are derived from TCP reno (highspeed, scalable) and just
-provide an alternative congestion window calculation. More complex
-ones like BIC try to look at other events to provide better
-heuristics.  There are also round trip time based algorithms like
-Vegas and Westwood+.
-
-Good TCP congestion control is a complex problem because the algorithm
-needs to maintain fairness and performance. Please review current
-research and RFC's before developing new modules.
-
-The default congestion control mechanism is chosen based on the
-DEFAULT_TCP_CONG Kconfig parameter. If you really want a particular default
-value then you can set it using sysctl net.ipv4.tcp_congestion_control. The
-module will be autoloaded if needed and you will get the expected protocol. If
-you ask for an unknown congestion method, then the sysctl attempt will fail.
-
-If you remove a TCP congestion control module, then you will get the next
-available one. Since reno cannot be built as a module, and cannot be
-removed, it will always be available.
-
-How the new TCP output machine [nyi] works.
-===========================================
-
-Data is kept on a single queue. The skb->users flag tells us if the frame is
-one that has been queued already. To add a frame we throw it on the end. Ack
-walks down the list from the start.
-
-We keep a set of control flags
-
-
-	sk->tcp_pend_event
-
-		TCP_PEND_ACK			Ack needed
-		TCP_ACK_NOW			Needed now
-		TCP_WINDOW			Window update check
-		TCP_WINZERO			Zero probing
-
-
-	sk->transmit_queue		The transmission frame begin
-	sk->transmit_new		First new frame pointer
-	sk->transmit_end		Where to add frames
-
-	sk->tcp_last_tx_ack		Last ack seen
-	sk->tcp_dup_ack			Dup ack count for fast retransmit
-
-
-Frames are queued for output by tcp_write. We do our best to send the frames
-off immediately if possible, but otherwise queue and compute the body
-checksum in the copy. 
-
-When a write is done we try to clear any pending events and piggy back them.
-If the window is full we queue full sized frames. On the first timeout in
-zero window we split this.
-
-On a timer we walk the retransmit list to send any retransmits, update the
-backoff timers etc. A change of route table stamp causes a change of header
-and recompute. We add any new tcp level headers and refinish the checksum
-before sending. 
-
-- 
2.17.1

^ permalink raw reply related

* Re: Regression caused by commit 7bb05b85bc2d ("r8169: don't use MSI-X on RTL8106e")
From: Kai-Heng Feng @ 2018-09-12  8:19 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: jian-hong, Linux Netdev List, Linux Kernel Mailing List
In-Reply-To: <alpine.DEB.2.21.1809120826310.1427@nanos.tec.linutronix.de>

at 14:32, Thomas Gleixner <tglx@linutronix.de> wrote:

> On Wed, 12 Sep 2018, Kai-Heng Feng wrote:
>
>> There's a Dell machine with RTL8106e stops to work after S3 since the
>> commit introduced. So I am wondering if it's possible to revert the
>> commit and use DMI/subsystem id based quirk table?
>
> Probably.

Hopefully Jian-Hong can cook up a quirk table for the issue.

>
>> It's because of commit bc976233a872 ("genirq/msi, x86/vector: Prevent
>> reservation mode for non maskable MSI") cleared the reservation mode,  
>> and I
>> can see this after S3:
>>
>> [   94.872838] do_IRQ: 3.33 No irq handler for vector
>
> It's not because of that commit, really. There is a interrupt sent after
> resume to the wrong vector for whatever reason. The MSI vector cannot be
> masked it seems in the device, but the driver should quiescen the device to
> a point where it does not send interrupts.

Understood.

>
>> If the device uses MSI-X instead of MSI, the issue doesn't happen  
>> because of
>> reservation mode.
>
> Reservation mode has absolutely nothing to do with that. What prevents the
> issue is the fact that MSI-X can be masked by the IRQ core.

So in this case I think keep the device using MSI-X is a better route, it's  
MSI-X capable anyway.

>
>> Is it something should be handled by x86 BIOS? Because I don't see this  
>> issue
>> when I use Suspend-to-Idle, which doesn't use BIOS to do suspend.
>
> Suspend to idle works completely different and I don't see the BIOS at
> fault here. it's more an issue of MSI not being maskable on that device,
> which can't be fixed in BIOS or it's some half quiescened state which is
> used when suspending and that's a pure driver issue.

Understood.
Thanks for all the info!

Kai-Heng

>
> Thanks,
>
> 	tglx

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox