Netdev List
 help / color / mirror / Atom feed
* [PATCH bpf-next v4 01/11] bpf: Make SOCK_OPS_GET_TCP size independent
From: Lawrence Brakmo @ 2018-01-04 23:55 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann,
	Eric Dumazet, Neal Cardwell, Yuchung Cheng
In-Reply-To: <20180104235533.3672006-1-brakmo@fb.com>

Make SOCK_OPS_GET_TCP helper macro size independent (before only worked
with 4-byte fields.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 net/core/filter.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 130b842..099ff9fd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4449,9 +4449,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		break;
 
 /* Helper macro for adding read access to tcp_sock fields. */
-#define SOCK_OPS_GET_TCP32(FIELD_NAME)					      \
+#define SOCK_OPS_GET_TCP(FIELD_NAME)					      \
 	do {								      \
-		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) >      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME));  \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
 						struct bpf_sock_ops_kern,     \
 						is_fullsock),		      \
@@ -4463,16 +4464,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 						struct bpf_sock_ops_kern, sk),\
 				      si->dst_reg, si->src_reg,		      \
 				      offsetof(struct bpf_sock_ops_kern, sk));\
-		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,        \
+		*insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock,	      \
+						   FIELD_NAME), si->dst_reg,  \
+				      si->dst_reg,			      \
 				      offsetof(struct tcp_sock, FIELD_NAME)); \
 	} while (0)
 
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
-		SOCK_OPS_GET_TCP32(snd_cwnd);
+		SOCK_OPS_GET_TCP(snd_cwnd);
 		break;
 
 	case offsetof(struct bpf_sock_ops, srtt_us):
-		SOCK_OPS_GET_TCP32(srtt_us);
+		SOCK_OPS_GET_TCP(srtt_us);
 		break;
 	}
 	return insn - insn_buf;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 06/11] bpf: Add sock_ops RTO callback
From: Lawrence Brakmo @ 2018-01-04 23:55 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann,
	Eric Dumazet, Neal Cardwell, Yuchung Cheng
In-Reply-To: <20180104235533.3672006-1-brakmo@fb.com>

Adds an optional call to sock_ops BPF program based on whether the
BPF_SOCK_OPS_RTO_CB_FLAG is set in bpf_sock_ops_flags.
The BPF program is passed 2 arguments: icsk_retransmits and whether the
RTO has expired.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/uapi/linux/bpf.h | 5 +++++
 include/uapi/linux/tcp.h | 3 +++
 net/ipv4/tcp_timer.c     | 7 +++++++
 3 files changed, 15 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f7adeca..a1316c7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -998,6 +998,11 @@ enum {
 					 * a congestion threshold. RTTs above
 					 * this indicate congestion
 					 */
+	BPF_SOCK_OPS_RTO_CB,		/* Called when an RTO has triggered.
+					 * Arg1: value of icsk_retransmits
+					 * Arg2: value of icsk_rto
+					 * Arg3: whether RTO has expired
+					 */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index b4a4f64..089c19e 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -259,6 +259,9 @@ struct tcp_md5sig {
 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];		/* key (binary) */
 };
 
+/* Definitions for bpf_sock_ops_flags */
+#define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
+
 /* INET_DIAG_MD5SIG */
 struct tcp_diag_md5sig {
 	__u8	tcpm_family;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 6db3124..257abdd 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -213,11 +213,18 @@ static int tcp_write_timeout(struct sock *sk)
 						icsk->icsk_user_timeout);
 	}
 	tcp_fastopen_active_detect_blackhole(sk, expired);
+
+	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
+		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
+				  icsk->icsk_retransmits,
+				  icsk->icsk_rto, (int)expired);
+
 	if (expired) {
 		/* Has it gone just too far? */
 		tcp_write_err(sk);
 		return 1;
 	}
+
 	return 0;
 }
 
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 04/11] bpf: Support passing args to sock_ops bpf function
From: Lawrence Brakmo @ 2018-01-04 23:55 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann,
	Eric Dumazet, Neal Cardwell, Yuchung Cheng
In-Reply-To: <20180104235533.3672006-1-brakmo@fb.com>

Adds support for passing up to 4 arguments to sock_ops bpf functions. It
reusues the reply union, so the bpf_sock_ops structures are not
increased in size.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/linux/filter.h   |  1 +
 include/net/tcp.h        | 64 ++++++++++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/bpf.h |  5 ++--
 net/ipv4/tcp.c           |  2 +-
 net/ipv4/tcp_nv.c        |  2 +-
 net/ipv4/tcp_output.c    |  2 +-
 6 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2e6e889..97f9a02 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1001,6 +1001,7 @@ struct bpf_sock_ops_kern {
 	struct	sock *sk;
 	u32	op;
 	union {
+		u32 args[4];
 		u32 reply;
 		u32 replylong[4];
 	};
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 108d16a..8e9111f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2005,7 +2005,7 @@ void tcp_cleanup_ulp(struct sock *sk);
  * program loaded).
  */
 #ifdef CONFIG_BPF
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
 {
 	struct bpf_sock_ops_kern sock_ops;
 	int ret;
@@ -2018,6 +2018,8 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 
 	sock_ops.sk = sk;
 	sock_ops.op = op;
+	if (nargs > 0)
+		memcpy(sock_ops.args, args, nargs*sizeof(u32));
 
 	ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
 	if (ret == 0)
@@ -2026,18 +2028,70 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 		ret = -1;
 	return ret;
 }
+
+static inline int tcp_call_bpf_1arg(struct sock *sk, int op, u32 arg)
+{
+	return tcp_call_bpf(sk, op, 1, &arg);
+}
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+	u32 args[2] = {arg1, arg2};
+
+	return tcp_call_bpf(sk, op, 2, args);
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3)
+{
+	u32 args[3] = {arg1, arg2, arg3};
+
+	return tcp_call_bpf(sk, op, 3, args);
+}
+
+static inline int tcp_call_bpf_4arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3, u32 arg4)
+{
+	u32 args[4] = {arg1, arg2, arg3, arg4};
+
+	return tcp_call_bpf(sk, op, 4, args);
+}
+
 #else
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
 {
 	return -EPERM;
 }
+
+static inline int tcp_call_bpf_1arg(struct sock *sk, int op, u32 arg)
+{
+	return -EPERM;
+}
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+	return -EPERM;
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+	u32 arg3)
+{
+	return -EPERM;
+}
+
+static inline int tcp_call_bpf_4arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3, u32 arg4)
+{
+	return -EPERM;
+}
+
 #endif
 
 static inline u32 tcp_timeout_init(struct sock *sk)
 {
 	int timeout;
 
-	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT);
+	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);
 
 	if (timeout <= 0)
 		timeout = TCP_TIMEOUT_INIT;
@@ -2048,7 +2102,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
 {
 	int rwnd;
 
-	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT);
+	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);
 
 	if (rwnd < 0)
 		rwnd = 0;
@@ -2057,7 +2111,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
 
 static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 {
-	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
+	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
 }
 
 #if IS_ENABLED(CONFIG_SMC)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f2f8b36..3f2ee04 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -945,8 +945,9 @@ struct bpf_map_info {
 struct bpf_sock_ops {
 	__u32 op;
 	union {
-		__u32 reply;
-		__u32 replylong[4];
+		__u32 args[4];		/* Optionally passed to bpf program */
+		__u32 reply;		/* Returned by bpf program	    */
+		__u32 replylong[4];	/* Optionally returned by bpf prog  */
 	};
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7ac583a..a1fe7a7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -463,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
 	tcp_mtup_init(sk);
 	icsk->icsk_af_ops->rebuild_header(sk);
 	tcp_init_metrics(sk);
-	tcp_call_bpf(sk, bpf_op);
+	tcp_call_bpf(sk, bpf_op, 0, NULL);
 	tcp_init_congestion_control(sk);
 	tcp_init_buffer_space(sk);
 }
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index 0b5a05b..ddbce73 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -146,7 +146,7 @@ static void tcpnv_init(struct sock *sk)
 	 * within a datacenter, where we have reasonable estimates of
 	 * RTTs
 	 */
-	base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT);
+	base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL);
 	if (base_rtt > 0) {
 		ca->nv_base_rtt = base_rtt;
 		ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 04be9f83..b093985 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3468,7 +3468,7 @@ int tcp_connect(struct sock *sk)
 	struct sk_buff *buff;
 	int err;
 
-	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB);
+	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
 
 	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
 		return -EHOSTUNREACH; /* Routing failure or similar. */
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 05/11] bpf: Adds field bpf_sock_ops_flags to tcp_sock
From: Lawrence Brakmo @ 2018-01-04 23:55 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann,
	Eric Dumazet, Neal Cardwell, Yuchung Cheng
In-Reply-To: <20180104235533.3672006-1-brakmo@fb.com>

Adds field bpf_sock_ops_flags to tcp_sock and bpf_sock_ops. Its primary
use is to determine if there should be calls to sock_ops bpf program at
various points in the TCP code. The field is initialized to zero,
disabling the calls. A sock_ops BPF program can set, per connection and
as necessary, when the connection is established.

It also adds support for reading and writting the field within a
sock_ops BPF program.

Examples of where to call the bpf program:

1) When RTO fires
2) When a packet is retransmitted
3) When the connection terminates
4) When a packet is sent
5) When a packet is received

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/linux/tcp.h      | 8 ++++++++
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 7 +++++++
 3 files changed, 16 insertions(+)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4f93f095..62f4388 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -373,6 +373,14 @@ struct tcp_sock {
 	 */
 	struct request_sock *fastopen_rsk;
 	u32	*saved_syn;
+
+/* Sock_ops bpf program related variables */
+#ifdef CONFIG_BPF
+	u32     bpf_sock_ops_flags;     /* values defined in uapi/linux/tcp.h */
+#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_flags & ARG)
+#else
+#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
+#endif
 };
 
 enum tsq_enum {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3f2ee04..f7adeca 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -962,6 +962,7 @@ struct bpf_sock_ops {
 				 */
 	__u32 snd_cwnd;
 	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
+	__u32 bpf_sock_ops_flags; /* flags defined in uapi/linux/tcp.h */
 };
 
 /* List of known BPF sock_ops operators.
diff --git a/net/core/filter.c b/net/core/filter.c
index 978ac78..76fd6e9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3843,6 +3843,7 @@ static bool sock_ops_is_valid_access(int off, int size,
 		switch (off) {
 		case offsetof(struct bpf_sock_ops, op) ...
 		     offsetof(struct bpf_sock_ops, replylong[3]):
+		case offsetof(struct bpf_sock_ops, bpf_sock_ops_flags):
 			break;
 		default:
 			return false;
@@ -4525,6 +4526,12 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct bpf_sock_ops, srtt_us):
 		SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock);
 		break;
+
+	case offsetof(struct bpf_sock_ops, bpf_sock_ops_flags):
+		SOCK_OPS_GET_OR_SET_FIELD(bpf_sock_ops_flags,
+					  bpf_sock_ops_flags,
+					  struct tcp_sock, type);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 03/11] bpf: Add write access to tcp_sock and sock fields
From: Lawrence Brakmo @ 2018-01-04 23:55 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann,
	Eric Dumazet, Neal Cardwell, Yuchung Cheng
In-Reply-To: <20180104235533.3672006-1-brakmo@fb.com>

This patch adds a macro, SOCK_OPS_SET_FIELD, for writing to
struct tcp_sock or struct sock fields. This required adding a new
field "temp" to struct bpf_sock_ops_kern for temporary storage that
is used by sock_ops_convert_ctx_access. It is used to store and recover
the contents of a register, so the register can be used to store the
address of the sk. Since we cannot overwrite the dst_reg because it
contains the pointer to ctx, nor the src_reg since it contains the value
we want to store, we need an extra register to contain the address
of the sk.

Also adds the macro SOCK_OPS_GET_OR_SET_FIELD that calls one of the
GET or SET macros depending on the value of the TYPE field.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/linux/filter.h |  3 +++
 include/net/tcp.h      |  2 +-
 net/core/filter.c      | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2b0df27..2e6e889 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1005,6 +1005,9 @@ struct bpf_sock_ops_kern {
 		u32 replylong[4];
 	};
 	u32	is_fullsock;
+	u64	temp;			/* Used by sock_ops_convert_ctx_access
+					 * as temporary storaage of a register
+					 */
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6939e69..108d16a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2010,7 +2010,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 	struct bpf_sock_ops_kern sock_ops;
 	int ret;
 
-	memset(&sock_ops, 0, sizeof(sock_ops));
+	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
 	if (sk_fullsock(sk)) {
 		sock_ops.is_fullsock = 1;
 		sock_owned_by_me(sk);
diff --git a/net/core/filter.c b/net/core/filter.c
index 7064862..978ac78 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4470,6 +4470,54 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(OBJ, OBJ_FIELD));	      \
 	} while (0)
 
+/* Helper macro for adding write access to tcp_sock or sock fields.
+ * The macro is called with two registers, dst_reg which contains a pointer
+ * to ctx (context) and src_reg which contains the value that should be
+ * stored. However, we need an additional register since we cannot overwrite
+ * dst_reg because it may be used later in the program.
+ * Instead we "borrow" one of the other register. We first save its value
+ * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
+ * it at the end of the macro.
+ */
+#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
+	do {								      \
+		int reg = BPF_REG_9;					      \
+		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) >		      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD));   \
+		if (si->dst_reg == reg || si->src_reg == reg)		      \
+			reg--;						      \
+		if (si->dst_reg == reg || si->src_reg == reg)		      \
+			reg--;						      \
+		*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       temp));			      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern,     \
+						is_fullsock),		      \
+				      reg, si->dst_reg,			      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       is_fullsock));		      \
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2);		      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern, sk),\
+				      reg, si->dst_reg,			      \
+				      offsetof(struct bpf_sock_ops_kern, sk));\
+		*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD),	      \
+				      reg, si->src_reg,			      \
+				      offsetof(OBJ, OBJ_FIELD));	      \
+		*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       temp));			      \
+	} while (0)
+
+#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE)	      \
+	do {								      \
+		if (TYPE == BPF_WRITE)					      \
+			SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);	      \
+		else							      \
+			SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);	      \
+	} while (0)
+
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
 		SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock);
 		break;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 02/11] bpf: Make SOCK_OPS_GET_TCP struct independent
From: Lawrence Brakmo @ 2018-01-04 23:55 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann,
	Eric Dumazet, Neal Cardwell, Yuchung Cheng
In-Reply-To: <20180104235533.3672006-1-brakmo@fb.com>

Changed SOCK_OPS_GET_TCP to SOCK_OPS_GET_FIELD and added 2
arguments so now it can also work with struct sock fields.
The first argument is the name of the field in the bpf_sock_ops
struct, the 2nd argument is the name of the field in the OBJ struct.

Previous: SOCK_OPS_GET_TCP(FIELD_NAME)
New:      SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)

Where OBJ is either "struct tcp_sock" or "struct sock" (without
quotation). BPF_FIELD is the name of the field in the bpf_sock_ops
struct and OBJ_FIELD is the name of the field in the OBJ struct.

Although the field names are currently the same, the kernel struct names
could change in the future and this change makes it easier to support
that.

Note that adding access to tcp_sock fields in sock_ops programs does
not preclude the tcp_sock fields from being removed as long as we are
willing to do one of the following:

  1) Return a fixed value (e.x. 0 or 0xffffffff), or
  2) Make the verifier fail if that field is accessed (i.e. program
    fails to load) so the user will know that field is no longer
    supported.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 net/core/filter.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 099ff9fd..7064862 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4448,11 +4448,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 					       is_fullsock));
 		break;
 
-/* Helper macro for adding read access to tcp_sock fields. */
-#define SOCK_OPS_GET_TCP(FIELD_NAME)					      \
+/* Helper macro for adding read access to tcp_sock or sock fields. */
+#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
 	do {								      \
-		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) >      \
-			     FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME));  \
+		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) >		      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD));   \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
 						struct bpf_sock_ops_kern,     \
 						is_fullsock),		      \
@@ -4464,18 +4464,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 						struct bpf_sock_ops_kern, sk),\
 				      si->dst_reg, si->src_reg,		      \
 				      offsetof(struct bpf_sock_ops_kern, sk));\
-		*insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock,	      \
-						   FIELD_NAME), si->dst_reg,  \
-				      si->dst_reg,			      \
-				      offsetof(struct tcp_sock, FIELD_NAME)); \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ,		      \
+						       OBJ_FIELD),	      \
+				      si->dst_reg, si->dst_reg,		      \
+				      offsetof(OBJ, OBJ_FIELD));	      \
 	} while (0)
 
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
-		SOCK_OPS_GET_TCP(snd_cwnd);
+		SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock);
 		break;
 
 	case offsetof(struct bpf_sock_ops, srtt_us):
-		SOCK_OPS_GET_TCP(srtt_us);
+		SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock);
 		break;
 	}
 	return insn - insn_buf;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 00/11] bpf: More sock_ops callbacks
From: Lawrence Brakmo @ 2018-01-04 23:55 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann,
	Eric Dumazet, Neal Cardwell, Yuchung Cheng

This patchset adds support for:

- direct R or R/W access to many tcp_sock fields
- passing up to 4 arguments to sock_ops BPF functions
- tcp_sock field bpf_sock_ops_flags for controlling callbacks
- optionally calling sock_ops BPF program when RTO fires
- optionally calling sock_ops BPF program when packet is retransmitted
- optionally calling sock_ops BPF program when TCP state changes
- access to tclass and sk_txhash
- new selftest

v2: Fixed commit message 0/11. The commit is to "bpf-next" but the patch
    below used "bpf" and Patchwork didn't work correctly.
v3: Cleaned RTO callback as per  Yuchung's comment
    Added BPF enum for TCP states as per  Alexei's comment
v4: Fixed compile warnings related to detecting changes between TCP
    internal states and the BPF defined states.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>

Consists of the following patches:
[PATCH bpf-next v4 01/11] bpf: Make SOCK_OPS_GET_TCP size independent
[PATCH bpf-next v4 02/11] bpf: Make SOCK_OPS_GET_TCP struct
[PATCH bpf-next v4 03/11] bpf: Add write access to tcp_sock and sock
[PATCH bpf-next v4 04/11] bpf: Support passing args to sock_ops bpf
[PATCH bpf-next v4 05/11] bpf: Adds field bpf_sock_ops_flags to
[PATCH bpf-next v4 06/11] bpf: Add sock_ops RTO callback
[PATCH bpf-next v4 07/11] bpf: Add support for reading sk_state and
[PATCH bpf-next v4 08/11] bpf: Add sock_ops R/W access to tclass &
[PATCH bpf-next v4 09/11] bpf: Add BPF_SOCK_OPS_RETRANS_CB
[PATCH bpf-next v4 10/11] bpf: Add BPF_SOCK_OPS_STATE_CB
[PATCH bpf-next v4 11/11] bpf: add selftest for tcpbpf

 include/linux/filter.h                         |   4 +
 include/linux/tcp.h                            |   8 ++
 include/net/tcp.h                              |  66 +++++++++-
 include/uapi/linux/bpf.h                       |  61 ++++++++-
 include/uapi/linux/tcp.h                       |   5 +
 net/core/filter.c                              | 221 ++++++++++++++++++++++++++++++--
 net/ipv4/tcp.c                                 |  26 +++-
 net/ipv4/tcp_nv.c                              |   2 +-
 net/ipv4/tcp_output.c                          |   5 +-
 net/ipv4/tcp_timer.c                           |   7 +
 tools/include/uapi/linux/bpf.h                 |  70 +++++++++-
 tools/testing/selftests/bpf/Makefile           |   4 +-
 tools/testing/selftests/bpf/tcp_client.py      |  52 ++++++++
 tools/testing/selftests/bpf/tcp_server.py      |  79 ++++++++++++
 tools/testing/selftests/bpf/test_tcpbpf_kern.c | 125 ++++++++++++++++++
 tools/testing/selftests/bpf/test_tcpbpf_user.c | 113 ++++++++++++++++
 16 files changed, 823 insertions(+), 25 deletions(-)

^ permalink raw reply

* [PATCH bpf-next v4 07/11] bpf: Add support for reading sk_state and more
From: Lawrence Brakmo @ 2018-01-04 23:55 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann,
	Eric Dumazet, Neal Cardwell, Yuchung Cheng
In-Reply-To: <20180104235533.3672006-1-brakmo@fb.com>

Add support for reading many more tcp_sock fields

  state,	same as sk->sk_state
  rtt_min	same as sk->rtt_min.s[0].v (current rtt_min)
  snd_ssthresh
  rcv_nxt
  snd_nxt
  snd_una
  mss_cache
  ecn_flags
  rate_delivered
  rate_interval_us
  packets_out
  retrans_out
  total_retrans
  segs_in
  data_segs_in
  segs_out
  data_segs_out
  bytes_received (__u64)
  bytes_acked    (__u64)

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/uapi/linux/bpf.h |  19 +++++++++
 net/core/filter.c        | 101 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 119 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a1316c7..a8f4cf0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -963,6 +963,25 @@ struct bpf_sock_ops {
 	__u32 snd_cwnd;
 	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
 	__u32 bpf_sock_ops_flags; /* flags defined in uapi/linux/tcp.h */
+	__u32 state;
+	__u32 rtt_min;
+	__u32 snd_ssthresh;
+	__u32 rcv_nxt;
+	__u32 snd_nxt;
+	__u32 snd_una;
+	__u32 mss_cache;
+	__u32 ecn_flags;
+	__u32 rate_delivered;
+	__u32 rate_interval_us;
+	__u32 packets_out;
+	__u32 retrans_out;
+	__u32 total_retrans;
+	__u32 segs_in;
+	__u32 data_segs_in;
+	__u32 segs_out;
+	__u32 data_segs_out;
+	__u64 bytes_received;
+	__u64 bytes_acked;
 };
 
 /* List of known BPF sock_ops operators.
diff --git a/net/core/filter.c b/net/core/filter.c
index 76fd6e9..d4c5c1a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3829,7 +3829,7 @@ static bool __is_valid_sock_ops_access(int off, int size)
 	/* The verifier guarantees that size > 0. */
 	if (off % size != 0)
 		return false;
-	if (size != sizeof(__u32))
+	if (size != sizeof(__u32) && size != sizeof(__u64))
 		return false;
 
 	return true;
@@ -4449,6 +4449,32 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 					       is_fullsock));
 		break;
 
+	case offsetof(struct bpf_sock_ops, state):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_state));
+		break;
+
+	case offsetof(struct bpf_sock_ops, rtt_min):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
+			     sizeof(struct minmax));
+		BUILD_BUG_ON(sizeof(struct minmax) <
+			     sizeof(struct minmax_sample));
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct tcp_sock, rtt_min) +
+				      FIELD_SIZEOF(struct minmax_sample, t));
+		break;
+
 /* Helper macro for adding read access to tcp_sock or sock fields. */
 #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \
 	do {								      \
@@ -4532,6 +4558,79 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 					  bpf_sock_ops_flags,
 					  struct tcp_sock, type);
 		break;
+
+	case offsetof(struct bpf_sock_ops, snd_ssthresh):
+		SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rcv_nxt):
+		SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, snd_nxt):
+		SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, snd_una):
+		SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, mss_cache):
+		SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, ecn_flags):
+		SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rate_delivered):
+		SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, rate_interval_us):
+		SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, packets_out):
+		SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, retrans_out):
+		SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, total_retrans):
+		SOCK_OPS_GET_FIELD(total_retrans, total_retrans,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, segs_in):
+		SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, data_segs_in):
+		SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, segs_out):
+		SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, data_segs_out):
+		SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, bytes_received):
+		SOCK_OPS_GET_FIELD(bytes_received, bytes_received,
+				   struct tcp_sock);
+		break;
+
+	case offsetof(struct bpf_sock_ops, bytes_acked):
+		SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 08/11] bpf: Add sock_ops R/W access to tclass & sk_txhash
From: Lawrence Brakmo @ 2018-01-04 23:55 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann,
	Eric Dumazet, Neal Cardwell, Yuchung Cheng
In-Reply-To: <20180104235533.3672006-1-brakmo@fb.com>

Adds direct R/W access to sk_txhash and access to tclass for ipv6 flows
through getsockopt and setsockopt. Sample usage for tclass:

  bpf_getsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v, sizeof(v))

where skops is a pointer to the ctx (struct bpf_sock_ops).

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/uapi/linux/bpf.h |  1 +
 net/core/filter.c        | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a8f4cf0..df1e73e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -982,6 +982,7 @@ struct bpf_sock_ops {
 	__u32 data_segs_out;
 	__u64 bytes_received;
 	__u64 bytes_acked;
+	__u32 sk_txhash;
 };
 
 /* List of known BPF sock_ops operators.
diff --git a/net/core/filter.c b/net/core/filter.c
index d4c5c1a..4d2ff88 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3230,6 +3230,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			ret = -EINVAL;
 		}
 #ifdef CONFIG_INET
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (level == SOL_IPV6) {
+		if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+			return -EINVAL;
+
+		val = *((int *)optval);
+		/* Only some options are supported */
+		switch (optname) {
+		case IPV6_TCLASS:
+			if (val < -1 || val > 0xff) {
+				ret = -EINVAL;
+			} else {
+				struct ipv6_pinfo *np = inet6_sk(sk);
+
+				if (val == -1)
+					val = 0;
+				np->tclass = val;
+			}
+			break;
+		default:
+			ret = -EINVAL;
+		}
+#endif
 	} else if (level == SOL_TCP &&
 		   sk->sk_prot->setsockopt == tcp_setsockopt) {
 		if (optname == TCP_CONGESTION) {
@@ -3239,7 +3262,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			strncpy(name, optval, min_t(long, optlen,
 						    TCP_CA_NAME_MAX-1));
 			name[TCP_CA_NAME_MAX-1] = 0;
-			ret = tcp_set_congestion_control(sk, name, false, reinit);
+			ret = tcp_set_congestion_control(sk, name, false,
+							 reinit);
 		} else {
 			struct tcp_sock *tp = tcp_sk(sk);
 
@@ -3305,6 +3329,22 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 		} else {
 			goto err_clear;
 		}
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (level == SOL_IPV6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+			goto err_clear;
+
+		/* Only some options are supported */
+		switch (optname) {
+		case IPV6_TCLASS:
+			*((int *)optval) = (int)np->tclass;
+			break;
+		default:
+			goto err_clear;
+		}
+#endif
 	} else {
 		goto err_clear;
 	}
@@ -3844,6 +3884,7 @@ static bool sock_ops_is_valid_access(int off, int size,
 		case offsetof(struct bpf_sock_ops, op) ...
 		     offsetof(struct bpf_sock_ops, replylong[3]):
 		case offsetof(struct bpf_sock_ops, bpf_sock_ops_flags):
+		case offsetof(struct bpf_sock_ops, sk_txhash):
 			break;
 		default:
 			return false;
@@ -4631,6 +4672,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct bpf_sock_ops, bytes_acked):
 		SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock);
 		break;
+
+	case offsetof(struct bpf_sock_ops, sk_txhash):
+		SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
+					  struct sock, type);
+		break;
 	}
 	return insn - insn_buf;
 }
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 11/11] bpf: add selftest for tcpbpf
From: Lawrence Brakmo @ 2018-01-04 23:55 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann,
	Eric Dumazet, Neal Cardwell, Yuchung Cheng
In-Reply-To: <20180104235533.3672006-1-brakmo@fb.com>

Added a selftest for tcpbpf (sock_ops) that checks that the appropriate
callbacks occured and that it can access tcp_sock fields and that their
values are correct.

Run with command: ./test_tcpbpf_user

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 tools/include/uapi/linux/bpf.h                 |  70 +++++++++++++-
 tools/testing/selftests/bpf/Makefile           |   4 +-
 tools/testing/selftests/bpf/tcp_client.py      |  52 ++++++++++
 tools/testing/selftests/bpf/tcp_server.py      |  79 ++++++++++++++++
 tools/testing/selftests/bpf/test_tcpbpf_kern.c | 125 +++++++++++++++++++++++++
 tools/testing/selftests/bpf/test_tcpbpf_user.c | 113 ++++++++++++++++++++++
 6 files changed, 438 insertions(+), 5 deletions(-)
 create mode 100755 tools/testing/selftests/bpf/tcp_client.py
 create mode 100755 tools/testing/selftests/bpf/tcp_server.py
 create mode 100644 tools/testing/selftests/bpf/test_tcpbpf_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_tcpbpf_user.c

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4e8c60a..1fcd86f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -945,8 +945,9 @@ struct bpf_map_info {
 struct bpf_sock_ops {
 	__u32 op;
 	union {
-		__u32 reply;
-		__u32 replylong[4];
+		__u32 args[4];		/* Optionally passed to bpf program */
+		__u32 reply;		/* Returned by bpf program	    */
+		__u32 replylong[4];	/* Optionally returned by bpf prog  */
 	};
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
@@ -955,6 +956,33 @@ struct bpf_sock_ops {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	__u32 is_fullsock;	/* Some TCP fields are only valid if
+				 * there is a full socket. If not, the
+				 * fields read as zero.
+				 */
+	__u32 snd_cwnd;
+	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
+	__u32 bpf_sock_ops_flags; /* flags defined in uapi/linux/tcp.h */
+	__u32 state;
+	__u32 rtt_min;
+	__u32 snd_ssthresh;
+	__u32 rcv_nxt;
+	__u32 snd_nxt;
+	__u32 snd_una;
+	__u32 mss_cache;
+	__u32 ecn_flags;
+	__u32 rate_delivered;
+	__u32 rate_interval_us;
+	__u32 packets_out;
+	__u32 retrans_out;
+	__u32 total_retrans;
+	__u32 segs_in;
+	__u32 data_segs_in;
+	__u32 segs_out;
+	__u32 data_segs_out;
+	__u64 bytes_received;
+	__u64 bytes_acked;
+	__u32 sk_txhash;
 };
 
 /* List of known BPF sock_ops operators.
@@ -990,6 +1018,41 @@ enum {
 					 * a congestion threshold. RTTs above
 					 * this indicate congestion
 					 */
+	BPF_SOCK_OPS_RTO_CB,		/* Called when an RTO has triggered.
+					 * Arg1: value of icsk_retransmits
+					 * Arg2: value of icsk_rto
+					 * Arg3: whether RTO has expired
+					 */
+	BPF_SOCK_OPS_RETRANS_CB,	/* Called when skb is retransmitted.
+					 * Arg1: sequence number of 1st byte
+					 * Arg2: # segments
+					 */
+	BPF_SOCK_OPS_STATE_CB,		/* Called when TCP changes state.
+					 * Arg1: old_state
+					 * Arg2: new_state
+					 */
+};
+
+/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
+ * changes between the TCP and BPF versions. Ideally this should never happen.
+ * If it does, we need to add code to convert them before calling
+ * the BPF sock_ops function.
+ */
+enum {
+	BPF_TCP_ESTABLISHED = 1,
+	BPF_TCP_SYN_SENT,
+	BPF_TCP_SYN_RECV,
+	BPF_TCP_FIN_WAIT1,
+	BPF_TCP_FIN_WAIT2,
+	BPF_TCP_TIME_WAIT,
+	BPF_TCP_CLOSE,
+	BPF_TCP_CLOSE_WAIT,
+	BPF_TCP_LAST_ACK,
+	BPF_TCP_LISTEN,
+	BPF_TCP_CLOSING,	/* Now a valid state */
+	BPF_TCP_NEW_SYN_RECV,
+
+	BPF_TCP_MAX_STATES	/* Leave at the end! */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
@@ -1009,7 +1072,8 @@ struct bpf_perf_event_value {
 #define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
 
 struct bpf_cgroup_dev_ctx {
-	__u32 access_type; /* (access << 16) | type */
+	/* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */
+	__u32 access_type;
 	__u32 major;
 	__u32 minor;
 };
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 1304753..8e032a2 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -14,12 +14,12 @@ CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../i
 LDLIBS += -lcap -lelf -lrt
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
-	test_align test_verifier_log test_dev_cgroup
+	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
-	test_l4lb_noinline.o test_xdp_noinline.o
+	test_l4lb_noinline.o test_xdp_noinline.o test_tcpbpf_kern.o
 
 TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
 	test_offload.py
diff --git a/tools/testing/selftests/bpf/tcp_client.py b/tools/testing/selftests/bpf/tcp_client.py
new file mode 100755
index 0000000..ac2ce32
--- /dev/null
+++ b/tools/testing/selftests/bpf/tcp_client.py
@@ -0,0 +1,52 @@
+#!/usr/local/bin/python
+#
+# SPDX-License-Identifier: GPL-2.0
+#
+
+import sys, os, os.path, getopt
+import socket, time
+import subprocess
+import select
+
+def read(sock, n):
+    buf = ''
+    while len(buf) < n:
+        rem = n - len(buf)
+        try: s = sock.recv(rem)
+        except (socket.error), e: return ''
+        buf += s
+    return buf
+
+def send(sock, s):
+    total = len(s)
+    count = 0
+    while count < total:
+        try: n = sock.send(s)
+        except (socket.error), e: n = 0
+        if n == 0:
+            return count;
+        count += n
+    return count
+
+
+serverPort = int(sys.argv[1])
+HostName = socket.gethostname()
+
+time.sleep(1)
+
+# create active socket
+sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+try:
+    sock.connect((HostName, serverPort))
+except socket.error as e:
+    sys.exit(1)
+
+buf = ''
+n = 0
+while n < 1000:
+    buf += '+'
+    n += 1
+
+n = send(sock, buf)
+n = read(sock, 500)
+sys.exit(0)
diff --git a/tools/testing/selftests/bpf/tcp_server.py b/tools/testing/selftests/bpf/tcp_server.py
new file mode 100755
index 0000000..9e5db0d
--- /dev/null
+++ b/tools/testing/selftests/bpf/tcp_server.py
@@ -0,0 +1,79 @@
+#!/usr/local/bin/python
+#
+# SPDX-License-Identifier: GPL-2.0
+#
+
+import sys, os, os.path, getopt
+import socket, time
+import subprocess
+import select
+
+def read(sock, n):
+    buf = ''
+    while len(buf) < n:
+        rem = n - len(buf)
+        try: s = sock.recv(rem)
+        except (socket.error), e: return ''
+        buf += s
+    return buf
+
+def send(sock, s):
+    total = len(s)
+    count = 0
+    while count < total:
+        try: n = sock.send(s)
+        except (socket.error), e: n = 0
+        if n == 0:
+            return count;
+        count += n
+    return count
+
+
+SERVER_PORT = 12877
+MAX_PORTS = 2
+
+serverPort = SERVER_PORT
+serverSocket = None
+
+HostName = socket.gethostname()
+
+# create passive socket
+serverSocket = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+host = socket.gethostname()
+
+while serverPort < SERVER_PORT + 5:
+	try: serverSocket.bind((host, serverPort))
+	except socket.error as msg:
+            serverPort += 1
+            continue
+	break
+
+cmdStr = ("./tcp_client.py %d &") % (serverPort)
+os.system(cmdStr)
+
+buf = ''
+n = 0
+while n < 500:
+    buf += '.'
+    n += 1
+
+serverSocket.listen(MAX_PORTS)
+readList = [serverSocket]
+
+while True:
+    readyRead, readyWrite, inError = \
+        select.select(readList, [], [], 10)
+
+    if len(readyRead) > 0:
+        waitCount = 0
+        for sock in readyRead:
+            if sock == serverSocket:
+                (clientSocket, address) = serverSocket.accept()
+                address = str(address[0])
+                readList.append(clientSocket)
+            else:
+                s = read(sock, 1000)
+                n = send(sock, buf)
+                sock.close()
+                time.sleep(1)
+                sys.exit(0)
diff --git a/tools/testing/selftests/bpf/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/test_tcpbpf_kern.c
new file mode 100644
index 0000000..0fa7429
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpbpf_kern.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/in6.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <netinet/in.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+struct globals {
+	__u32 event_map;
+	__u32 total_retrans;
+	__u32 data_segs_in;
+	__u32 data_segs_out;
+	__u64 bytes_received;
+	__u64 bytes_acked;
+};
+
+struct bpf_map_def SEC("maps") global_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct globals),
+	.max_entries = 2,
+};
+
+static inline void update_event_map(int event)
+{
+	__u32 key = 0;
+	struct globals g, *gp;
+
+	gp = bpf_map_lookup_elem(&global_map, &key);
+	if (gp == NULL) {
+		struct globals g = {0, 0, 0, 0, 0, 0};
+
+		g.event_map |= (1 << event);
+		bpf_map_update_elem(&global_map, &key, &g,
+			    BPF_ANY);
+	} else {
+		g = *gp;
+		g.event_map |= (1 << event);
+		bpf_map_update_elem(&global_map, &key, &g,
+			    BPF_ANY);
+	}
+}
+
+int _version SEC("version") = 1;
+
+SEC("sockops")
+int bpf_testcb(struct bpf_sock_ops *skops)
+{
+	int rv = -1;
+	int op;
+	int init_seq = 0;
+	int ret = 0;
+	int v = 0;
+
+	/* For testing purposes, only execute rest of BPF program
+	 * if remote port number is in the range 12877..12887
+	 * I.e. the active side of the connection
+	 */
+	if ((bpf_ntohl(skops->remote_port) < 12877 ||
+	     bpf_ntohl(skops->remote_port) >= 12887)) {
+		skops->reply = -1;
+		return 1;
+	}
+
+	op = (int) skops->op;
+
+	/* Check that both hosts are within same datacenter. For this example
+	 * it is the case when the first 5.5 bytes of their IPv6 addresses are
+	 * the same.
+	 */
+	if (1) {
+		update_event_map(op);
+
+		switch (op) {
+		case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+			skops->bpf_sock_ops_flags = 0xfff;
+			init_seq = skops->snd_nxt;
+			break;
+		case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+			init_seq = skops->snd_nxt;
+			skops->bpf_sock_ops_flags = 0xfff;
+			skops->sk_txhash = 0x12345f;
+			v = 0xff;
+			ret = bpf_setsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v,
+					     sizeof(v));
+			break;
+		case BPF_SOCK_OPS_RTO_CB:
+			break;
+		case BPF_SOCK_OPS_RETRANS_CB:
+			break;
+		case BPF_SOCK_OPS_STATE_CB:
+			if (skops->args[1] == BPF_TCP_CLOSE) {
+				__u32 key = 0;
+				struct globals g, *gp;
+
+				gp = bpf_map_lookup_elem(&global_map, &key);
+				if (!gp)
+					break;
+				g = *gp;
+				g.total_retrans = skops->total_retrans;
+				g.data_segs_in = skops->data_segs_in;
+				g.data_segs_out = skops->data_segs_out;
+				g.bytes_received = skops->bytes_received;
+				g.bytes_acked = skops->bytes_acked;
+				bpf_map_update_elem(&global_map, &key, &g,
+						    BPF_ANY);
+			}
+			break;
+		default:
+			rv = -1;
+		}
+	} else {
+		rv = -1;
+	}
+	skops->reply = rv;
+	return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c
new file mode 100644
index 0000000..38665ea
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <signal.h>
+#include <string.h>
+#include <assert.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "bpf_util.h"
+#include <linux/perf_event.h>
+
+struct globals {
+	__u32 event_map;
+	__u32 total_retrans;
+	__u32 data_segs_in;
+	__u32 data_segs_out;
+	__u64 bytes_received;
+	__u64 bytes_acked;
+};
+
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+			const char *name)
+{
+	struct bpf_map *map;
+
+	map = bpf_object__find_map_by_name(obj, name);
+	if (!map) {
+		printf("%s:FAIL:map '%s' not found\n", test, name);
+		return -1;
+	}
+	return bpf_map__fd(map);
+}
+
+#define SYSTEM(CMD)						\
+	do {							\
+		if (system(CMD)) {				\
+			printf("system(%s) FAILS!\n", CMD);	\
+		}						\
+	} while (0)
+
+int main(int argc, char **argv)
+{
+	struct globals g = {0, 0, 0, 0, 0, 0};
+	__u32 key = 0;
+	int rv;
+	int pid;
+	int error = EXIT_FAILURE;
+	int cg_fd, prog_fd, map_fd;
+	char cmd[100], *dir;
+	const char *file = "test_tcpbpf_kern.o";
+	struct bpf_object *obj;
+	struct stat buffer;
+
+	dir = "/tmp/cgroupv2/foo";
+
+	if (stat(dir, &buffer) != 0) {
+		SYSTEM("mkdir -p /tmp/cgroupv2");
+		SYSTEM("mount -t cgroup2 none /tmp/cgroupv2");
+		SYSTEM("mkdir -p /tmp/cgroupv2/foo");
+	}
+	pid = (int) getpid();
+	sprintf(cmd, "echo %d >> /tmp/cgroupv2/foo/cgroup.procs", pid);
+	SYSTEM(cmd);
+
+	cg_fd = open(dir, O_DIRECTORY, O_RDONLY);
+	if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) {
+//	if (load_bpf_file(prog)) {
+		printf("FAILED: load_bpf_file failed for: %s\n", file);
+//		printf("%s", bpf_log_buf);
+		goto err;
+	}
+
+	rv = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_SOCK_OPS, 0);
+	if (rv) {
+		printf("FAILED: bpf_prog_attach: %d (%s)\n",
+		       error, strerror(errno));
+		goto err;
+	}
+
+	SYSTEM("./tcp_server.py");
+
+	map_fd = bpf_find_map(__func__, obj, "global_map");
+	if (map_fd < 0)
+		goto err;
+
+	rv = bpf_map_lookup_elem(map_fd, &key, &g);
+	if (rv != 0) {
+		printf("FAILED: bpf_map_lookup_elem returns %d\n", rv);
+		goto err;
+	}
+
+	if (g.bytes_received != 501 || g.bytes_acked != 1002 ||
+	    g.data_segs_in != 1 || g.data_segs_out != 1 ||
+		g.event_map != 0x45e) {
+		printf("FAILED: Wrong stats\n");
+		goto err;
+	}
+	printf("PASSED!\n");
+	error = 0;
+err:
+	bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS);
+	return error;
+}
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 09/11] bpf: Add BPF_SOCK_OPS_RETRANS_CB
From: Lawrence Brakmo @ 2018-01-04 23:55 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann,
	Eric Dumazet, Neal Cardwell, Yuchung Cheng
In-Reply-To: <20180104235533.3672006-1-brakmo@fb.com>

Adds support for calling sock_ops BPF program when there is a
retransmission. Two arguments are used; one for the sequence number and
other for the number of segments retransmitted. Does not include syn-ack
retransmissions.

New op: BPF_SOCK_OPS_RETRANS_CB.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/uapi/linux/bpf.h | 4 ++++
 include/uapi/linux/tcp.h | 1 +
 net/ipv4/tcp_output.c    | 3 +++
 3 files changed, 8 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index df1e73e..f6ab1da 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1023,6 +1023,10 @@ enum {
 					 * Arg2: value of icsk_rto
 					 * Arg3: whether RTO has expired
 					 */
+	BPF_SOCK_OPS_RETRANS_CB,	/* Called when skb is retransmitted.
+					 * Arg1: sequence number of 1st byte
+					 * Arg2: # segments
+					 */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 089c19e..dc36d3c 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -261,6 +261,7 @@ struct tcp_md5sig {
 
 /* Definitions for bpf_sock_ops_flags */
 #define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
+#define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
 
 /* INET_DIAG_MD5SIG */
 struct tcp_diag_md5sig {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b093985..8109675 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2907,6 +2907,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	if (likely(!err)) {
 		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
 		trace_tcp_retransmit_skb(sk, skb);
+		if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
+			tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RETRANS_CB,
+					  TCP_SKB_CB(skb)->seq, segs);
 	} else if (err != -EBUSY) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
 	}
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 10/11] bpf: Add BPF_SOCK_OPS_STATE_CB
From: Lawrence Brakmo @ 2018-01-04 23:55 UTC (permalink / raw)
  To: netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann,
	Eric Dumazet, Neal Cardwell, Yuchung Cheng
In-Reply-To: <20180104235533.3672006-1-brakmo@fb.com>

Adds support for calling sock_ops BPF program when there is a TCP state
change. Two arguments are used; one for the old state and another for
the new state.

There is a new enum in include/uapi/linux/bpf.h that exports the TCP
states that prepends BPF_ to the current TCP state names. If it is ever
necessary to change the internal TCP state values (other than adding
more to the end), then it will become necessary to convert from the
internal TCP state value to the BPF value before calling the BPF
sock_ops function. There are a set of compile checks added in tcp.c
to detect if the internal and BPF values differ so we can make the
necessary fixes.

New op: BPF_SOCK_OPS_STATE_CB.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/uapi/linux/bpf.h | 26 ++++++++++++++++++++++++++
 include/uapi/linux/tcp.h |  1 +
 net/ipv4/tcp.c           | 24 ++++++++++++++++++++++++
 3 files changed, 51 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f6ab1da..1fcd86f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1027,6 +1027,32 @@ enum {
 					 * Arg1: sequence number of 1st byte
 					 * Arg2: # segments
 					 */
+	BPF_SOCK_OPS_STATE_CB,		/* Called when TCP changes state.
+					 * Arg1: old_state
+					 * Arg2: new_state
+					 */
+};
+
+/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
+ * changes between the TCP and BPF versions. Ideally this should never happen.
+ * If it does, we need to add code to convert them before calling
+ * the BPF sock_ops function.
+ */
+enum {
+	BPF_TCP_ESTABLISHED = 1,
+	BPF_TCP_SYN_SENT,
+	BPF_TCP_SYN_RECV,
+	BPF_TCP_FIN_WAIT1,
+	BPF_TCP_FIN_WAIT2,
+	BPF_TCP_TIME_WAIT,
+	BPF_TCP_CLOSE,
+	BPF_TCP_CLOSE_WAIT,
+	BPF_TCP_LAST_ACK,
+	BPF_TCP_LISTEN,
+	BPF_TCP_CLOSING,	/* Now a valid state */
+	BPF_TCP_NEW_SYN_RECV,
+
+	BPF_TCP_MAX_STATES	/* Leave at the end! */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index dc36d3c..211322c 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -262,6 +262,7 @@ struct tcp_md5sig {
 /* Definitions for bpf_sock_ops_flags */
 #define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
 #define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
+#define BPF_SOCK_OPS_STATE_CB_FLAG	(1<<2)
 
 /* INET_DIAG_MD5SIG */
 struct tcp_diag_md5sig {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a1fe7a7..7709a23 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2044,6 +2044,30 @@ void tcp_set_state(struct sock *sk, int state)
 {
 	int oldstate = sk->sk_state;
 
+	/* We defined a new enum for TCP states that are exported in BPF
+	 * so as not force the internal TCP states to be frozen. The
+	 * following checks will detect if an internal state value ever
+	 * differs from the BPF value. If this ever happens, then we will
+	 * need to remap the internal value to the BPF value before calling
+	 * tcp_call_bpf_2arg.
+	 */
+	BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
+	BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
+	BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
+	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
+	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
+	BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
+	BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
+	BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
+	BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
+	BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
+	BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
+
+	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
+		tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
+
 	switch (state) {
 	case TCP_ESTABLISHED:
 		if (oldstate != TCP_ESTABLISHED)
-- 
2.9.5

^ permalink raw reply related

* Fw: [Bug 198353] New: Generic netlink family bug when multicast groups are greater than 13
From: Stephen Hemminger @ 2018-01-05  0:30 UTC (permalink / raw)
  To: netdev



Begin forwarded message:

Date: Fri, 05 Jan 2018 00:21:42 +0000
From: bugzilla-daemon@bugzilla.kernel.org
To: stephen@networkplumber.org
Subject: [Bug 198353] New: Generic netlink family bug when multicast groups are greater than 13


https://bugzilla.kernel.org/show_bug.cgi?id=198353

            Bug ID: 198353
           Summary: Generic netlink family bug when multicast groups are
                    greater than 13
           Product: Networking
           Version: 2.5
    Kernel Version: 4.4
          Hardware: All
                OS: Linux
              Tree: Mainline
            Status: NEW
          Severity: blocking
          Priority: P1
         Component: Other
          Assignee: stephen@networkplumber.org
          Reporter: jaipal.katkuri@itron.com
        Regression: No

Created attachment 273405
  --> https://bugzilla.kernel.org/attachment.cgi?id=273405&action=edit  
linux-4.4-netlink-genlmcgroup.patch

Kernel Version: 4.4
File: netlink/genetlink.c

While reserving multicast groups for a netlink family in the kernel, if the
request has more than 13 groups the code allocates only 13 groups and returns
success.

In the function "genl_allocate_reserve_groups" the variable "id" is returned
with the first available multicast group slot but it doesn't check if the
request "n_groups" is more than 13.

Attached patch fixes the problem.

-- 
You are receiving this mail because:
You are the assignee for the bug.

^ permalink raw reply

* Re: Issue with commit fea23fb591cc "net: phy: convert read-modify-write to phy_modify()"
From: Ivan Khoronzhuk @ 2018-01-05  0:44 UTC (permalink / raw)
  To: Russell King - ARM Linux, Grygorii Strashko
  Cc: Heiner Kallweit, Andrew Lunn, David S. Miller,
	netdev@vger.kernel.org
In-Reply-To: <20180104114439.GL28752@n2100.armlinux.org.uk>

+ G.Strashko
The below change also brokes phy connect for am572x..

 int genphy_restart_aneg(struct phy_device *phydev)
 {
-       int ctl = phy_read(phydev, MII_BMCR);
-
-       if (ctl < 0)
-               return ctl;
-
-       ctl |= BMCR_ANENABLE | BMCR_ANRESTART;
-
        /* Don't isolate the PHY if we're negotiating */
-       ctl &= ~BMCR_ISOLATE;
-
-       return phy_write(phydev, MII_BMCR, ctl);
+       return phy_modify(phydev, MII_BMCR, ~BMCR_ISOLATE,
+                         BMCR_ANENABLE | BMCR_ANRESTART);



-- 
Regards,
Ivan Khoronzhuk

^ permalink raw reply

* Re: Issue with commit fea23fb591cc "net: phy: convert read-modify-write to phy_modify()"
From: Russell King - ARM Linux @ 2018-01-05  1:02 UTC (permalink / raw)
  To: Ivan Khoronzhuk
  Cc: Grygorii Strashko, Heiner Kallweit, Andrew Lunn, David S. Miller,
	netdev@vger.kernel.org
In-Reply-To: <20180105004405.GA20332@khorivan>

On Fri, Jan 05, 2018 at 02:44:07AM +0200, Ivan Khoronzhuk wrote:
> + G.Strashko
> The below change also brokes phy connect for am572x..
> 
>  int genphy_restart_aneg(struct phy_device *phydev)
>  {
> -       int ctl = phy_read(phydev, MII_BMCR);
> -
> -       if (ctl < 0)
> -               return ctl;
> -
> -       ctl |= BMCR_ANENABLE | BMCR_ANRESTART;
> -
>         /* Don't isolate the PHY if we're negotiating */
> -       ctl &= ~BMCR_ISOLATE;
> -
> -       return phy_write(phydev, MII_BMCR, ctl);
> +       return phy_modify(phydev, MII_BMCR, ~BMCR_ISOLATE,
> +                         BMCR_ANENABLE | BMCR_ANRESTART);
> 

This should fix it, but it needs a bit more work, as it's also
recently been pointed out that the comment is wrong.  Also, as I
said for the original patch, it needs a thorough review - and
because the last one didn't get that, the problem is now
compounded by two patches needing a thorough review, or both
being looked at as one patch combined.

I'm afraid I've not had time to fully complete this patch yet
because of the recent issues around CPU cache speculation, news
of which broke without warning on Wednesday.

I tested what's in net-next again, and it passes my tests locally -
it's only if I take an interface through an up-down-up sequence
that I see a problem (as discovered by Andrew) which explains why
my testing did not find it.

8<===
From: Russell King <rmk+kernel@armlinux.org.uk>
To: Andrew Lunn <andrew@lunn.ch>,Florian Fainelli <f.fainelli@gmail.com>
Cc: netdev@vger.kernel.org
Subject: [PATCH] net: phy: fix wrong masks to phy_modify()

The mask argument for phy_modify() in several locations was inverted.

Fixes: fea23fb591cc ("net: phy: convert read-modify-write to phy_modify()")
Reported-by: Heiner Kallweit <hkallweit1@gmail.com>
Tested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
---
 drivers/net/phy/at803x.c     |  2 +-
 drivers/net/phy/marvell.c    | 19 +++++++++----------
 drivers/net/phy/phy_device.c |  6 +++---
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
index e86c1b8b1b51..a80cce11b252 100644
--- a/drivers/net/phy/at803x.c
+++ b/drivers/net/phy/at803x.c
@@ -231,7 +231,7 @@ static int at803x_suspend(struct phy_device *phydev)
 
 static int at803x_resume(struct phy_device *phydev)
 {
-	return phy_modify(phydev, MII_BMCR, ~(BMCR_PDOWN | BMCR_ISOLATE), 0);
+	return phy_modify(phydev, MII_BMCR, BMCR_PDOWN | BMCR_ISOLATE, 0);
 }
 
 static int at803x_probe(struct phy_device *phydev)
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 8212c2fd7fe1..5d28063e9327 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -668,7 +668,7 @@ static int m88e3016_config_init(struct phy_device *phydev)
 
 	/* Enable Scrambler and Auto-Crossover */
 	ret = phy_modify(phydev, MII_88E3016_PHY_SPEC_CTRL,
-			 ~MII_88E3016_DISABLE_SCRAMBLER,
+			 MII_88E3016_DISABLE_SCRAMBLER,
 			 MII_88E3016_AUTO_MDIX_CROSSOVER);
 	if (ret < 0)
 		return ret;
@@ -684,9 +684,9 @@ static int m88e1111_config_init_hwcfg_mode(struct phy_device *phydev,
 		mode |= MII_M1111_HWCFG_FIBER_COPPER_AUTO;
 
 	return phy_modify(phydev, MII_M1111_PHY_EXT_SR,
-			  (u16)~(MII_M1111_HWCFG_MODE_MASK |
-				 MII_M1111_HWCFG_FIBER_COPPER_AUTO |
-				 MII_M1111_HWCFG_FIBER_COPPER_RES),
+			  MII_M1111_HWCFG_MODE_MASK |
+			  MII_M1111_HWCFG_FIBER_COPPER_AUTO |
+			  MII_M1111_HWCFG_FIBER_COPPER_RES,
 			  mode);
 }
 
@@ -705,8 +705,7 @@ static int m88e1111_config_init_rgmii_delays(struct phy_device *phydev)
 	}
 
 	return phy_modify(phydev, MII_M1111_PHY_EXT_CR,
-			  (u16)~(MII_M1111_RGMII_RX_DELAY |
-				 MII_M1111_RGMII_TX_DELAY),
+			  MII_M1111_RGMII_RX_DELAY | MII_M1111_RGMII_TX_DELAY,
 			  delay);
 }
 
@@ -833,7 +832,7 @@ static int m88e1510_config_init(struct phy_device *phydev)
 
 		/* In reg 20, write MODE[2:0] = 0x1 (SGMII to Copper) */
 		err = phy_modify(phydev, MII_88E1510_GEN_CTRL_REG_1,
-				 ~MII_88E1510_GEN_CTRL_REG_1_MODE_MASK,
+				 MII_88E1510_GEN_CTRL_REG_1_MODE_MASK,
 				 MII_88E1510_GEN_CTRL_REG_1_MODE_SGMII);
 		if (err < 0)
 			return err;
@@ -957,7 +956,7 @@ static int m88e1145_config_init_rgmii(struct phy_device *phydev)
 		if (err < 0)
 			return err;
 
-		err = phy_modify(phydev, 0x1e, 0xf03f,
+		err = phy_modify(phydev, 0x1e, 0x0fc0,
 				 2 << 9 | /* 36 ohm */
 				 2 << 6); /* 39 ohm */
 		if (err < 0)
@@ -1379,7 +1378,7 @@ static int m88e1318_set_wol(struct phy_device *phydev,
 
 		/* Setup LED[2] as interrupt pin (active low) */
 		err = __phy_modify(phydev, MII_88E1318S_PHY_LED_TCR,
-				   (u16)~MII_88E1318S_PHY_LED_TCR_FORCE_INT,
+				   MII_88E1318S_PHY_LED_TCR_FORCE_INT,
 				   MII_88E1318S_PHY_LED_TCR_INTn_ENABLE |
 				   MII_88E1318S_PHY_LED_TCR_INT_ACTIVE_LOW);
 		if (err < 0)
@@ -1419,7 +1418,7 @@ static int m88e1318_set_wol(struct phy_device *phydev,
 
 		/* Clear WOL status and disable magic packet matching */
 		err = __phy_modify(phydev, MII_88E1318S_PHY_WOL_CTRL,
-				   (u16)~MII_88E1318S_PHY_WOL_CTRL_MAGIC_PACKET_MATCH_ENABLE,
+				   MII_88E1318S_PHY_WOL_CTRL_MAGIC_PACKET_MATCH_ENABLE,
 				   MII_88E1318S_PHY_WOL_CTRL_CLEAR_WOL_STATUS);
 		if (err < 0)
 			goto error;
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index e5ddc5ae56d1..e1acb3052515 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1353,7 +1353,7 @@ EXPORT_SYMBOL(genphy_setup_forced);
 int genphy_restart_aneg(struct phy_device *phydev)
 {
 	/* Don't isolate the PHY if we're negotiating */
-	return phy_modify(phydev, MII_BMCR, ~BMCR_ISOLATE,
+	return phy_modify(phydev, MII_BMCR, BMCR_ISOLATE,
 			  BMCR_ANENABLE | BMCR_ANRESTART);
 }
 EXPORT_SYMBOL(genphy_restart_aneg);
@@ -1626,13 +1626,13 @@ EXPORT_SYMBOL(genphy_suspend);
 
 int genphy_resume(struct phy_device *phydev)
 {
-	return phy_modify(phydev, MII_BMCR, ~BMCR_PDOWN, 0);
+	return phy_modify(phydev, MII_BMCR, BMCR_PDOWN, 0);
 }
 EXPORT_SYMBOL(genphy_resume);
 
 int genphy_loopback(struct phy_device *phydev, bool enable)
 {
-	return phy_modify(phydev, MII_BMCR, ~BMCR_LOOPBACK,
+	return phy_modify(phydev, MII_BMCR, BMCR_LOOPBACK,
 			  enable ? BMCR_LOOPBACK : 0);
 }
 EXPORT_SYMBOL(genphy_loopback);
-- 
2.7.4

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
According to speedtest.net: 8.21Mbps down 510kbps up

^ permalink raw reply related

* Re: [RFC 11/14] tcp_md5: Move TCP-MD5 code out of TCP itself
From: Christoph Paasch @ 2018-01-05  1:15 UTC (permalink / raw)
  To: Mat Martineau; +Cc: netdev, Eric Dumazet, Alexei Starovoitov
In-Reply-To: <alpine.OSX.2.21.1801021115090.8545@ssempron-mobl.amr.corp.intel.com>

Hello Mat,

On 02/01/18 - 11:39:23, Mat Martineau wrote:
> 
> Hi Christoph -
> 
> On Mon, 18 Dec 2017, Christoph Paasch wrote:
> 
> > This is all just copy-pasting the TCP_MD5-code into functions that are
> > placed in net/ipv4/tcp_md5.c.
> > 
> > Signed-off-by: Christoph Paasch <cpaasch@apple.com>
> > Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
> > ---
> > include/linux/inet_diag.h |    1 +
> > include/linux/tcp_md5.h   |  138 ++++++
> > include/net/tcp.h         |   77 ----
> > net/ipv4/Makefile         |    1 +
> > net/ipv4/tcp.c            |  133 +-----
> > net/ipv4/tcp_diag.c       |   81 +---
> > net/ipv4/tcp_input.c      |   38 --
> > net/ipv4/tcp_ipv4.c       |  520 ++-------------------
> > net/ipv4/tcp_md5.c        | 1102 +++++++++++++++++++++++++++++++++++++++++++++
> > net/ipv4/tcp_minisocks.c  |   27 +-
> > net/ipv4/tcp_output.c     |    4 +-
> > net/ipv6/tcp_ipv6.c       |  318 +------------
> > 12 files changed, 1305 insertions(+), 1135 deletions(-)
> > create mode 100644 include/linux/tcp_md5.h
> > create mode 100644 net/ipv4/tcp_md5.c
> 
> ...
> 
> > diff --git a/include/linux/tcp_md5.h b/include/linux/tcp_md5.h
> > new file mode 100644
> > index 000000000000..f6a681cdded4
> > --- /dev/null
> > +++ b/include/linux/tcp_md5.h
> > @@ -0,0 +1,138 @@
> 
> There's no license info in this new file. Take a look at the SPDX
> identifiers recently added as the first line of some files (like
> tcp_vegas.h) for one way to do it.

Thanks, I added the SPDX-identifier line.

> 
> 
> > +#ifndef _LINUX_TCP_MD5_H
> > +#define _LINUX_TCP_MD5_H
> > +
> > +#include <linux/skbuff.h>
> > +
> > +#ifdef CONFIG_TCP_MD5SIG
> > +#include <linux/types.h>
> > +
> > +#include <net/tcp.h>
> > +
> > +union tcp_md5_addr {
> > +	struct in_addr  a4;
> > +#if IS_ENABLED(CONFIG_IPV6)
> > +	struct in6_addr	a6;
> > +#endif
> > +};
> > +
> > +/* - key database */
> > +struct tcp_md5sig_key {
> > +	struct hlist_node	node;
> > +	u8			keylen;
> > +	u8			family; /* AF_INET or AF_INET6 */
> > +	union tcp_md5_addr	addr;
> > +	u8			prefixlen;
> > +	u8			key[TCP_MD5SIG_MAXKEYLEN];
> > +	struct rcu_head		rcu;
> > +};
> > +
> > +/* - sock block */
> > +struct tcp_md5sig_info {
> > +	struct hlist_head	head;
> > +	struct rcu_head		rcu;
> > +};
> > +
> > +union tcp_md5sum_block {
> > +	struct tcp4_pseudohdr ip4;
> > +#if IS_ENABLED(CONFIG_IPV6)
> > +	struct tcp6_pseudohdr ip6;
> > +#endif
> > +};
> > +
> > +/* - pool: digest algorithm, hash description and scratch buffer */
> > +struct tcp_md5sig_pool {
> > +	struct ahash_request	*md5_req;
> > +	void			*scratch;
> > +};
> > +
> > +extern const struct tcp_sock_af_ops tcp_sock_ipv4_specific;
> > +extern const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
> > +extern const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
> > +
> > +/* - functions */
> > +int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
> > +			const struct sock *sk, const struct sk_buff *skb);
> > +
> > +struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
> > +					 const struct sock *addr_sk);
> > +
> > +void tcp_v4_md5_destroy_sock(struct sock *sk);
> > +
> > +int tcp_v4_md5_send_response_prepare(struct sk_buff *skb, u8 flags,
> > +				     unsigned int remaining,
> > +				     struct tcp_out_options *opts,
> > +				     const struct sock *sk);
> > +
> > +void tcp_v4_md5_send_response_write(__be32 *topt, struct sk_buff *skb,
> > +				    struct tcphdr *t1,
> > +				    struct tcp_out_options *opts,
> > +				    const struct sock *sk);
> > +
> > +int tcp_v6_md5_send_response_prepare(struct sk_buff *skb, u8 flags,
> > +				     unsigned int remaining,
> > +				     struct tcp_out_options *opts,
> > +				     const struct sock *sk);
> > +
> > +void tcp_v6_md5_send_response_write(__be32 *topt, struct sk_buff *skb,
> > +				    struct tcphdr *t1,
> > +				    struct tcp_out_options *opts,
> > +				    const struct sock *sk);
> > +
> > +bool tcp_v4_inbound_md5_hash(const struct sock *sk,
> > +			     const struct sk_buff *skb);
> > +
> > +void tcp_v4_md5_syn_recv_sock(const struct sock *listener, struct sock *sk);
> > +
> > +void tcp_v6_md5_syn_recv_sock(const struct sock *listener, struct sock *sk);
> > +
> > +void tcp_md5_time_wait(struct sock *sk, struct inet_timewait_sock *tw);
> > +
> > +struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk,
> > +					 const struct sock *addr_sk);
> > +
> > +int tcp_v6_md5_hash_skb(char *md5_hash,
> > +			const struct tcp_md5sig_key *key,
> > +			const struct sock *sk,
> > +			const struct sk_buff *skb);
> > +
> > +bool tcp_v6_inbound_md5_hash(const struct sock *sk,
> > +			     const struct sk_buff *skb);
> > +
> > +static inline void tcp_md5_twsk_destructor(struct sock *sk)
> > +{
> > +	struct tcp_timewait_sock *twsk = tcp_twsk(sk);
> > +
> > +	if (twsk->tw_md5_key)
> > +		kfree_rcu(twsk->tw_md5_key, rcu);
> > +}
> > +
> > +static inline void tcp_md5_add_header_len(const struct sock *listener,
> > +					  struct sock *sk)
> > +{
> > +	struct tcp_sock *tp = tcp_sk(sk);
> > +
> > +	if (tp->af_specific->md5_lookup(listener, sk))
> > +		tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
> > +}
> > +
> > +int tcp_md5_diag_get_aux(struct sock *sk, bool net_admin, struct sk_buff *skb);
> > +
> > +int tcp_md5_diag_get_aux_size(struct sock *sk, bool net_admin);
> > +
> > +#else
> > +
> > +static inline bool tcp_v4_inbound_md5_hash(const struct sock *sk,
> > +					   const struct sk_buff *skb)
> > +{
> > +	return false;
> > +}
> > +
> > +static inline bool tcp_v6_inbound_md5_hash(const struct sock *sk,
> > +					   const struct sk_buff *skb)
> > +{
> > +	return false;
> > +}
> > +
> > +#endif
> > +
> > +#endif /* _LINUX_TCP_MD5_H */
> 
> ...
> 
> > diff --git a/net/ipv4/tcp_md5.c b/net/ipv4/tcp_md5.c
> > new file mode 100644
> > index 000000000000..a31b404e6dbf
> > --- /dev/null
> > +++ b/net/ipv4/tcp_md5.c
> > @@ -0,0 +1,1102 @@
> 
> This new file needs license info too, maybe a SPDX identifier like
> tcp_input.c

Same here, added the SPDX-line.

Thanks for spotting this.


Christoph

^ permalink raw reply

* Re: [PATCH v6 3/6] can: m_can: Add PM Runtime
From: Yang, Wenyou @ 2018-01-05  1:23 UTC (permalink / raw)
  To: Faiz Abbas, Marc Kleine-Budde, wg, robh+dt, mark.rutland
  Cc: linux-can, netdev, devicetree, linux-kernel, nsekhar, fcooper,
	robh, sergei.shtylyov
In-Reply-To: <6bdaab16-cb16-e039-473c-52dd295bd4ba@ti.com>



On 2018/1/4 23:17, Faiz Abbas wrote:
> Hi,
>
> On Wednesday 03 January 2018 08:47 PM, Marc Kleine-Budde wrote:
>> On 01/03/2018 04:06 PM, Faiz Abbas wrote:
>>> Hi,
>>>
>>> On Wednesday 03 January 2018 07:55 PM, Marc Kleine-Budde wrote:
>>>> On 01/03/2018 01:39 PM, Faiz Abbas wrote:
>>>>> On Tuesday 02 January 2018 09:37 PM, Marc Kleine-Budde wrote:
>>>>>> On 12/22/2017 02:31 PM, Faiz Abbas wrote:
>>>>>>> From: Franklin S Cooper Jr <fcooper@ti.com>
>>>>>>>
>>>>>>> Add support for PM Runtime which is the new way to handle managing clocks.
>>>>>>> However, to avoid breaking SoCs not using PM_RUNTIME leave the old clk
>>>>>>> management approach in place.
>>>>>> There is no PM_RUNTIME anymore since 464ed18ebdb6 ("PM: Eliminate
>>>>>> CONFIG_PM_RUNTIME")
>>>>> Ok. Will change the commit message.
>>>>>
>>>>>> Have a look at the discussion: https://patchwork.kernel.org/patch/9436507/ :
>>>>>>
>>>>>>>> Well, I admit it would be nicer if drivers didn't have to worry about
>>>>>>>> whether or not CONFIG_PM was enabled.  A slightly cleaner approach
>>>>>>>> from the one outlined above would have the probe routine do this:
>>>>>>>>
>>>>>>>> 	my_power_up(dev);
>>>>>>>> 	pm_runtime_set_active(dev);
>>>>>>>> 	pm_runtime_get_noresume(dev);
>>>>>>>> 	pm_runtime_enable(dev);
>>>>> This discussion seems to be about cases in which CONFIG_PM is not
>>>>> enabled. CONFIG_PM is always selected in the case of omap devices.
>>>> Yes, but in the commit message you state that you need to support
>>>> systems that don't have PM_RUNTIME enabled. The only mainline SoCs I see
>>>> is "arch/arm/boot/dts/sama5d2.dtsi" so far. Please check if they select
>>>> CONFIG_PM, then we can make the driver much simpler.
>>> Actually the old clock management (for hclk which is the interface
>>> clock) is still required as mentioned in the cover letter. Will change
>>> the rather misleading description.
>> Ok. So you can use the code as discussed on
>> https://patchwork.kernel.org/patch/9436507/ ?
> Looking at the kernel configuration, it seems like SAMA5D2 platform
> selects CONFIG_PM (Wenyou, please confirm).
Confirmed.  The CONFIG_PM is selected.

> So, it seems like the only
> users of this driver always have CONFIG_PM enabled.
>
> So I guess the best way is to maintain the current code for pm_runtime_*
> and move the clock enable/disable to pm_runtime callbacks.
>
> Something like this:
>
> m_can_runtime_resume()
> {
> 	clk_prepare_enable(cclk);
> 	clk_prepare_enable(hclk);
> }
>
> m_can_runtime_suspend()
> {
> 	clk_disable_unprepare(cclk);
> 	clk_disable_unprepare(hclk);
> }
>
> SET_RUNTIME_PM_OPS(m_can_runtime_suspend, m_can_runtime_resume, NULL)
>
> static void m_can_start(struct net_device *dev)
> {
> 	pm_runtime_get_sync(dev)
> 	...
> }
>
> static void m_can_stop(struct net_device *dev)
> {
> 	...
> 	pm_runtime_put_sync(dev)
> }
>
> Does that sound okay? If yes, I will go work on the implementation.
>
> Thanks,
> Faiz
Best Regards,
Wenyou Yang

^ permalink raw reply

* Re: [PATCH net-next v2] xen-netback: make copy batch size configurable
From: Joao Martins @ 2018-01-05  1:31 UTC (permalink / raw)
  To: David Miller; +Cc: netdev@vger.kernel.org, wei.liu2, paul.durrant, xen-devel
In-Reply-To: <20171226.172222.668725258435898199.davem@davemloft.net>

On 12/26/2017 10:22 PM, David Miller wrote:
> From: Joao Martins <joao.m.martins@oracle.com>
> Date: Thu, 21 Dec 2017 17:24:28 +0000
> 
>> Commit eb1723a29b9a ("xen-netback: refactor guest rx") refactored Rx
>> handling and as a result decreased max grant copy ops from 4352 to 64.
>> Before this commit it would drain the rx_queue (while there are
>> enough slots in the ring to put packets) then copy to all pages and write
>> responses on the ring. With the refactor we do almost the same albeit
>> the last two steps are done every COPY_BATCH_SIZE (64) copies.
>>
>> For big packets, the value of 64 means copying 3 packets best case scenario
>> (17 copies) and worst-case only 1 packet (34 copies, i.e. if all frags
>> plus head cross the 4k grant boundary) which could be the case when
>> packets go from local backend process.
>>
>> Instead of making it static to 64 grant copies, lets allow the user to
>> select its value (while keeping the current as default) by introducing
>> the `copy_batch_size` module parameter. This allows users to select
>> the higher batches (i.e. for better throughput with big packets) as it
>> was prior to the above mentioned commit.
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>> Changes since v1:
>>  * move rx_copy.{idx,op} reallocation to separate helper
>>  Addressed Paul's comments:
>>  * rename xenvif_copy_state#size field to batch_size
>>  * argument `size` should be unsigned int
>>  * vfree is safe with NULL
>>  * realloc rx_copy.{idx,op} after copy op flush
> 
> I truly dislike things of this nature.
> 
> When you give the user a numerical value to set, they have to pick
> something.  This in turn requires deep, weird, knowledge of how the
> driver implements RX packet processing.
> 
> That's asbolutely unacceptable.  Can you imagine being an admin and
> trying to figure out what random number to plug into this thing?
> 
> "maximum number of grant copies on RX"
> 
> I've been the networking maintainer for more than 2 decades and I
> have no idea whatsoever what kind of value I might want to set
> there.
> 
> Nobody should have to know this other than people working on the
> driver.
> 
Sorry, I didn't consider that.

> Instead, the issue is that the driver can optimize for throughput
> or something else (latency, RX packing, I don't know exactly what
> it is, but you're keeping the default value so it has some merit
> right?).  Therefore, what you need to export is a boolean which
> is self describing.
> 
I kept the default because of the recent work on tidy up
netback rx structures. But while it gives much less overhead when going
with bigger numbers of VMs/interfaces, it limits throughput per
interface (hence the parameter).

> "rx_optimize_throughput"
> 
> That's it.  And you, the smart person who knows what this grant
> copy mumbo jumbo means, can pick a specific value to use for
> high throughput.
>
OK.

Joao

^ permalink raw reply

* Re: [PATCH v2 0/6] wl1251: Fix MAC address for Nokia N900
From: Luis R. Rodriguez @ 2018-01-05  1:45 UTC (permalink / raw)
  To: Pali Rohár
  Cc: Luis R. Rodriguez, Greg Kroah-Hartman, Kalle Valo, David Gnedt,
	Daniel Wagner, Tony Lindgren, Sebastian Reichel, Pavel Machek,
	Ivaylo Dimitrov, Aaro Koskinen, Grazvydas Ignotas, linux-kernel,
	linux-wireless, netdev
In-Reply-To: <20180102192345.dnywuv4sapltrfrl@pali>

On Tue, Jan 02, 2018 at 08:23:45PM +0100, Pali Rohár wrote:
> On Friday 10 November 2017 00:38:22 Pali Rohár wrote:
> > This patch series fix processing MAC address for wl1251 chip found in Nokia N900.
> > 
> > Changes since v1:
> > * Added Acked-by for Pavel Machek
> > * Fixed grammar
> > * Magic numbers for NVS offsets are replaced by defines
> > * Check for validity of mac address NVS data is moved into function
> > * Changed order of patches as Pavel requested
> > 
> > Pali Rohár (6):
> >   wl1251: Update wl->nvs_len after wl->nvs is valid
> >   wl1251: Generate random MAC address only if driver does not have
> >     valid
> >   wl1251: Parse and use MAC address from supplied NVS data
> >   wl1251: Set generated MAC address back to NVS data
> >   firmware: Add request_firmware_prefer_user() function
> >   wl1251: Use request_firmware_prefer_user() for loading NVS
> >     calibration data
> > 
> >  drivers/base/firmware_class.c          |   45 +++++++++++++-
> >  drivers/net/wireless/ti/wl1251/Kconfig |    1 +
> >  drivers/net/wireless/ti/wl1251/main.c  |  104 ++++++++++++++++++++++++++------
> >  include/linux/firmware.h               |    9 +++
> >  4 files changed, 138 insertions(+), 21 deletions(-)
> 
> Hi! Are there any comments for first 4 patches? If not, could they be
> accepted and merged?

Since the first 4 patches do not touch the firmware API they seem fine to me so
long as the maintainer accepts them. Maybe resend and clarify you have dropped
the other ones and amend with the new tags.

  Luis

^ permalink raw reply

* Re: r8169 take too long to complete driver initialization
From: Chris Chiu @ 2018-01-05  2:17 UTC (permalink / raw)
  To: nic_swsd, netdev, Linux Kernel, Linux Upstreaming Team
In-Reply-To: <CAB4CAweXip+hDprJc6biwR=UtGF1xu-06HdB4ddzg9UZGL+Ngw@mail.gmail.com>

On Wed, Dec 20, 2017 at 4:41 PM, Chris Chiu <chiu@endlessm.com> wrote:
> Hi,
>     We've hit a suspend/resume issue on a Acer desktop caused by r8169
> driver. The dmseg
> https://gist.github.com/mschiu77/b741849b5070281daaead8dfee312d1a
> shows it's still in msleep() within a mutex lock.
>     After looking into the code, it's caused by the
> rtl8168ep_stop_cmac() which is waiting 100 seconds for
> rtl_ocp_tx_cond. The following dmesg states that the r8169 driver is
> loaded.
>
> [   20.270526] r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
>
> But it takes > 100 seconds to get the following messages
>
> [  140.400223] r8169 0000:02:00.0 (unnamed net_device)
> (uninitialized): rtl_ocp_tx_cond == 1 (loop: 2000, delay: 50).
> [  140.413294] r8169 0000:02:00.0 eth0: RTL8168ep/8111ep at
> 0xffffb16c80db1000, f8:0f:41:ea:74:0d, XID 10200800 IRQ 46
> [  140.413297] r8169 0000:02:00.0 eth0: jumbo features [frames: 9200
> bytes, tx checksumming: ko]
>
> So any trial to suspend the machine during this period would always
> get device/resource busy message then abort. Is this  rtl_ocp_tx_cond
> necessary? Because the ethernet is still working and I don't see any
> problem. I don't know it should be considered normal or not. Please
> let me know if any more information required. Thanks
>
> Chris

gentle ping,

cheers.

^ permalink raw reply

* Re: Investment
From: James Tyler, MBA, CFA @ 2018-01-05  2:23 UTC (permalink / raw)


Thank you for your time,

We are looking for clients in your country with good business or project that requires financing to execute.

Please get back to me if you are interested in this or you know anybody who has good business ideas but lack the necessary capital to fund his projects so we can establish working relationship.

Sincerely,

James Tyler, MBA, CFA
Investment analyst.

^ permalink raw reply

* [PATCH 0/2] Consolidate BUILD_BUG macros
From: Masahiro Yamada @ 2018-01-05  3:10 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ian Abbott, Masahiro Yamada, Hideaki YOSHIFUJI, netdev,
	Alexey Kuznetsov, linux-kernel, David S. Miller

Masahiro Yamada (2):
  genl_magic: remove own BUILD_BUG_ON*() defines
  build_bug.h: remove BUILD_BUG_ON_NULL()

 include/linux/build_bug.h       |  2 --
 include/linux/genl_magic_func.h | 12 +-----------
 net/ipv6/mcast.c                |  8 ++++----
 3 files changed, 5 insertions(+), 17 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH 2/2] build_bug.h: remove BUILD_BUG_ON_NULL()
From: Masahiro Yamada @ 2018-01-05  3:10 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ian Abbott, Masahiro Yamada, Hideaki YOSHIFUJI, netdev,
	Alexey Kuznetsov, linux-kernel, David S. Miller
In-Reply-To: <1515121833-3174-1-git-send-email-yamada.masahiro@socionext.com>

This macro is only used by net/ipv6/mcast.c, but there is no reason
why it must be BUILD_BUG_ON_NULL().

Replace it with BUILD_BUG_ON_ZERO(), and remove BUILD_BUG_ON_NULL()
definition from <linux/build_bug.h>.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---

 include/linux/build_bug.h | 2 --
 net/ipv6/mcast.c          | 8 ++++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h
index 3efed0d..43d1fd5 100644
--- a/include/linux/build_bug.h
+++ b/include/linux/build_bug.h
@@ -8,7 +8,6 @@
 #define __BUILD_BUG_ON_NOT_POWER_OF_2(n) (0)
 #define BUILD_BUG_ON_NOT_POWER_OF_2(n) (0)
 #define BUILD_BUG_ON_ZERO(e) (0)
-#define BUILD_BUG_ON_NULL(e) ((void *)0)
 #define BUILD_BUG_ON_INVALID(e) (0)
 #define BUILD_BUG_ON_MSG(cond, msg) (0)
 #define BUILD_BUG_ON(condition) (0)
@@ -28,7 +27,6 @@
  * aren't permitted).
  */
 #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:(-!!(e)); }))
-#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:(-!!(e)); }))
 
 /*
  * BUILD_BUG_ON_INVALID() permits the compiler to check the validity of the
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 8446426..b28d4aa 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -65,10 +65,10 @@
 #include <net/ip6_checksum.h>
 
 /* Ensure that we have struct in6_addr aligned on 32bit word. */
-static void *__mld2_query_bugs[] __attribute__((__unused__)) = {
-	BUILD_BUG_ON_NULL(offsetof(struct mld2_query, mld2q_srcs) % 4),
-	BUILD_BUG_ON_NULL(offsetof(struct mld2_report, mld2r_grec) % 4),
-	BUILD_BUG_ON_NULL(offsetof(struct mld2_grec, grec_mca) % 4)
+static int __mld2_query_bugs[] __attribute__((__unused__)) = {
+	BUILD_BUG_ON_ZERO(offsetof(struct mld2_query, mld2q_srcs) % 4),
+	BUILD_BUG_ON_ZERO(offsetof(struct mld2_report, mld2r_grec) % 4),
+	BUILD_BUG_ON_ZERO(offsetof(struct mld2_grec, grec_mca) % 4)
 };
 
 static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT;
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH] MAINTAINERS: Update my email address.
From: Pravin Shelar @ 2018-01-05  3:28 UTC (permalink / raw)
  To: David Miller; +Cc: Linux Kernel Network Developers
In-Reply-To: <20180104.133849.130715444571006602.davem@davemloft.net>

On Thu, Jan 4, 2018 at 10:38 AM, David Miller <davem@davemloft.net> wrote:
> From: Pravin B Shelar <pshelar@ovn.org>
> Date: Tue,  2 Jan 2018 20:14:42 -0800
>
>> Signed-off-by: Pravin Shelar <pshelar@ovn.org>
>
> Applied, but please take Joe's feedback into consideration.

I was bit busy so could not send update soon. I am planing on updating mailmap.

Thanks.

^ permalink raw reply

* Re: [PATCH iproute2-next v1 3/9] rdma: Add filtering infrastructure
From: David Ahern @ 2018-01-05  3:29 UTC (permalink / raw)
  To: Leon Romanovsky, Doug Ledford, Jason Gunthorpe
  Cc: RDMA mailing list, netdev, Stephen Hemminger, Leon Romanovsky
In-Reply-To: <20180104070150.15625-4-leon@kernel.org>

On 1/4/18 12:01 AM, Leon Romanovsky wrote:
> diff --git a/rdma/utils.c b/rdma/utils.c
> index af2b374d..446c23da 100644
> --- a/rdma/utils.c
> +++ b/rdma/utils.c
> @@ -114,6 +114,225 @@ static void dev_map_cleanup(struct rd *rd)
>  	}
>  }
>  
> +static int add_filter(struct rd *rd, char *key, char *value,
> +		      const char * const valid_filters[])
> +{
> +	char cset[] = "1234567890,-";
> +	struct filter_entry *fe;
> +	bool key_found = false;
> +	int idx = 0;
> +	int ret;
> +
> +	fe = calloc(1, sizeof(*fe));
> +	if (!fe)
> +		return -ENOMEM;
> +
> +	while (valid_filters[idx]) {
> +		if (!strcmpx(key, valid_filters[idx])) {
> +			key_found = true;
> +			break;
> +		}
> +		idx++;
> +	}
> +	if (!key_found) {
> +		pr_err("Unsupported filter option: %s\n", key);
> +		ret = -EINVAL;
> +		goto err;
> +	}
> +
> +	/*
> +	 * Check the filter validity, not optimal, but works
> +	 *
> +	 * Actually, there are three types of filters
> +	 *  numeric - for example PID or QPN
> +	 *  string  - for example states, currently only "display"
> +	 *            is from this type
> +	 *  link    - user requested to filter on specific link
> +	 *            e.g. mlx5_1/1, mlx5_1/-, mlx5_1 ...
> +	 */
> +	if (strcmpx(key, "link") && strcmpx(key, "display") &&
> +	    strspn(value, cset) != strlen(value)) {
> +		pr_err("%s filter accepts \"%s\" characters only\n", key, cset);
> +		ret = -EINVAL;
> +		goto err;
> +	}
> +
> +	fe->key = strdup(key);
> +	fe->value = strdup(value);
> +

Missed this the other day. 2 more strdup's that should be checked.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox