Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf 09/11] bpf: Add BPF_SOCK_OPS_RETRANS_CB
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

Adds support for calling sock_ops BPF program when there is a
retransmission. Two arguments are used; one for the sequence number and
other for the number of segments retransmitted. Does not include syn-ack
retransmissions.

New op: BPF_SOCK_OPS_RETRANS_CB.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/uapi/linux/bpf.h | 4 ++++
 include/uapi/linux/tcp.h | 1 +
 net/ipv4/tcp_output.c    | 3 +++
 3 files changed, 8 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fe2b692..7165619 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1014,6 +1014,10 @@ enum {
 					 * Arg2: value of icsk_rto
 					 * Arg3: whether RTO has expired
 					 */
+	BPF_SOCK_OPS_RETRANS_CB,	/* Called when skb is retransmitted.
+					 * Arg1: sequence number of 1st byte
+					 * Arg2: # segments
+					 */
 };
 
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 089c19e..dc36d3c 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -261,6 +261,7 @@ struct tcp_md5sig {
 
 /* Definitions for bpf_sock_ops_flags */
 #define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
+#define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
 
 /* INET_DIAG_MD5SIG */
 struct tcp_diag_md5sig {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 50cb242..b8ad088 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2910,6 +2910,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	if (likely(!err)) {
 		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
 		trace_tcp_retransmit_skb(sk, skb);
+		if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
+			tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RETRANS_CB,
+					  TCP_SKB_CB(skb)->seq, segs);
 	} else if (err != -EBUSY) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
 	}
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 01/11] bpf: Make SOCK_OPS_GET_TCP size independent
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

Make SOCK_OPS_GET_TCP helper macro size independent (before only worked
with 4-byte fields.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 net/core/filter.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 754abe1..d47d126 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4448,9 +4448,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		break;
 
 /* Helper macro for adding read access to tcp_sock fields. */
-#define SOCK_OPS_GET_TCP32(FIELD_NAME)					      \
+#define SOCK_OPS_GET_TCP(FIELD_NAME)					      \
 	do {								      \
-		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \
+		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) >      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME));  \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
 						struct bpf_sock_ops_kern,     \
 						is_fullsock),		      \
@@ -4462,16 +4463,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 						struct bpf_sock_ops_kern, sk),\
 				      si->dst_reg, si->src_reg,		      \
 				      offsetof(struct bpf_sock_ops_kern, sk));\
-		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,        \
+		*insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock,	      \
+						   FIELD_NAME), si->dst_reg,  \
+				      si->dst_reg,			      \
 				      offsetof(struct tcp_sock, FIELD_NAME)); \
 	} while (0)
 
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
-		SOCK_OPS_GET_TCP32(snd_cwnd);
+		SOCK_OPS_GET_TCP(snd_cwnd);
 		break;
 
 	case offsetof(struct bpf_sock_ops, srtt_us):
-		SOCK_OPS_GET_TCP32(srtt_us);
+		SOCK_OPS_GET_TCP(srtt_us);
 		break;
 	}
 	return insn - insn_buf;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 02/11] bpf: Make SOCK_OPS_GET_TCP struct independent
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

Changed SOCK_OPS_GET_TCP to SOCK_OPS_GET_FIELD and added a new
argument so now it can also work with struct sock fields.

Previous: SOCK_OPS_GET_TCP(FIELD_NAME)
New:      SOCK_OPS_GET_FIELD(FIELD_NAME, OBJ)

Where OBJ is either "struct tcp_sock" or "struct sock" (without
quotation). Assumes FIELD_NAME is a field in the struct
bpf_sock_ops and in the OBJ specified.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 net/core/filter.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index d47d126..f808269 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4447,10 +4447,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 					       is_fullsock));
 		break;
 
-/* Helper macro for adding read access to tcp_sock fields. */
-#define SOCK_OPS_GET_TCP(FIELD_NAME)					      \
+/* Helper macro for adding read access to tcp_sock or sock fields. */
+#define SOCK_OPS_GET_FIELD(FIELD_NAME, OBJ)				      \
 	do {								      \
-		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) >      \
+		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, FIELD_NAME) >		      \
 			     FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME));  \
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
 						struct bpf_sock_ops_kern,     \
@@ -4463,18 +4463,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 						struct bpf_sock_ops_kern, sk),\
 				      si->dst_reg, si->src_reg,		      \
 				      offsetof(struct bpf_sock_ops_kern, sk));\
-		*insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock,	      \
-						   FIELD_NAME), si->dst_reg,  \
-				      si->dst_reg,			      \
-				      offsetof(struct tcp_sock, FIELD_NAME)); \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ,		      \
+						       FIELD_NAME),	      \
+				      si->dst_reg, si->dst_reg,		      \
+				      offsetof(OBJ, FIELD_NAME));	      \
 	} while (0)
 
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
-		SOCK_OPS_GET_TCP(snd_cwnd);
+		SOCK_OPS_GET_FIELD(snd_cwnd, struct tcp_sock);
 		break;
 
 	case offsetof(struct bpf_sock_ops, srtt_us):
-		SOCK_OPS_GET_TCP(srtt_us);
+		SOCK_OPS_GET_FIELD(srtt_us, struct tcp_sock);
 		break;
 	}
 	return insn - insn_buf;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 03/11] bpf: Add write access to tcp_sock and sock fields
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

This patch adds a macro, SOCK_OPS_SET_FIELD, for writing to
struct tcp_sock or struct sock fields. This required adding a new
field "temp" to struct bpf_sock_ops_kern for temporary storage that
is used by sock_ops_convert_ctx_access. It is used to store and recover
the contents of a register, so the register can be used to store the
address of the sk. Since we cannot overwrite the dst_reg because it
contains the pointer to ctx, nor the src_reg since it contains the value
we want to store, we need an extra register to contain the address
of the sk.

Also adds the macro SOCK_OPS_GET_OR_SET_FIELD that calls one of the
GET or SET macros depending on the value of the TYPE field.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/linux/filter.h |  3 +++
 include/net/tcp.h      |  2 +-
 net/core/filter.c      | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5feb441..8929162 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -987,6 +987,9 @@ struct bpf_sock_ops_kern {
 		u32 replylong[4];
 	};
 	u32	is_fullsock;
+	u64	temp;			/* Used by sock_ops_convert_ctx_access
+					 * as temporary storaage of a register
+					 */
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6cc205c..e0213f1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2011,7 +2011,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 	struct bpf_sock_ops_kern sock_ops;
 	int ret;
 
-	memset(&sock_ops, 0, sizeof(sock_ops));
+	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, is_fullsock));
 	if (sk_fullsock(sk)) {
 		sock_ops.is_fullsock = 1;
 		sock_owned_by_me(sk);
diff --git a/net/core/filter.c b/net/core/filter.c
index f808269..97e65df 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4469,6 +4469,52 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(OBJ, FIELD_NAME));	      \
 	} while (0)
 
+/* Helper macro for adding write access to tcp_sock or sock fields.
+ * The macro is called with two registers, dst_reg which contains a pointer
+ * to ctx (context) and src_reg which contains the value that should be
+ * stored. However, we need an aditional register since we cannot overwrite
+ * dst_reg because it may be used later in the program.
+ * Instead we "borrow" one of the other register. We first save its value
+ * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
+ * it at the end of the macro.
+ */
+#define SOCK_OPS_SET_FIELD(FIELD_NAME, OBJ)				      \
+	do {								      \
+		int reg = BPF_REG_9;					      \
+		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, FIELD_NAME) >		      \
+			     FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME));  \
+		while (si->dst_reg == reg || si->src_reg == reg)	      \
+			reg--;						      \
+		*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       temp));			      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern,     \
+						is_fullsock),		      \
+				      reg, si->dst_reg,			      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       is_fullsock));		      \
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2);		      \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \
+						struct bpf_sock_ops_kern, sk),\
+				      reg, si->dst_reg,			      \
+				      offsetof(struct bpf_sock_ops_kern, sk));\
+		*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, FIELD_NAME),      \
+				      reg, si->src_reg,			      \
+				      offsetof(OBJ, FIELD_NAME));	      \
+		*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg,		      \
+				      offsetof(struct bpf_sock_ops_kern,      \
+					       temp));			      \
+	} while (0)
+
+#define SOCK_OPS_GET_OR_SET_FIELD(FIELD_NAME, OBJ, TYPE)		      \
+	do {								      \
+		if (TYPE == BPF_WRITE)					      \
+			SOCK_OPS_SET_FIELD(FIELD_NAME, OBJ);		      \
+		else							      \
+			SOCK_OPS_GET_FIELD(FIELD_NAME, OBJ);		      \
+	} while (0)
+
 	case offsetof(struct bpf_sock_ops, snd_cwnd):
 		SOCK_OPS_GET_FIELD(snd_cwnd, struct tcp_sock);
 		break;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 04/11] bpf: Support passing args to sock_ops bpf function
From: Lawrence Brakmo @ 2017-12-19  6:21 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20171219062200.372711-1-brakmo@fb.com>

Adds support for passing up to 4 arguments to sock_ops bpf functions. It
reusues the reply union, so the bpf_sock_ops structures are not
increased in size.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/linux/filter.h   |  1 +
 include/net/tcp.h        | 64 ++++++++++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/bpf.h |  5 ++--
 net/ipv4/tcp.c           |  2 +-
 net/ipv4/tcp_nv.c        |  2 +-
 net/ipv4/tcp_output.c    |  2 +-
 6 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 8929162..2a09f27 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -983,6 +983,7 @@ struct bpf_sock_ops_kern {
 	struct	sock *sk;
 	u32	op;
 	union {
+		u32 args[4];
 		u32 reply;
 		u32 replylong[4];
 	};
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e0213f1..c262be6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2006,7 +2006,7 @@ void tcp_cleanup_ulp(struct sock *sk);
  * program loaded).
  */
 #ifdef CONFIG_BPF
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
 {
 	struct bpf_sock_ops_kern sock_ops;
 	int ret;
@@ -2019,6 +2019,8 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 
 	sock_ops.sk = sk;
 	sock_ops.op = op;
+	if (nargs > 0)
+		memcpy(sock_ops.args, args, nargs*sizeof(u32));
 
 	ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
 	if (ret == 0)
@@ -2027,18 +2029,70 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
 		ret = -1;
 	return ret;
 }
+
+static inline int tcp_call_bpf_1arg(struct sock *sk, int op, u32 arg)
+{
+	return tcp_call_bpf(sk, op, 1, &arg);
+}
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+	u32 args[2] = {arg1, arg2};
+
+	return tcp_call_bpf(sk, op, 2, args);
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3)
+{
+	u32 args[3] = {arg1, arg2, arg3};
+
+	return tcp_call_bpf(sk, op, 3, args);
+}
+
+static inline int tcp_call_bpf_4arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3, u32 arg4)
+{
+	u32 args[4] = {arg1, arg2, arg3, arg4};
+
+	return tcp_call_bpf(sk, op, 4, args);
+}
+
 #else
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
 {
 	return -EPERM;
 }
+
+static inline int tcp_call_bpf_1arg(struct sock *sk, int op, u32 arg)
+{
+	return -EPERM;
+}
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+	return -EPERM;
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+	u32 arg3)
+{
+	return -EPERM;
+}
+
+static inline int tcp_call_bpf_4arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+				    u32 arg3, u32 arg4)
+{
+	return -EPERM;
+}
+
 #endif
 
 static inline u32 tcp_timeout_init(struct sock *sk)
 {
 	int timeout;
 
-	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT);
+	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);
 
 	if (timeout <= 0)
 		timeout = TCP_TIMEOUT_INIT;
@@ -2049,7 +2103,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
 {
 	int rwnd;
 
-	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT);
+	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);
 
 	if (rwnd < 0)
 		rwnd = 0;
@@ -2058,7 +2112,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
 
 static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 {
-	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
+	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
 }
 
 #if IS_ENABLED(CONFIG_SMC)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 595bda1..addd849 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -936,8 +936,9 @@ struct bpf_map_info {
 struct bpf_sock_ops {
 	__u32 op;
 	union {
-		__u32 reply;
-		__u32 replylong[4];
+		__u32 args[4];		/* Optionally passed to bpf program */
+		__u32 reply;		/* Returned by bpf program	    */
+		__u32 replylong[4];	/* Optionally returned by bpf prog  */
 	};
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1803116..817df3f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -465,7 +465,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
 	tcp_mtup_init(sk);
 	icsk->icsk_af_ops->rebuild_header(sk);
 	tcp_init_metrics(sk);
-	tcp_call_bpf(sk, bpf_op);
+	tcp_call_bpf(sk, bpf_op, 0, NULL);
 	tcp_init_congestion_control(sk);
 	tcp_init_buffer_space(sk);
 }
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index 0b5a05b..ddbce73 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -146,7 +146,7 @@ static void tcpnv_init(struct sock *sk)
 	 * within a datacenter, where we have reasonable estimates of
 	 * RTTs
 	 */
-	base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT);
+	base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL);
 	if (base_rtt > 0) {
 		ca->nv_base_rtt = base_rtt;
 		ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a4d214c..50cb242 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3471,7 +3471,7 @@ int tcp_connect(struct sock *sk)
 	struct sk_buff *buff;
 	int err;
 
-	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB);
+	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
 
 	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
 		return -EHOSTUNREACH; /* Routing failure or similar. */
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH v2 2/3] vsprintf: print <no-symbol> if symbol not found
From: Joe Perches @ 2017-12-19  6:18 UTC (permalink / raw)
  To: Tobin C. Harding, kernel-hardening
  Cc: Steven Rostedt, Tycho Andersen, Linus Torvalds, Kees Cook,
	Andrew Morton, Daniel Borkmann, Masahiro Yamada,
	Alexei Starovoitov, linux-kernel, Network Development
In-Reply-To: <1513654094-16832-3-git-send-email-me@tobin.cc>

On Tue, 2017-12-19 at 14:28 +1100, Tobin C. Harding wrote:
> Depends on: commit 40eee173a35e ("kallsyms: don't leak address when
> symbol not found")
> 
> Currently vsprintf for specifiers %p[SsB] relies on the behaviour of
> kallsyms (sprint_symbol()) and prints the actual address if a symbol is
> not found. Previous patch changes this behaviour so that sprint_symbol()
> returns an error if symbol not found. With this patch in place we can
> print a sanitized message '<symbol not found>' instead of leaking the
> address.
> 
> Print '<symbol not found>' for printk specifier %p[sSB] if symbol look
> up fails.
> 
> Signed-off-by: Tobin C. Harding <me@tobin.cc>
> ---
>  lib/vsprintf.c | 11 ++++++++---
>  1 file changed, 8 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> index 01c3957b2de6..820ed4fe6e6c 100644
> --- a/lib/vsprintf.c
> +++ b/lib/vsprintf.c
> @@ -674,6 +674,8 @@ char *symbol_string(char *buf, char *end, void *ptr,
>  	unsigned long value;
>  #ifdef CONFIG_KALLSYMS
>  	char sym[KSYM_SYMBOL_LEN];
> +	const char *sym_not_found = "<symbol not found>";

This will be reinitialized on every use.

> +	int ret;
>  #endif
>  
>  	if (fmt[1] == 'R')
> @@ -682,11 +684,14 @@ char *symbol_string(char *buf, char *end, void *ptr,
>  
>  #ifdef CONFIG_KALLSYMS
>  	if (*fmt == 'B')
> -		sprint_backtrace(sym, value);
> +		ret = sprint_backtrace(sym, value);
>  	else if (*fmt != 'f' && *fmt != 's')
> -		sprint_symbol(sym, value);
> +		ret = sprint_symbol(sym, value);
>  	else
> -		sprint_symbol_no_offset(sym, value);
> +		ret = sprint_symbol_no_offset(sym, value);
> +
> +	if (ret == -1)
> +		strcpy(sym, sym_not_found);


This could avoid the unnecessary strcpy if sym_not_found
was not used at all and this was used instead

	if (ret == -1)
		return string(buf, end, "<symbol not found>", spec);

	return string(buf, end, sym, spec);

or maybe

	return string(buf, end, ret == -1 ? "<symbol not found>" : sum, spec);

>  
>  	return string(buf, end, sym, spec);
>  #else

^ permalink raw reply

* net/wireless/certs/*.x509 binary files
From: Randy Dunlap @ 2017-12-19  6:01 UTC (permalink / raw)
  To: LKML; +Cc: netdev@vger.kernel.org, Johannes Berg

This is just an FYI/acknowledgment that net/wireless/certs/*.x509
binary file(s) practically kills use of Linux kernel tarballs.

Of course, someone can always enable EXPERT and CFG80211_CERTIFICATION_ONUS
and disable the REGDB kconfig symbols to get around this.
Oh, and then chmod +x tools/objtool/sync-check.sh (unrelated problem).

Then you are good to go. :)

Background:
I was getting build errors on net/wireless/shipped-certs.o and the
build log didn't help me at all. It just said something like,
Error: build failed on net/wireless/shipped-certs.o.
Even building with V=1 didn't help.
So I finally discovered the reason and worked around it.

-- 
~Randy

PS:  Yes, I know about git.

^ permalink raw reply

* Re: r8169 regression: UDP packets dropped intermittantly
From: Jonathan Woithe @ 2017-12-19  5:45 UTC (permalink / raw)
  To: Holger Hoffstätte; +Cc: netdev, linux-kernel
In-Reply-To: <20171218223224.GA13172@marvin.atrad.com.au>

Hi again

This is a follow up to my earlier message.

On Tue, Dec 19, 2017 at 09:02:25AM +1030, Jonathan Woithe wrote:
> On Mon, Dec 18, 2017 at 02:38:53PM +0100, Holger Hoffstätte wrote:
> > Since I've seen your postings several times now with no comment or resolution
> > I've decided to try your reproducer on my own systems. In short, I cannot
> > reproduce any packet loss, despite having 2 (cheap) 1Gb switches between the
> > two machines. Both are running 4.14.7.
> 
> Thanks for trying the test program on your system.  The result indicates
> that the problem might be specific to the behaviour of a particular network
> variant of the r8169 chip.

I was able to temporarily acquire a PCIe card which uses the r8169 driver.
This allowed me to run the reproducer on the same machine with two different
r8169-based cards.  The original NIC is this:

  05:01.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL-8169 
  Gigabit Ethernet (rev 10) [10ec:8169]
          Subsystem: Netgear GA311 [1385:311a]

The PCIe card is this:

  02:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8111/8168B 
  PCI Express Gigabit Ethernet controller (rev 06) [10ec:8168]
          Subsystem: Realtek Semiconductor Co., Ltd. Device 0123 [10ec:0123]

The test was conducted with kernel 4.3.0 since both the 4.3.0 driver (which
triggers the fault) and the forward ported driver (which predates commit
da78dbff2e05630921c551dbbc70a4b7981a8fff) was available.  For the record,
the machine used as the slave in these tests (the one receiving the 6 byte
request and sending the 14 byte response) was using its onboard NIC:

  00:19.0 Ethernet controller: Intel Corporation 82579V Gigabit Network 
  Connection (rev 05) [8086:1503]
          Subsystem: Gigabyte Technology Co., Ltd 82579V Gigabit Network 
          Connection [1458:e000]

Test outcomes were as follows:

  PCIe card, unpatched 4.3.0 r8169 driver: no error (tested for 1 hour)
  PCIe card, forward ported r8169 driver:  no error (tested for 1 hour)

  GA311 card, unpatched 4.3.0 r8169 driver: test fail in under 4 minutes
  GA311 card, forward ported r8169 driver:  no error (tested for 1 hour)

For completeness, I then booted 4.14 and repeated the test with its r8168
driver.  The PCIe card ran for an hour without triggering the error, while
the GA311 triggered it quickly (in under 3 minutes).

This clearly indicates that not every card using the r8169 driver is
vulnerable to the problem.  It also explains why Holger was unable to
reproduce the result on his system: the PCIe cards do not appear to suffer
from the problem.  Most likely the PCI RTL-8169 chip is affected, but newer
PCIe variations do not.  However, obviously more testing will be required
with a wider variety of cards if this inference is to hold up.

The above result (and those from Holger) allow the problem description to be
refined a little: changes in commit da78dbff2e05630921c551dbbc70a4b7981a8fff
cause GA311 NICs (and possibly other PCI cards using an RTL-8169) to have
trouble with small UDP packets, while PCIe variants are seemingly
unaffected.

Does this help?

Regards
  jonathan

^ permalink raw reply

* [PATCH V4 14/26] pch_gbe: deprecate pci_get_bus_and_slot()
From: Sinan Kaya @ 2017-12-19  5:37 UTC (permalink / raw)
  To: linux-pci, timur
  Cc: linux-arm-msm, linux-arm-kernel, Sinan Kaya, David S. Miller,
	Kees Cook, Eric Dumazet, Tobias Klauser, Andy Shevchenko,
	open list:NETWORKING DRIVERS, open list
In-Reply-To: <1513661883-28662-1-git-send-email-okaya@codeaurora.org>

pci_get_bus_and_slot() is restrictive such that it assumes domain=0 as
where a PCI device is present. This restricts the device drivers to be
reused for other domain numbers.

Getting ready to remove pci_get_bus_and_slot() function in favor of
pci_get_domain_bus_and_slot().

Use the domain information from pdev while calling into
pci_get_domain_bus_and_slot() function.

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
---
 drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
index 40e52ff..7cd4946 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
@@ -2594,8 +2594,10 @@ static int pch_gbe_probe(struct pci_dev *pdev,
 	if (adapter->pdata && adapter->pdata->platform_init)
 		adapter->pdata->platform_init(pdev);
 
-	adapter->ptp_pdev = pci_get_bus_and_slot(adapter->pdev->bus->number,
-					       PCI_DEVFN(12, 4));
+	adapter->ptp_pdev =
+		pci_get_domain_bus_and_slot(pci_domain_nr(adapter->pdev->bus),
+					    adapter->pdev->bus->number,
+					    PCI_DEVFN(12, 4));
 
 	netdev->netdev_ops = &pch_gbe_netdev_ops;
 	netdev->watchdog_timeo = PCH_GBE_WATCHDOG_PERIOD;
-- 
1.9.1

^ permalink raw reply related

* [PATCH V4 13/26] bnx2x: deprecate pci_get_bus_and_slot()
From: Sinan Kaya @ 2017-12-19  5:37 UTC (permalink / raw)
  To: linux-pci, timur
  Cc: linux-arm-msm, linux-arm-kernel, Sinan Kaya, Ariel Elior,
	supporter:BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER,
	open list:BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER, open list
In-Reply-To: <1513661883-28662-1-git-send-email-okaya@codeaurora.org>

pci_get_bus_and_slot() is restrictive such that it assumes domain=0 as
where a PCI device is present. This restricts the device drivers to be
reused for other domain numbers.

Getting ready to remove pci_get_bus_and_slot() function in favor of
pci_get_domain_bus_and_slot().

Introduce bnx2x_vf_domain() function to extract the domain information
and save it to VF specific data structure.

Use the saved domain value while calling pci_get_domain_bus_and_slot().

Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c | 10 +++++++++-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h |  1 +
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
index 3591077..ffa7959 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -812,7 +812,7 @@ static u8 bnx2x_vf_is_pcie_pending(struct bnx2x *bp, u8 abs_vfid)
 	if (!vf)
 		return false;
 
-	dev = pci_get_bus_and_slot(vf->bus, vf->devfn);
+	dev = pci_get_domain_bus_and_slot(vf->domain, vf->bus, vf->devfn);
 	if (dev)
 		return bnx2x_is_pcie_pending(dev);
 	return false;
@@ -1041,6 +1041,13 @@ void bnx2x_iov_init_dmae(struct bnx2x *bp)
 		REG_WR(bp, DMAE_REG_BACKWARD_COMP_EN, 0);
 }
 
+static int bnx2x_vf_domain(struct bnx2x *bp, int vfid)
+{
+	struct pci_dev *dev = bp->pdev;
+
+	return pci_domain_nr(dev->bus);
+}
+
 static int bnx2x_vf_bus(struct bnx2x *bp, int vfid)
 {
 	struct pci_dev *dev = bp->pdev;
@@ -1606,6 +1613,7 @@ int bnx2x_iov_nic_init(struct bnx2x *bp)
 		struct bnx2x_virtf *vf = BP_VF(bp, vfid);
 
 		/* fill in the BDF and bars */
+		vf->domain = bnx2x_vf_domain(bp, vfid);
 		vf->bus = bnx2x_vf_bus(bp, vfid);
 		vf->devfn = bnx2x_vf_devfn(bp, vfid);
 		bnx2x_vf_set_bars(bp, vf);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h
index 53466f6..eb814c6 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h
@@ -182,6 +182,7 @@ struct bnx2x_virtf {
 	u32 error;	/* 0 means all's-well */
 
 	/* BDF */
+	unsigned int domain;
 	unsigned int bus;
 	unsigned int devfn;
 
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH v4 25/36] nds32: Miscellaneous header files
From: Greentime Hu @ 2017-12-19  5:34 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Greentime, Linux Kernel Mailing List, linux-arch, Thomas Gleixner,
	Jason Cooper, Marc Zyngier, Rob Herring, Networking, Vincent Chen,
	DTML, Al Viro, David Howells, Will Deacon, Daniel Lezcano,
	linux-serial-u79uwXL29TY76Z2rM5mHXA, Geert Uytterhoeven,
	Linus Walleij, Mark Rutland, Greg KH, Guo Ren
In-Reply-To: <CAK8P3a1EOD3WB=c4JybBD5bazhVJ5reuegyA1Hyoj-nPHpsW6g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Hi, Arnd:

2017-12-18 19:13 GMT+08:00 Arnd Bergmann <arnd-r2nGTMty4D4@public.gmane.org>:
> On Mon, Dec 18, 2017 at 7:46 AM, Greentime Hu <green.hu-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>> From: Greentime Hu <greentime-MUIXKm3Oiri1Z/+hSey0Gg@public.gmane.org>
>>
>> This patch introduces some miscellaneous header files.
>
>> +static inline void __delay(unsigned long loops)
>> +{
>> +       __asm__ __volatile__(".align 2\n"
>> +                            "1:\n"
>> +                            "\taddi\t%0, %0, -1\n"
>> +                            "\tbgtz\t%0, 1b\n"
>> +                            :"=r"(loops)
>> +                            :"0"(loops));
>> +}
>> +
>> +static inline void __udelay(unsigned long usecs, unsigned long lpj)
>> +{
>> +       usecs *= (unsigned long)(((0x8000000000000000ULL / (500000 / HZ)) +
>> +                                 0x80000000ULL) >> 32);
>> +       usecs = (unsigned long)(((unsigned long long)usecs * lpj) >> 32);
>> +       __delay(usecs);
>> +}
>
> Do you have a reliable clocksource that you can read here instead of doing the
> loop? It's generally preferred to have an accurate delay if at all possible, the
> delay loop calibration is only for those architectures that don't have any
> way to observe how much time has passed accurately.
>

We currently only have atcpit100 as clocksource but it is an IP of  SoC.
These delay API will be unavailable if we changed to another SoC
unless all these timer driver provided the same APIs.
It may suffer our customers if they forget to port these APIs in their
timer drivers when they try to use nds32 in the first beginning.
Or maybe I can use a CONFIG_USE_ACCURATE_DELAY to keep these 2
implementions for these purposes?
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Linux ECN Handling
From: Steve Ibanez @ 2017-12-19  5:16 UTC (permalink / raw)
  To: Neal Cardwell
  Cc: Eric Dumazet, Yuchung Cheng, Daniel Borkmann, Netdev,
	Florian Westphal, Mohammad Alizadeh, Lawrence Brakmo
In-Reply-To: <CADVnQy=9xxQ2XH0rKhR+kHp6J9nOvM7e=6z-AJPAiTaap0e_bA@mail.gmail.com>

Hi Neal,

I started looking into this receiver ACKing issue today. Strangely,
when I tried adding printk statements at the top of the
tcp_v4_do_rcv(), tcp_rcv_established(), __tcp_ack_snd_check() and
tcp_send_delayed_ack() functions they were never executed on the
machine running the iperf3 server (i.e. the destination of the flows).
Maybe the iperf3 server is using its own TCP stack?

In any case, the ACKing problem is reproducible using just normal
iperf for which I do see my printk statements being executed. I can
now confirm that when the CWR marked packet (for which no ACK is sent)
arrives at the receiver, the __tcp_ack_snd_check() function is never
invoked; and hence neither is the tcp_send_delayed_ack() function.
Hopefully this helps narrow down where the issue might be? I started
adding some printk statements into the tcp_rcv_established() function,
but I'm not sure where the best places to look would be so I wanted to
ask your advice on this.

In case you're interested, I instrumented the __tcp_ack_snd_check()
function with the following printk statements:

@@ -5057,9 +5117,15 @@ static inline void tcp_data_snd_check(struct sock *sk)
 /*
  * Check if sending an ack is needed.
  */
-static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible,
const struct tcphdr *th)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+        struct inet_sock *inet = inet_sk(sk);

            /* More than one full frame received... */
        if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
@@ -5071,21 +5137,31 @@ static void __tcp_ack_snd_check(struct sock
*sk, int ofo_possible)
            tcp_in_quickack_mode(sk) ||
            /* We have out of order data. */
            (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
+                // SI: Debugging TCP ECN handeling
+                if (sk->sk_family == AF_INET && th->cwr) {
+                        printk("tcp_debug: __tcp_ack_snd_check:
%pI4/%u CWR set and sending ACK now - rcv_nxt=%u\n",
+                                 &inet->inet_daddr,
ntohs(inet->inet_sport), tp->rcv_nxt);
+                 }
                /* Then ack it now */
                tcp_send_ack(sk);
        } else {
+                // SI: Debugging TCP ECN handeling
+                if (sk->sk_family == AF_INET && th->cwr) {
+                        printk("tcp_debug: __tcp_ack_snd_check:
%pI4/%u CWR set and sending delayed ACK - rcv_nxt=%u\n",
+                                 &inet->inet_daddr,
ntohs(inet->inet_sport), tp->rcv_nxt);
+                 }
                /* Else, send delayed ack. */
-               tcp_send_delayed_ack(sk);
+               tcp_send_delayed_ack(sk, th);
        }
 }

In the kernel log on the receiver, I see the following sequence of
events at a timeout:

[ 2730.145023] tcp_debug: __tcp_ack_snd_check: 10.0.0.5/916 CWR set
and sending ACK now - rcv_nxt=2317949387
[ 2730.145543] tcp_debug: __tcp_ack_snd_check: 10.0.0.5/916 CWR set
and sending ACK now - rcv_nxt=2318243331 <-- last log statement before
timeout
[ 2730.452540] tcp_debug: __tcp_ack_snd_check: 10.0.0.5/916 CWR set
and sending ACK now - rcv_nxt=2318593747
[ 2730.453137] tcp_debug: __tcp_ack_snd_check: 10.0.0.5/916 CWR set
and sending ACK now - rcv_nxt=2318813843

>From the tcpdump trace at the receiver's interface I see that the last
log statement before the timeout corresponds exactly to one CWR packet
before the unACKed CWR packet. For example, in this case, the CWR
packet to which the indicated log statement corresponds has seqNo =
2318196995 and length 46336 ==> 2318196995 + 46336 = 2318243331, which
is exactly the rcv_nxt value of the indicated log statement. And then
unACKed CWR packet arrives and it is completely missing from the log
file, there is no indication of sending a delayed ACK either. Hence my
conclusion that the __tcp_ack_snd_check() function is never invoked by
the receiver upon receiving the unACKed CWR packet.

Sorry if that was long and verbose, I just wanted to be clear on what
I had done. Please do let me know if you have any questions though.

Thanks,
-Steve


On Tue, Dec 5, 2017 at 12:04 PM, Neal Cardwell <ncardwell@google.com> wrote:
> On Tue, Dec 5, 2017 at 2:36 PM, Steve Ibanez <sibanez@stanford.edu> wrote:
>> Hi Neal,
>>
>> I've included a link to small trace of 13 packets which is different
>> from the screenshot I attached in my last email, but shows the same
>> sequence of events. It's a bit hard to read the tcptrace due to the
>> 300ms timeout, so I figured this was the best approach.
>>
>> slice.pcap: https://drive.google.com/open?id=1hYXbUClHGbQv1hWG1HZWDO2WYf30N6G8
>
> Thanks for the trace! Attached is a screen shot (first screen shot is
> for the arriving packets with CWR; second is after the RTO). The
> sender behavior looks reasonable. I don't see why the receiver is not
> ACKing. As you say, it does look like a receiver bug. You could try
> adding instrumentation to try to isolate why the receiver is not
> sending an ACK immediately. You might instrument __tcp_ack_snd_check()
> and tcp_send_delayed_ack() so that when the most recent incoming
> packet had cwr set they printk to log what they are deciding in this
> case. Perhaps the tcp_send_delayed_ack()  code is hitting the max_ato
> = HZ / 2 code path?
>
> neal

^ permalink raw reply

* Re: [Patch net-next] net_sched: properly check for empty skb array on error path
From: John Fastabend @ 2017-12-19  4:54 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers
In-Reply-To: <CAM_iQpU8EuhBvfFtf0t0TCS-XKLffuF+E6PBM7JS8Y5dh5sR8w@mail.gmail.com>

On 12/18/2017 08:31 PM, Cong Wang wrote:
> On Mon, Dec 18, 2017 at 7:58 PM, John Fastabend
> <john.fastabend@gmail.com> wrote:
>> On 12/18/2017 06:20 PM, Cong Wang wrote:
>>> On Mon, Dec 18, 2017 at 5:25 PM, John Fastabend
>>> <john.fastabend@gmail.com> wrote:
>>>> On 12/18/2017 02:34 PM, Cong Wang wrote:
>>>>> First, the check of &q->ring.queue against NULL is wrong, it
>>>>> is always false. We should check the value rather than the address.
>>>>>
>>>>
>>>> Thanks.
>>>>
>>>>> Secondly, we need the same check in pfifo_fast_reset() too,
>>>>> as both ->reset() and ->destroy() are called in qdisc_destroy().
>>>>>
>>>>
>>>> not that it hurts to have the check here, but if init fails
>>>> in qdisc_create it seems only ->destroy() is called without
>>>> a ->reset().
>>>>
>>>> Is there another path for init() to fail that I'm missing.
>>>
>>> Pretty sure ->reset() is called in qdisc_destroy() and also before
>>> ->destroy():
>>>
>>
>> Except, the failed init path does not call qdisc_destroy.
>>
>> static struct Qdisc *qdisc_create(struct net_device *dev,
>> [...]
>>
>>         if (ops->init) {
>>                 err = ops->init(sch, tca[TCA_OPTIONS]);
>>                 if (err != 0)
>>                         goto err_out5;
>>         }
>> [...]
>>
>> err_out5:
>>         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
>>         if (ops->destroy)
>>                 ops->destroy(sch);
> 
> Didn't I say qdisc_destroy() rather than ->destroy()? :-)
> 

Yep, thanks for the fix.

Acked-by: John Fastabend <john.fastabend@gmail.com>

^ permalink raw reply

* RE: [PATCH net-next] netdevsim: correctly check return value of debugfs_create_dir
From: Prashant Bhole @ 2017-12-19  4:45 UTC (permalink / raw)
  To: 'Jakub Kicinski', 'David Miller'; +Cc: netdev
In-Reply-To: <20171211104029.63635671@cakuba.netronome.com>


> From: Jakub Kicinski [mailto:jakub.kicinski@netronome.com]
> 
> On Mon, 11 Dec 2017 13:46:48 +0900, Prashant Bhole wrote:
> > > From: David Miller [mailto:davem@davemloft.net]
> > >
> > > From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
> > > Date: Fri,  8 Dec 2017 09:52:50 +0900
> > >
> > > > Return value is now checked with IS_ERROR_OR_NULL because
> > > > debugfs_create_dir doesn't return error value. It either returns
> > > > NULL or a valid pointer.
> > > >
> > > > Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
> > > > ---
> > > >  drivers/net/netdevsim/netdev.c | 2 +-
> > > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > >
> > > > diff --git a/drivers/net/netdevsim/netdev.c
> > > > b/drivers/net/netdevsim/netdev.c index eb8c679fca9f..88d8ee2c89da
> > > > 100644
> > > > --- a/drivers/net/netdevsim/netdev.c
> > > > +++ b/drivers/net/netdevsim/netdev.c
> > > > @@ -469,7 +469,7 @@ static int __init nsim_module_init(void)
> > > >  	int err;
> > > >
> > > >  	nsim_ddir = debugfs_create_dir(DRV_NAME, NULL);
> > > > -	if (IS_ERR(nsim_ddir))
> > > > +	if (IS_ERR_OR_NULL(nsim_ddir))
> > > >  		return PTR_ERR(nsim_ddir);
> > >
> > > debugfs_create_dir() should really be fixed, either it uses error
> > > pointers consistently and therefore always provides a suitable error
> > > code to return
> > or it
> > > always uses NULL.
> > >
> > > This in-between behavior makes using it as an interface painful
> > > because no
> > clear
> > > meaning is given to NULL.
> > >
> > > So please do the work necessary to make debugfs_create_dir()'s
> > > return semantics clearer and more useful.
> > >
> > > Thank you.
> >
> > Dave,
> > Thanks for comments. I will try to fix error handling in netdevsim
first.
> >
> > Jakub,
> > Let's decide with an example. The typical directory structure for
> > netdevsim interface is as below:
> > /sys/kernel/debug/netdevsim/sim0/bpf_bound_progs/
> > Please let me know if you are ok with following:
> >
> > 1) If debugfs_create_dir() fails in module_init, let's keep it fatal
> > error with corrected condition:
> > +	if (IS_ERR_OR_NULL(nsim_ddir))
> > +		return -ENOMEM;
> >
> > 2) In case sim0 or bpf_bound_progs are  fail to create, we need to add
> > checks before creating any file in them.
> 
> Fine with me, although if you fix DebugFS first you could use the real
error from
> the start here.

Jakub, Dave,
Sorry for late reply.
I tried to evaluate whether fixing return value of debugfs_create_dir() (and
friends) will be useful or not because it has not been changed since very
long time. Now I am not much convinced about changing this api. 

Important and possible error codes could be -EEXIST and -ENOMEM. Suppose
-EEXIST is returned, IMO the directory shouldn't exists in the first place
because it is specific to particular module. Also, there is no point in
creating file in such directory, because directory owner (creator) might
remove it too. This means there are less chances that api change will be
useful. Please let me know your opinion on it.

If you are ok with above explanation, shall I submit v2 for this patch?

-Prashant  

^ permalink raw reply

* Re: [Patch net-next] net_sched: properly check for empty skb array on error path
From: Cong Wang @ 2017-12-19  4:31 UTC (permalink / raw)
  To: John Fastabend; +Cc: Linux Kernel Network Developers
In-Reply-To: <4caae5f5-63e2-65c8-522e-0dfe736a7738@gmail.com>

On Mon, Dec 18, 2017 at 7:58 PM, John Fastabend
<john.fastabend@gmail.com> wrote:
> On 12/18/2017 06:20 PM, Cong Wang wrote:
>> On Mon, Dec 18, 2017 at 5:25 PM, John Fastabend
>> <john.fastabend@gmail.com> wrote:
>>> On 12/18/2017 02:34 PM, Cong Wang wrote:
>>>> First, the check of &q->ring.queue against NULL is wrong, it
>>>> is always false. We should check the value rather than the address.
>>>>
>>>
>>> Thanks.
>>>
>>>> Secondly, we need the same check in pfifo_fast_reset() too,
>>>> as both ->reset() and ->destroy() are called in qdisc_destroy().
>>>>
>>>
>>> not that it hurts to have the check here, but if init fails
>>> in qdisc_create it seems only ->destroy() is called without
>>> a ->reset().
>>>
>>> Is there another path for init() to fail that I'm missing.
>>
>> Pretty sure ->reset() is called in qdisc_destroy() and also before
>> ->destroy():
>>
>
> Except, the failed init path does not call qdisc_destroy.
>
> static struct Qdisc *qdisc_create(struct net_device *dev,
> [...]
>
>         if (ops->init) {
>                 err = ops->init(sch, tca[TCA_OPTIONS]);
>                 if (err != 0)
>                         goto err_out5;
>         }
> [...]
>
> err_out5:
>         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
>         if (ops->destroy)
>                 ops->destroy(sch);

Didn't I say qdisc_destroy() rather than ->destroy()? :-)


struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
                                const struct Qdisc_ops *ops,
                                unsigned int parentid)
{
        struct Qdisc *sch;

        if (!try_module_get(ops->owner))
                return NULL;

        sch = qdisc_alloc(dev_queue, ops);
        if (IS_ERR(sch)) {
                module_put(ops->owner);
                return NULL;
        }
        sch->parent = parentid;

        if (!ops->init || ops->init(sch, NULL) == 0)
                return sch;

        qdisc_destroy(sch);
        return NULL;
}

^ permalink raw reply

* Re: [PATCH 3/3] trace: print address if symbol not found
From: Tobin C. Harding @ 2017-12-19  4:20 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: kernel-hardening, Tycho Andersen, Linus Torvalds, Kees Cook,
	Andrew Morton, Daniel Borkmann, Masahiro Yamada,
	Alexei Starovoitov, linux-kernel, Network Development
In-Reply-To: <20171218223738.49e563c0@vmware.local.home>

On Mon, Dec 18, 2017 at 10:37:38PM -0500, Steven Rostedt wrote:
> On Tue, 19 Dec 2017 14:00:11 +1100
> "Tobin C. Harding" <me@tobin.cc> wrote:
> 
> > I ran through these as outlined here for the new version (v4). This hits
> > the modified code but doesn't test symbol look up failure.
> 
> stacktrace shouldn't post non kernel values, unless there's a frame
> pointer that isn't handled by kallsyms.
> 
> As for the other two, we could probably force a failure, like:
> 
>  # echo 'hist:keys=hrtimer.sym' > \
>      events/timer/hrtimer_start/trigger
>  # cat events/timer/hrtimer_start/hist
> 
> And then just add sym-offset too.
>
> > I also configured kernel with 'Perform a startup test on ftrace' for
> > good luck.
> > 
> > Are you happy with this level of testing?
> 
> Can you try the above.

Did both and in both cases we get the addresses as hoped :)

thanks,
Tobin.

^ permalink raw reply

* [PATCH bpf] bpf: do not allow root to mangle valid pointers
From: Alexei Starovoitov @ 2017-12-19  4:15 UTC (permalink / raw)
  To: David S . Miller
  Cc: Daniel Borkmann, Jann Horn, Edward Cree, netdev, kernel-team

Do not allow root to convert valid pointers into unknown scalars.
In particular disallow:
 ptr &= reg
 ptr <<= reg
 ptr += ptr
and explicitly allow:
 ptr -= ptr
since pkt_end - pkt == length

1.
This minimizes amount of address leaks root can do.
In the future may need to further tighten the leaks with kptr_restrict.

2.
If program has such pointer math it's likely a user mistake and
when verifier complains about it right away instead of many instructions
later on invalid memory access it's easier for users to fix their progs.

3.
when register holding a pointer cannot change to scalar it allows JITs to
optimize better. Like 32-bit archs could use single register for pointers
instead of a pair required to hold 64-bit scalars.

4.
reduces architecture dependent behavior. Since code:
r1 = r10;
r1 &= 0xff;
if (r1 ...)
will behave differently arm64 vs x64 and offloaded vs native.

A significant chunk of ptr mangling was allowed by
commit f1174f77b50c ("bpf/verifier: rework value tracking")
yet some of it was allowed even earlier.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                       | 102 ++++++++++------------------
 tools/testing/selftests/bpf/test_verifier.c |  56 +++++++--------
 2 files changed, 63 insertions(+), 95 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 86dfe6b5c243..04b24876cd23 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1890,29 +1890,25 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 
 	if (BPF_CLASS(insn->code) != BPF_ALU64) {
 		/* 32-bit ALU ops on pointers produce (meaningless) scalars */
-		if (!env->allow_ptr_leaks)
-			verbose(env,
-				"R%d 32-bit pointer arithmetic prohibited\n",
-				dst);
+		verbose(env,
+			"R%d 32-bit pointer arithmetic prohibited\n",
+			dst);
 		return -EACCES;
 	}
 
 	if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-		if (!env->allow_ptr_leaks)
-			verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
-				dst);
+		verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+			dst);
 		return -EACCES;
 	}
 	if (ptr_reg->type == CONST_PTR_TO_MAP) {
-		if (!env->allow_ptr_leaks)
-			verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
-				dst);
+		verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+			dst);
 		return -EACCES;
 	}
 	if (ptr_reg->type == PTR_TO_PACKET_END) {
-		if (!env->allow_ptr_leaks)
-			verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
-				dst);
+		verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+			dst);
 		return -EACCES;
 	}
 
@@ -1979,9 +1975,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	case BPF_SUB:
 		if (dst_reg == off_reg) {
 			/* scalar -= pointer.  Creates an unknown scalar */
-			if (!env->allow_ptr_leaks)
-				verbose(env, "R%d tried to subtract pointer from scalar\n",
-					dst);
+			verbose(env, "R%d tried to subtract pointer from scalar\n",
+				dst);
 			return -EACCES;
 		}
 		/* We don't allow subtraction from FP, because (according to
@@ -1989,9 +1984,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 * be able to deal with it.
 		 */
 		if (ptr_reg->type == PTR_TO_STACK) {
-			if (!env->allow_ptr_leaks)
-				verbose(env, "R%d subtraction from stack pointer prohibited\n",
-					dst);
+			verbose(env, "R%d subtraction from stack pointer prohibited\n",
+				dst);
 			return -EACCES;
 		}
 		if (known && (ptr_reg->off - smin_val ==
@@ -2040,19 +2034,14 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	case BPF_AND:
 	case BPF_OR:
 	case BPF_XOR:
-		/* bitwise ops on pointers are troublesome, prohibit for now.
-		 * (However, in principle we could allow some cases, e.g.
-		 * ptr &= ~3 which would reduce min_value by 3.)
-		 */
-		if (!env->allow_ptr_leaks)
-			verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
-				dst, bpf_alu_string[opcode >> 4]);
+		/* bitwise ops on pointers are troublesome, prohibit. */
+		verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
+			dst, bpf_alu_string[opcode >> 4]);
 		return -EACCES;
 	default:
 		/* other operators (e.g. MUL,LSH) produce non-pointer results */
-		if (!env->allow_ptr_leaks)
-			verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
-				dst, bpf_alu_string[opcode >> 4]);
+		verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
+			dst, bpf_alu_string[opcode >> 4]);
 		return -EACCES;
 	}
 
@@ -2308,7 +2297,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
 	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
 	u8 opcode = BPF_OP(insn->code);
-	int rc;
 
 	dst_reg = &regs[insn->dst_reg];
 	src_reg = NULL;
@@ -2319,43 +2307,29 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		if (src_reg->type != SCALAR_VALUE) {
 			if (dst_reg->type != SCALAR_VALUE) {
 				/* Combining two pointers by any ALU op yields
-				 * an arbitrary scalar.
+				 * an arbitrary scalar. Disallow all math except
+				 * pointer subtraction
 				 */
-				if (!env->allow_ptr_leaks) {
-					verbose(env, "R%d pointer %s pointer prohibited\n",
-						insn->dst_reg,
-						bpf_alu_string[opcode >> 4]);
-					return -EACCES;
+				if (opcode == BPF_SUB){
+					mark_reg_unknown(env, regs, insn->dst_reg);
+					return 0;
 				}
-				mark_reg_unknown(env, regs, insn->dst_reg);
-				return 0;
+				verbose(env, "R%d pointer %s pointer prohibited\n",
+					insn->dst_reg,
+					bpf_alu_string[opcode >> 4]);
+				return -EACCES;
 			} else {
 				/* scalar += pointer
 				 * This is legal, but we have to reverse our
 				 * src/dest handling in computing the range
 				 */
-				rc = adjust_ptr_min_max_vals(env, insn,
-							     src_reg, dst_reg);
-				if (rc == -EACCES && env->allow_ptr_leaks) {
-					/* scalar += unknown scalar */
-					__mark_reg_unknown(&off_reg);
-					return adjust_scalar_min_max_vals(
-							env, insn,
-							dst_reg, off_reg);
-				}
-				return rc;
+				return adjust_ptr_min_max_vals(env, insn,
+							       src_reg, dst_reg);
 			}
 		} else if (ptr_reg) {
 			/* pointer += scalar */
-			rc = adjust_ptr_min_max_vals(env, insn,
-						     dst_reg, src_reg);
-			if (rc == -EACCES && env->allow_ptr_leaks) {
-				/* unknown scalar += scalar */
-				__mark_reg_unknown(dst_reg);
-				return adjust_scalar_min_max_vals(
-						env, insn, dst_reg, *src_reg);
-			}
-			return rc;
+			return adjust_ptr_min_max_vals(env, insn,
+						       dst_reg, src_reg);
 		}
 	} else {
 		/* Pretend the src is a reg with a known value, since we only
@@ -2364,17 +2338,9 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		off_reg.type = SCALAR_VALUE;
 		__mark_reg_known(&off_reg, insn->imm);
 		src_reg = &off_reg;
-		if (ptr_reg) { /* pointer += K */
-			rc = adjust_ptr_min_max_vals(env, insn,
-						     ptr_reg, src_reg);
-			if (rc == -EACCES && env->allow_ptr_leaks) {
-				/* unknown scalar += K */
-				__mark_reg_unknown(dst_reg);
-				return adjust_scalar_min_max_vals(
-						env, insn, dst_reg, off_reg);
-			}
-			return rc;
-		}
+		if (ptr_reg) /* pointer += K */
+			return adjust_ptr_min_max_vals(env, insn,
+						       ptr_reg, src_reg);
 	}
 
 	/* Got here implies adding two SCALAR_VALUEs */
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 961c1426fbf2..b51017404c62 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -422,9 +422,7 @@ static struct bpf_test tests[] = {
 			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr_unpriv = "R1 subtraction from stack pointer",
-		.result_unpriv = REJECT,
-		.errstr = "R1 invalid mem access",
+		.errstr = "R1 subtraction from stack pointer",
 		.result = REJECT,
 	},
 	{
@@ -1859,9 +1857,8 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.result = ACCEPT,
-		.result_unpriv = REJECT,
-		.errstr_unpriv = "R1 pointer += pointer",
+		.result = REJECT,
+		.errstr = "R1 pointer += pointer",
 	},
 	{
 		"unpriv: neg pointer",
@@ -2589,7 +2586,8 @@ static struct bpf_test tests[] = {
 			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
 				    offsetof(struct __sk_buff, data)),
 			BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_4),
-			BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, len)),
 			BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 49),
 			BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 49),
 			BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_2),
@@ -2896,7 +2894,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "invalid access to packet",
+		.errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
@@ -3882,9 +3880,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map2 = { 3, 11 },
-		.errstr_unpriv = "R0 pointer += pointer",
-		.errstr = "R0 invalid mem access 'inv'",
-		.result_unpriv = REJECT,
+		.errstr = "R0 pointer += pointer",
 		.result = REJECT,
 		.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 	},
@@ -3925,7 +3921,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 4 },
-		.errstr = "R4 invalid mem access",
+		.errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS
 	},
@@ -3946,7 +3942,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 4 },
-		.errstr = "R4 invalid mem access",
+		.errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS
 	},
@@ -3967,7 +3963,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 4 },
-		.errstr = "R4 invalid mem access",
+		.errstr = "R4 pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS
 	},
@@ -5192,10 +5188,8 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map2 = { 3 },
-		.errstr_unpriv = "R0 bitwise operator &= on pointer",
-		.errstr = "invalid mem access 'inv'",
+		.errstr = "R0 bitwise operator &= on pointer",
 		.result = REJECT,
-		.result_unpriv = REJECT,
 	},
 	{
 		"map element value illegal alu op, 2",
@@ -5211,10 +5205,8 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map2 = { 3 },
-		.errstr_unpriv = "R0 32-bit pointer arithmetic prohibited",
-		.errstr = "invalid mem access 'inv'",
+		.errstr = "R0 32-bit pointer arithmetic prohibited",
 		.result = REJECT,
-		.result_unpriv = REJECT,
 	},
 	{
 		"map element value illegal alu op, 3",
@@ -5230,10 +5222,8 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map2 = { 3 },
-		.errstr_unpriv = "R0 pointer arithmetic with /= operator",
-		.errstr = "invalid mem access 'inv'",
+		.errstr = "R0 pointer arithmetic with /= operator",
 		.result = REJECT,
-		.result_unpriv = REJECT,
 	},
 	{
 		"map element value illegal alu op, 4",
@@ -6016,8 +6006,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map_in_map = { 3 },
-		.errstr = "R1 type=inv expected=map_ptr",
-		.errstr_unpriv = "R1 pointer arithmetic on CONST_PTR_TO_MAP prohibited",
+		.errstr = "R1 pointer arithmetic on CONST_PTR_TO_MAP prohibited",
 		.result = REJECT,
 	},
 	{
@@ -7645,6 +7634,19 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
+		"pkt_end - pkt_start is allowed",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_2),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
 		"XDP pkt read, pkt_end mangling, bad access 1",
 		.insns = {
 			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
@@ -7659,7 +7661,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "R1 offset is outside of the packet",
+		.errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
@@ -7678,7 +7680,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "R1 offset is outside of the packet",
+		.errstr = "R3 pointer arithmetic on PTR_TO_PACKET_END",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 9/9] selftests/bpf: add tests for recent bugfixes
From: Alexei Starovoitov @ 2017-12-19  4:12 UTC (permalink / raw)
  To: David S . Miller
  Cc: Daniel Borkmann, Jann Horn, Edward Cree, netdev, kernel-team
In-Reply-To: <20171219041201.1979983-1-ast@kernel.org>

From: Jann Horn <jannh@google.com>

These tests should cover the following cases:

 - MOV with both zero-extended and sign-extended immediates
 - implicit truncation of register contents via ALU32/MOV32
 - implicit 32-bit truncation of ALU32 output
 - oversized register source operand for ALU32 shift
 - right-shift of a number that could be positive or negative
 - map access where adding the operation size to the offset causes signed
   32-bit overflow
 - direct stack access at a ~4GiB offset

Also remove the F_LOAD_WITH_STRICT_ALIGNMENT flag from a bunch of tests
that should fail independent of what flags userspace passes.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_verifier.c | 549 +++++++++++++++++++++++++++-
 1 file changed, 533 insertions(+), 16 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index b03ecfd7185b..961c1426fbf2 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -606,7 +606,6 @@ static struct bpf_test tests[] = {
 		},
 		.errstr = "misaligned stack access",
 		.result = REJECT,
-		.flags = F_LOAD_WITH_STRICT_ALIGNMENT,
 	},
 	{
 		"invalid map_fd for function call",
@@ -1797,7 +1796,6 @@ static struct bpf_test tests[] = {
 		},
 		.result = REJECT,
 		.errstr = "misaligned stack access off (0x0; 0x0)+-8+2 size 8",
-		.flags = F_LOAD_WITH_STRICT_ALIGNMENT,
 	},
 	{
 		"PTR_TO_STACK store/load - bad alignment on reg",
@@ -1810,7 +1808,6 @@ static struct bpf_test tests[] = {
 		},
 		.result = REJECT,
 		.errstr = "misaligned stack access off (0x0; 0x0)+-10+8 size 8",
-		.flags = F_LOAD_WITH_STRICT_ALIGNMENT,
 	},
 	{
 		"PTR_TO_STACK store/load - out of bounds low",
@@ -6324,7 +6321,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6348,7 +6345,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6374,7 +6371,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R8 invalid mem access 'inv'",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6399,7 +6396,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R8 invalid mem access 'inv'",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6447,7 +6444,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6518,7 +6515,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6569,7 +6566,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6596,7 +6593,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6622,7 +6619,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6651,7 +6648,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6681,7 +6678,7 @@ static struct bpf_test tests[] = {
 			BPF_JMP_IMM(BPF_JA, 0, 0, -7),
 		},
 		.fixup_map1 = { 4 },
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 	},
 	{
@@ -6709,8 +6706,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.fixup_map1 = { 3 },
-		.errstr_unpriv = "R0 pointer comparison prohibited",
-		.errstr = "R0 min value is negative",
+		.errstr = "unbounded min value",
 		.result = REJECT,
 		.result_unpriv = REJECT,
 	},
@@ -6766,6 +6762,462 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 	},
 	{
+		"bounds check based on zero-extended MOV",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			/* r2 = 0x0000'0000'ffff'ffff */
+			BPF_MOV32_IMM(BPF_REG_2, 0xffffffff),
+			/* r2 = 0 */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 32),
+			/* no-op */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+			/* access at offset 0 */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT
+	},
+	{
+		"bounds check based on sign-extended MOV. test1",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			/* r2 = 0xffff'ffff'ffff'ffff */
+			BPF_MOV64_IMM(BPF_REG_2, 0xffffffff),
+			/* r2 = 0xffff'ffff */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 32),
+			/* r0 = <oob pointer> */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+			/* access to OOB pointer */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "map_value pointer and 4294967295",
+		.result = REJECT
+	},
+	{
+		"bounds check based on sign-extended MOV. test2",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			/* r2 = 0xffff'ffff'ffff'ffff */
+			BPF_MOV64_IMM(BPF_REG_2, 0xffffffff),
+			/* r2 = 0xfff'ffff */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36),
+			/* r0 = <oob pointer> */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+			/* access to OOB pointer */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "R0 min value is outside of the array range",
+		.result = REJECT
+	},
+	{
+		"bounds check based on reg_off + var_off + insn_off. test1",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 1),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, (1 << 29) - 1),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_6),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, (1 << 29) - 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 3),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 4 },
+		.errstr = "value_size=8 off=1073741825",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"bounds check based on reg_off + var_off + insn_off. test2",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 1),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, (1 << 30) - 1),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_6),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, (1 << 29) - 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 3),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 4 },
+		.errstr = "value 1073741823",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
+	{
+		"bounds check after truncation of non-boundary-crossing range",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+			/* r1 = [0x00, 0xff] */
+			BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_MOV64_IMM(BPF_REG_2, 1),
+			/* r2 = 0x10'0000'0000 */
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 36),
+			/* r1 = [0x10'0000'0000, 0x10'0000'00ff] */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
+			/* r1 = [0x10'7fff'ffff, 0x10'8000'00fe] */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff),
+			/* r1 = [0x00, 0xff] */
+			BPF_ALU32_IMM(BPF_SUB, BPF_REG_1, 0x7fffffff),
+			/* r1 = 0 */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8),
+			/* no-op */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			/* access at offset 0 */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT
+	},
+	{
+		"bounds check after truncation of boundary-crossing range (1)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+			/* r1 = [0x00, 0xff] */
+			BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = [0xffff'ff80, 0x1'0000'007f] */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = [0xffff'ff80, 0xffff'ffff] or
+			 *      [0x0000'0000, 0x0000'007f]
+			 */
+			BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 0),
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = [0x00, 0xff] or
+			 *      [0xffff'ffff'0000'0080, 0xffff'ffff'ffff'ffff]
+			 */
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = 0 or
+			 *      [0x00ff'ffff'ff00'0000, 0x00ff'ffff'ffff'ffff]
+			 */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8),
+			/* no-op or OOB pointer computation */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			/* potentially OOB access */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		/* not actually fully unbounded, but the bound is very high */
+		.errstr = "R0 unbounded memory access",
+		.result = REJECT
+	},
+	{
+		"bounds check after truncation of boundary-crossing range (2)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+			/* r1 = [0x00, 0xff] */
+			BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = [0xffff'ff80, 0x1'0000'007f] */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = [0xffff'ff80, 0xffff'ffff] or
+			 *      [0x0000'0000, 0x0000'007f]
+			 * difference to previous test: truncation via MOV32
+			 * instead of ALU32.
+			 */
+			BPF_MOV32_REG(BPF_REG_1, BPF_REG_1),
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = [0x00, 0xff] or
+			 *      [0xffff'ffff'0000'0080, 0xffff'ffff'ffff'ffff]
+			 */
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1),
+			/* r1 = 0 or
+			 *      [0x00ff'ffff'ff00'0000, 0x00ff'ffff'ffff'ffff]
+			 */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8),
+			/* no-op or OOB pointer computation */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			/* potentially OOB access */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		/* not actually fully unbounded, but the bound is very high */
+		.errstr = "R0 unbounded memory access",
+		.result = REJECT
+	},
+	{
+		"bounds check after wrapping 32-bit addition",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			/* r1 = 0x7fff'ffff */
+			BPF_MOV64_IMM(BPF_REG_1, 0x7fffffff),
+			/* r1 = 0xffff'fffe */
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff),
+			/* r1 = 0 */
+			BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 2),
+			/* no-op */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			/* access at offset 0 */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT
+	},
+	{
+		"bounds check after shift with oversized count operand",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_MOV64_IMM(BPF_REG_2, 32),
+			BPF_MOV64_IMM(BPF_REG_1, 1),
+			/* r1 = (u32)1 << (u32)32 = ? */
+			BPF_ALU32_REG(BPF_LSH, BPF_REG_1, BPF_REG_2),
+			/* r1 = [0x0000, 0xffff] */
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xffff),
+			/* computes unknown pointer, potentially OOB */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			/* potentially OOB access */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "R0 max value is outside of the array range",
+		.result = REJECT
+	},
+	{
+		"bounds check after right shift of maybe-negative number",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			/* r1 = [0x00, 0xff] */
+			BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			/* r1 = [-0x01, 0xfe] */
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1),
+			/* r1 = 0 or 0xff'ffff'ffff'ffff */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8),
+			/* r1 = 0 or 0xffff'ffff'ffff */
+			BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8),
+			/* computes unknown pointer, potentially OOB */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			/* potentially OOB access */
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+			/* exit */
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "R0 unbounded memory access",
+		.result = REJECT
+	},
+	{
+		"bounds check map access with off+size signed 32bit overflow. test1",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x7ffffffe),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+			BPF_JMP_A(0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "map_value pointer and 2147483646",
+		.result = REJECT
+	},
+	{
+		"bounds check map access with off+size signed 32bit overflow. test2",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x1fffffff),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x1fffffff),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x1fffffff),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+			BPF_JMP_A(0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "pointer offset 1073741822",
+		.result = REJECT
+	},
+	{
+		"bounds check map access with off+size signed 32bit overflow. test3",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 0x1fffffff),
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 0x1fffffff),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 2),
+			BPF_JMP_A(0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "pointer offset -1073741822",
+		.result = REJECT
+	},
+	{
+		"bounds check map access with off+size signed 32bit overflow. test4",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_MOV64_IMM(BPF_REG_1, 1000000),
+			BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 1000000),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 2),
+			BPF_JMP_A(0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.errstr = "map_value pointer and 1000000000000",
+		.result = REJECT
+	},
+	{
+		"pointer/scalar confusion in state equality check (way 1)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+			BPF_JMP_A(1),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_10),
+			BPF_JMP_A(0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 leaks addr as return value"
+	},
+	{
+		"pointer/scalar confusion in state equality check (way 2)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_10),
+			BPF_JMP_A(1),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT,
+		.result_unpriv = REJECT,
+		.errstr_unpriv = "R0 leaks addr as return value"
+	},
+	{
 		"variable-offset ctx access",
 		.insns = {
 			/* Get an unknown value */
@@ -6807,6 +7259,71 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_LWT_IN,
 	},
 	{
+		"indirect variable-offset stack access",
+		.insns = {
+			/* Fill the top 8 bytes of the stack */
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			/* Get an unknown value */
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+			/* Make it small and 4-byte aligned */
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4),
+			BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 8),
+			/* add it to fp.  We now have either fp-4 or fp-8, but
+			 * we don't know which
+			 */
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
+			/* dereference it indirectly */
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 5 },
+		.errstr = "variable stack read R2",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_LWT_IN,
+	},
+	{
+		"direct stack access with 32-bit wraparound. test1",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff),
+			BPF_MOV32_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_EXIT_INSN()
+		},
+		.errstr = "fp pointer and 2147483647",
+		.result = REJECT
+	},
+	{
+		"direct stack access with 32-bit wraparound. test2",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x3fffffff),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x3fffffff),
+			BPF_MOV32_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_EXIT_INSN()
+		},
+		.errstr = "fp pointer and 1073741823",
+		.result = REJECT
+	},
+	{
+		"direct stack access with 32-bit wraparound. test3",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x1fffffff),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x1fffffff),
+			BPF_MOV32_IMM(BPF_REG_0, 0),
+			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+			BPF_EXIT_INSN()
+		},
+		.errstr = "fp pointer offset 1073741822",
+		.result = REJECT
+	},
+	{
 		"liveness pruning and write screening",
 		.insns = {
 			/* Get an unknown value */
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 7/9] bpf: don't prune branches when a scalar is replaced with a pointer
From: Alexei Starovoitov @ 2017-12-19  4:11 UTC (permalink / raw)
  To: David S . Miller
  Cc: Daniel Borkmann, Jann Horn, Edward Cree, netdev, kernel-team
In-Reply-To: <20171219041201.1979983-1-ast@kernel.org>

From: Jann Horn <jannh@google.com>

This could be made safe by passing through a reference to env and checking
for env->allow_ptr_leaks, but it would only work one way and is probably
not worth the hassle - not doing it will not directly lead to program
rejection.

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 102c519836f6..982bd9ec721a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3467,15 +3467,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 			return range_within(rold, rcur) &&
 			       tnum_in(rold->var_off, rcur->var_off);
 		} else {
-			/* if we knew anything about the old value, we're not
-			 * equal, because we can't know anything about the
-			 * scalar value of the pointer in the new value.
+			/* We're trying to use a pointer in place of a scalar.
+			 * Even if the scalar was unbounded, this could lead to
+			 * pointer leaks because scalars are allowed to leak
+			 * while pointers are not. We could make this safe in
+			 * special cases if root is calling us, but it's
+			 * probably not worth the hassle.
 			 */
-			return rold->umin_value == 0 &&
-			       rold->umax_value == U64_MAX &&
-			       rold->smin_value == S64_MIN &&
-			       rold->smax_value == S64_MAX &&
-			       tnum_is_unknown(rold->var_off);
+			return false;
 		}
 	case PTR_TO_MAP_VALUE:
 		/* If the new min/max/var_off satisfy the old ones and
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 5/9] bpf: fix missing error return in check_stack_boundary()
From: Alexei Starovoitov @ 2017-12-19  4:11 UTC (permalink / raw)
  To: David S . Miller
  Cc: Daniel Borkmann, Jann Horn, Edward Cree, netdev, kernel-team
In-Reply-To: <20171219041201.1979983-1-ast@kernel.org>

From: Jann Horn <jannh@google.com>

Prevent indirect stack accesses at non-constant addresses, which would
permit reading and corrupting spilled pointers.

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ecdc265244ca..77e4b5223867 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1303,6 +1303,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
 		verbose(env, "invalid variable stack read R%d var_off=%s\n",
 			regno, tn_buf);
+		return -EACCES;
 	}
 	off = regs[regno].off + regs[regno].var_off.value;
 	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 8/9] bpf: fix integer overflows
From: Alexei Starovoitov @ 2017-12-19  4:12 UTC (permalink / raw)
  To: David S . Miller
  Cc: Daniel Borkmann, Jann Horn, Edward Cree, netdev, kernel-team
In-Reply-To: <20171219041201.1979983-1-ast@kernel.org>

There were various issues related to the limited size of integers used in
the verifier:
 - `off + size` overflow in __check_map_access()
 - `off + reg->off` overflow in check_mem_access()
 - `off + reg->var_off.value` overflow or 32-bit truncation of
   `reg->var_off.value` in check_mem_access()
 - 32-bit truncation in check_stack_boundary()

Make sure that any integer math cannot overflow by not allowing
pointer math with large values.

Also reduce the scope of "scalar op scalar" tracking.

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  4 ++--
 kernel/bpf/verifier.c        | 48 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c561b986bab0..1632bb13ad8a 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -15,11 +15,11 @@
  * In practice this is far bigger than any realistic pointer offset; this limit
  * ensures that umax_value + (int)off + (int)size cannot overflow a u64.
  */
-#define BPF_MAX_VAR_OFF	(1ULL << 31)
+#define BPF_MAX_VAR_OFF	(1 << 29)
 /* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO].  This ensures
  * that converting umax_value to int cannot overflow.
  */
-#define BPF_MAX_VAR_SIZ	INT_MAX
+#define BPF_MAX_VAR_SIZ	(1 << 29)
 
 /* Liveness marks, used for registers and spilled-regs (in stack slots).
  * Read marks propagate upwards until they find a write mark; they record that
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 982bd9ec721a..86dfe6b5c243 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1819,6 +1819,41 @@ static bool signed_sub_overflows(s64 a, s64 b)
 	return res > a;
 }
 
+static bool check_reg_sane_offset(struct bpf_verifier_env *env,
+				  const struct bpf_reg_state *reg,
+				  enum bpf_reg_type type)
+{
+	bool known = tnum_is_const(reg->var_off);
+	s64 val = reg->var_off.value;
+	s64 smin = reg->smin_value;
+
+	if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
+		verbose(env, "math between %s pointer and %lld is not allowed\n",
+			reg_type_str[type], val);
+		return false;
+	}
+
+	if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
+		verbose(env, "%s pointer offset %d is not allowed\n",
+			reg_type_str[type], reg->off);
+		return false;
+	}
+
+	if (smin == S64_MIN) {
+		verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
+			reg_type_str[type]);
+		return false;
+	}
+
+	if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
+		verbose(env, "value %lld makes %s pointer be out of bounds\n",
+			smin, reg_type_str[type]);
+		return false;
+	}
+
+	return true;
+}
+
 /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
  * Caller should also handle BPF_MOV case separately.
  * If we return -EACCES, caller may want to try again treating pointer as a
@@ -1887,6 +1922,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	dst_reg->type = ptr_reg->type;
 	dst_reg->id = ptr_reg->id;
 
+	if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
+	    !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
+		return -EINVAL;
+
 	switch (opcode) {
 	case BPF_ADD:
 		/* We can take a fixed offset as long as it doesn't overflow
@@ -2017,6 +2056,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
+	if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
+		return -EINVAL;
+
 	__update_reg_bounds(dst_reg);
 	__reg_deduce_bounds(dst_reg);
 	__reg_bound_offset(dst_reg);
@@ -2046,6 +2088,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 	src_known = tnum_is_const(src_reg.var_off);
 	dst_known = tnum_is_const(dst_reg->var_off);
 
+	if (!src_known &&
+	    opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
+		__mark_reg_unknown(dst_reg);
+		return 0;
+	}
+
 	switch (opcode) {
 	case BPF_ADD:
 		if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 2/9] bpf: fix incorrect sign extension in check_alu_op()
From: Alexei Starovoitov @ 2017-12-19  4:11 UTC (permalink / raw)
  To: David S . Miller
  Cc: Daniel Borkmann, Jann Horn, Edward Cree, netdev, kernel-team
In-Reply-To: <20171219041201.1979983-1-ast@kernel.org>

From: Jann Horn <jannh@google.com>

Distinguish between
BPF_ALU64|BPF_MOV|BPF_K (load 32-bit immediate, sign-extended to 64-bit)
and BPF_ALU|BPF_MOV|BPF_K (load 32-bit immediate, zero-padded to 64-bit);
only perform sign extension in the first case.

Starting with v4.14, this is exploitable by unprivileged users as long as
the unprivileged_bpf_disabled sysctl isn't set.

Debian assigned CVE-2017-16995 for this issue.

v3:
 - add CVE number (Ben Hutchings)

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Signed-off-by: Jann Horn <jannh@google.com>
Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 625e358ca765..c086010ae51e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2408,7 +2408,13 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			 * remember the value we stored into this reg
 			 */
 			regs[insn->dst_reg].type = SCALAR_VALUE;
-			__mark_reg_known(regs + insn->dst_reg, insn->imm);
+			if (BPF_CLASS(insn->code) == BPF_ALU64) {
+				__mark_reg_known(regs + insn->dst_reg,
+						 insn->imm);
+			} else {
+				__mark_reg_known(regs + insn->dst_reg,
+						 (u32)insn->imm);
+			}
 		}

 	} else if (opcode > BPF_END) {
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 0/9] bpf: verifier security fixes
From: Alexei Starovoitov @ 2017-12-19  4:11 UTC (permalink / raw)
  To: David S . Miller
  Cc: Daniel Borkmann, Jann Horn, Edward Cree, netdev, kernel-team

This patch set addresses a set of security vulnerabilities
in bpf verifier logic discovered by Jann Horn.
All of the patches are candidates for 4.14 stable.

Alexei Starovoitov (1):
  bpf: fix integer overflows

Edward Cree (1):
  bpf/verifier: fix bounds calculation on BPF_RSH

Jann Horn (7):
  bpf: fix incorrect sign extension in check_alu_op()
  bpf: fix incorrect tracking of register size truncation
  bpf: fix 32-bit ALU op verification
  bpf: fix missing error return in check_stack_boundary()
  bpf: force strict alignment checks for stack pointers
  bpf: don't prune branches when a scalar is replaced with a pointer
  selftests/bpf: add tests for recent bugfixes

 include/linux/bpf_verifier.h                |   4 +-
 kernel/bpf/verifier.c                       | 175 ++++++---
 tools/testing/selftests/bpf/test_verifier.c | 549 +++++++++++++++++++++++++++-
 3 files changed, 661 insertions(+), 67 deletions(-)

-- 
2.9.5

^ permalink raw reply

* [PATCH bpf 3/9] bpf: fix incorrect tracking of register size truncation
From: Alexei Starovoitov @ 2017-12-19  4:11 UTC (permalink / raw)
  To: David S . Miller
  Cc: Daniel Borkmann, Jann Horn, Edward Cree, netdev, kernel-team
In-Reply-To: <20171219041201.1979983-1-ast@kernel.org>

From: Jann Horn <jannh@google.com>

Properly handle register truncation to a smaller size.

The old code first mirrors the clearing of the high 32 bits in the bitwise
tristate representation, which is correct. But then, it computes the new
arithmetic bounds as the intersection between the old arithmetic bounds and
the bounds resulting from the bitwise tristate representation. Therefore,
when coerce_reg_to_32() is called on a number with bounds
[0xffff'fff8, 0x1'0000'0007], the verifier computes
[0xffff'fff8, 0xffff'ffff] as bounds of the truncated number.
This is incorrect: The truncated number could also be in the range [0, 7],
and no meaningful arithmetic bounds can be computed in that case apart from
the obvious [0, 0xffff'ffff].

Starting with v4.14, this is exploitable by unprivileged users as long as
the unprivileged_bpf_disabled sysctl isn't set.

Debian assigned CVE-2017-16996 for this issue.

v2:
 - flip the mask during arithmetic bounds calculation (Ben Hutchings)
v3:
 - add CVE number (Ben Hutchings)

Fixes: b03c9f9fdc37 ("bpf/verifier: track signed and unsigned min/max values")
Signed-off-by: Jann Horn <jannh@google.com>
Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 44 +++++++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c086010ae51e..f716bdf29dd0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1067,6 +1067,29 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 					   strict);
 }
 
+/* truncate register to smaller size (in bytes)
+ * must be called with size < BPF_REG_SIZE
+ */
+static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
+{
+	u64 mask;
+
+	/* clear high bits in bit representation */
+	reg->var_off = tnum_cast(reg->var_off, size);
+
+	/* fix arithmetic bounds */
+	mask = ((u64)1 << (size * 8)) - 1;
+	if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
+		reg->umin_value &= mask;
+		reg->umax_value &= mask;
+	} else {
+		reg->umin_value = 0;
+		reg->umax_value = mask;
+	}
+	reg->smin_value = reg->umin_value;
+	reg->smax_value = reg->umax_value;
+}
+
 /* check whether memory at (regno + off) is accessible for t = (read | write)
  * if t==write, value_regno is a register which value is stored into memory
  * if t==read, value_regno is a register which will receive the value from memory
@@ -1200,9 +1223,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
 	    regs[value_regno].type == SCALAR_VALUE) {
 		/* b/h/w load zero-extends, mark upper bits as known 0 */
-		regs[value_regno].var_off =
-			tnum_cast(regs[value_regno].var_off, size);
-		__update_reg_bounds(&regs[value_regno]);
+		coerce_reg_to_size(&regs[value_regno], size);
 	}
 	return err;
 }
@@ -1772,14 +1793,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	return 0;
 }
 
-static void coerce_reg_to_32(struct bpf_reg_state *reg)
-{
-	/* clear high 32 bits */
-	reg->var_off = tnum_cast(reg->var_off, 4);
-	/* Update bounds */
-	__update_reg_bounds(reg);
-}
-
 static bool signed_add_overflows(s64 a, s64 b)
 {
 	/* Do the add in u64, where overflow is well-defined */
@@ -2017,8 +2030,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 
 	if (BPF_CLASS(insn->code) != BPF_ALU64) {
 		/* 32-bit ALU ops are (32,32)->64 */
-		coerce_reg_to_32(dst_reg);
-		coerce_reg_to_32(&src_reg);
+		coerce_reg_to_size(dst_reg, 4);
+		coerce_reg_to_size(&src_reg, 4);
 	}
 	smin_val = src_reg.smin_value;
 	smax_val = src_reg.smax_value;
@@ -2398,10 +2411,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 					return -EACCES;
 				}
 				mark_reg_unknown(env, regs, insn->dst_reg);
-				/* high 32 bits are known zero. */
-				regs[insn->dst_reg].var_off = tnum_cast(
-						regs[insn->dst_reg].var_off, 4);
-				__update_reg_bounds(&regs[insn->dst_reg]);
+				coerce_reg_to_size(&regs[insn->dst_reg], 4);
 			}
 		} else {
 			/* case: R = imm
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf 4/9] bpf: fix 32-bit ALU op verification
From: Alexei Starovoitov @ 2017-12-19  4:11 UTC (permalink / raw)
  To: David S . Miller
  Cc: Daniel Borkmann, Jann Horn, Edward Cree, netdev, kernel-team
In-Reply-To: <20171219041201.1979983-1-ast@kernel.org>

From: Jann Horn <jannh@google.com>

32-bit ALU ops operate on 32-bit values and have 32-bit outputs.
Adjust the verifier accordingly.

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f716bdf29dd0..ecdc265244ca 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2017,6 +2017,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	return 0;
 }
 
+/* WARNING: This function does calculations on 64-bit values, but the actual
+ * execution may occur on 32-bit values. Therefore, things like bitshifts
+ * need extra checks in the 32-bit case.
+ */
 static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 				      struct bpf_insn *insn,
 				      struct bpf_reg_state *dst_reg,
@@ -2027,12 +2031,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 	bool src_known, dst_known;
 	s64 smin_val, smax_val;
 	u64 umin_val, umax_val;
+	u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
 
-	if (BPF_CLASS(insn->code) != BPF_ALU64) {
-		/* 32-bit ALU ops are (32,32)->64 */
-		coerce_reg_to_size(dst_reg, 4);
-		coerce_reg_to_size(&src_reg, 4);
-	}
 	smin_val = src_reg.smin_value;
 	smax_val = src_reg.smax_value;
 	umin_val = src_reg.umin_value;
@@ -2168,9 +2168,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		__update_reg_bounds(dst_reg);
 		break;
 	case BPF_LSH:
-		if (umax_val > 63) {
-			/* Shifts greater than 63 are undefined.  This includes
-			 * shifts by a negative number.
+		if (umax_val >= insn_bitness) {
+			/* Shifts greater than 31 or 63 are undefined.
+			 * This includes shifts by a negative number.
 			 */
 			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
@@ -2196,9 +2196,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		__update_reg_bounds(dst_reg);
 		break;
 	case BPF_RSH:
-		if (umax_val > 63) {
-			/* Shifts greater than 63 are undefined.  This includes
-			 * shifts by a negative number.
+		if (umax_val >= insn_bitness) {
+			/* Shifts greater than 31 or 63 are undefined.
+			 * This includes shifts by a negative number.
 			 */
 			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
@@ -2234,6 +2234,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		break;
 	}
 
+	if (BPF_CLASS(insn->code) != BPF_ALU64) {
+		/* 32-bit ALU ops are (32,32)->32 */
+		coerce_reg_to_size(dst_reg, 4);
+		coerce_reg_to_size(&src_reg, 4);
+	}
+
 	__reg_deduce_bounds(dst_reg);
 	__reg_bound_offset(dst_reg);
 	return 0;
-- 
2.9.5

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox