* [PATCH bpf 0/11] bpf: more sock_ops callbacks
@ 2017-12-19 6:21 Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 01/11] bpf: Make SOCK_OPS_GET_TCP size independent Lawrence Brakmo
` (10 more replies)
0 siblings, 11 replies; 15+ messages in thread
From: Lawrence Brakmo @ 2017-12-19 6:21 UTC (permalink / raw)
To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
This patchset adds support for:
- direct R or R/W access to many tcp_sock fields
- passing up to 4 arguments to sock_ops BPF functions
- tcp_sock field bpf_sock_ops_flags for controlling callbacks
- optionally calling sock_ops BPF program when RTO fires
- optionally calling sock_ops BPF program when packet is retransmitted
- optionally calling sock_ops BPF program when TCP state changes
- access to tclass and sk_txhash
- new selftest
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Consists of the following patches:
[PATCH bpf 01/11] bpf: Make SOCK_OPS_GET_TCP size independent
[PATCH bpf 02/11] bpf: Make SOCK_OPS_GET_TCP struct independent
[PATCH bpf 03/11] bpf: Add write access to tcp_sock and sock fields
[PATCH bpf 04/11] bpf: Support passing args to sock_ops bpf function
[PATCH bpf 05/11] bpf: Adds field bpf_sock_ops_flags to tcp_sock
[PATCH bpf 06/11] bpf: Add sock_ops RTO callback
[PATCH bpf 07/11] bpf: Add support for reading sk_state and more
[PATCH bpf 08/11] bpf: Add sock_ops R/W access to tclass & sk_txhash
[PATCH bpf 09/11] bpf: Add BPF_SOCK_OPS_RETRANS_CB
[PATCH bpf 10/11] bpf: Add BPF_SOCK_OPS_STATE_CB
[PATCH bpf 11/11] bpf: add selftest for tcpbpf
include/linux/filter.h | 4 +
include/linux/tcp.h | 8 ++
include/net/tcp.h | 66 +++++++++-
include/uapi/linux/bpf.h | 39 +++++-
include/uapi/linux/tcp.h | 5 +
net/core/filter.c | 212 ++++++++++++++++++++++++++++++--
net/ipv4/tcp.c | 4 +-
net/ipv4/tcp_nv.c | 2 +-
net/ipv4/tcp_output.c | 5 +-
net/ipv4/tcp_timer.c | 9 ++
tools/include/uapi/linux/bpf.h | 45 ++++++-
tools/testing/selftests/bpf/Makefile | 5 +-
tools/testing/selftests/bpf/tcp_client.py | 57 +++++++++
tools/testing/selftests/bpf/tcp_server.py | 83 +++++++++++++
tools/testing/selftests/bpf/test_tcpbpf_kern.c | 133 ++++++++++++++++++++
tools/testing/selftests/bpf/test_tcpbpf_user.c | 119 ++++++++++++++++++
16 files changed, 772 insertions(+), 24 deletions(-)
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH bpf 01/11] bpf: Make SOCK_OPS_GET_TCP size independent
2017-12-19 6:21 [PATCH bpf 0/11] bpf: more sock_ops callbacks Lawrence Brakmo
@ 2017-12-19 6:21 ` Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 02/11] bpf: Make SOCK_OPS_GET_TCP struct independent Lawrence Brakmo
` (9 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: Lawrence Brakmo @ 2017-12-19 6:21 UTC (permalink / raw)
To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
Make SOCK_OPS_GET_TCP helper macro size independent (before only worked
with 4-byte fields.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
net/core/filter.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c
index 754abe1..d47d126 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4448,9 +4448,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
/* Helper macro for adding read access to tcp_sock fields. */
-#define SOCK_OPS_GET_TCP32(FIELD_NAME) \
+#define SOCK_OPS_GET_TCP(FIELD_NAME) \
do { \
- BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) != 4); \
+ BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) > \
+ FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
is_fullsock), \
@@ -4462,16 +4463,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
struct bpf_sock_ops_kern, sk),\
si->dst_reg, si->src_reg, \
offsetof(struct bpf_sock_ops_kern, sk));\
- *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, \
+ *insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock, \
+ FIELD_NAME), si->dst_reg, \
+ si->dst_reg, \
offsetof(struct tcp_sock, FIELD_NAME)); \
} while (0)
case offsetof(struct bpf_sock_ops, snd_cwnd):
- SOCK_OPS_GET_TCP32(snd_cwnd);
+ SOCK_OPS_GET_TCP(snd_cwnd);
break;
case offsetof(struct bpf_sock_ops, srtt_us):
- SOCK_OPS_GET_TCP32(srtt_us);
+ SOCK_OPS_GET_TCP(srtt_us);
break;
}
return insn - insn_buf;
--
2.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH bpf 02/11] bpf: Make SOCK_OPS_GET_TCP struct independent
2017-12-19 6:21 [PATCH bpf 0/11] bpf: more sock_ops callbacks Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 01/11] bpf: Make SOCK_OPS_GET_TCP size independent Lawrence Brakmo
@ 2017-12-19 6:21 ` Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 03/11] bpf: Add write access to tcp_sock and sock fields Lawrence Brakmo
` (8 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: Lawrence Brakmo @ 2017-12-19 6:21 UTC (permalink / raw)
To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
Changed SOCK_OPS_GET_TCP to SOCK_OPS_GET_FIELD and added a new
argument so now it can also work with struct sock fields.
Previous: SOCK_OPS_GET_TCP(FIELD_NAME)
New: SOCK_OPS_GET_FIELD(FIELD_NAME, OBJ)
Where OBJ is either "struct tcp_sock" or "struct sock" (without
quotation). Assumes FIELD_NAME is a field in the struct
bpf_sock_ops and in the OBJ specified.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
net/core/filter.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c
index d47d126..f808269 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4447,10 +4447,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
is_fullsock));
break;
-/* Helper macro for adding read access to tcp_sock fields. */
-#define SOCK_OPS_GET_TCP(FIELD_NAME) \
+/* Helper macro for adding read access to tcp_sock or sock fields. */
+#define SOCK_OPS_GET_FIELD(FIELD_NAME, OBJ) \
do { \
- BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD_NAME) > \
+ BUILD_BUG_ON(FIELD_SIZEOF(OBJ, FIELD_NAME) > \
FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
@@ -4463,18 +4463,18 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
struct bpf_sock_ops_kern, sk),\
si->dst_reg, si->src_reg, \
offsetof(struct bpf_sock_ops_kern, sk));\
- *insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock, \
- FIELD_NAME), si->dst_reg, \
- si->dst_reg, \
- offsetof(struct tcp_sock, FIELD_NAME)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \
+ FIELD_NAME), \
+ si->dst_reg, si->dst_reg, \
+ offsetof(OBJ, FIELD_NAME)); \
} while (0)
case offsetof(struct bpf_sock_ops, snd_cwnd):
- SOCK_OPS_GET_TCP(snd_cwnd);
+ SOCK_OPS_GET_FIELD(snd_cwnd, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, srtt_us):
- SOCK_OPS_GET_TCP(srtt_us);
+ SOCK_OPS_GET_FIELD(srtt_us, struct tcp_sock);
break;
}
return insn - insn_buf;
--
2.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH bpf 03/11] bpf: Add write access to tcp_sock and sock fields
2017-12-19 6:21 [PATCH bpf 0/11] bpf: more sock_ops callbacks Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 01/11] bpf: Make SOCK_OPS_GET_TCP size independent Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 02/11] bpf: Make SOCK_OPS_GET_TCP struct independent Lawrence Brakmo
@ 2017-12-19 6:21 ` Lawrence Brakmo
2017-12-20 0:51 ` Daniel Borkmann
2017-12-20 1:10 ` Alexei Starovoitov
2017-12-19 6:21 ` [PATCH bpf 04/11] bpf: Support passing args to sock_ops bpf function Lawrence Brakmo
` (7 subsequent siblings)
10 siblings, 2 replies; 15+ messages in thread
From: Lawrence Brakmo @ 2017-12-19 6:21 UTC (permalink / raw)
To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
This patch adds a macro, SOCK_OPS_SET_FIELD, for writing to
struct tcp_sock or struct sock fields. This required adding a new
field "temp" to struct bpf_sock_ops_kern for temporary storage that
is used by sock_ops_convert_ctx_access. It is used to store and recover
the contents of a register, so the register can be used to store the
address of the sk. Since we cannot overwrite the dst_reg because it
contains the pointer to ctx, nor the src_reg since it contains the value
we want to store, we need an extra register to contain the address
of the sk.
Also adds the macro SOCK_OPS_GET_OR_SET_FIELD that calls one of the
GET or SET macros depending on the value of the TYPE field.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
include/linux/filter.h | 3 +++
include/net/tcp.h | 2 +-
net/core/filter.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 50 insertions(+), 1 deletion(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5feb441..8929162 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -987,6 +987,9 @@ struct bpf_sock_ops_kern {
u32 replylong[4];
};
u32 is_fullsock;
+ u64 temp; /* Used by sock_ops_convert_ctx_access
+ * as temporary storaage of a register
+ */
};
#endif /* __LINUX_FILTER_H__ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6cc205c..e0213f1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2011,7 +2011,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
struct bpf_sock_ops_kern sock_ops;
int ret;
- memset(&sock_ops, 0, sizeof(sock_ops));
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, is_fullsock));
if (sk_fullsock(sk)) {
sock_ops.is_fullsock = 1;
sock_owned_by_me(sk);
diff --git a/net/core/filter.c b/net/core/filter.c
index f808269..97e65df 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4469,6 +4469,52 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
offsetof(OBJ, FIELD_NAME)); \
} while (0)
+/* Helper macro for adding write access to tcp_sock or sock fields.
+ * The macro is called with two registers, dst_reg which contains a pointer
+ * to ctx (context) and src_reg which contains the value that should be
+ * stored. However, we need an aditional register since we cannot overwrite
+ * dst_reg because it may be used later in the program.
+ * Instead we "borrow" one of the other register. We first save its value
+ * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
+ * it at the end of the macro.
+ */
+#define SOCK_OPS_SET_FIELD(FIELD_NAME, OBJ) \
+ do { \
+ int reg = BPF_REG_9; \
+ BUILD_BUG_ON(FIELD_SIZEOF(OBJ, FIELD_NAME) > \
+ FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME)); \
+ while (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, \
+ is_fullsock), \
+ reg, si->dst_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ is_fullsock)); \
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, sk),\
+ reg, si->dst_reg, \
+ offsetof(struct bpf_sock_ops_kern, sk));\
+ *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, FIELD_NAME), \
+ reg, si->src_reg, \
+ offsetof(OBJ, FIELD_NAME)); \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ } while (0)
+
+#define SOCK_OPS_GET_OR_SET_FIELD(FIELD_NAME, OBJ, TYPE) \
+ do { \
+ if (TYPE == BPF_WRITE) \
+ SOCK_OPS_SET_FIELD(FIELD_NAME, OBJ); \
+ else \
+ SOCK_OPS_GET_FIELD(FIELD_NAME, OBJ); \
+ } while (0)
+
case offsetof(struct bpf_sock_ops, snd_cwnd):
SOCK_OPS_GET_FIELD(snd_cwnd, struct tcp_sock);
break;
--
2.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH bpf 04/11] bpf: Support passing args to sock_ops bpf function
2017-12-19 6:21 [PATCH bpf 0/11] bpf: more sock_ops callbacks Lawrence Brakmo
` (2 preceding siblings ...)
2017-12-19 6:21 ` [PATCH bpf 03/11] bpf: Add write access to tcp_sock and sock fields Lawrence Brakmo
@ 2017-12-19 6:21 ` Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 05/11] bpf: Adds field bpf_sock_ops_flags to tcp_sock Lawrence Brakmo
` (6 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: Lawrence Brakmo @ 2017-12-19 6:21 UTC (permalink / raw)
To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
Adds support for passing up to 4 arguments to sock_ops bpf functions. It
reusues the reply union, so the bpf_sock_ops structures are not
increased in size.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
include/linux/filter.h | 1 +
include/net/tcp.h | 64 ++++++++++++++++++++++++++++++++++++++++++++----
include/uapi/linux/bpf.h | 5 ++--
net/ipv4/tcp.c | 2 +-
net/ipv4/tcp_nv.c | 2 +-
net/ipv4/tcp_output.c | 2 +-
6 files changed, 66 insertions(+), 10 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 8929162..2a09f27 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -983,6 +983,7 @@ struct bpf_sock_ops_kern {
struct sock *sk;
u32 op;
union {
+ u32 args[4];
u32 reply;
u32 replylong[4];
};
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e0213f1..c262be6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2006,7 +2006,7 @@ void tcp_cleanup_ulp(struct sock *sk);
* program loaded).
*/
#ifdef CONFIG_BPF
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
{
struct bpf_sock_ops_kern sock_ops;
int ret;
@@ -2019,6 +2019,8 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
sock_ops.sk = sk;
sock_ops.op = op;
+ if (nargs > 0)
+ memcpy(sock_ops.args, args, nargs*sizeof(u32));
ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
if (ret == 0)
@@ -2027,18 +2029,70 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
ret = -1;
return ret;
}
+
+static inline int tcp_call_bpf_1arg(struct sock *sk, int op, u32 arg)
+{
+ return tcp_call_bpf(sk, op, 1, &arg);
+}
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+ u32 args[2] = {arg1, arg2};
+
+ return tcp_call_bpf(sk, op, 2, args);
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+ u32 arg3)
+{
+ u32 args[3] = {arg1, arg2, arg3};
+
+ return tcp_call_bpf(sk, op, 3, args);
+}
+
+static inline int tcp_call_bpf_4arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+ u32 arg3, u32 arg4)
+{
+ u32 args[4] = {arg1, arg2, arg3, arg4};
+
+ return tcp_call_bpf(sk, op, 4, args);
+}
+
#else
-static inline int tcp_call_bpf(struct sock *sk, int op)
+static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
{
return -EPERM;
}
+
+static inline int tcp_call_bpf_1arg(struct sock *sk, int op, u32 arg)
+{
+ return -EPERM;
+}
+
+static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2)
+{
+ return -EPERM;
+}
+
+static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+ u32 arg3)
+{
+ return -EPERM;
+}
+
+static inline int tcp_call_bpf_4arg(struct sock *sk, int op, u32 arg1, u32 arg2,
+ u32 arg3, u32 arg4)
+{
+ return -EPERM;
+}
+
#endif
static inline u32 tcp_timeout_init(struct sock *sk)
{
int timeout;
- timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT);
+ timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);
if (timeout <= 0)
timeout = TCP_TIMEOUT_INIT;
@@ -2049,7 +2103,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
{
int rwnd;
- rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT);
+ rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);
if (rwnd < 0)
rwnd = 0;
@@ -2058,7 +2112,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
{
- return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
+ return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
}
#if IS_ENABLED(CONFIG_SMC)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 595bda1..addd849 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -936,8 +936,9 @@ struct bpf_map_info {
struct bpf_sock_ops {
__u32 op;
union {
- __u32 reply;
- __u32 replylong[4];
+ __u32 args[4]; /* Optionally passed to bpf program */
+ __u32 reply; /* Returned by bpf program */
+ __u32 replylong[4]; /* Optionally returned by bpf prog */
};
__u32 family;
__u32 remote_ip4; /* Stored in network byte order */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1803116..817df3f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -465,7 +465,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
tcp_mtup_init(sk);
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_metrics(sk);
- tcp_call_bpf(sk, bpf_op);
+ tcp_call_bpf(sk, bpf_op, 0, NULL);
tcp_init_congestion_control(sk);
tcp_init_buffer_space(sk);
}
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index 0b5a05b..ddbce73 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -146,7 +146,7 @@ static void tcpnv_init(struct sock *sk)
* within a datacenter, where we have reasonable estimates of
* RTTs
*/
- base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT);
+ base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT, 0, NULL);
if (base_rtt > 0) {
ca->nv_base_rtt = base_rtt;
ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a4d214c..50cb242 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3471,7 +3471,7 @@ int tcp_connect(struct sock *sk)
struct sk_buff *buff;
int err;
- tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB);
+ tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
return -EHOSTUNREACH; /* Routing failure or similar. */
--
2.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH bpf 05/11] bpf: Adds field bpf_sock_ops_flags to tcp_sock
2017-12-19 6:21 [PATCH bpf 0/11] bpf: more sock_ops callbacks Lawrence Brakmo
` (3 preceding siblings ...)
2017-12-19 6:21 ` [PATCH bpf 04/11] bpf: Support passing args to sock_ops bpf function Lawrence Brakmo
@ 2017-12-19 6:21 ` Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 06/11] bpf: Add sock_ops RTO callback Lawrence Brakmo
` (5 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: Lawrence Brakmo @ 2017-12-19 6:21 UTC (permalink / raw)
To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
Adds field bpf_sock_ops_flags to tcp_sock and bpf_sock_ops. Its primary
use is to determine if there should be calls to sock_ops bpf program at
various points in the TCP code. The field is initialized to zero,
disabling the calls. A sock_ops BPF program can set, per connection and
as necessary, when the connection is established.
It also adds support for reading and writting the field within a
sock_ops BPF program.
Examples of where to call the bpf program:
1) When RTO fires
2) When a packet is retransmitted
3) When the connection terminates
4) When a packet is sent
5) When a packet is received
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
include/linux/tcp.h | 8 ++++++++
include/uapi/linux/bpf.h | 1 +
net/core/filter.c | 6 ++++++
3 files changed, 15 insertions(+)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index df5d97a..c46553f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -372,6 +372,14 @@ struct tcp_sock {
*/
struct request_sock *fastopen_rsk;
u32 *saved_syn;
+
+/* Sock_ops bpf program related variables */
+#ifdef CONFIG_BPF
+ u32 bpf_sock_ops_flags; /* values defined in uapi/linux/tcp.h */
+#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_flags & ARG)
+#else
+#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
+#endif
};
enum tsq_enum {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index addd849..dfbf43a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -953,6 +953,7 @@ struct bpf_sock_ops {
*/
__u32 snd_cwnd;
__u32 srtt_us; /* Averaged RTT << 3 in usecs */
+ __u32 bpf_sock_ops_flags; /* flags defined in uapi/linux/tcp.h */
};
/* List of known BPF sock_ops operators.
diff --git a/net/core/filter.c b/net/core/filter.c
index 97e65df..2692514 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3842,6 +3842,7 @@ static bool sock_ops_is_valid_access(int off, int size,
switch (off) {
case offsetof(struct bpf_sock_ops, op) ...
offsetof(struct bpf_sock_ops, replylong[3]):
+ case offsetof(struct bpf_sock_ops, bpf_sock_ops_flags):
break;
default:
return false;
@@ -4522,6 +4523,11 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct bpf_sock_ops, srtt_us):
SOCK_OPS_GET_FIELD(srtt_us, struct tcp_sock);
break;
+
+ case offsetof(struct bpf_sock_ops, bpf_sock_ops_flags):
+ SOCK_OPS_GET_OR_SET_FIELD(bpf_sock_ops_flags, struct tcp_sock,
+ type);
+ break;
}
return insn - insn_buf;
}
--
2.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH bpf 06/11] bpf: Add sock_ops RTO callback
2017-12-19 6:21 [PATCH bpf 0/11] bpf: more sock_ops callbacks Lawrence Brakmo
` (4 preceding siblings ...)
2017-12-19 6:21 ` [PATCH bpf 05/11] bpf: Adds field bpf_sock_ops_flags to tcp_sock Lawrence Brakmo
@ 2017-12-19 6:21 ` Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 07/11] bpf: Add support for reading sk_state and more Lawrence Brakmo
` (4 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: Lawrence Brakmo @ 2017-12-19 6:21 UTC (permalink / raw)
To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
Adds an optional call to sock_ops BPF program based on whether the
BPF_SOCK_OPS_RTO_CB_FLAG is set in bpf_sock_ops_flags.
The BPF program is passed 2 arguments: icsk_retransmits and whether the
RTO has expired.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
include/uapi/linux/bpf.h | 5 +++++
include/uapi/linux/tcp.h | 3 +++
net/ipv4/tcp_timer.c | 9 +++++++++
3 files changed, 17 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dfbf43a..1c36795 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -989,6 +989,11 @@ enum {
* a congestion threshold. RTTs above
* this indicate congestion
*/
+ BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered.
+ * Arg1: value of icsk_retransmits
+ * Arg2: value of icsk_rto
+ * Arg3: whether RTO has expired
+ */
};
#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index b4a4f64..089c19e 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -259,6 +259,9 @@ struct tcp_md5sig {
__u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; /* key (binary) */
};
+/* Definitions for bpf_sock_ops_flags */
+#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0)
+
/* INET_DIAG_MD5SIG */
struct tcp_diag_md5sig {
__u8 tcpm_family;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 16df6dd..e6afd93 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -230,9 +230,18 @@ static int tcp_write_timeout(struct sock *sk)
}
if (expired) {
/* Has it gone just too far? */
+ if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
+ tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
+ icsk->icsk_retransmits,
+ icsk->icsk_rto, 1);
tcp_write_err(sk);
return 1;
}
+
+ if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
+ tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
+ icsk->icsk_retransmits,
+ icsk->icsk_rto, 0);
return 0;
}
--
2.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH bpf 07/11] bpf: Add support for reading sk_state and more
2017-12-19 6:21 [PATCH bpf 0/11] bpf: more sock_ops callbacks Lawrence Brakmo
` (5 preceding siblings ...)
2017-12-19 6:21 ` [PATCH bpf 06/11] bpf: Add sock_ops RTO callback Lawrence Brakmo
@ 2017-12-19 6:21 ` Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 08/11] bpf: Add sock_ops R/W access to tclass & sk_txhash Lawrence Brakmo
` (3 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: Lawrence Brakmo @ 2017-12-19 6:21 UTC (permalink / raw)
To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
Add support for reading many more tcp_sock fields
state, same as sk->sk_state
rtt_min same as sk->rtt_min.s[0].v (current rtt_min)
snd_ssthresh
rcv_nxt
snd_nxt
snd_una
mss_cache
ecn_flags
rate_delivered
rate_interval_us
packets_out
retrans_out
total_retrans
segs_in
data_segs_in
segs_out
data_segs_out
bytes_received (__u64)
bytes_acked (__u64)
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
include/uapi/linux/bpf.h | 19 ++++++++++
net/core/filter.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 114 insertions(+), 1 deletion(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1c36795..19a0b1b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -954,6 +954,25 @@ struct bpf_sock_ops {
__u32 snd_cwnd;
__u32 srtt_us; /* Averaged RTT << 3 in usecs */
__u32 bpf_sock_ops_flags; /* flags defined in uapi/linux/tcp.h */
+ __u32 state;
+ __u32 rtt_min;
+ __u32 snd_ssthresh;
+ __u32 rcv_nxt;
+ __u32 snd_nxt;
+ __u32 snd_una;
+ __u32 mss_cache;
+ __u32 ecn_flags;
+ __u32 rate_delivered;
+ __u32 rate_interval_us;
+ __u32 packets_out;
+ __u32 retrans_out;
+ __u32 total_retrans;
+ __u32 segs_in;
+ __u32 data_segs_in;
+ __u32 segs_out;
+ __u32 data_segs_out;
+ __u64 bytes_received;
+ __u64 bytes_acked;
};
/* List of known BPF sock_ops operators.
diff --git a/net/core/filter.c b/net/core/filter.c
index 2692514..2628077 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3828,7 +3828,7 @@ static bool __is_valid_sock_ops_access(int off, int size)
/* The verifier guarantees that size > 0. */
if (off % size != 0)
return false;
- if (size != sizeof(__u32))
+ if (size != sizeof(__u32) && size != sizeof(__u64))
return false;
return true;
@@ -4448,6 +4448,32 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
is_fullsock));
break;
+ case offsetof(struct bpf_sock_ops, state):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1);
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_state));
+ break;
+
+ case offsetof(struct bpf_sock_ops, rtt_min):
+ BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
+ sizeof(struct minmax));
+ BUILD_BUG_ON(sizeof(struct minmax) <
+ sizeof(struct minmax_sample));
+
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct tcp_sock, rtt_min) +
+ FIELD_SIZEOF(struct minmax_sample, t));
+ break;
+
/* Helper macro for adding read access to tcp_sock or sock fields. */
#define SOCK_OPS_GET_FIELD(FIELD_NAME, OBJ) \
do { \
@@ -4528,6 +4554,74 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_OR_SET_FIELD(bpf_sock_ops_flags, struct tcp_sock,
type);
break;
+
+ case offsetof(struct bpf_sock_ops, snd_ssthresh):
+ SOCK_OPS_GET_FIELD(snd_ssthresh, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, rcv_nxt):
+ SOCK_OPS_GET_FIELD(rcv_nxt, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, snd_nxt):
+ SOCK_OPS_GET_FIELD(snd_nxt, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, snd_una):
+ SOCK_OPS_GET_FIELD(snd_una, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, mss_cache):
+ SOCK_OPS_GET_FIELD(mss_cache, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, ecn_flags):
+ SOCK_OPS_GET_FIELD(ecn_flags, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, rate_delivered):
+ SOCK_OPS_GET_FIELD(rate_delivered, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, rate_interval_us):
+ SOCK_OPS_GET_FIELD(rate_interval_us, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, packets_out):
+ SOCK_OPS_GET_FIELD(packets_out, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, retrans_out):
+ SOCK_OPS_GET_FIELD(retrans_out, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, total_retrans):
+ SOCK_OPS_GET_FIELD(total_retrans, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, segs_in):
+ SOCK_OPS_GET_FIELD(segs_in, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, data_segs_in):
+ SOCK_OPS_GET_FIELD(data_segs_in, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, segs_out):
+ SOCK_OPS_GET_FIELD(data_segs_out, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, data_segs_out):
+ SOCK_OPS_GET_FIELD(data_segs_out, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, bytes_received):
+ SOCK_OPS_GET_FIELD(bytes_received, struct tcp_sock);
+ break;
+
+ case offsetof(struct bpf_sock_ops, bytes_acked):
+ SOCK_OPS_GET_FIELD(bytes_acked, struct tcp_sock);
+ break;
}
return insn - insn_buf;
}
--
2.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH bpf 08/11] bpf: Add sock_ops R/W access to tclass & sk_txhash
2017-12-19 6:21 [PATCH bpf 0/11] bpf: more sock_ops callbacks Lawrence Brakmo
` (6 preceding siblings ...)
2017-12-19 6:21 ` [PATCH bpf 07/11] bpf: Add support for reading sk_state and more Lawrence Brakmo
@ 2017-12-19 6:21 ` Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 09/11] bpf: Add BPF_SOCK_OPS_RETRANS_CB Lawrence Brakmo
` (2 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: Lawrence Brakmo @ 2017-12-19 6:21 UTC (permalink / raw)
To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
Adds direct R/W access to sk_txhash and access to tclass for ipv6 flows
through getsockopt and setsockopt. Sample usage for tclass:
bpf_getsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v, sizeof(v))
where skops is a pointer to the ctx (struct bpf_sock_ops).
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
include/uapi/linux/bpf.h | 1 +
net/core/filter.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 47 insertions(+), 1 deletion(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 19a0b1b..fe2b692 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -973,6 +973,7 @@ struct bpf_sock_ops {
__u32 data_segs_out;
__u64 bytes_received;
__u64 bytes_acked;
+ __u32 sk_txhash;
};
/* List of known BPF sock_ops operators.
diff --git a/net/core/filter.c b/net/core/filter.c
index 2628077..5cb2b70 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3229,6 +3229,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
ret = -EINVAL;
}
#ifdef CONFIG_INET
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (level == SOL_IPV6) {
+ if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+ return -EINVAL;
+
+ val = *((int *)optval);
+ /* Only some options are supported */
+ switch (optname) {
+ case IPV6_TCLASS:
+ if (val < -1 || val > 0xff) {
+ ret = -EINVAL;
+ } else {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ if (val == -1)
+ val = 0;
+ np->tclass = val;
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ }
+#endif
} else if (level == SOL_TCP &&
sk->sk_prot->setsockopt == tcp_setsockopt) {
if (optname == TCP_CONGESTION) {
@@ -3238,7 +3261,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
strncpy(name, optval, min_t(long, optlen,
TCP_CA_NAME_MAX-1));
name[TCP_CA_NAME_MAX-1] = 0;
- ret = tcp_set_congestion_control(sk, name, false, reinit);
+ ret = tcp_set_congestion_control(sk, name, false,
+ reinit);
} else {
struct tcp_sock *tp = tcp_sk(sk);
@@ -3304,6 +3328,22 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
} else {
goto err_clear;
}
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (level == SOL_IPV6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+ goto err_clear;
+
+ /* Only some options are supported */
+ switch (optname) {
+ case IPV6_TCLASS:
+ *((int *)optval) = (int)np->tclass;
+ break;
+ default:
+ goto err_clear;
+ }
+#endif
} else {
goto err_clear;
}
@@ -3843,6 +3883,7 @@ static bool sock_ops_is_valid_access(int off, int size,
case offsetof(struct bpf_sock_ops, op) ...
offsetof(struct bpf_sock_ops, replylong[3]):
case offsetof(struct bpf_sock_ops, bpf_sock_ops_flags):
+ case offsetof(struct bpf_sock_ops, sk_txhash):
break;
default:
return false;
@@ -4622,6 +4663,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct bpf_sock_ops, bytes_acked):
SOCK_OPS_GET_FIELD(bytes_acked, struct tcp_sock);
break;
+
+ case offsetof(struct bpf_sock_ops, sk_txhash):
+ SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, struct sock, type);
+ break;
}
return insn - insn_buf;
}
--
2.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH bpf 09/11] bpf: Add BPF_SOCK_OPS_RETRANS_CB
2017-12-19 6:21 [PATCH bpf 0/11] bpf: more sock_ops callbacks Lawrence Brakmo
` (7 preceding siblings ...)
2017-12-19 6:21 ` [PATCH bpf 08/11] bpf: Add sock_ops R/W access to tclass & sk_txhash Lawrence Brakmo
@ 2017-12-19 6:21 ` Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 10/11] bpf: Add BPF_SOCK_OPS_STATE_CB Lawrence Brakmo
2017-12-19 6:22 ` [PATCH bpf 11/11] bpf: add selftest for tcpbpf Lawrence Brakmo
10 siblings, 0 replies; 15+ messages in thread
From: Lawrence Brakmo @ 2017-12-19 6:21 UTC (permalink / raw)
To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
Adds support for calling sock_ops BPF program when there is a
retransmission. Two arguments are used; one for the sequence number and
other for the number of segments retransmitted. Does not include syn-ack
retransmissions.
New op: BPF_SOCK_OPS_RETRANS_CB.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
include/uapi/linux/bpf.h | 4 ++++
include/uapi/linux/tcp.h | 1 +
net/ipv4/tcp_output.c | 3 +++
3 files changed, 8 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fe2b692..7165619 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1014,6 +1014,10 @@ enum {
* Arg2: value of icsk_rto
* Arg3: whether RTO has expired
*/
+ BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted.
+ * Arg1: sequence number of 1st byte
+ * Arg2: # segments
+ */
};
#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 089c19e..dc36d3c 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -261,6 +261,7 @@ struct tcp_md5sig {
/* Definitions for bpf_sock_ops_flags */
#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0)
+#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1)
/* INET_DIAG_MD5SIG */
struct tcp_diag_md5sig {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 50cb242..b8ad088 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2910,6 +2910,9 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (likely(!err)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
trace_tcp_retransmit_skb(sk, skb);
+ if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
+ tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RETRANS_CB,
+ TCP_SKB_CB(skb)->seq, segs);
} else if (err != -EBUSY) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
--
2.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH bpf 10/11] bpf: Add BPF_SOCK_OPS_STATE_CB
2017-12-19 6:21 [PATCH bpf 0/11] bpf: more sock_ops callbacks Lawrence Brakmo
` (8 preceding siblings ...)
2017-12-19 6:21 ` [PATCH bpf 09/11] bpf: Add BPF_SOCK_OPS_RETRANS_CB Lawrence Brakmo
@ 2017-12-19 6:21 ` Lawrence Brakmo
2017-12-19 6:22 ` [PATCH bpf 11/11] bpf: add selftest for tcpbpf Lawrence Brakmo
10 siblings, 0 replies; 15+ messages in thread
From: Lawrence Brakmo @ 2017-12-19 6:21 UTC (permalink / raw)
To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
Adds support for calling sock_ops BPF program when there is a TCP state
change. Two arguments are used; one for the old state and another for
the new state.
New op: BPF_SOCK_OPS_STATE_CB.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
include/uapi/linux/bpf.h | 4 ++++
include/uapi/linux/tcp.h | 1 +
net/ipv4/tcp.c | 2 ++
3 files changed, 7 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7165619..b018d6f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1018,6 +1018,10 @@ enum {
* Arg1: sequence number of 1st byte
* Arg2: # segments
*/
+ BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state.
+ * Arg1: old_state
+ * Arg2: new_state
+ */
};
#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index dc36d3c..211322c 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -262,6 +262,7 @@ struct tcp_md5sig {
/* Definitions for bpf_sock_ops_flags */
#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0)
#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1)
+#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2)
/* INET_DIAG_MD5SIG */
struct tcp_diag_md5sig {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 817df3f..e70dd2f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2041,6 +2041,8 @@ void tcp_set_state(struct sock *sk, int state)
int oldstate = sk->sk_state;
trace_tcp_set_state(sk, oldstate, state);
+ if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
+ tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
switch (state) {
case TCP_ESTABLISHED:
--
2.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH bpf 11/11] bpf: add selftest for tcpbpf
2017-12-19 6:21 [PATCH bpf 0/11] bpf: more sock_ops callbacks Lawrence Brakmo
` (9 preceding siblings ...)
2017-12-19 6:21 ` [PATCH bpf 10/11] bpf: Add BPF_SOCK_OPS_STATE_CB Lawrence Brakmo
@ 2017-12-19 6:22 ` Lawrence Brakmo
2017-12-20 1:34 ` Alexei Starovoitov
10 siblings, 1 reply; 15+ messages in thread
From: Lawrence Brakmo @ 2017-12-19 6:22 UTC (permalink / raw)
To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Daniel Borkmann
Added a selftest for tcpbpf (sock_ops) that checks that the appropriate
callbacks occured and that it can access tcp_sock fields and that their
values are correct.
Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
tools/include/uapi/linux/bpf.h | 45 ++++++++-
tools/testing/selftests/bpf/Makefile | 5 +-
tools/testing/selftests/bpf/tcp_client.py | 57 +++++++++++
tools/testing/selftests/bpf/tcp_server.py | 83 +++++++++++++++
tools/testing/selftests/bpf/test_tcpbpf_kern.c | 133 +++++++++++++++++++++++++
tools/testing/selftests/bpf/test_tcpbpf_user.c | 119 ++++++++++++++++++++++
6 files changed, 438 insertions(+), 4 deletions(-)
create mode 100755 tools/testing/selftests/bpf/tcp_client.py
create mode 100755 tools/testing/selftests/bpf/tcp_server.py
create mode 100644 tools/testing/selftests/bpf/test_tcpbpf_kern.c
create mode 100644 tools/testing/selftests/bpf/test_tcpbpf_user.c
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index cf446c2..b018d6f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -936,8 +936,9 @@ struct bpf_map_info {
struct bpf_sock_ops {
__u32 op;
union {
- __u32 reply;
- __u32 replylong[4];
+ __u32 args[4]; /* Optionally passed to bpf program */
+ __u32 reply; /* Returned by bpf program */
+ __u32 replylong[4]; /* Optionally returned by bpf prog */
};
__u32 family;
__u32 remote_ip4; /* Stored in network byte order */
@@ -946,6 +947,33 @@ struct bpf_sock_ops {
__u32 local_ip6[4]; /* Stored in network byte order */
__u32 remote_port; /* Stored in network byte order */
__u32 local_port; /* stored in host byte order */
+ __u32 is_fullsock; /* Some TCP fields are only valid if
+ * there is a full socket. If not, the
+ * fields read as zero.
+ */
+ __u32 snd_cwnd;
+ __u32 srtt_us; /* Averaged RTT << 3 in usecs */
+ __u32 bpf_sock_ops_flags; /* flags defined in uapi/linux/tcp.h */
+ __u32 state;
+ __u32 rtt_min;
+ __u32 snd_ssthresh;
+ __u32 rcv_nxt;
+ __u32 snd_nxt;
+ __u32 snd_una;
+ __u32 mss_cache;
+ __u32 ecn_flags;
+ __u32 rate_delivered;
+ __u32 rate_interval_us;
+ __u32 packets_out;
+ __u32 retrans_out;
+ __u32 total_retrans;
+ __u32 segs_in;
+ __u32 data_segs_in;
+ __u32 segs_out;
+ __u32 data_segs_out;
+ __u64 bytes_received;
+ __u64 bytes_acked;
+ __u32 sk_txhash;
};
/* List of known BPF sock_ops operators.
@@ -981,6 +1009,19 @@ enum {
* a congestion threshold. RTTs above
* this indicate congestion
*/
+ BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered.
+ * Arg1: value of icsk_retransmits
+ * Arg2: value of icsk_rto
+ * Arg3: whether RTO has expired
+ */
+ BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted.
+ * Arg1: sequence number of 1st byte
+ * Arg2: # segments
+ */
+ BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state.
+ * Arg1: old_state
+ * Arg2: new_state
+ */
};
#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 255fb1f..f3632b2 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -13,11 +13,12 @@ CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../i
LDLIBS += -lcap -lelf
TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
- test_align test_verifier_log test_dev_cgroup
+ test_align test_verifier_log test_dev_cgroup test_tcpbpf_user
TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \
- sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o
+ sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
+ test_tcpbpf_kern.o
TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
test_offload.py
diff --git a/tools/testing/selftests/bpf/tcp_client.py b/tools/testing/selftests/bpf/tcp_client.py
new file mode 100755
index 0000000..de5b0e6
--- /dev/null
+++ b/tools/testing/selftests/bpf/tcp_client.py
@@ -0,0 +1,57 @@
+#!/usr/local/bin/python
+#
+# Copyright (c) 2017 Facebook
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of version 2 of the GNU General Public
+# License as published by the Free Software Foundation.
+#
+
+import sys, os, os.path, getopt
+import socket, time
+import subprocess
+import select
+
+def read(sock, n):
+ buf = ''
+ while len(buf) < n:
+ rem = n - len(buf)
+ try: s = sock.recv(rem)
+ except (socket.error), e: return ''
+ buf += s
+ return buf
+
+def send(sock, s):
+ total = len(s)
+ count = 0
+ while count < total:
+ try: n = sock.send(s)
+ except (socket.error), e: n = 0
+ if n == 0:
+ return count;
+ count += n
+ return count
+
+
+serverPort = int(sys.argv[1])
+HostName = socket.gethostname()
+
+time.sleep(1)
+
+# create active socket
+sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+try:
+ sock.connect((HostName, serverPort))
+except socket.error as e:
+ sys.exit(1)
+
+buf = ''
+n = 0
+while n < 1000:
+ buf += '+'
+ n += 1
+
+n = send(sock, buf)
+n = read(sock, 500)
+sys.exit(0)
+
diff --git a/tools/testing/selftests/bpf/tcp_server.py b/tools/testing/selftests/bpf/tcp_server.py
new file mode 100755
index 0000000..b9391f3
--- /dev/null
+++ b/tools/testing/selftests/bpf/tcp_server.py
@@ -0,0 +1,83 @@
+#!/usr/local/bin/python
+#
+# Copyright (c) 2017 Facebook
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of version 2 of the GNU General Public
+# License as published by the Free Software Foundation.
+#
+
+import sys, os, os.path, getopt
+import socket, time
+import subprocess
+import select
+
+def read(sock, n):
+ buf = ''
+ while len(buf) < n:
+ rem = n - len(buf)
+ try: s = sock.recv(rem)
+ except (socket.error), e: return ''
+ buf += s
+ return buf
+
+def send(sock, s):
+ total = len(s)
+ count = 0
+ while count < total:
+ try: n = sock.send(s)
+ except (socket.error), e: n = 0
+ if n == 0:
+ return count;
+ count += n
+ return count
+
+
+SERVER_PORT = 12877
+MAX_PORTS = 2
+
+serverPort = SERVER_PORT
+serverSocket = None
+
+HostName = socket.gethostname()
+
+# create passive socket
+serverSocket = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+host = socket.gethostname()
+
+while serverPort < SERVER_PORT + 5:
+ try: serverSocket.bind((host, serverPort))
+ except socket.error as msg:
+ serverPort += 1
+ continue
+ break
+
+cmdStr = ("./tcp_client.py %d &") % (serverPort)
+os.system(cmdStr)
+
+buf = ''
+n = 0
+while n < 500:
+ buf += '.'
+ n += 1
+
+serverSocket.listen(MAX_PORTS)
+readList = [serverSocket]
+
+while True:
+ readyRead, readyWrite, inError = \
+ select.select(readList, [], [], 10)
+
+ if len(readyRead) > 0:
+ waitCount = 0
+ for sock in readyRead:
+ if sock == serverSocket:
+ (clientSocket, address) = serverSocket.accept()
+ address = str(address[0])
+ readList.append(clientSocket)
+ else:
+ s = read(sock, 1000)
+ n = send(sock, buf)
+ sock.close()
+ time.sleep(1)
+ sys.exit(0)
diff --git a/tools/testing/selftests/bpf/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/test_tcpbpf_kern.c
new file mode 100644
index 0000000..6b69105
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpbpf_kern.c
@@ -0,0 +1,133 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/in6.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <netinet/in.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+
+struct globals {
+ __u32 event_map;
+ __u32 total_retrans;
+ __u32 data_segs_in;
+ __u32 data_segs_out;
+ __u64 bytes_received;
+ __u64 bytes_acked;
+};
+
+struct bpf_map_def SEC("maps") global_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(struct globals),
+ .max_entries = 2,
+};
+
+
+static inline void update_event_map(int event)
+{
+ __u32 key = 0;
+ struct globals g, *gp;
+
+ gp = bpf_map_lookup_elem(&global_map, &key);
+ if (gp == NULL) {
+ struct globals g = {0, 0, 0, 0, 0, 0};
+
+ g.event_map |= (1 << event);
+ bpf_map_update_elem(&global_map, &key, &g,
+ BPF_ANY);
+ } else {
+ g = *gp;
+ g.event_map |= (1 << event);
+ bpf_map_update_elem(&global_map, &key, &g,
+ BPF_ANY);
+ }
+}
+
+SEC("sockops")
+int bpf_testcb(struct bpf_sock_ops *skops)
+{
+ int rv = -1;
+ int op;
+ int init_seq = 0;
+ int ret = 0;
+ int v = 0;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if remote port number is in the range 12877..12887
+ * I.e. the active side of the connection
+ */
+ if ((bpf_ntohl(skops->remote_port) < 12877 ||
+ bpf_ntohl(skops->remote_port) >= 12887)) {
+ skops->reply = -1;
+ return 1;
+ }
+
+ op = (int) skops->op;
+
+ /* Check that both hosts are within same datacenter. For this example
+ * it is the case when the first 5.5 bytes of their IPv6 addresses are
+ * the same.
+ */
+ if (1) {
+ update_event_map(op);
+
+ switch (op) {
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ skops->bpf_sock_ops_flags = 0xfff;
+ init_seq = skops->snd_nxt;
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ init_seq = skops->snd_nxt;
+ skops->bpf_sock_ops_flags = 0xfff;
+ skops->sk_txhash = 0x12345f;
+ v = 0xff;
+ ret = bpf_setsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v,
+ sizeof(v));
+ break;
+ case BPF_SOCK_OPS_RTO_CB:
+ break;
+ case BPF_SOCK_OPS_RETRANS_CB:
+ break;
+ case BPF_SOCK_OPS_STATE_CB:
+ if (skops->args[1] == 7) {
+ __u32 key = 0;
+ struct globals g, *gp;
+
+ gp = bpf_map_lookup_elem(&global_map, &key);
+ if (gp == NULL) {
+ } else {
+ g = *gp;
+ g.total_retrans = skops->total_retrans;
+ g.data_segs_in = skops->data_segs_in;
+ g.data_segs_out = skops->data_segs_out;
+ g.bytes_received =
+ skops->bytes_received;
+ g.bytes_acked = skops->bytes_acked;
+ bpf_map_update_elem(&global_map, &key,
+ &g, BPF_ANY);
+ }
+ }
+ break;
+ default:
+ rv = -1;
+ }
+ } else {
+ rv = -1;
+ }
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c
new file mode 100644
index 0000000..8d941fd
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c
@@ -0,0 +1,119 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <signal.h>
+#include <string.h>
+#include <assert.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+//#include "bpf_load.h"
+#include "bpf_util.h"
+#include <linux/perf_event.h>
+
+struct globals {
+ __u32 event_map;
+ __u32 total_retrans;
+ __u32 data_segs_in;
+ __u32 data_segs_out;
+ __u64 bytes_received;
+ __u64 bytes_acked;
+};
+
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+ const char *name)
+{
+ struct bpf_map *map;
+
+ map = bpf_object__find_map_by_name(obj, name);
+ if (!map) {
+ printf("%s:FAIL:map '%s' not found\n", test, name);
+ return -1;
+ }
+ return bpf_map__fd(map);
+}
+
+#define SYSTEM(CMD) \
+ do { \
+ if (system(CMD)) { \
+ printf("system(%s) FAILS!\n", CMD); \
+ } \
+ } while (0)
+
+int main(int argc, char **argv)
+{
+ struct globals g = {0, 0, 0, 0, 0, 0};
+ __u32 key = 0;
+ int rv;
+ int pid;
+ int error = EXIT_FAILURE;
+ int cg_fd, prog_fd, map_fd;
+ char cmd[100], *dir;
+ const char *file = "./test_tcpbpf_kern.o";
+ struct bpf_object *obj;
+ struct stat buffer;
+
+ dir = "/tmp/cgroupv2/foo";
+
+ if (stat(dir, &buffer) != 0) {
+ SYSTEM("mkdir -p /tmp/cgroupv2");
+ SYSTEM("mount -t cgroup2 none /tmp/cgroupv2");
+ SYSTEM("mkdir -p /tmp/cgroupv2/foo");
+ }
+ pid = (int) getpid();
+ sprintf(cmd, "echo %d >> /tmp/cgroupv2/foo/cgroup.procs", pid);
+ SYSTEM(cmd);
+
+ cg_fd = open(dir, O_DIRECTORY, O_RDONLY);
+ if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) {
+// if (load_bpf_file(prog)) {
+ printf("FAILED: load_bpf_file failed for: %s\n", file);
+// printf("%s", bpf_log_buf);
+ goto err;
+ }
+
+ rv = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_SOCK_OPS, 0);
+ if (rv) {
+ printf("FAILED: bpf_prog_attach: %d (%s)\n",
+ error, strerror(errno));
+ goto err;
+ }
+
+ SYSTEM("./tcp_server.py");
+
+ map_fd = bpf_find_map(__func__, obj, "global_map");
+ if (map_fd < 0)
+ goto err;
+
+ rv = bpf_map_lookup_elem(map_fd, &key, &g);
+ if (rv != 0) {
+ printf("FAILED: bpf_map_lookup_elem returns %d\n", rv);
+ goto err;
+ }
+
+ if (g.bytes_received != 501 || g.bytes_acked != 1002 ||
+ g.data_segs_in != 1 || g.data_segs_out != 1 ||
+ g.event_map != 0x45e) {
+ printf("FAILED: Wrong stats\n");
+ goto err;
+ }
+ printf("PASSED!\n");
+ error = 0;
+err:
+ bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS);
+ return error;
+}
--
2.9.5
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH bpf 03/11] bpf: Add write access to tcp_sock and sock fields
2017-12-19 6:21 ` [PATCH bpf 03/11] bpf: Add write access to tcp_sock and sock fields Lawrence Brakmo
@ 2017-12-20 0:51 ` Daniel Borkmann
2017-12-20 1:10 ` Alexei Starovoitov
1 sibling, 0 replies; 15+ messages in thread
From: Daniel Borkmann @ 2017-12-20 0:51 UTC (permalink / raw)
To: Lawrence Brakmo, netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov
On 12/19/2017 07:21 AM, Lawrence Brakmo wrote:
> This patch adds a macro, SOCK_OPS_SET_FIELD, for writing to
> struct tcp_sock or struct sock fields. This required adding a new
> field "temp" to struct bpf_sock_ops_kern for temporary storage that
> is used by sock_ops_convert_ctx_access. It is used to store and recover
> the contents of a register, so the register can be used to store the
> address of the sk. Since we cannot overwrite the dst_reg because it
> contains the pointer to ctx, nor the src_reg since it contains the value
> we want to store, we need an extra register to contain the address
> of the sk.
>
> Also adds the macro SOCK_OPS_GET_OR_SET_FIELD that calls one of the
> GET or SET macros depending on the value of the TYPE field.
>
> Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
> ---
> include/linux/filter.h | 3 +++
> include/net/tcp.h | 2 +-
> net/core/filter.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 50 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 5feb441..8929162 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -987,6 +987,9 @@ struct bpf_sock_ops_kern {
> u32 replylong[4];
> };
> u32 is_fullsock;
> + u64 temp; /* Used by sock_ops_convert_ctx_access
> + * as temporary storaage of a register
> + */
> };
>
> #endif /* __LINUX_FILTER_H__ */
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 6cc205c..e0213f1 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -2011,7 +2011,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op)
> struct bpf_sock_ops_kern sock_ops;
> int ret;
>
> - memset(&sock_ops, 0, sizeof(sock_ops));
> + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, is_fullsock));
I don't think this is correct. sock_ops is on stack, so above you only
zero up to the offset of is_fullsock, but not including it, so when
you have !sk_fullsock(sk), then your BPF prog will still act as if the
sock_ops.is_fullsock was set in case prior stack garbage said so.
> if (sk_fullsock(sk)) {
> sock_ops.is_fullsock = 1;
> sock_owned_by_me(sk);
Thanks,
Daniel
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH bpf 03/11] bpf: Add write access to tcp_sock and sock fields
2017-12-19 6:21 ` [PATCH bpf 03/11] bpf: Add write access to tcp_sock and sock fields Lawrence Brakmo
2017-12-20 0:51 ` Daniel Borkmann
@ 2017-12-20 1:10 ` Alexei Starovoitov
1 sibling, 0 replies; 15+ messages in thread
From: Alexei Starovoitov @ 2017-12-20 1:10 UTC (permalink / raw)
To: Lawrence Brakmo, netdev; +Cc: Kernel Team, Blake Matheny, Daniel Borkmann
On 12/18/17 10:21 PM, Lawrence Brakmo wrote:
> +#define SOCK_OPS_SET_FIELD(FIELD_NAME, OBJ) \
> + do { \
> + int reg = BPF_REG_9; \
> + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, FIELD_NAME) > \
> + FIELD_SIZEOF(struct bpf_sock_ops, FIELD_NAME)); \
> + while (si->dst_reg == reg || si->src_reg == reg) \
> + reg--; \
> + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \
> + offsetof(struct bpf_sock_ops_kern, \
> + temp)); \
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
> + struct bpf_sock_ops_kern, \
> + is_fullsock), \
> + reg, si->dst_reg, \
> + offsetof(struct bpf_sock_ops_kern, \
> + is_fullsock)); \
> + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
> + struct bpf_sock_ops_kern, sk),\
> + reg, si->dst_reg, \
> + offsetof(struct bpf_sock_ops_kern, sk));\
> + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, FIELD_NAME), \
> + reg, si->src_reg, \
> + offsetof(OBJ, FIELD_NAME)); \
> + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \
> + offsetof(struct bpf_sock_ops_kern, \
> + temp)); \
> + } while (0)
that's neat. I like it.
I guess the prog can check is_fullsock on its own to see whether writes
will fail or not, so JEQ above is ok.
Only while() loop looks a bit scary.
May be replace with two 'if' ?
if (si->dst_reg == reg || si->src_reg == reg)
reg --;
if (si->dst_reg == reg || si->src_reg == reg)
reg --;
so it's clear that tmp reg will be reg_7, 8 or 9.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH bpf 11/11] bpf: add selftest for tcpbpf
2017-12-19 6:22 ` [PATCH bpf 11/11] bpf: add selftest for tcpbpf Lawrence Brakmo
@ 2017-12-20 1:34 ` Alexei Starovoitov
0 siblings, 0 replies; 15+ messages in thread
From: Alexei Starovoitov @ 2017-12-20 1:34 UTC (permalink / raw)
To: Lawrence Brakmo, netdev; +Cc: Kernel Team, Blake Matheny, Daniel Borkmann
On 12/18/17 10:22 PM, Lawrence Brakmo wrote:
> - sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o
> + sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
> + test_tcpbpf_kern.o
it won't apply. please base patches on bpf-next tree
> +#!/usr/local/bin/python
> +#
> +# Copyright (c) 2017 Facebook
> +#
> +# This program is free software; you can redistribute it and/or
> +# modify it under the terms of version 2 of the GNU General Public
> +# License as published by the Free Software Foundation.
the license should be in SPDX format.
> +++ b/tools/testing/selftests/bpf/test_tcpbpf_kern.c
> @@ -0,0 +1,133 @@
> +/* Copyright (c) 2017 Facebook
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of version 2 of the GNU General Public
> + * License as published by the Free Software Foundation.
> + */
same here.
> + case BPF_SOCK_OPS_STATE_CB:
> + if (skops->args[1] == 7) {
> + __u32 key = 0;
> + struct globals g, *gp;
> +
> + gp = bpf_map_lookup_elem(&global_map, &key);
> + if (gp == NULL) {
> + } else {
> + g = *gp;
> + g.total_retrans = skops->total_retrans;
> + g.data_segs_in = skops->data_segs_in;
you can reduce indent by doing
if (!gp)
break;
g = *gp;
g.total_retrans = skops->total_retrans;
> +++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c
> @@ -0,0 +1,119 @@
> +/* Copyright (c) 2017 Facebook
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of version 2 of the GNU General Public
> + * License as published by the Free Software Foundation.
> + */
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <stdio.h>
> +#include <unistd.h>
> +#include <errno.h>
> +#include <signal.h>
> +#include <string.h>
> +#include <assert.h>
> +#include <linux/perf_event.h>
> +#include <linux/ptrace.h>
> +#include <linux/bpf.h>
> +#include <sys/ioctl.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <bpf/bpf.h>
> +#include <bpf/libbpf.h>
> +//#include "bpf_load.h"
please remove left over comments.
^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2017-12-20 1:34 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-12-19 6:21 [PATCH bpf 0/11] bpf: more sock_ops callbacks Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 01/11] bpf: Make SOCK_OPS_GET_TCP size independent Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 02/11] bpf: Make SOCK_OPS_GET_TCP struct independent Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 03/11] bpf: Add write access to tcp_sock and sock fields Lawrence Brakmo
2017-12-20 0:51 ` Daniel Borkmann
2017-12-20 1:10 ` Alexei Starovoitov
2017-12-19 6:21 ` [PATCH bpf 04/11] bpf: Support passing args to sock_ops bpf function Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 05/11] bpf: Adds field bpf_sock_ops_flags to tcp_sock Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 06/11] bpf: Add sock_ops RTO callback Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 07/11] bpf: Add support for reading sk_state and more Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 08/11] bpf: Add sock_ops R/W access to tclass & sk_txhash Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 09/11] bpf: Add BPF_SOCK_OPS_RETRANS_CB Lawrence Brakmo
2017-12-19 6:21 ` [PATCH bpf 10/11] bpf: Add BPF_SOCK_OPS_STATE_CB Lawrence Brakmo
2017-12-19 6:22 ` [PATCH bpf 11/11] bpf: add selftest for tcpbpf Lawrence Brakmo
2017-12-20 1:34 ` Alexei Starovoitov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).