* [PATCH RFC net-next 0/3] Extensions to allow asynchronous TCP_INFO notifications based on congestion parameters
@ 2018-10-22 15:23 Sowmini Varadhan
2018-10-22 15:23 ` [PATCH RFC net-next 1/3] sock_diag: Refactor inet_sock_diag_destroy code Sowmini Varadhan
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Sowmini Varadhan @ 2018-10-22 15:23 UTC (permalink / raw)
To: sowmini.varadhan, netdev; +Cc: edumazet, brakmo
Problem statement:
We would like to monitor some subset of TCP sockets in user-space,
(the monitoring application would define 4-tuples it wants to monitor)
using TCP_INFO stats to analyze reported problems. The idea is to
use those stats to see where the bottlenecks are likely to be ("is it
application-limited?" or "is there evidence of BufferBloat in the
path?" etc)
Today we can do this by periodically polling for tcp_info, but this
could be made more efficient if the kernel would asynchronously
notify the application via tcp_info when some "interesting"
thresholds (e.g., "RTT variance > X", or "total_retrans > Y" etc)
are reached. And to make this effective, it is better if
we could apply the threshold check *before* constructing the
tcp_info netlink notification, so that we don't waste resources
constructing notifications that will be discarded by the filter.
One idea, implemented in this patchset, is to extend the tcp_call_bpf()
infra so that the BPF kernel module (the sock_ops filter/callback)
can examine the values in the sock_ops, apply any thresholds it wants,
and return some new status ("BPF_TCP_INFO_NOTIFY"). Use this status in
the tcp stack to queue up a tcp_info notification (similar to
sock_diag_broadcast_destroy() today..)
Patch 1 in this set refactors the existing sock_diag code so that
the functions can be reused for notifications from other states than CLOSE.
Patch 2 provides a minor extension to tcp_call_bpf() so that it
will queue a tcp_info_notification if the BPF callout returns
BPF_TCP_INFO_NOTIFY
Patch 3, provided strictly as a demonstration/PoC to aid in reviewing
this proposal, shows a simple sample/bpf example where we trigger the
tcp_info notification for an iperf connection if the number of
retransmits exceeds 16.
Sowmini Varadhan (3):
sock_diag: Refactor inet_sock_diag_destroy code
tcp: BPF_TCP_INFO_NOTIFY support
bpf: Added a sample for tcp_info_notify callback
include/linux/sock_diag.h | 18 +++++++---
include/net/tcp.h | 15 +++++++-
include/uapi/linux/bpf.h | 4 ++
include/uapi/linux/sock_diag.h | 2 +
net/core/sock.c | 4 +-
net/core/sock_diag.c | 11 +++---
samples/bpf/Makefile | 1 +
samples/bpf/tcp_notify_kern.c | 73 ++++++++++++++++++++++++++++++++++++++++
8 files changed, 114 insertions(+), 14 deletions(-)
create mode 100644 samples/bpf/tcp_notify_kern.c
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH RFC net-next 1/3] sock_diag: Refactor inet_sock_diag_destroy code
2018-10-22 15:23 [PATCH RFC net-next 0/3] Extensions to allow asynchronous TCP_INFO notifications based on congestion parameters Sowmini Varadhan
@ 2018-10-22 15:23 ` Sowmini Varadhan
2018-10-22 15:23 ` [PATCH RFC net-next 2/3] tcp: BPF_TCP_INFO_NOTIFY support Sowmini Varadhan
2018-10-22 15:24 ` [PATCH RFC net-next 3/3] bpf: Added a sample for tcp_info_notify callback Sowmini Varadhan
2 siblings, 0 replies; 4+ messages in thread
From: Sowmini Varadhan @ 2018-10-22 15:23 UTC (permalink / raw)
To: sowmini.varadhan, netdev; +Cc: edumazet, brakmo
We want to use the inet_sock_diag_destroy code to send notifications
for more types of TCP events than just socket_close(), so refactor
the code to allow this.
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
include/linux/sock_diag.h | 18 +++++++++++++-----
include/uapi/linux/sock_diag.h | 2 ++
net/core/sock.c | 4 ++--
net/core/sock_diag.c | 11 ++++++-----
4 files changed, 23 insertions(+), 12 deletions(-)
diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 15fe980..df85767 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -34,7 +34,7 @@ int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
struct sk_buff *skb, int attrtype);
static inline
-enum sknetlink_groups sock_diag_destroy_group(const struct sock *sk)
+enum sknetlink_groups sock_diag_group(const struct sock *sk)
{
switch (sk->sk_family) {
case AF_INET:
@@ -43,7 +43,15 @@ enum sknetlink_groups sock_diag_destroy_group(const struct sock *sk)
switch (sk->sk_protocol) {
case IPPROTO_TCP:
- return SKNLGRP_INET_TCP_DESTROY;
+ switch (sk->sk_state) {
+ case TCP_ESTABLISHED:
+ return SKNLGRP_INET_TCP_CONNECTED;
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ return SKNLGRP_INET_TCP_3WH;
+ default:
+ return SKNLGRP_INET_TCP_DESTROY;
+ }
case IPPROTO_UDP:
return SKNLGRP_INET_UDP_DESTROY;
default:
@@ -67,15 +75,15 @@ enum sknetlink_groups sock_diag_destroy_group(const struct sock *sk)
}
static inline
-bool sock_diag_has_destroy_listeners(const struct sock *sk)
+bool sock_diag_has_listeners(const struct sock *sk)
{
const struct net *n = sock_net(sk);
- const enum sknetlink_groups group = sock_diag_destroy_group(sk);
+ const enum sknetlink_groups group = sock_diag_group(sk);
return group != SKNLGRP_NONE && n->diag_nlsk &&
netlink_has_listeners(n->diag_nlsk, group);
}
-void sock_diag_broadcast_destroy(struct sock *sk);
+void sock_diag_broadcast(struct sock *sk);
int sock_diag_destroy(struct sock *sk, int err);
#endif
diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h
index e592500..4252674 100644
--- a/include/uapi/linux/sock_diag.h
+++ b/include/uapi/linux/sock_diag.h
@@ -32,6 +32,8 @@ enum sknetlink_groups {
SKNLGRP_INET_UDP_DESTROY,
SKNLGRP_INET6_TCP_DESTROY,
SKNLGRP_INET6_UDP_DESTROY,
+ SKNLGRP_INET_TCP_3WH,
+ SKNLGRP_INET_TCP_CONNECTED,
__SKNLGRP_MAX,
};
#define SKNLGRP_MAX (__SKNLGRP_MAX - 1)
diff --git a/net/core/sock.c b/net/core/sock.c
index 7e8796a..6684840 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1600,8 +1600,8 @@ static void __sk_free(struct sock *sk)
if (likely(sk->sk_net_refcnt))
sock_inuse_add(sock_net(sk), -1);
- if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
- sock_diag_broadcast_destroy(sk);
+ if (unlikely(sk->sk_net_refcnt && sock_diag_has_listeners(sk)))
+ sock_diag_broadcast(sk);
else
sk_destruct(sk);
}
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 3312a58..dbd9e65 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -116,14 +116,14 @@ static size_t sock_diag_nlmsg_size(void)
+ nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
}
-static void sock_diag_broadcast_destroy_work(struct work_struct *work)
+static void sock_diag_broadcast_work(struct work_struct *work)
{
struct broadcast_sk *bsk =
container_of(work, struct broadcast_sk, work);
struct sock *sk = bsk->sk;
const struct sock_diag_handler *hndl;
struct sk_buff *skb;
- const enum sknetlink_groups group = sock_diag_destroy_group(sk);
+ const enum sknetlink_groups group = sock_diag_group(sk);
int err = -1;
WARN_ON(group == SKNLGRP_NONE);
@@ -144,11 +144,12 @@ static void sock_diag_broadcast_destroy_work(struct work_struct *work)
else
kfree_skb(skb);
out:
- sk_destruct(sk);
+ if (group <= SKNLGRP_INET6_UDP_DESTROY)
+ sk_destruct(sk);
kfree(bsk);
}
-void sock_diag_broadcast_destroy(struct sock *sk)
+void sock_diag_broadcast(struct sock *sk)
{
/* Note, this function is often called from an interrupt context. */
struct broadcast_sk *bsk =
@@ -156,7 +157,7 @@ void sock_diag_broadcast_destroy(struct sock *sk)
if (!bsk)
return sk_destruct(sk);
bsk->sk = sk;
- INIT_WORK(&bsk->work, sock_diag_broadcast_destroy_work);
+ INIT_WORK(&bsk->work, sock_diag_broadcast_work);
queue_work(broadcast_wq, &bsk->work);
}
--
1.7.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH RFC net-next 2/3] tcp: BPF_TCP_INFO_NOTIFY support
2018-10-22 15:23 [PATCH RFC net-next 0/3] Extensions to allow asynchronous TCP_INFO notifications based on congestion parameters Sowmini Varadhan
2018-10-22 15:23 ` [PATCH RFC net-next 1/3] sock_diag: Refactor inet_sock_diag_destroy code Sowmini Varadhan
@ 2018-10-22 15:23 ` Sowmini Varadhan
2018-10-22 15:24 ` [PATCH RFC net-next 3/3] bpf: Added a sample for tcp_info_notify callback Sowmini Varadhan
2 siblings, 0 replies; 4+ messages in thread
From: Sowmini Varadhan @ 2018-10-22 15:23 UTC (permalink / raw)
To: sowmini.varadhan, netdev; +Cc: edumazet, brakmo
We want to be able to set up the monitoring application so that it can
be aysnchronously notified when "interesting" events happen, e.g., when
application-determined thresholds on parameters like RTT estimate, number
of retransmissions, RTO are reached.
The bpf_sock_ops infrastructure provided as part of Commit 40304b2a1567
("bpf: BPF support for sock_ops") provides an elegant way to trigger
this asynchronous notification. The BPF program can examine the
current TCP state reported in the bpf_sock_ops and conditionally
return a (new) status BPF_TCP_INFO_NOTIFY. The return status is used
by the caller to queue up a tcp_info notification for the application.
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
include/net/tcp.h | 15 +++++++++++++--
include/uapi/linux/bpf.h | 4 ++++
2 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0d29292..df06a9f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -47,6 +47,7 @@
#include <linux/seq_file.h>
#include <linux/memcontrol.h>
#include <linux/bpf-cgroup.h>
+#include <linux/sock_diag.h>
extern struct inet_hashinfo tcp_hashinfo;
@@ -2065,6 +2066,12 @@ struct tcp_ulp_ops {
__MODULE_INFO(alias, alias_userspace, name); \
__MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
+#define TCPDIAG_CB(sk) \
+do { \
+ if (unlikely(sk->sk_net_refcnt && sock_diag_has_listeners(sk))) \
+ sock_diag_broadcast(sk); \
+} while (0)
+
/* Call BPF_SOCK_OPS program that returns an int. If the return value
* is < 0, then the BPF op failed (for example if the loaded BPF
* program does not support the chosen operation or there is no BPF
@@ -2088,9 +2095,13 @@ static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
memcpy(sock_ops.args, args, nargs * sizeof(*args));
ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
- if (ret == 0)
+ if (ret == 0) {
ret = sock_ops.reply;
- else
+
+ /* XXX would be nice if we could use replylong[1] here */
+ if (ret == BPF_TCP_INFO_NOTIFY)
+ TCPDIAG_CB(sk);
+ } else
ret = -1;
return ret;
}
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa5ccd2..bc45e5e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2678,6 +2678,10 @@ enum {
BPF_TCP_MAX_STATES /* Leave at the end! */
};
+enum {
+ BPF_TCP_INFO_NOTIFY = 2
+};
+
#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */
#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */
--
1.7.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH RFC net-next 3/3] bpf: Added a sample for tcp_info_notify callback
2018-10-22 15:23 [PATCH RFC net-next 0/3] Extensions to allow asynchronous TCP_INFO notifications based on congestion parameters Sowmini Varadhan
2018-10-22 15:23 ` [PATCH RFC net-next 1/3] sock_diag: Refactor inet_sock_diag_destroy code Sowmini Varadhan
2018-10-22 15:23 ` [PATCH RFC net-next 2/3] tcp: BPF_TCP_INFO_NOTIFY support Sowmini Varadhan
@ 2018-10-22 15:24 ` Sowmini Varadhan
2 siblings, 0 replies; 4+ messages in thread
From: Sowmini Varadhan @ 2018-10-22 15:24 UTC (permalink / raw)
To: sowmini.varadhan, netdev; +Cc: edumazet, brakmo
Simple Proof-Of-Concept test program for BPF_TCP_INFO_NOTIFY
(will move this to testing/selftests/net later)
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
samples/bpf/Makefile | 1 +
samples/bpf/tcp_notify_kern.c | 73 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 74 insertions(+), 0 deletions(-)
create mode 100644 samples/bpf/tcp_notify_kern.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index be0a961..d937bd2 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -152,6 +152,7 @@ always += tcp_bufs_kern.o
always += tcp_cong_kern.o
always += tcp_iw_kern.o
always += tcp_clamp_kern.o
+always += tcp_notify_kern.o
always += tcp_basertt_kern.o
always += tcp_tos_reflect_kern.o
always += xdp_redirect_kern.o
diff --git a/samples/bpf/tcp_notify_kern.c b/samples/bpf/tcp_notify_kern.c
new file mode 100644
index 0000000..bc4efd8
--- /dev/null
+++ b/samples/bpf/tcp_notify_kern.c
@@ -0,0 +1,73 @@
+/* Sample BPF program to demonstrate how to triger async tcp_info
+ * notification based on thresholds determeined in the filter.
+ * The example here will trigger notification if skops->total_retrans > 16
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 0
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_tcp_info_notify(struct bpf_sock_ops *skops)
+{
+ int bufsize = 150000;
+ int to_init = 10;
+ int clamp = 100;
+ int rv = 0;
+ int op;
+
+ /* For testing purposes, only execute rest of BPF program
+ * if neither port numberis 5001 (default iperf port)
+ */
+ if (bpf_ntohl(skops->remote_port) != 5001 &&
+ skops->local_port != 5001) {
+ skops->reply = -1;
+ return 0;
+ }
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ switch (op) {
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ bpf_sock_ops_cb_flags_set(skops,
+ (BPF_SOCK_OPS_RETRANS_CB_FLAG|
+ BPF_SOCK_OPS_RTO_CB_FLAG));
+ rv = 1;
+ break;
+ case BPF_SOCK_OPS_RETRANS_CB:
+ case BPF_SOCK_OPS_RTO_CB:
+ if (skops->total_retrans < 16)
+ rv = 1; /* skip */
+ else
+ rv = BPF_TCP_INFO_NOTIFY;
+ break;
+ default:
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
--
1.7.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2018-10-22 23:48 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-10-22 15:23 [PATCH RFC net-next 0/3] Extensions to allow asynchronous TCP_INFO notifications based on congestion parameters Sowmini Varadhan
2018-10-22 15:23 ` [PATCH RFC net-next 1/3] sock_diag: Refactor inet_sock_diag_destroy code Sowmini Varadhan
2018-10-22 15:23 ` [PATCH RFC net-next 2/3] tcp: BPF_TCP_INFO_NOTIFY support Sowmini Varadhan
2018-10-22 15:24 ` [PATCH RFC net-next 3/3] bpf: Added a sample for tcp_info_notify callback Sowmini Varadhan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).