All of lore.kernel.org
 help / color / mirror / Atom feed
From: Lawrence Brakmo <brakmo@fb.com>
To: netdev <netdev@vger.kernel.org>
Cc: Kernel Team <kernel-team@fb.com>, Blake Matheny <bmatheny@fb.com>,
	Alexei Starovoitov <ast@fb.com>,
	Daniel Borkmann <daniel@iogearbox.net>,
	David Ahern <dsa@cumulusnetworks.com>
Subject: [RFC PATCH net-next v2 10/15] bpf: Add support for changing congestion control
Date: Thu, 15 Jun 2017 13:08:39 -0700	[thread overview]
Message-ID: <20170615200844.2752485-11-brakmo@fb.com> (raw)
In-Reply-To: <20170615200844.2752485-1-brakmo@fb.com>

Added support for changing congestion control for SOCKET_OPS bps
programs through the setsockopt bpf helper function. It also adds
a new SOCKET_OPS op, BPF_SOCKET_OPS_NEEDS_ECN, that is needed for
congestion controls, like dctcp, that need to enable ECN in the
SYN packets.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/net/tcp.h        |  9 ++++++++-
 include/uapi/linux/bpf.h |  3 +++
 net/core/filter.c        | 11 +++++++++--
 net/ipv4/tcp.c           |  2 +-
 net/ipv4/tcp_cong.c      | 15 ++++++++++-----
 net/ipv4/tcp_input.c     |  3 ++-
 net/ipv4/tcp_output.c    |  8 +++++---
 7 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 29c27dc..371b1bd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1001,7 +1001,9 @@ void tcp_get_default_congestion_control(char *name);
 void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
-int tcp_set_congestion_control(struct sock *sk, const char *name);
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load);
+void tcp_reinit_congestion_control(struct sock *sk,
+				   const struct tcp_congestion_ops *ca);
 u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
 
@@ -2039,4 +2041,9 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk, bool is_req_sock)
 		rwnd = 0;
 	return rwnd;
 }
+
+static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
+{
+	return (tcp_call_bpf(sk, true, BPF_SOCKET_OPS_NEEDS_ECN) == 1);
+}
 #endif	/* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c3490d3..8d1d2b7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -776,6 +776,9 @@ enum {
 						 * passive connection is
 						 * established
 						 */
+	BPF_SOCKET_OPS_NEEDS_ECN,	/* If connection's congestion control
+					 * needs ECN
+					 */
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index 9ff567c..4325aba 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2716,8 +2716,15 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_socket_ops_kern *, bpf_socket,
 		}
 	} else if (level == SOL_TCP &&
 		   bpf_socket->sk->sk_prot->setsockopt == tcp_setsockopt) {
-		/* Place holder */
-		ret = -EINVAL;
+		if (optname == TCP_CONGESTION) {
+			ret = tcp_set_congestion_control(sk, optval, false);
+			if (!ret && bpf_socket->op > BPF_SOCKET_OPS_NEEDS_ECN)
+				/* replacing an existing ca */
+				tcp_reinit_congestion_control(sk,
+					inet_csk(sk)->icsk_ca_ops);
+		} else {
+			ret = -EINVAL;
+		}
 	} else {
 		ret = -EINVAL;
 	}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index cc8fd8b..07984ea 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2478,7 +2478,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		name[val] = 0;
 
 		lock_sock(sk);
-		err = tcp_set_congestion_control(sk, name);
+		err = tcp_set_congestion_control(sk, name, true);
 		release_sock(sk);
 		return err;
 	}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 324c9bc..e6f3717 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk)
 		INET_ECN_dontxmit(sk);
 }
 
-static void tcp_reinit_congestion_control(struct sock *sk,
-					  const struct tcp_congestion_ops *ca)
+void tcp_reinit_congestion_control(struct sock *sk,
+				   const struct tcp_congestion_ops *ca)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
@@ -334,7 +334,7 @@ int tcp_set_allowed_congestion_control(char *val)
 }
 
 /* Change congestion control for socket */
-int tcp_set_congestion_control(struct sock *sk, const char *name)
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_congestion_ops *ca;
@@ -344,7 +344,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
 		return -EPERM;
 
 	rcu_read_lock();
-	ca = __tcp_ca_find_autoload(name);
+	if (!load)
+		ca = tcp_ca_find(name);
+	else
+		ca = __tcp_ca_find_autoload(name);
 	/* No change asking for existing value */
 	if (ca == icsk->icsk_ca_ops) {
 		icsk->icsk_ca_setsockopt = 1;
@@ -352,8 +355,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
 	}
 	if (!ca)
 		err = -ENOENT;
+	else if (!load)
+		icsk->icsk_ca_ops = ca;
 	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
-		   ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
+			   ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
 		err = -EPERM;
 	else if (!try_module_get(ca->owner))
 		err = -EBUSY;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e0d688a..14c889f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6192,7 +6192,8 @@ static void tcp_ecn_create_request(struct request_sock *req,
 	ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
 
 	if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
-	    (ecn_ok_dst & DST_FEATURE_ECN_CA))
+	    (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
+	    tcp_bpf_ca_needs_ecn((struct sock *)req))
 		inet_rsk(req)->ecn_ok = 1;
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9124d3d..30660ee 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -316,7 +316,8 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
 	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
 	if (!(tp->ecn_flags & TCP_ECN_OK))
 		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
-	else if (tcp_ca_needs_ecn(sk))
+	else if (tcp_ca_needs_ecn(sk) ||
+		 tcp_bpf_ca_needs_ecn(sk))
 		INET_ECN_xmit(sk);
 }
 
@@ -324,8 +325,9 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
 static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
 	bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
-		       tcp_ca_needs_ecn(sk);
+		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
 
 	if (!use_ecn) {
 		const struct dst_entry *dst = __sk_dst_get(sk);
@@ -339,7 +341,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
 	if (use_ecn) {
 		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
 		tp->ecn_flags = TCP_ECN_OK;
-		if (tcp_ca_needs_ecn(sk))
+		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
 			INET_ECN_xmit(sk);
 	}
 }
-- 
2.9.3

  parent reply	other threads:[~2017-06-15 20:08 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-06-15 20:08 [RFC PATCH net-next v2 00/15] bpf: BPF support for socket ops Lawrence Brakmo
2017-06-15 20:08 ` [RFC PATCH net-next v2 01/15] " Lawrence Brakmo
2017-06-16 12:07   ` Daniel Borkmann
2017-06-16 23:41     ` Lawrence Brakmo
2017-06-19 18:44       ` Daniel Borkmann
2017-06-19 20:49         ` Lawrence Brakmo
2017-06-17 21:48     ` Lawrence Brakmo
2017-06-19 18:52       ` Daniel Borkmann
2017-06-19 20:49         ` Lawrence Brakmo
2017-06-15 20:08 ` [RFC PATCH net-next v2 02/15] bpf: program to load socketops BPF programs Lawrence Brakmo
2017-06-15 20:08 ` [RFC PATCH net-next v2 03/15] bpf: Support for per connection SYN/SYN-ACK RTOs Lawrence Brakmo
2017-06-15 20:08 ` [RFC PATCH net-next v2 04/15] bpf: Sample bpf program to set " Lawrence Brakmo
2017-06-15 20:08 ` [RFC PATCH net-next v2 05/15] bpf: Support for setting initial receive window Lawrence Brakmo
2017-06-15 20:08 ` [RFC PATCH net-next v2 06/15] bpf: Sample bpf program to set initial window Lawrence Brakmo
2017-06-15 20:08 ` [RFC PATCH net-next v2 07/15] bpf: Add setsockopt helper function to bpf Lawrence Brakmo
2017-06-16 13:27   ` Daniel Borkmann
2017-06-17 23:17     ` Lawrence Brakmo
2017-06-15 20:08 ` [RFC PATCH net-next v2 08/15] bpf: Add TCP connection BPF callbacks Lawrence Brakmo
2017-06-15 20:08 ` [RFC PATCH net-next v2 09/15] bpf: Sample BPF program to set buffer sizes Lawrence Brakmo
2017-06-15 20:08 ` Lawrence Brakmo [this message]
2017-06-16 13:58   ` [RFC PATCH net-next v2 10/15] bpf: Add support for changing congestion control Daniel Borkmann
2017-06-18  2:39     ` Lawrence Brakmo
2017-06-19 22:34       ` Daniel Borkmann
2017-06-20  0:35         ` Lawrence Brakmo
2017-06-15 20:08 ` [RFC PATCH net-next v2 11/15] bpf: Sample BPF program to set " Lawrence Brakmo
2017-06-15 20:08 ` [RFC PATCH net-next v2 12/15] bpf: Adds support for setting initial cwnd Lawrence Brakmo
2017-06-15 20:08 ` [RFC PATCH net-next v2 13/15] bpf: Sample BPF program to set " Lawrence Brakmo
2017-06-15 20:08 ` [RFC PATCH net-next v2 14/15] bpf: Adds support for setting sndcwnd clamp Lawrence Brakmo
2017-06-15 20:08 ` [RFC PATCH net-next v2 15/15] bpf: Sample bpf program to set " Lawrence Brakmo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170615200844.2752485-11-brakmo@fb.com \
    --to=brakmo@fb.com \
    --cc=ast@fb.com \
    --cc=bmatheny@fb.com \
    --cc=daniel@iogearbox.net \
    --cc=dsa@cumulusnetworks.com \
    --cc=kernel-team@fb.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.