MPTCP Linux Development
 help / color / mirror / Atom feed
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
To: Florian Westphal <fw@strlen.de>
Cc: mptcp@lists.linux.dev
Subject: Re: [PATCH mptcp 5/4] mptcp: handle join requests early
Date: Wed, 2 Mar 2022 17:33:32 -0800 (PST)	[thread overview]
Message-ID: <bf2a68e3-e373-754-cea2-d3a32bb74f5a@linux.intel.com> (raw)
In-Reply-To: <20220302094527.12212-1-fw@strlen.de>

On Wed, 2 Mar 2022, Florian Westphal wrote:

> Relative patch to better explain whats happening, if we go this route
> then this would be squashed/replace patch #3.
>

Thanks for trying this out and explaining the tradeoffs. I think the 
runtime overhead for early join handling is less than I was originally 
thinking. This approach does seem to touch less TCP code but is a little 
more expensive at runtime.

> Main problem: bypasses TW infrastructure, i.e. SYN+JOIN with existing
> subflow quadruple won't work.
>
> This could be resolved by duplicating considerable code from
> tcp_ipv4(6).c to mptcp_handle_join, but I don't like that either.
>

Agreed on wanting to avoid that.

> IOW, I would prefer to keep the sequence as-is, first do the standard
> port-based demux and then, if no result was found, do the join handling.
>
> This means subflow to TCP-only listener will fail to establish, but that
> could be resolved by having a 'reserve the port' approach (mptcpd?) in
> the tooling that adds the required netlink commands.
>

I'm not sure what this would look like? Seems like a bit of kernel work in 
itself. Not sure if you want to explain here on this list or cover it in 
the meeting.

> I would prefer to minimize kernel work.
>

I hear you :)


-Mat


> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
> include/net/mptcp.h | 17 +++++++-------
> net/ipv4/tcp_ipv4.c | 12 ++++------
> net/ipv6/tcp_ipv6.c |  9 ++------
> net/mptcp/ctrl.c    | 54 ++++++++++++++++++++++++++++++++++++---------
> 4 files changed, 59 insertions(+), 33 deletions(-)
>
> diff --git a/include/net/mptcp.h b/include/net/mptcp.h
> index b8939d7ea12e..cc95c279a196 100644
> --- a/include/net/mptcp.h
> +++ b/include/net/mptcp.h
> @@ -189,7 +189,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req,
> 				  struct sk_buff *skb);
>
> __be32 mptcp_get_reset_option(const struct sk_buff *skb);
> -struct sock *__mptcp_handle_join(int af, struct sk_buff *skb);
> +bool __mptcp_handle_join(int af, struct sk_buff *skb);
>
> static inline __be32 mptcp_reset_option(const struct sk_buff *skb)
> {
> @@ -199,17 +199,17 @@ static inline __be32 mptcp_reset_option(const struct sk_buff *skb)
> 	return htonl(0u);
> }
>
> -static inline struct sock *mptcp_handle_join(struct sk_buff *skb, int af)
> +static inline bool mptcp_handle_join(struct sk_buff *skb, int af)
> {
> 	const struct tcphdr *th = tcp_hdr(skb);
>
> -	if (th->syn && !th->ack && !th->rst && !th->fin)
> +	if (unlikely(th->syn && !th->ack && !th->rst && !th->fin))
> 		return __mptcp_handle_join(af, skb);
>
> -	return NULL;
> +	return true;
> }
>
> -static inline struct sock *mptcp_handle_join4(struct sk_buff *skb)
> +static inline bool mptcp_handle_join4(struct sk_buff *skb)
> {
> 	return mptcp_handle_join(skb, AF_INET);
> }
> @@ -290,7 +290,8 @@ static inline int mptcp_subflow_init_cookie_req(struct request_sock *req,
> }
>
> static inline __be32 mptcp_reset_option(const struct sk_buff *skb)  { return htonl(0u); }
> -static inline struct sock *mptcp_handle_join4(struct sk_buff *skb) { return NULL; }
> +static inline bool mptcp_handle_join4(int af, struct sk_buff *skb) { return true; }
> +
> #endif /* CONFIG_MPTCP */
>
> #if IS_ENABLED(CONFIG_MPTCP_IPV6)
> @@ -299,7 +300,7 @@ int mptcpv6_init_net(struct net *net);
> void mptcpv6_exit_net(struct net *net);
> void mptcpv6_handle_mapped(struct sock *sk, bool mapped);
>
> -static inline struct sock *mptcp_handle_join6(struct sk_buff *skb)
> +static inline bool mptcp_handle_join6(struct sk_buff *skb)
> {
> 	return mptcp_handle_join(skb, AF_INET6);
> }
> @@ -308,7 +309,7 @@ static inline int mptcpv6_init(void) { return 0; }
> static inline int mptcpv6_init_net(struct net *net) { return 0; }
> static inline void mptcpv6_exit_net(struct net *net) { }
> static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { }
> -static inline struct sock *mptcp_handle_join6(struct sk_buff *skb) { return NULL; }
> +static inline bool mptcp_handle_join6(struct sk_buff *skb) { return true; }
> #endif
>
> #endif /* __NET_MPTCP_H */
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index da9ed7b0b7f5..ba685f18a767 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1949,12 +1949,15 @@ int tcp_v4_rcv(struct sk_buff *skb)
>
> 	th = (const struct tcphdr *)skb->data;
> 	iph = ip_hdr(skb);
> +
> +	if (!mptcp_handle_join4(skb))
> +		goto no_tcp_socket;
> +
> lookup:
> 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
> 			       th->dest, sdif, &refcounted);
> 	if (!sk)
> 		goto no_tcp_socket;
> -
> process:
> 	if (sk->sk_state == TCP_TIME_WAIT)
> 		goto do_time_wait;
> @@ -2087,10 +2090,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
> 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
> 		goto discard_it;
>
> -	sk = mptcp_handle_join4(skb);
> -	if (sk)
> -		goto process;
> -
> 	tcp_v4_fill_cb(skb, iph, th);
>
> 	if (tcp_checksum_complete(skb)) {
> @@ -2137,9 +2136,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
> 							iph->daddr, th->dest,
> 							inet_iif(skb),
> 							sdif);
> -		if (!sk2)
> -			sk2 = mptcp_handle_join4(skb);
> -
> 		if (sk2) {
> 			inet_twsk_deschedule_put(inet_twsk(sk));
> 			sk = sk2;
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index a8aebfbb531e..39184d7654b1 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1615,6 +1615,8 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
> 	th = (const struct tcphdr *)skb->data;
> 	hdr = ipv6_hdr(skb);
>
> +	if (!mptcp_handle_join6(skb))
> +		goto no_tcp_socket;
> lookup:
> 	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th),
> 				th->source, th->dest, inet6_iif(skb), sdif,
> @@ -1746,10 +1748,6 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
> 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
> 		goto discard_it;
>
> -	sk = mptcp_handle_join6(skb);
> -	if (sk)
> -		goto process;
> -
> 	tcp_v6_fill_cb(skb, hdr, th);
>
> 	if (tcp_checksum_complete(skb)) {
> @@ -1799,9 +1797,6 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
> 					    ntohs(th->dest),
> 					    tcp_v6_iif_l3_slave(skb),
> 					    sdif);
> -		if (!sk2)
> -			sk2 = mptcp_handle_join6(skb);
> -
> 		if (sk2) {
> 			struct inet_timewait_sock *tw = inet_twsk(sk);
> 			inet_twsk_deschedule_put(tw);
> diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
> index c7370c5147df..bf079bb50177 100644
> --- a/net/mptcp/ctrl.c
> +++ b/net/mptcp/ctrl.c
> @@ -10,6 +10,8 @@
>
> #include <net/net_namespace.h>
> #include <net/netns/generic.h>
> +#include <net/inet_hashtables.h>
> +#include <net/inet6_hashtables.h>
>
> #include "protocol.h"
> #include "mib.h"
> @@ -214,30 +216,54 @@ static void add_mptcp_rst(struct sk_buff *skb)
> 	}
> }
>
> -struct sock *__mptcp_handle_join(int af, struct sk_buff *skb)
> +/* return false if tcp must pretend no socket was found. */
> +bool __mptcp_handle_join(int af, struct sk_buff *skb)
> {
> 	struct mptcp_options_received mp_opt;
> +	const struct ipv6hdr *ip6h;
> +	const struct iphdr *iph;
> +	struct sock *lsk = NULL;
> +	const struct tcphdr *th;
> 	struct mptcp_pernet *pernet;
> 	struct mptcp_sock *msk;
> 	struct socket *ssock;
> -	struct sock *lsk;
> 	struct net *net;
>
> 	/* paranoia check: don't allow 0 destination port,
> 	 * else __inet_inherit_port will insert the child socket
> 	 * into the phony hash slot of the pernet listener.
> 	 */
> -	if (tcp_hdr(skb)->dest == 0)
> -		return NULL;
> +	if (tcp_hdr(skb)->dest == 0 || skb->sk)
> +		return true;
>
> 	mptcp_get_options(skb, &mp_opt);
>
> 	if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ))
> -		return NULL;
> +		return true;
>
> 	net = dev_net(skb_dst(skb)->dev);
> 	if (!mptcp_is_enabled(net))
> -		return NULL;
> +		return true;
> +
> +	lsk = NULL;
> +	th = tcp_hdr(skb);
> +	switch (af) {
> +	case AF_INET:
> +		iph = ip_hdr(skb);
> +		lsk = inet_lookup_listener(net, &tcp_hashinfo, skb, __tcp_hdrlen(th), iph->saddr,
> +					   th->source, iph->daddr, th->dest, inet_iif(skb), inet_sdif(skb));
> +		break;
> +#if IS_ENABLED(CONFIG_MPTCP_IPV6)
> +	case AF_INET6:
> +		ip6h = ipv6_hdr(skb);
> +		lsk = inet6_lookup_listener(net, &tcp_hashinfo, skb, __tcp_hdrlen(th), &ip6h->saddr,
> +					    th->source, &ip6h->daddr, th->dest, inet6_iif(skb), inet6_sdif(skb));
> +		break;
> +#endif
> +	}
> +
> +	if (lsk && sk_is_mptcp(lsk))
> +		goto assign_sk;
>
> 	/* RFC8684: If the token is unknown [..], the receiver will send
> 	 * back a reset (RST) signal, analogous to an unknown port in TCP,
> @@ -246,14 +272,14 @@ struct sock *__mptcp_handle_join(int af, struct sk_buff *skb)
> 	msk = mptcp_token_get_sock(net, mp_opt.token);
> 	if (!msk) {
> 		add_mptcp_rst(skb);
> -		return NULL;
> +		return false; /* suppress 4-tuple based lookups */
> 	}
>
> 	if (!mptcp_pm_sport_in_anno_list(msk, af, skb)) {
> 		sock_put((struct sock *)msk);
> 		MPTCP_INC_STATS(net, MPTCP_MIB_MISMATCHPORTSYNRX);
> 		add_mptcp_rst(skb);
> -		return NULL;
> +		return false;
> 	}
>
> 	sock_put((struct sock *)msk);
> @@ -270,14 +296,21 @@ struct sock *__mptcp_handle_join(int af, struct sk_buff *skb)
> #endif
> 	default:
> 		WARN_ON_ONCE(1);
> -		return NULL;
> +		return true;
> 	}
>
> 	ssock = __mptcp_nmpc_socket(mptcp_sk(lsk));
> 	if (WARN_ON(!ssock))
> 		return NULL;
>
> -	return ssock->sk;
> +	lsk = ssock->sk;
> +
> +assign_sk:
> +	WARN_ON_ONCE(sk_is_refcounted(lsk));
> +
> +	skb->sk = lsk;
> +	skb->destructor = sock_pfree;
> +	return true;
> }
>
> static struct socket *mptcp_create_join_listen_socket(struct net *net, int af)
> @@ -297,6 +330,7 @@ static struct socket *mptcp_create_join_listen_socket(struct net *net, int af)
>
> 	ssock->sk->sk_max_ack_backlog = SOMAXCONN;
> 	inet_sk_state_store(ssock->sk, TCP_LISTEN);
> +	sock_set_flag(ssock->sk, SOCK_RCU_FREE);
>
> 	s->sk->sk_max_ack_backlog = SOMAXCONN;
> 	inet_sk_state_store(s->sk, TCP_LISTEN);
> -- 
> 2.34.1
>
>
>

--
Mat Martineau
Intel

  reply	other threads:[~2022-03-03  1:33 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-02-24 15:50 [PATCH mptcp-next v4 0/4] mptcp: replace per-addr listener sockets Florian Westphal
2022-02-24 15:50 ` [PATCH mptcp-next 1/4] mptcp: prefer ip address in syn skb instead of listen sk bound address Florian Westphal
2022-02-24 15:50 ` [PATCH mptcp-next 2/4] tcp: add mptcp join demultiplex hooks Florian Westphal
2022-02-24 15:50 ` [PATCH mptcp-next 3/4] mptcp: handle join requests via pernet listen socket Florian Westphal
2022-03-04  7:36   ` Kishen Maloor
2022-03-08 18:45     ` Florian Westphal
2022-03-08 23:00       ` Kishen Maloor
2022-03-09 12:53         ` Florian Westphal
2022-03-09 17:40           ` Kishen Maloor
2022-03-09 21:37             ` Florian Westphal
2022-03-09 23:40               ` Kishen Maloor
2022-03-10  0:37                 ` Mat Martineau
2022-03-10  1:27                   ` Kishen Maloor
2022-03-11  1:16               ` Mat Martineau
2022-02-24 15:50 ` [PATCH mptcp-next 4/4] mptcp: remove per-address listening sockets Florian Westphal
2022-02-24 17:23   ` mptcp: remove per-address listening sockets: Tests Results MPTCP CI
2022-03-02  9:45   ` [PATCH mptcp 5/4] mptcp: handle join requests early Florian Westphal
2022-03-03  1:33     ` Mat Martineau [this message]
2022-03-03 16:28       ` Florian Westphal

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bf2a68e3-e373-754-cea2-d3a32bb74f5a@linux.intel.com \
    --to=mathew.j.martineau@linux.intel.com \
    --cc=fw@strlen.de \
    --cc=mptcp@lists.linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox