All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [MPTCP] [PATCH v2] tcp: Register handlers for extra TCP options
@ 2017-10-05  7:40 Christoph Paasch
  0 siblings, 0 replies; 5+ messages in thread
From: Christoph Paasch @ 2017-10-05  7:40 UTC (permalink / raw)
  To: mptcp 

[-- Attachment #1: Type: text/plain, Size: 27482 bytes --]

Hello Mat,

On 04/10/17 - 13:36:16, Mat Martineau wrote:
> Allow additional TCP options to be handled by registered hook
> functions.
> 
> Registered options have a priority that determines the order in which
> options are prepared and written. Lower priority numbers are handled
> first.
> 
> Option parsing will call the provided 'parse' function when a TCP option
> number is not recognized by the normal option parsing code.
> 
> The 'prepare' function determines the required space for registered
> options and store associated data. 'write' adds the option to the TCP
> header.
> 
> A static key and RCU synchronization are used to minimize the
> performance impact of these extensible TCP features.
> 
> Signed-off-by: Mat Martineau <mathew.j.martineau(a)linux.intel.com>
> ---
> 
> Changes from v1: One 'prepare' callback (no more special callback for
> request_sock), and add a few missing callback sites (like ipv6).

great, I like that we now have only one 'prepare' callback.

> 
> drivers/infiniband/hw/cxgb4/cm.c |   2 +-
>  include/linux/tcp.h              |  22 +++++++
>  include/net/tcp.h                |  40 +++++++++++-
>  net/ipv4/syncookies.c            |   2 +-
>  net/ipv4/tcp.c                   | 133 +++++++++++++++++++++++++++++++++++++++
>  net/ipv4/tcp_input.c             |  16 +++--
>  net/ipv4/tcp_ipv4.c              |  80 ++++++++++++++++++-----
>  net/ipv4/tcp_minisocks.c         |   4 +-
>  net/ipv4/tcp_output.c            |  43 +++++++------
>  net/ipv6/syncookies.c            |   2 +-
>  net/ipv6/tcp_ipv6.c              |  28 ++++++++-
>  11 files changed, 323 insertions(+), 49 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
> index daf7a56e5d7e..c3eb31611011 100644
> --- a/drivers/infiniband/hw/cxgb4/cm.c
> +++ b/drivers/infiniband/hw/cxgb4/cm.c
> @@ -3752,7 +3752,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
>  	 */
>  	memset(&tmp_opt, 0, sizeof(tmp_opt));
>  	tcp_clear_options(&tmp_opt);
> -	tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL);
> +	tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL, NULL);
>  
>  	req = __skb_push(skb, sizeof(*req));
>  	memset(req, 0, sizeof(*req));
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 4aa40ef02d32..0347e6ce99be 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -112,6 +112,23 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
>  	rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
>  }
>  
> +#define OPTION_SACK_ADVERTISE	(1 << 0)
> +#define OPTION_TS		(1 << 1)
> +#define OPTION_MD5		(1 << 2)
> +#define OPTION_WSCALE		(1 << 3)
> +#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
> +
> +struct tcp_out_options {
> +	u16 options;		/* bit field of OPTION_* */
> +	u16 mss;		/* 0 to disable */
> +	u8 ws;			/* window scale, 0 to disable */
> +	u8 num_sack_blocks;	/* number of SACK blocks to include */
> +	u8 hash_size;		/* bytes in hash_location */
> +	__u8 *hash_location;	/* temporary pointer, overloaded */
> +	__u32 tsval, tsecr;	/* need to include OPTION_TS */
> +	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
> +};
> +
>  /* This is the max number of SACKS that we'll generate and process. It's safe
>   * to increase this, although since:
>   *   size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
> @@ -389,6 +406,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
>  	return (struct tcp_sock *)sk;
>  }
>  
> +static inline struct sock *tcp_to_sk(const struct tcp_sock *tp)
> +{
> +	return (struct sock *)tp;
> +}

Nice little function :)

> +
>  struct tcp_timewait_sock {
>  	struct inet_timewait_sock tw_sk;
>  #define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 3bc910a9bfc6..04f3dcecf592 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -433,7 +433,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
>  		int flags, int *addr_len);
>  void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
>  		       struct tcp_options_received *opt_rx,
> -		       int estab, struct tcp_fastopen_cookie *foc);
> +		       int estab, struct tcp_fastopen_cookie *foc,
> +		       struct tcp_sock *tp);
>  const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
>  
>  /*
> @@ -2109,4 +2110,41 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
>  {
>  	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
>  }
> +
> +extern struct static_key_false tcp_extra_options_enabled;
> +
> +struct tcp_extra_option_ops {
> +	struct list_head	list;
> +	unsigned char		option_kind;
> +	unsigned char		priority;
> +	void (*parse)(int opsize, const unsigned char *opptr,
> +		      const struct sk_buff *skb,
> +		      struct tcp_options_received *opt_rx,
> +		      struct sock *sk);
> +	/* Return the number of bytes consumed */
> +	unsigned int (*prepare)(struct sk_buff *skb, u8 flags,
> +				unsigned int remaining,
> +				struct tcp_out_options *opts,
> +				const struct sock *sk);
> +	void (*write)(__be32 *ptr, struct tcp_out_options *opts,
> +		      const struct sock *sk);
> +	struct module		*owner;
> +};
> +
> +void tcp_extra_options_parse(int opcode, int opsize, const unsigned char *opptr,
> +			     const struct sk_buff *skb,
> +			     struct tcp_options_received *opt_rx,
> +			     struct sock *sk);
> +
> +unsigned int tcp_extra_options_prepare(struct sk_buff *skb, u8 flags,
> +				       unsigned int remaining,
> +				       struct tcp_out_options *opts,
> +				       const struct sock *sk);
> +
> +void tcp_extra_options_write(__be32 *ptr, struct tcp_out_options *opts,
> +			     const struct sock *sk);
> +
> +int tcp_register_extra_option(struct tcp_extra_option_ops *ops);
> +void tcp_unregister_extra_option(struct tcp_extra_option_ops *ops);
> +
>  #endif	/* _TCP_H */
> diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
> index b1bb1b3a1082..6c8d750a2243 100644
> --- a/net/ipv4/syncookies.c
> +++ b/net/ipv4/syncookies.c
> @@ -313,7 +313,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
>  
>  	/* check for timestamp cookie support */
>  	memset(&tcp_opt, 0, sizeof(tcp_opt));
> -	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
> +	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL, tp);
>  
>  	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
>  		tsoff = secure_tcp_ts_off(sock_net(sk),
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 5091402720ab..8136857b992b 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -270,6 +270,7 @@
>  #include <linux/time.h>
>  #include <linux/slab.h>
>  #include <linux/errqueue.h>
> +#include <linux/static_key.h>
>  
>  #include <net/icmp.h>
>  #include <net/inet_common.h>
> @@ -306,6 +307,13 @@ EXPORT_SYMBOL(tcp_memory_allocated);
>  struct percpu_counter tcp_sockets_allocated;
>  EXPORT_SYMBOL(tcp_sockets_allocated);
>  
> +/*
> + * Optional TCP option handlers
> + */
> +static DEFINE_SPINLOCK(tcp_option_list_lock);
> +static LIST_HEAD(tcp_option_list);
> +DEFINE_STATIC_KEY_FALSE(tcp_extra_options_enabled);
> +
>  /*
>   * TCP splice context
>   */
> @@ -3375,6 +3383,130 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
>  
>  #endif
>  
> +/* Linear search, few entries are expected. The RCU read lock must
> + * be held before calling.
> + */
> +static struct tcp_extra_option_ops *tcp_extra_options_find_kind(unsigned char kind)
> +{
> +	struct tcp_extra_option_ops *entry;
> +
> +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
> +		if (entry->option_kind == kind)
> +			return entry;
> +	}
> +
> +	return NULL;
> +}
> +
> +void tcp_extra_options_parse(int opcode, int opsize, const unsigned char *opptr,
> +			     const struct sk_buff *skb,
> +			     struct tcp_options_received *opt_rx,
> +			     struct sock *sk)
> +{
> +	struct tcp_extra_option_ops *entry;
> +
> +	rcu_read_lock();
> +	entry = tcp_extra_options_find_kind(opcode);
> +	if (entry && entry->parse)
> +		entry->parse(opsize, opptr, skb, opt_rx, sk);
> +	rcu_read_unlock();
> +}
> +EXPORT_SYMBOL_GPL(tcp_extra_options_parse);
> +
> +/* The RCU read lock must be held before calling, and should span both
> + * the call to this function and tcp_extra_options_write to ensure that
> + * tcp_option_list does not change between the two calls. To preserve
> + * expected option alignment, always returns a multiple of 4 bytes.
> + */

The RCU read lock won't be able to protect the list from being altered. All
it will take care of is that the elements on the list won't get free'd while
traversing it and that the next-pointer won't get changed. That way the
list-traversal is still fine.

If we move the extra-options to a mode where the list is on a per-TCP socket
basis, we can avoid handling this. Because we can limit adding/removing
TCP-options to happen only when the socket is in TCP_CLOSE. So, once the
connection started, the list will always remain the same.

> +unsigned int tcp_extra_options_prepare(struct sk_buff *skb, u8 flags,
> +				       unsigned int remaining,
> +				       struct tcp_out_options *opts,
> +				       const struct sock *sk)
> +{
> +	struct tcp_extra_option_ops *entry;
> +	unsigned int used = 0;
> +
> +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
> +		if (unlikely(!entry->prepare))
> +			continue;
> +
> +		used += entry->prepare(skb, flags, remaining - used, opts, sk);
> +	}
> +
> +	return roundup(used, 4);
> +}
> +EXPORT_SYMBOL_GPL(tcp_extra_options_prepare);
> +
> +/* The RCU read lock must be held before calling, and should span both
> + * the call to tcp_extra_options_write and this function to ensure that
> + * tcp_option_list does not change between the two calls.
> + */
> +void tcp_extra_options_write(__be32 *ptr, struct tcp_out_options *opts,
> +			     const struct sock *sk)
> +{
> +	struct tcp_extra_option_ops *entry;
> +
> +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
> +		if (unlikely(!entry->write))
> +			continue;
> +
> +		entry->write(ptr, opts, sk);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(tcp_extra_options_write);
> +
> +int tcp_register_extra_option(struct tcp_extra_option_ops *ops)
> +{
> +	struct tcp_extra_option_ops *entry;
> +	struct list_head* add_before = &tcp_option_list;
> +	int ret = 0;
> +
> +	if (!ops->option_kind)
> +		return -EINVAL;
> +
> +	if (!try_module_get(ops->owner))
> +		return -ENOENT;
> +
> +	spin_lock(&tcp_option_list_lock);
> +
> +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
> +		if (entry->option_kind == ops->option_kind) {
> +			pr_notice("Option kind %u already registered\n",
> +				  ops->option_kind);
> +			spin_unlock(&tcp_option_list_lock);
> +			module_put(ops->owner);
> +			return -EEXIST;
> +		}
> +
> +		if (entry->priority <= ops->priority)
> +			add_before = &entry->list;
> +	}
> +
> +	list_add_tail_rcu(&ops->list, add_before);
> +	pr_debug("Option kind %u registered\n", ops->option_kind);
> +
> +	spin_unlock(&tcp_option_list_lock);
> +
> +	static_branch_inc(&tcp_extra_options_enabled);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(tcp_register_extra_option);
> +
> +void tcp_unregister_extra_option(struct tcp_extra_option_ops *ops)
> +{
> +	spin_lock(&tcp_option_list_lock);
> +	list_del_rcu(&ops->list);
> +	spin_unlock(&tcp_option_list_lock);
> +
> +	synchronize_net();
> +
> +	static_branch_dec(&tcp_extra_options_enabled);
> +
> +	module_put(ops->owner);
> +}
> +EXPORT_SYMBOL_GPL(tcp_unregister_extra_option);
> +
>  void tcp_done(struct sock *sk)
>  {
>  	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
> @@ -3521,6 +3653,7 @@ void __init tcp_init(void)
>  		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
>  	}
>  
> +	INIT_LIST_HEAD(&tcp_option_list);
>  
>  	cnt = tcp_hashinfo.ehash_mask + 1;
>  	sysctl_tcp_max_orphans = cnt / 2;
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index c5d7656beeee..faf3c8d34cec 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -3728,7 +3728,7 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
>  void tcp_parse_options(const struct net *net,
>  		       const struct sk_buff *skb,
>  		       struct tcp_options_received *opt_rx, int estab,
> -		       struct tcp_fastopen_cookie *foc)
> +		       struct tcp_fastopen_cookie *foc, struct tcp_sock *tp)
>  {
>  	const unsigned char *ptr;
>  	const struct tcphdr *th = tcp_hdr(skb);
> @@ -3830,6 +3830,12 @@ void tcp_parse_options(const struct net *net,
>  						ptr + 2, th->syn, foc, true);
>  				break;
>  
> +			default:
> +				tcp_extra_options_parse(opcode, opsize, ptr,
> +							skb, opt_rx,
> +							tcp_to_sk(tp));
> +				break;
> +
>  			}
>  			ptr += opsize-2;
>  			length -= opsize;
> @@ -3876,7 +3882,7 @@ static bool tcp_fast_parse_options(const struct net *net,
>  			return true;
>  	}
>  
> -	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
> +	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL, tp);
>  	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
>  		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
>  
> @@ -5569,7 +5575,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
>  		/* Get original SYNACK MSS value if user MSS sets mss_clamp */
>  		tcp_clear_options(&opt);
>  		opt.user_mss = opt.mss_clamp = 0;
> -		tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
> +		tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL, tp);
>  		mss = opt.mss_clamp;
>  	}
>  
> @@ -5623,7 +5629,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
>  	int saved_clamp = tp->rx_opt.mss_clamp;
>  	bool fastopen_fail;
>  
> -	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
> +	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc, tp);
>  	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
>  		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
>  
> @@ -6299,7 +6305,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
>  	tmp_opt.mss_clamp = af_ops->mss_clamp;
>  	tmp_opt.user_mss  = tp->rx_opt.user_mss;
>  	tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
> -			  want_cookie ? NULL : &foc);
> +			  want_cookie ? NULL : &foc, tp);
>  
>  	if (want_cookie && !tmp_opt.saw_tstamp)
>  		tcp_clear_options(&tmp_opt);
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index d9416b5162bc..537734e70317 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -598,9 +598,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
>  	const struct tcphdr *th = tcp_hdr(skb);
>  	struct {
>  		struct tcphdr th;
> -#ifdef CONFIG_TCP_MD5SIG
> -		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
> -#endif
> +		__be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
>  	} rep;
>  	struct ip_reply_arg arg;
>  #ifdef CONFIG_TCP_MD5SIG
> @@ -611,6 +609,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
>  	struct sock *sk1 = NULL;
>  #endif
>  	struct net *net;
> +	int offset = 0;
>  
>  	/* Never send a reset in response to a reset. */
>  	if (th->rst)
> @@ -676,17 +675,44 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
>  			goto out;
>  
>  	}
> +#endif
> +
> +	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
> +		unsigned int remaining;
> +		unsigned int used;
> +		struct tcp_out_options opts;
> +
> +		remaining = sizeof(rep.opt);
> +#ifdef CONFIG_TCP_MD5SIG
> +		if (key)
> +			remaining -= TCPOLEN_MD5SIG_ALIGNED;
> +#endif

We will break TCP_MD5 here with this patch if we move it inside the static
branch.
Only after the patch that makes TCP_MD5 adopt the framework we can move this
code to here.


Cheers,
Christoph

> +
> +		memset(&opts, 0, sizeof(opts));
> +
> +		rcu_read_lock();
> +		used = tcp_extra_options_prepare(NULL, TCPHDR_RST, remaining,
> +						 &opts, sk);
> +
> +		tcp_extra_options_write(&rep.opt[0], &opts, sk);
> +		rcu_read_unlock();
> +
> +		arg.iov[0].iov_len += used;
> +		offset += used / 4;
> +		rep.th.doff = arg.iov[0].iov_len / 4;
> +	}
>  
> +#ifdef CONFIG_TCP_MD5SIG
>  	if (key) {
> -		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
> -				   (TCPOPT_NOP << 16) |
> -				   (TCPOPT_MD5SIG << 8) |
> -				   TCPOLEN_MD5SIG);
> +		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
> +					  (TCPOPT_NOP << 16) |
> +					  (TCPOPT_MD5SIG << 8) |
> +					  TCPOLEN_MD5SIG);
>  		/* Update length and the length the header thinks exists */
>  		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
>  		rep.th.doff = arg.iov[0].iov_len / 4;
>  
> -		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
> +		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
>  				     key, ip_hdr(skb)->saddr,
>  				     ip_hdr(skb)->daddr, &rep.th);
>  	}
> @@ -738,14 +764,11 @@ static void tcp_v4_send_ack(const struct sock *sk,
>  	const struct tcphdr *th = tcp_hdr(skb);
>  	struct {
>  		struct tcphdr th;
> -		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
> -#ifdef CONFIG_TCP_MD5SIG
> -			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
> -#endif
> -			];
> +		__be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
>  	} rep;
>  	struct net *net = sock_net(sk);
>  	struct ip_reply_arg arg;
> +	int offset = 0;
>  
>  	memset(&rep.th, 0, sizeof(struct tcphdr));
>  	memset(&arg, 0, sizeof(arg));
> @@ -759,33 +782,56 @@ static void tcp_v4_send_ack(const struct sock *sk,
>  		rep.opt[1] = htonl(tsval);
>  		rep.opt[2] = htonl(tsecr);
>  		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
> +		offset += 3;
>  	}
>  
>  	/* Swap the send and the receive. */
>  	rep.th.dest    = th->source;
>  	rep.th.source  = th->dest;
> -	rep.th.doff    = arg.iov[0].iov_len / 4;
>  	rep.th.seq     = htonl(seq);
>  	rep.th.ack_seq = htonl(ack);
>  	rep.th.ack     = 1;
>  	rep.th.window  = htons(win);
>  
> +	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
> +		unsigned int remaining;
> +		unsigned int used;
> +		struct tcp_out_options opts;
> +
> +		remaining = sizeof(rep.th) + sizeof(rep.opt) - arg.iov[0].iov_len;
>  #ifdef CONFIG_TCP_MD5SIG
> -	if (key) {
> -		int offset = (tsecr) ? 3 : 0;
> +		if (key)
> +			remaining -= TCPOLEN_MD5SIG_ALIGNED;
> +#endif
>  
> +		memset(&opts, 0, sizeof(opts));
> +		rcu_read_lock();
> +		used = tcp_extra_options_prepare(NULL, TCPHDR_ACK, remaining,
> +						 &opts, sk);
> +
> +		tcp_extra_options_write(&rep.opt[offset], &opts, sk);
> +		rcu_read_unlock();
> +
> +		arg.iov[0].iov_len += used;
> +		offset += used / 4;
> +	}
> +
> +	rep.th.doff = arg.iov[0].iov_len / 4;
> +
> +#ifdef CONFIG_TCP_MD5SIG
> +	if (key) {
>  		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
>  					  (TCPOPT_NOP << 16) |
>  					  (TCPOPT_MD5SIG << 8) |
>  					  TCPOLEN_MD5SIG);
>  		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
> -		rep.th.doff = arg.iov[0].iov_len/4;
>  
>  		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
>  				    key, ip_hdr(skb)->saddr,
>  				    ip_hdr(skb)->daddr, &rep.th);
>  	}
>  #endif
> +
>  	arg.flags = reply_flags;
>  	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
>  				      ip_hdr(skb)->saddr, /* XXX */
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index 188a6f31356d..1c3e91899dac 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -98,7 +98,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
>  
>  	tmp_opt.saw_tstamp = 0;
>  	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
> -		tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
> +		tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL, NULL);
>  
>  		if (tmp_opt.saw_tstamp) {
>  			if (tmp_opt.rcv_tsecr)
> @@ -565,7 +565,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
>  
>  	tmp_opt.saw_tstamp = 0;
>  	if (th->doff > (sizeof(struct tcphdr)>>2)) {
> -		tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
> +		tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL, NULL);
>  
>  		if (tmp_opt.saw_tstamp) {
>  			tmp_opt.ts_recent = req->ts_recent;
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 0bc9e46a5369..61eba3d0ae17 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -41,6 +41,7 @@
>  #include <linux/compiler.h>
>  #include <linux/gfp.h>
>  #include <linux/module.h>
> +#include <linux/static_key.h>
>  
>  /* People can turn this off for buggy TCP's found in printers etc. */
>  int sysctl_tcp_retrans_collapse __read_mostly = 1;
> @@ -413,23 +414,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
>  	return tp->snd_una != tp->snd_up;
>  }
>  
> -#define OPTION_SACK_ADVERTISE	(1 << 0)
> -#define OPTION_TS		(1 << 1)
> -#define OPTION_MD5		(1 << 2)
> -#define OPTION_WSCALE		(1 << 3)
> -#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
> -
> -struct tcp_out_options {
> -	u16 options;		/* bit field of OPTION_* */
> -	u16 mss;		/* 0 to disable */
> -	u8 ws;			/* window scale, 0 to disable */
> -	u8 num_sack_blocks;	/* number of SACK blocks to include */
> -	u8 hash_size;		/* bytes in hash_location */
> -	__u8 *hash_location;	/* temporary pointer, overloaded */
> -	__u32 tsval, tsecr;	/* need to include OPTION_TS */
> -	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
> -};
> -
>  /* Write previously computed TCP options to the packet.
>   *
>   * Beware: Something in the Internet is very sensitive to the ordering of
> @@ -536,6 +520,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
>  		}
>  		ptr += (len + 3) >> 2;
>  	}
> +
> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> +		tcp_extra_options_write(ptr, opts, tcp_to_sk(tp));
>  }
>  
>  /* Compute TCP options for SYN packets. This is not the final
> @@ -603,6 +590,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
>  		}
>  	}
>  
> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> +		remaining -= tcp_extra_options_prepare(skb, TCPHDR_SYN,
> +						       remaining, opts,
> +						       tcp_to_sk(tp));
> +
>  	return MAX_TCP_OPTION_SPACE - remaining;
>  }
>  
> @@ -663,6 +655,12 @@ static unsigned int tcp_synack_options(struct request_sock *req,
>  		}
>  	}
>  
> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> +		remaining -= tcp_extra_options_prepare(skb,
> +						       TCPHDR_SYN | TCPHDR_ACK,
> +						       remaining, opts,
> +						       req_to_sk(req));
> +
>  	return MAX_TCP_OPTION_SPACE - remaining;
>  }
>  
> @@ -696,6 +694,11 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
>  		size += TCPOLEN_TSTAMP_ALIGNED;
>  	}
>  
> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> +		size += tcp_extra_options_prepare(skb, 0,
> +						  MAX_TCP_OPTION_SPACE - size,
> +						  opts, tcp_to_sk(tp));
> +
>  	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
>  	if (unlikely(eff_sacks)) {
>  		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
> @@ -1016,6 +1019,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
>  	tcb = TCP_SKB_CB(skb);
>  	memset(&opts, 0, sizeof(opts));
>  
> +	rcu_read_lock();
>  	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
>  		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
>  	else
> @@ -1092,6 +1096,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
>  					       md5, sk, skb);
>  	}
>  #endif
> +	rcu_read_unlock();
>  
>  	icsk->icsk_af_ops->send_check(sk, skb);
>  
> @@ -3156,8 +3161,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
>  #endif
>  		skb->skb_mstamp = tcp_clock_us();
>  
> -#ifdef CONFIG_TCP_MD5SIG
>  	rcu_read_lock();
> +#ifdef CONFIG_TCP_MD5SIG
>  	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
>  #endif
>  	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
> @@ -3196,8 +3201,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
>  	if (md5)
>  		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
>  					       md5, req_to_sk(req), skb);
> -	rcu_read_unlock();
>  #endif
> +	rcu_read_unlock();
>  
>  	/* Do not fool tcpdump (if any), clean our debris */
>  	skb->tstamp = 0;
> diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
> index 4e7817abc0b9..407480366c73 100644
> --- a/net/ipv6/syncookies.c
> +++ b/net/ipv6/syncookies.c
> @@ -162,7 +162,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
>  
>  	/* check for timestamp cookie support */
>  	memset(&tcp_opt, 0, sizeof(tcp_opt));
> -	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
> +	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL, tp);
>  
>  	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
>  		tsoff = secure_tcpv6_ts_off(sock_net(sk),
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 64d94afa427f..4a3fba1ef3a2 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -784,9 +784,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
>  	struct flowi6 fl6;
>  	struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
>  	struct sock *ctl_sk = net->ipv6.tcp_sk;
> -	unsigned int tot_len = sizeof(struct tcphdr);
> +	unsigned int tot_len = 0;
>  	struct dst_entry *dst;
>  	__be32 *topt;
> +	struct tcp_out_options extraopts;
>  
>  	if (tsecr)
>  		tot_len += TCPOLEN_TSTAMP_ALIGNED;
> @@ -795,10 +796,28 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
>  		tot_len += TCPOLEN_MD5SIG_ALIGNED;
>  #endif
>  
> +	rcu_read_lock();
> +	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
> +		unsigned int remaining = MAX_TCP_OPTION_SPACE - tot_len;
> +		u8 extraflags = rst ? TCPHDR_RST : 0;
> +
> +		if (!rst || !th->ack)
> +			extraflags |= TCPHDR_ACK;
> +
> +		memset(&extraopts, 0, sizeof(extraopts));
> +
> +		tot_len += tcp_extra_options_prepare(skb, extraflags, remaining,
> +						     &extraopts, sk);
> +	}
> +
> +	tot_len += sizeof(struct tcphdr);
> +
>  	buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
>  			 GFP_ATOMIC);
> -	if (!buff)
> +	if (!buff) {
> +		rcu_read_unlock();
>  		return;
> +	}
>  
>  	skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
>  
> @@ -835,6 +854,11 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
>  	}
>  #endif
>  
> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> +		tcp_extra_options_write(topt, &extraopts, sk);
> +
> +	rcu_read_unlock();
> +
>  	memset(&fl6, 0, sizeof(fl6));
>  	fl6.daddr = ipv6_hdr(skb)->saddr;
>  	fl6.saddr = ipv6_hdr(skb)->daddr;
> -- 
> 2.14.2
> 
> _______________________________________________
> mptcp mailing list
> mptcp(a)lists.01.org
> https://lists.01.org/mailman/listinfo/mptcp

^ permalink raw reply	[flat|nested] 5+ messages in thread
* Re: [MPTCP] [PATCH v2] tcp: Register handlers for extra TCP options
@ 2017-10-12 19:56 Christoph Paasch
  0 siblings, 0 replies; 5+ messages in thread
From: Christoph Paasch @ 2017-10-12 19:56 UTC (permalink / raw)
  To: mptcp 

[-- Attachment #1: Type: text/plain, Size: 27006 bytes --]

On 04/10/17 - 13:36:16, Mat Martineau wrote:
> Allow additional TCP options to be handled by registered hook
> functions.
> 
> Registered options have a priority that determines the order in which
> options are prepared and written. Lower priority numbers are handled
> first.
> 
> Option parsing will call the provided 'parse' function when a TCP option
> number is not recognized by the normal option parsing code.
> 
> The 'prepare' function determines the required space for registered
> options and store associated data. 'write' adds the option to the TCP
> header.
> 
> A static key and RCU synchronization are used to minimize the
> performance impact of these extensible TCP features.
> 
> Signed-off-by: Mat Martineau <mathew.j.martineau(a)linux.intel.com>
> ---
> 
> Changes from v1: One 'prepare' callback (no more special callback for
> request_sock), and add a few missing callback sites (like ipv6).
> 
> drivers/infiniband/hw/cxgb4/cm.c |   2 +-
>  include/linux/tcp.h              |  22 +++++++
>  include/net/tcp.h                |  40 +++++++++++-
>  net/ipv4/syncookies.c            |   2 +-
>  net/ipv4/tcp.c                   | 133 +++++++++++++++++++++++++++++++++++++++
>  net/ipv4/tcp_input.c             |  16 +++--
>  net/ipv4/tcp_ipv4.c              |  80 ++++++++++++++++++-----
>  net/ipv4/tcp_minisocks.c         |   4 +-
>  net/ipv4/tcp_output.c            |  43 +++++++------
>  net/ipv6/syncookies.c            |   2 +-
>  net/ipv6/tcp_ipv6.c              |  28 ++++++++-
>  11 files changed, 323 insertions(+), 49 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
> index daf7a56e5d7e..c3eb31611011 100644
> --- a/drivers/infiniband/hw/cxgb4/cm.c
> +++ b/drivers/infiniband/hw/cxgb4/cm.c
> @@ -3752,7 +3752,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
>  	 */
>  	memset(&tmp_opt, 0, sizeof(tmp_opt));
>  	tcp_clear_options(&tmp_opt);
> -	tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL);
> +	tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL, NULL);
>  
>  	req = __skb_push(skb, sizeof(*req));
>  	memset(req, 0, sizeof(*req));
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 4aa40ef02d32..0347e6ce99be 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -112,6 +112,23 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
>  	rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
>  }
>  
> +#define OPTION_SACK_ADVERTISE	(1 << 0)
> +#define OPTION_TS		(1 << 1)
> +#define OPTION_MD5		(1 << 2)
> +#define OPTION_WSCALE		(1 << 3)
> +#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
> +
> +struct tcp_out_options {
> +	u16 options;		/* bit field of OPTION_* */
> +	u16 mss;		/* 0 to disable */
> +	u8 ws;			/* window scale, 0 to disable */
> +	u8 num_sack_blocks;	/* number of SACK blocks to include */
> +	u8 hash_size;		/* bytes in hash_location */
> +	__u8 *hash_location;	/* temporary pointer, overloaded */
> +	__u32 tsval, tsecr;	/* need to include OPTION_TS */
> +	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
> +};
> +
>  /* This is the max number of SACKS that we'll generate and process. It's safe
>   * to increase this, although since:
>   *   size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
> @@ -389,6 +406,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
>  	return (struct tcp_sock *)sk;
>  }
>  
> +static inline struct sock *tcp_to_sk(const struct tcp_sock *tp)
> +{
> +	return (struct sock *)tp;
> +}
> +
>  struct tcp_timewait_sock {
>  	struct inet_timewait_sock tw_sk;
>  #define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 3bc910a9bfc6..04f3dcecf592 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -433,7 +433,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
>  		int flags, int *addr_len);
>  void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
>  		       struct tcp_options_received *opt_rx,
> -		       int estab, struct tcp_fastopen_cookie *foc);
> +		       int estab, struct tcp_fastopen_cookie *foc,
> +		       struct tcp_sock *tp);
>  const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
>  
>  /*
> @@ -2109,4 +2110,41 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
>  {
>  	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
>  }
> +
> +extern struct static_key_false tcp_extra_options_enabled;
> +
> +struct tcp_extra_option_ops {
> +	struct list_head	list;
> +	unsigned char		option_kind;
> +	unsigned char		priority;
> +	void (*parse)(int opsize, const unsigned char *opptr,
> +		      const struct sk_buff *skb,
> +		      struct tcp_options_received *opt_rx,
> +		      struct sock *sk);
> +	/* Return the number of bytes consumed */
> +	unsigned int (*prepare)(struct sk_buff *skb, u8 flags,
> +				unsigned int remaining,
> +				struct tcp_out_options *opts,
> +				const struct sock *sk);
> +	void (*write)(__be32 *ptr, struct tcp_out_options *opts,
> +		      const struct sock *sk);
> +	struct module		*owner;
> +};
> +
> +void tcp_extra_options_parse(int opcode, int opsize, const unsigned char *opptr,
> +			     const struct sk_buff *skb,
> +			     struct tcp_options_received *opt_rx,
> +			     struct sock *sk);
> +
> +unsigned int tcp_extra_options_prepare(struct sk_buff *skb, u8 flags,
> +				       unsigned int remaining,
> +				       struct tcp_out_options *opts,
> +				       const struct sock *sk);
> +
> +void tcp_extra_options_write(__be32 *ptr, struct tcp_out_options *opts,
> +			     const struct sock *sk);
> +
> +int tcp_register_extra_option(struct tcp_extra_option_ops *ops);
> +void tcp_unregister_extra_option(struct tcp_extra_option_ops *ops);
> +
>  #endif	/* _TCP_H */
> diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
> index b1bb1b3a1082..6c8d750a2243 100644
> --- a/net/ipv4/syncookies.c
> +++ b/net/ipv4/syncookies.c
> @@ -313,7 +313,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
>  
>  	/* check for timestamp cookie support */
>  	memset(&tcp_opt, 0, sizeof(tcp_opt));
> -	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
> +	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL, tp);
>  
>  	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
>  		tsoff = secure_tcp_ts_off(sock_net(sk),
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 5091402720ab..8136857b992b 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -270,6 +270,7 @@
>  #include <linux/time.h>
>  #include <linux/slab.h>
>  #include <linux/errqueue.h>
> +#include <linux/static_key.h>
>  
>  #include <net/icmp.h>
>  #include <net/inet_common.h>
> @@ -306,6 +307,13 @@ EXPORT_SYMBOL(tcp_memory_allocated);
>  struct percpu_counter tcp_sockets_allocated;
>  EXPORT_SYMBOL(tcp_sockets_allocated);
>  
> +/*
> + * Optional TCP option handlers
> + */
> +static DEFINE_SPINLOCK(tcp_option_list_lock);
> +static LIST_HEAD(tcp_option_list);
> +DEFINE_STATIC_KEY_FALSE(tcp_extra_options_enabled);
> +
>  /*
>   * TCP splice context
>   */
> @@ -3375,6 +3383,130 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
>  
>  #endif
>  
> +/* Linear search, few entries are expected. The RCU read lock must
> + * be held before calling.
> + */
> +static struct tcp_extra_option_ops *tcp_extra_options_find_kind(unsigned char kind)
> +{
> +	struct tcp_extra_option_ops *entry;
> +
> +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
> +		if (entry->option_kind == kind)
> +			return entry;
> +	}
> +
> +	return NULL;
> +}
> +
> +void tcp_extra_options_parse(int opcode, int opsize, const unsigned char *opptr,
> +			     const struct sk_buff *skb,
> +			     struct tcp_options_received *opt_rx,
> +			     struct sock *sk)
> +{
> +	struct tcp_extra_option_ops *entry;
> +
> +	rcu_read_lock();
> +	entry = tcp_extra_options_find_kind(opcode);
> +	if (entry && entry->parse)
> +		entry->parse(opsize, opptr, skb, opt_rx, sk);
> +	rcu_read_unlock();
> +}
> +EXPORT_SYMBOL_GPL(tcp_extra_options_parse);
> +
> +/* The RCU read lock must be held before calling, and should span both
> + * the call to this function and tcp_extra_options_write to ensure that
> + * tcp_option_list does not change between the two calls. To preserve
> + * expected option alignment, always returns a multiple of 4 bytes.
> + */
> +unsigned int tcp_extra_options_prepare(struct sk_buff *skb, u8 flags,
> +				       unsigned int remaining,
> +				       struct tcp_out_options *opts,
> +				       const struct sock *sk)
> +{
> +	struct tcp_extra_option_ops *entry;
> +	unsigned int used = 0;
> +
> +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
> +		if (unlikely(!entry->prepare))
> +			continue;
> +
> +		used += entry->prepare(skb, flags, remaining - used, opts, sk);
> +	}
> +
> +	return roundup(used, 4);
> +}
> +EXPORT_SYMBOL_GPL(tcp_extra_options_prepare);
> +
> +/* The RCU read lock must be held before calling, and should span both
> + * the call to tcp_extra_options_write and this function to ensure that
> + * tcp_option_list does not change between the two calls.
> + */
> +void tcp_extra_options_write(__be32 *ptr, struct tcp_out_options *opts,
> +			     const struct sock *sk)
> +{
> +	struct tcp_extra_option_ops *entry;
> +
> +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
> +		if (unlikely(!entry->write))
> +			continue;
> +
> +		entry->write(ptr, opts, sk);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(tcp_extra_options_write);
> +
> +int tcp_register_extra_option(struct tcp_extra_option_ops *ops)
> +{
> +	struct tcp_extra_option_ops *entry;
> +	struct list_head* add_before = &tcp_option_list;
> +	int ret = 0;
> +
> +	if (!ops->option_kind)
> +		return -EINVAL;
> +
> +	if (!try_module_get(ops->owner))
> +		return -ENOENT;
> +
> +	spin_lock(&tcp_option_list_lock);
> +
> +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
> +		if (entry->option_kind == ops->option_kind) {
> +			pr_notice("Option kind %u already registered\n",
> +				  ops->option_kind);
> +			spin_unlock(&tcp_option_list_lock);
> +			module_put(ops->owner);
> +			return -EEXIST;
> +		}
> +
> +		if (entry->priority <= ops->priority)
> +			add_before = &entry->list;
> +	}
> +
> +	list_add_tail_rcu(&ops->list, add_before);
> +	pr_debug("Option kind %u registered\n", ops->option_kind);
> +
> +	spin_unlock(&tcp_option_list_lock);
> +
> +	static_branch_inc(&tcp_extra_options_enabled);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(tcp_register_extra_option);
> +
> +void tcp_unregister_extra_option(struct tcp_extra_option_ops *ops)
> +{
> +	spin_lock(&tcp_option_list_lock);
> +	list_del_rcu(&ops->list);
> +	spin_unlock(&tcp_option_list_lock);
> +
> +	synchronize_net();
> +
> +	static_branch_dec(&tcp_extra_options_enabled);
> +
> +	module_put(ops->owner);
> +}
> +EXPORT_SYMBOL_GPL(tcp_unregister_extra_option);
> +
>  void tcp_done(struct sock *sk)
>  {
>  	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
> @@ -3521,6 +3653,7 @@ void __init tcp_init(void)
>  		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
>  	}
>  
> +	INIT_LIST_HEAD(&tcp_option_list);
>  
>  	cnt = tcp_hashinfo.ehash_mask + 1;
>  	sysctl_tcp_max_orphans = cnt / 2;
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index c5d7656beeee..faf3c8d34cec 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -3728,7 +3728,7 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
>  void tcp_parse_options(const struct net *net,
>  		       const struct sk_buff *skb,
>  		       struct tcp_options_received *opt_rx, int estab,
> -		       struct tcp_fastopen_cookie *foc)
> +		       struct tcp_fastopen_cookie *foc, struct tcp_sock *tp)
>  {
>  	const unsigned char *ptr;
>  	const struct tcphdr *th = tcp_hdr(skb);
> @@ -3830,6 +3830,12 @@ void tcp_parse_options(const struct net *net,
>  						ptr + 2, th->syn, foc, true);
>  				break;
>  
> +			default:
> +				tcp_extra_options_parse(opcode, opsize, ptr,
> +							skb, opt_rx,
> +							tcp_to_sk(tp));
> +				break;
> +
>  			}
>  			ptr += opsize-2;
>  			length -= opsize;
> @@ -3876,7 +3882,7 @@ static bool tcp_fast_parse_options(const struct net *net,
>  			return true;
>  	}
>  
> -	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
> +	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL, tp);
>  	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
>  		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
>  
> @@ -5569,7 +5575,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
>  		/* Get original SYNACK MSS value if user MSS sets mss_clamp */
>  		tcp_clear_options(&opt);
>  		opt.user_mss = opt.mss_clamp = 0;
> -		tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
> +		tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL, tp);
>  		mss = opt.mss_clamp;
>  	}
>  
> @@ -5623,7 +5629,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
>  	int saved_clamp = tp->rx_opt.mss_clamp;
>  	bool fastopen_fail;
>  
> -	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
> +	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc, tp);
>  	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
>  		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
>  
> @@ -6299,7 +6305,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
>  	tmp_opt.mss_clamp = af_ops->mss_clamp;
>  	tmp_opt.user_mss  = tp->rx_opt.user_mss;
>  	tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
> -			  want_cookie ? NULL : &foc);
> +			  want_cookie ? NULL : &foc, tp);
>  
>  	if (want_cookie && !tmp_opt.saw_tstamp)
>  		tcp_clear_options(&tmp_opt);
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index d9416b5162bc..537734e70317 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -598,9 +598,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
>  	const struct tcphdr *th = tcp_hdr(skb);
>  	struct {
>  		struct tcphdr th;
> -#ifdef CONFIG_TCP_MD5SIG
> -		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
> -#endif
> +		__be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
>  	} rep;
>  	struct ip_reply_arg arg;
>  #ifdef CONFIG_TCP_MD5SIG
> @@ -611,6 +609,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
>  	struct sock *sk1 = NULL;
>  #endif
>  	struct net *net;
> +	int offset = 0;
>  
>  	/* Never send a reset in response to a reset. */
>  	if (th->rst)
> @@ -676,17 +675,44 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
>  			goto out;
>  
>  	}
> +#endif
> +
> +	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
> +		unsigned int remaining;
> +		unsigned int used;
> +		struct tcp_out_options opts;
> +
> +		remaining = sizeof(rep.opt);
> +#ifdef CONFIG_TCP_MD5SIG
> +		if (key)
> +			remaining -= TCPOLEN_MD5SIG_ALIGNED;
> +#endif
> +
> +		memset(&opts, 0, sizeof(opts));
> +
> +		rcu_read_lock();
> +		used = tcp_extra_options_prepare(NULL, TCPHDR_RST, remaining,
> +						 &opts, sk);
> +
> +		tcp_extra_options_write(&rep.opt[0], &opts, sk);
> +		rcu_read_unlock();
> +
> +		arg.iov[0].iov_len += used;
> +		offset += used / 4;
> +		rep.th.doff = arg.iov[0].iov_len / 4;
> +	}
>  
> +#ifdef CONFIG_TCP_MD5SIG
>  	if (key) {
> -		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
> -				   (TCPOPT_NOP << 16) |
> -				   (TCPOPT_MD5SIG << 8) |
> -				   TCPOLEN_MD5SIG);
> +		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
> +					  (TCPOPT_NOP << 16) |
> +					  (TCPOPT_MD5SIG << 8) |
> +					  TCPOLEN_MD5SIG);
>  		/* Update length and the length the header thinks exists */
>  		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
>  		rep.th.doff = arg.iov[0].iov_len / 4;
>  
> -		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
> +		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
>  				     key, ip_hdr(skb)->saddr,
>  				     ip_hdr(skb)->daddr, &rep.th);
>  	}
> @@ -738,14 +764,11 @@ static void tcp_v4_send_ack(const struct sock *sk,
>  	const struct tcphdr *th = tcp_hdr(skb);
>  	struct {
>  		struct tcphdr th;
> -		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
> -#ifdef CONFIG_TCP_MD5SIG
> -			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
> -#endif
> -			];
> +		__be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
>  	} rep;
>  	struct net *net = sock_net(sk);
>  	struct ip_reply_arg arg;
> +	int offset = 0;
>  
>  	memset(&rep.th, 0, sizeof(struct tcphdr));
>  	memset(&arg, 0, sizeof(arg));
> @@ -759,33 +782,56 @@ static void tcp_v4_send_ack(const struct sock *sk,
>  		rep.opt[1] = htonl(tsval);
>  		rep.opt[2] = htonl(tsecr);
>  		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
> +		offset += 3;
>  	}
>  
>  	/* Swap the send and the receive. */
>  	rep.th.dest    = th->source;
>  	rep.th.source  = th->dest;
> -	rep.th.doff    = arg.iov[0].iov_len / 4;
>  	rep.th.seq     = htonl(seq);
>  	rep.th.ack_seq = htonl(ack);
>  	rep.th.ack     = 1;
>  	rep.th.window  = htons(win);
>  
> +	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
> +		unsigned int remaining;
> +		unsigned int used;
> +		struct tcp_out_options opts;
> +
> +		remaining = sizeof(rep.th) + sizeof(rep.opt) - arg.iov[0].iov_len;
>  #ifdef CONFIG_TCP_MD5SIG
> -	if (key) {
> -		int offset = (tsecr) ? 3 : 0;
> +		if (key)
> +			remaining -= TCPOLEN_MD5SIG_ALIGNED;
> +#endif
>  
> +		memset(&opts, 0, sizeof(opts));
> +		rcu_read_lock();
> +		used = tcp_extra_options_prepare(NULL, TCPHDR_ACK, remaining,
> +						 &opts, sk);
> +
> +		tcp_extra_options_write(&rep.opt[offset], &opts, sk);
> +		rcu_read_unlock();
> +
> +		arg.iov[0].iov_len += used;
> +		offset += used / 4;
> +	}
> +
> +	rep.th.doff = arg.iov[0].iov_len / 4;
> +
> +#ifdef CONFIG_TCP_MD5SIG
> +	if (key) {
>  		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
>  					  (TCPOPT_NOP << 16) |
>  					  (TCPOPT_MD5SIG << 8) |
>  					  TCPOLEN_MD5SIG);
>  		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
> -		rep.th.doff = arg.iov[0].iov_len/4;
>  
>  		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
>  				    key, ip_hdr(skb)->saddr,
>  				    ip_hdr(skb)->daddr, &rep.th);
>  	}
>  #endif
> +
>  	arg.flags = reply_flags;
>  	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
>  				      ip_hdr(skb)->saddr, /* XXX */
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index 188a6f31356d..1c3e91899dac 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -98,7 +98,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
>  
>  	tmp_opt.saw_tstamp = 0;
>  	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
> -		tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
> +		tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL, NULL);
>  
>  		if (tmp_opt.saw_tstamp) {
>  			if (tmp_opt.rcv_tsecr)
> @@ -565,7 +565,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
>  
>  	tmp_opt.saw_tstamp = 0;
>  	if (th->doff > (sizeof(struct tcphdr)>>2)) {
> -		tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
> +		tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL, NULL);
>  
>  		if (tmp_opt.saw_tstamp) {
>  			tmp_opt.ts_recent = req->ts_recent;
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 0bc9e46a5369..61eba3d0ae17 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -41,6 +41,7 @@
>  #include <linux/compiler.h>
>  #include <linux/gfp.h>
>  #include <linux/module.h>
> +#include <linux/static_key.h>
>  
>  /* People can turn this off for buggy TCP's found in printers etc. */
>  int sysctl_tcp_retrans_collapse __read_mostly = 1;
> @@ -413,23 +414,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
>  	return tp->snd_una != tp->snd_up;
>  }
>  
> -#define OPTION_SACK_ADVERTISE	(1 << 0)
> -#define OPTION_TS		(1 << 1)
> -#define OPTION_MD5		(1 << 2)
> -#define OPTION_WSCALE		(1 << 3)
> -#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
> -
> -struct tcp_out_options {
> -	u16 options;		/* bit field of OPTION_* */
> -	u16 mss;		/* 0 to disable */
> -	u8 ws;			/* window scale, 0 to disable */
> -	u8 num_sack_blocks;	/* number of SACK blocks to include */
> -	u8 hash_size;		/* bytes in hash_location */
> -	__u8 *hash_location;	/* temporary pointer, overloaded */
> -	__u32 tsval, tsecr;	/* need to include OPTION_TS */
> -	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
> -};
> -
>  /* Write previously computed TCP options to the packet.
>   *
>   * Beware: Something in the Internet is very sensitive to the ordering of
> @@ -536,6 +520,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
>  		}
>  		ptr += (len + 3) >> 2;
>  	}
> +
> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> +		tcp_extra_options_write(ptr, opts, tcp_to_sk(tp));
>  }
>  
>  /* Compute TCP options for SYN packets. This is not the final
> @@ -603,6 +590,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
>  		}
>  	}
>  
> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> +		remaining -= tcp_extra_options_prepare(skb, TCPHDR_SYN,
> +						       remaining, opts,
> +						       tcp_to_sk(tp));
> +
>  	return MAX_TCP_OPTION_SPACE - remaining;
>  }
>  
> @@ -663,6 +655,12 @@ static unsigned int tcp_synack_options(struct request_sock *req,
>  		}
>  	}
>  
> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> +		remaining -= tcp_extra_options_prepare(skb,
> +						       TCPHDR_SYN | TCPHDR_ACK,
> +						       remaining, opts,
> +						       req_to_sk(req));
> +
>  	return MAX_TCP_OPTION_SPACE - remaining;
>  }
>  
> @@ -696,6 +694,11 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
>  		size += TCPOLEN_TSTAMP_ALIGNED;
>  	}
>  
> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> +		size += tcp_extra_options_prepare(skb, 0,
> +						  MAX_TCP_OPTION_SPACE - size,
> +						  opts, tcp_to_sk(tp));
> +
>  	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
>  	if (unlikely(eff_sacks)) {
>  		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
> @@ -1016,6 +1019,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
>  	tcb = TCP_SKB_CB(skb);
>  	memset(&opts, 0, sizeof(opts));
>  
> +	rcu_read_lock();
>  	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
>  		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
>  	else
> @@ -1092,6 +1096,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
>  					       md5, sk, skb);
>  	}
>  #endif
> +	rcu_read_unlock();
>  
>  	icsk->icsk_af_ops->send_check(sk, skb);
>  
> @@ -3156,8 +3161,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
>  #endif
>  		skb->skb_mstamp = tcp_clock_us();
>  
> -#ifdef CONFIG_TCP_MD5SIG
>  	rcu_read_lock();
> +#ifdef CONFIG_TCP_MD5SIG
>  	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
>  #endif
>  	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
> @@ -3196,8 +3201,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
>  	if (md5)
>  		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
>  					       md5, req_to_sk(req), skb);
> -	rcu_read_unlock();
>  #endif
> +	rcu_read_unlock();
>  
>  	/* Do not fool tcpdump (if any), clean our debris */
>  	skb->tstamp = 0;
> diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
> index 4e7817abc0b9..407480366c73 100644
> --- a/net/ipv6/syncookies.c
> +++ b/net/ipv6/syncookies.c
> @@ -162,7 +162,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
>  
>  	/* check for timestamp cookie support */
>  	memset(&tcp_opt, 0, sizeof(tcp_opt));
> -	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
> +	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL, tp);
>  
>  	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
>  		tsoff = secure_tcpv6_ts_off(sock_net(sk),
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 64d94afa427f..4a3fba1ef3a2 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -784,9 +784,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
>  	struct flowi6 fl6;
>  	struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
>  	struct sock *ctl_sk = net->ipv6.tcp_sk;
> -	unsigned int tot_len = sizeof(struct tcphdr);
> +	unsigned int tot_len = 0;
>  	struct dst_entry *dst;
>  	__be32 *topt;
> +	struct tcp_out_options extraopts;
>  
>  	if (tsecr)
>  		tot_len += TCPOLEN_TSTAMP_ALIGNED;
> @@ -795,10 +796,28 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
>  		tot_len += TCPOLEN_MD5SIG_ALIGNED;
>  #endif
>  
> +	rcu_read_lock();
> +	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
> +		unsigned int remaining = MAX_TCP_OPTION_SPACE - tot_len;
> +		u8 extraflags = rst ? TCPHDR_RST : 0;
> +
> +		if (!rst || !th->ack)
> +			extraflags |= TCPHDR_ACK;
> +
> +		memset(&extraopts, 0, sizeof(extraopts));
> +
> +		tot_len += tcp_extra_options_prepare(skb, extraflags, remaining,
> +						     &extraopts, sk);
> +	}
> +
> +	tot_len += sizeof(struct tcphdr);
> +
>  	buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
>  			 GFP_ATOMIC);
> -	if (!buff)
> +	if (!buff) {
> +		rcu_read_unlock();
>  		return;
> +	}
>  
>  	skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
>  
> @@ -835,6 +854,11 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
>  	}
>  #endif
>  
> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> +		tcp_extra_options_write(topt, &extraopts, sk);

I'm currently rebasing my changes on top of this patch here. We will have to
see how to handle this here correctly.

Because, MD5 needs some more info to generate the hash correctly. Notable,
IP-addresses and the TCP-header.

tcp_extra_options_write done in this way here won't suffice.

I will see how to handle this and resubmit with a v2 of my series.



Christoph


> +
> +	rcu_read_unlock();
> +
>  	memset(&fl6, 0, sizeof(fl6));
>  	fl6.daddr = ipv6_hdr(skb)->saddr;
>  	fl6.saddr = ipv6_hdr(skb)->daddr;
> -- 
> 2.14.2
> 
> _______________________________________________
> mptcp mailing list
> mptcp(a)lists.01.org
> https://lists.01.org/mailman/listinfo/mptcp

^ permalink raw reply	[flat|nested] 5+ messages in thread
* Re: [MPTCP] [PATCH v2] tcp: Register handlers for extra TCP options
@ 2017-10-06  5:28 Christoph Paasch
  0 siblings, 0 replies; 5+ messages in thread
From: Christoph Paasch @ 2017-10-06  5:28 UTC (permalink / raw)
  To: mptcp 

[-- Attachment #1: Type: text/plain, Size: 32449 bytes --]

Hello,

On 05/10/17 - 11:56:48, Mat Martineau wrote:
> On Thu, 5 Oct 2017, Christoph Paasch wrote:
> > On 04/10/17 - 13:36:16, Mat Martineau wrote:
> > > Allow additional TCP options to be handled by registered hook
> > > functions.
> > > 
> > > Registered options have a priority that determines the order in which
> > > options are prepared and written. Lower priority numbers are handled
> > > first.
> > > 
> > > Option parsing will call the provided 'parse' function when a TCP option
> > > number is not recognized by the normal option parsing code.
> > > 
> > > The 'prepare' function determines the required space for registered
> > > options and store associated data. 'write' adds the option to the TCP
> > > header.
> > > 
> > > A static key and RCU synchronization are used to minimize the
> > > performance impact of these extensible TCP features.
> > > 
> > > Signed-off-by: Mat Martineau <mathew.j.martineau(a)linux.intel.com>
> > > ---
> > > 
> > > Changes from v1: One 'prepare' callback (no more special callback for
> > > request_sock), and add a few missing callback sites (like ipv6).
> > 
> > great, I like that we now have only one 'prepare' callback.
> > 
> > > 
> > > drivers/infiniband/hw/cxgb4/cm.c |   2 +-
> > >  include/linux/tcp.h              |  22 +++++++
> > >  include/net/tcp.h                |  40 +++++++++++-
> > >  net/ipv4/syncookies.c            |   2 +-
> > >  net/ipv4/tcp.c                   | 133 +++++++++++++++++++++++++++++++++++++++
> > >  net/ipv4/tcp_input.c             |  16 +++--
> > >  net/ipv4/tcp_ipv4.c              |  80 ++++++++++++++++++-----
> > >  net/ipv4/tcp_minisocks.c         |   4 +-
> > >  net/ipv4/tcp_output.c            |  43 +++++++------
> > >  net/ipv6/syncookies.c            |   2 +-
> > >  net/ipv6/tcp_ipv6.c              |  28 ++++++++-
> > >  11 files changed, 323 insertions(+), 49 deletions(-)
> > > 
> > > diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
> > > index daf7a56e5d7e..c3eb31611011 100644
> > > --- a/drivers/infiniband/hw/cxgb4/cm.c
> > > +++ b/drivers/infiniband/hw/cxgb4/cm.c
> > > @@ -3752,7 +3752,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
> > >  	 */
> > >  	memset(&tmp_opt, 0, sizeof(tmp_opt));
> > >  	tcp_clear_options(&tmp_opt);
> > > -	tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL);
> > > +	tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL, NULL);
> > > 
> > >  	req = __skb_push(skb, sizeof(*req));
> > >  	memset(req, 0, sizeof(*req));
> > > diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> > > index 4aa40ef02d32..0347e6ce99be 100644
> > > --- a/include/linux/tcp.h
> > > +++ b/include/linux/tcp.h
> > > @@ -112,6 +112,23 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
> > >  	rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
> > >  }
> > > 
> > > +#define OPTION_SACK_ADVERTISE	(1 << 0)
> > > +#define OPTION_TS		(1 << 1)
> > > +#define OPTION_MD5		(1 << 2)
> > > +#define OPTION_WSCALE		(1 << 3)
> > > +#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
> > > +
> > > +struct tcp_out_options {
> > > +	u16 options;		/* bit field of OPTION_* */
> > > +	u16 mss;		/* 0 to disable */
> > > +	u8 ws;			/* window scale, 0 to disable */
> > > +	u8 num_sack_blocks;	/* number of SACK blocks to include */
> > > +	u8 hash_size;		/* bytes in hash_location */
> > > +	__u8 *hash_location;	/* temporary pointer, overloaded */
> > > +	__u32 tsval, tsecr;	/* need to include OPTION_TS */
> > > +	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
> > > +};
> > > +
> > >  /* This is the max number of SACKS that we'll generate and process. It's safe
> > >   * to increase this, although since:
> > >   *   size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
> > > @@ -389,6 +406,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
> > >  	return (struct tcp_sock *)sk;
> > >  }
> > > 
> > > +static inline struct sock *tcp_to_sk(const struct tcp_sock *tp)
> > > +{
> > > +	return (struct sock *)tp;
> > > +}
> > 
> > Nice little function :)
> > 
> > > +
> > >  struct tcp_timewait_sock {
> > >  	struct inet_timewait_sock tw_sk;
> > >  #define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt
> > > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > > index 3bc910a9bfc6..04f3dcecf592 100644
> > > --- a/include/net/tcp.h
> > > +++ b/include/net/tcp.h
> > > @@ -433,7 +433,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
> > >  		int flags, int *addr_len);
> > >  void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
> > >  		       struct tcp_options_received *opt_rx,
> > > -		       int estab, struct tcp_fastopen_cookie *foc);
> > > +		       int estab, struct tcp_fastopen_cookie *foc,
> > > +		       struct tcp_sock *tp);
> > >  const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
> > > 
> > >  /*
> > > @@ -2109,4 +2110,41 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
> > >  {
> > >  	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
> > >  }
> > > +
> > > +extern struct static_key_false tcp_extra_options_enabled;
> > > +
> > > +struct tcp_extra_option_ops {
> > > +	struct list_head	list;
> > > +	unsigned char		option_kind;
> > > +	unsigned char		priority;
> > > +	void (*parse)(int opsize, const unsigned char *opptr,
> > > +		      const struct sk_buff *skb,
> > > +		      struct tcp_options_received *opt_rx,
> > > +		      struct sock *sk);
> > > +	/* Return the number of bytes consumed */
> > > +	unsigned int (*prepare)(struct sk_buff *skb, u8 flags,
> > > +				unsigned int remaining,
> > > +				struct tcp_out_options *opts,
> > > +				const struct sock *sk);
> > > +	void (*write)(__be32 *ptr, struct tcp_out_options *opts,
> > > +		      const struct sock *sk);
> > > +	struct module		*owner;
> > > +};
> > > +
> > > +void tcp_extra_options_parse(int opcode, int opsize, const unsigned char *opptr,
> > > +			     const struct sk_buff *skb,
> > > +			     struct tcp_options_received *opt_rx,
> > > +			     struct sock *sk);
> > > +
> > > +unsigned int tcp_extra_options_prepare(struct sk_buff *skb, u8 flags,
> > > +				       unsigned int remaining,
> > > +				       struct tcp_out_options *opts,
> > > +				       const struct sock *sk);
> > > +
> > > +void tcp_extra_options_write(__be32 *ptr, struct tcp_out_options *opts,
> > > +			     const struct sock *sk);
> > > +
> > > +int tcp_register_extra_option(struct tcp_extra_option_ops *ops);
> > > +void tcp_unregister_extra_option(struct tcp_extra_option_ops *ops);
> > > +
> > >  #endif	/* _TCP_H */
> > > diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
> > > index b1bb1b3a1082..6c8d750a2243 100644
> > > --- a/net/ipv4/syncookies.c
> > > +++ b/net/ipv4/syncookies.c
> > > @@ -313,7 +313,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
> > > 
> > >  	/* check for timestamp cookie support */
> > >  	memset(&tcp_opt, 0, sizeof(tcp_opt));
> > > -	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
> > > +	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL, tp);
> > > 
> > >  	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
> > >  		tsoff = secure_tcp_ts_off(sock_net(sk),
> > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > > index 5091402720ab..8136857b992b 100644
> > > --- a/net/ipv4/tcp.c
> > > +++ b/net/ipv4/tcp.c
> > > @@ -270,6 +270,7 @@
> > >  #include <linux/time.h>
> > >  #include <linux/slab.h>
> > >  #include <linux/errqueue.h>
> > > +#include <linux/static_key.h>
> > > 
> > >  #include <net/icmp.h>
> > >  #include <net/inet_common.h>
> > > @@ -306,6 +307,13 @@ EXPORT_SYMBOL(tcp_memory_allocated);
> > >  struct percpu_counter tcp_sockets_allocated;
> > >  EXPORT_SYMBOL(tcp_sockets_allocated);
> > > 
> > > +/*
> > > + * Optional TCP option handlers
> > > + */
> > > +static DEFINE_SPINLOCK(tcp_option_list_lock);
> > > +static LIST_HEAD(tcp_option_list);
> > > +DEFINE_STATIC_KEY_FALSE(tcp_extra_options_enabled);
> > > +
> > >  /*
> > >   * TCP splice context
> > >   */
> > > @@ -3375,6 +3383,130 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
> > > 
> > >  #endif
> > > 
> > > +/* Linear search, few entries are expected. The RCU read lock must
> > > + * be held before calling.
> > > + */
> > > +static struct tcp_extra_option_ops *tcp_extra_options_find_kind(unsigned char kind)
> > > +{
> > > +	struct tcp_extra_option_ops *entry;
> > > +
> > > +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
> > > +		if (entry->option_kind == kind)
> > > +			return entry;
> > > +	}
> > > +
> > > +	return NULL;
> > > +}
> > > +
> > > +void tcp_extra_options_parse(int opcode, int opsize, const unsigned char *opptr,
> > > +			     const struct sk_buff *skb,
> > > +			     struct tcp_options_received *opt_rx,
> > > +			     struct sock *sk)
> > > +{
> > > +	struct tcp_extra_option_ops *entry;
> > > +
> > > +	rcu_read_lock();
> > > +	entry = tcp_extra_options_find_kind(opcode);
> > > +	if (entry && entry->parse)
> > > +		entry->parse(opsize, opptr, skb, opt_rx, sk);
> > > +	rcu_read_unlock();
> > > +}
> > > +EXPORT_SYMBOL_GPL(tcp_extra_options_parse);
> > > +
> > > +/* The RCU read lock must be held before calling, and should span both
> > > + * the call to this function and tcp_extra_options_write to ensure that
> > > + * tcp_option_list does not change between the two calls. To preserve
> > > + * expected option alignment, always returns a multiple of 4 bytes.
> > > + */
> > 
> > The RCU read lock won't be able to protect the list from being altered. All
> > it will take care of is that the elements on the list won't get free'd while
> > traversing it and that the next-pointer won't get changed. That way the
> > list-traversal is still fine.
> 
> Thanks for the description. If we switch to per-socket lists then RCU won't
> be necessary, but I think it's possible to coax RCU in to the behavior I
> want if:
> 
> * the list is copied/rebuilt every time an option is registered or
> unregistered
> 
> and
> 
> * tcp_extra_options_prepare returns a pointer to the version of the list for
> tcp_extra_options_write to use
> 
> But then it might as well be an array. The list should be short enough and
> rarely modified, so it wouldn't be a lot of overhead.
> 
> > If we move the extra-options to a mode where the list is on a per-TCP socket
> > basis, we can avoid handling this. Because we can limit adding/removing
> > TCP-options to happen only when the socket is in TCP_CLOSE. So, once the
> > connection started, the list will always remain the same.
> 
> I was thinking a global list might still be useful to avoid complicating
> request_sock, but now realize that incoming connections would use the extra
> options list from the listening socket.

I actually think we should copy the option-list over to the request-sock
when it gets created. For MD5 it's not absolutely necessary (the keys are
anyways already stored in the listener for each peer-IP address).

However, for MPTCP it would be useful. Because, MPTCP requires state per
request-sock. And if the list is in the request-sock, we can store the state
in there (thus avoid allocating a struct mptcp_request_sock

> > > +unsigned int tcp_extra_options_prepare(struct sk_buff *skb, u8 flags,
> > > +				       unsigned int remaining,
> > > +				       struct tcp_out_options *opts,
> > > +				       const struct sock *sk)
> > > +{
> > > +	struct tcp_extra_option_ops *entry;
> > > +	unsigned int used = 0;
> > > +
> > > +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
> > > +		if (unlikely(!entry->prepare))
> > > +			continue;
> > > +
> > > +		used += entry->prepare(skb, flags, remaining - used, opts, sk);
> > > +	}
> > > +
> > > +	return roundup(used, 4);
> > > +}
> > > +EXPORT_SYMBOL_GPL(tcp_extra_options_prepare);
> > > +
> > > +/* The RCU read lock must be held before calling, and should span both
> > > + * the call to tcp_extra_options_write and this function to ensure that
> > > + * tcp_option_list does not change between the two calls.
> > > + */
> > > +void tcp_extra_options_write(__be32 *ptr, struct tcp_out_options *opts,
> > > +			     const struct sock *sk)
> > > +{
> > > +	struct tcp_extra_option_ops *entry;
> > > +
> > > +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
> > > +		if (unlikely(!entry->write))
> > > +			continue;
> > > +
> > > +		entry->write(ptr, opts, sk);
> > > +	}
> > > +}
> > > +EXPORT_SYMBOL_GPL(tcp_extra_options_write);
> > > +
> > > +int tcp_register_extra_option(struct tcp_extra_option_ops *ops)
> > > +{
> > > +	struct tcp_extra_option_ops *entry;
> > > +	struct list_head* add_before = &tcp_option_list;
> > > +	int ret = 0;
> > > +
> > > +	if (!ops->option_kind)
> > > +		return -EINVAL;
> > > +
> > > +	if (!try_module_get(ops->owner))
> > > +		return -ENOENT;
> > > +
> > > +	spin_lock(&tcp_option_list_lock);
> > > +
> > > +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
> > > +		if (entry->option_kind == ops->option_kind) {
> > > +			pr_notice("Option kind %u already registered\n",
> > > +				  ops->option_kind);
> > > +			spin_unlock(&tcp_option_list_lock);
> > > +			module_put(ops->owner);
> > > +			return -EEXIST;
> > > +		}
> > > +
> > > +		if (entry->priority <= ops->priority)
> > > +			add_before = &entry->list;
> > > +	}
> > > +
> > > +	list_add_tail_rcu(&ops->list, add_before);
> > > +	pr_debug("Option kind %u registered\n", ops->option_kind);
> > > +
> > > +	spin_unlock(&tcp_option_list_lock);
> > > +
> > > +	static_branch_inc(&tcp_extra_options_enabled);
> > > +
> > > +	return ret;
> > > +}
> > > +EXPORT_SYMBOL_GPL(tcp_register_extra_option);
> > > +
> > > +void tcp_unregister_extra_option(struct tcp_extra_option_ops *ops)
> > > +{
> > > +	spin_lock(&tcp_option_list_lock);
> > > +	list_del_rcu(&ops->list);
> > > +	spin_unlock(&tcp_option_list_lock);
> > > +
> > > +	synchronize_net();
> > > +
> > > +	static_branch_dec(&tcp_extra_options_enabled);
> > > +
> > > +	module_put(ops->owner);
> > > +}
> > > +EXPORT_SYMBOL_GPL(tcp_unregister_extra_option);
> > > +
> > >  void tcp_done(struct sock *sk)
> > >  {
> > >  	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
> > > @@ -3521,6 +3653,7 @@ void __init tcp_init(void)
> > >  		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
> > >  	}
> > > 
> > > +	INIT_LIST_HEAD(&tcp_option_list);
> > > 
> > >  	cnt = tcp_hashinfo.ehash_mask + 1;
> > >  	sysctl_tcp_max_orphans = cnt / 2;
> > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > index c5d7656beeee..faf3c8d34cec 100644
> > > --- a/net/ipv4/tcp_input.c
> > > +++ b/net/ipv4/tcp_input.c
> > > @@ -3728,7 +3728,7 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
> > >  void tcp_parse_options(const struct net *net,
> > >  		       const struct sk_buff *skb,
> > >  		       struct tcp_options_received *opt_rx, int estab,
> > > -		       struct tcp_fastopen_cookie *foc)
> > > +		       struct tcp_fastopen_cookie *foc, struct tcp_sock *tp)
> > >  {
> > >  	const unsigned char *ptr;
> > >  	const struct tcphdr *th = tcp_hdr(skb);
> > > @@ -3830,6 +3830,12 @@ void tcp_parse_options(const struct net *net,
> > >  						ptr + 2, th->syn, foc, true);
> > >  				break;
> > > 
> > > +			default:
> > > +				tcp_extra_options_parse(opcode, opsize, ptr,
> > > +							skb, opt_rx,
> > > +							tcp_to_sk(tp));
> > > +				break;
> > > +
> > >  			}
> > >  			ptr += opsize-2;
> > >  			length -= opsize;
> > > @@ -3876,7 +3882,7 @@ static bool tcp_fast_parse_options(const struct net *net,
> > >  			return true;
> > >  	}
> > > 
> > > -	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
> > > +	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL, tp);
> > >  	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
> > >  		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
> > > 
> > > @@ -5569,7 +5575,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
> > >  		/* Get original SYNACK MSS value if user MSS sets mss_clamp */
> > >  		tcp_clear_options(&opt);
> > >  		opt.user_mss = opt.mss_clamp = 0;
> > > -		tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
> > > +		tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL, tp);
> > >  		mss = opt.mss_clamp;
> > >  	}
> > > 
> > > @@ -5623,7 +5629,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
> > >  	int saved_clamp = tp->rx_opt.mss_clamp;
> > >  	bool fastopen_fail;
> > > 
> > > -	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
> > > +	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc, tp);
> > >  	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
> > >  		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
> > > 
> > > @@ -6299,7 +6305,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
> > >  	tmp_opt.mss_clamp = af_ops->mss_clamp;
> > >  	tmp_opt.user_mss  = tp->rx_opt.user_mss;
> > >  	tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
> > > -			  want_cookie ? NULL : &foc);
> > > +			  want_cookie ? NULL : &foc, tp);
> > > 
> > >  	if (want_cookie && !tmp_opt.saw_tstamp)
> > >  		tcp_clear_options(&tmp_opt);
> > > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> > > index d9416b5162bc..537734e70317 100644
> > > --- a/net/ipv4/tcp_ipv4.c
> > > +++ b/net/ipv4/tcp_ipv4.c
> > > @@ -598,9 +598,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
> > >  	const struct tcphdr *th = tcp_hdr(skb);
> > >  	struct {
> > >  		struct tcphdr th;
> > > -#ifdef CONFIG_TCP_MD5SIG
> > > -		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
> > > -#endif
> > > +		__be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
> > >  	} rep;
> > >  	struct ip_reply_arg arg;
> > >  #ifdef CONFIG_TCP_MD5SIG
> > > @@ -611,6 +609,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
> > >  	struct sock *sk1 = NULL;
> > >  #endif
> > >  	struct net *net;
> > > +	int offset = 0;
> > > 
> > >  	/* Never send a reset in response to a reset. */
> > >  	if (th->rst)
> > > @@ -676,17 +675,44 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
> > >  			goto out;
> > > 
> > >  	}
> > > +#endif
> > > +
> > > +	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
> > > +		unsigned int remaining;
> > > +		unsigned int used;
> > > +		struct tcp_out_options opts;
> > > +
> > > +		remaining = sizeof(rep.opt);
> > > +#ifdef CONFIG_TCP_MD5SIG
> > > +		if (key)
> > > +			remaining -= TCPOLEN_MD5SIG_ALIGNED;
> > > +#endif
> > 
> > We will break TCP_MD5 here with this patch if we move it inside the static
> > branch.
> > Only after the patch that makes TCP_MD5 adopt the framework we can move this
> > code to here.
> 
> The MD5 option is handled outside the static branch (below), this bit of
> conditional code adjusts the space available to the extra options. The TCP
> options are not part of the MD5 digest, so it should be ok to write the
> extra options early.

Ok, I see. Now I understand. Yes, it's fine then :)


Christoph

> 
> > > +
> > > +		memset(&opts, 0, sizeof(opts));
> > > +
> > > +		rcu_read_lock();
> > > +		used = tcp_extra_options_prepare(NULL, TCPHDR_RST, remaining,
> > > +						 &opts, sk);
> > > +
> > > +		tcp_extra_options_write(&rep.opt[0], &opts, sk);
> > > +		rcu_read_unlock();
> > > +
> > > +		arg.iov[0].iov_len += used;
> > > +		offset += used / 4;
> 
> If any extra options were written, the MD5 option offset is adjusted here.
> 
> > > +		rep.th.doff = arg.iov[0].iov_len / 4;
> > > +	}
> 
> (End of static branch)
> 
> > > +#ifdef CONFIG_TCP_MD5SIG
> > >  	if (key) {
> > > -		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
> > > -				   (TCPOPT_NOP << 16) |
> > > -				   (TCPOPT_MD5SIG << 8) |
> > > -				   TCPOLEN_MD5SIG);
> > > +		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
> 
> The MD5 option will be written at the appropriate offset with or without the
> static branch above. Does that option need to be first? That could be
> accomodated.
> 
> 
> Mat
> 
> 
> > > +					  (TCPOPT_NOP << 16) |
> > > +					  (TCPOPT_MD5SIG << 8) |
> > > +					  TCPOLEN_MD5SIG);
> > >  		/* Update length and the length the header thinks exists */
> > >  		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
> > >  		rep.th.doff = arg.iov[0].iov_len / 4;
> > > 
> > > -		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
> > > +		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
> > >  				     key, ip_hdr(skb)->saddr,
> > >  				     ip_hdr(skb)->daddr, &rep.th);
> > >  	}
> > > @@ -738,14 +764,11 @@ static void tcp_v4_send_ack(const struct sock *sk,
> > >  	const struct tcphdr *th = tcp_hdr(skb);
> > >  	struct {
> > >  		struct tcphdr th;
> > > -		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
> > > -#ifdef CONFIG_TCP_MD5SIG
> > > -			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
> > > -#endif
> > > -			];
> > > +		__be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
> > >  	} rep;
> > >  	struct net *net = sock_net(sk);
> > >  	struct ip_reply_arg arg;
> > > +	int offset = 0;
> > > 
> > >  	memset(&rep.th, 0, sizeof(struct tcphdr));
> > >  	memset(&arg, 0, sizeof(arg));
> > > @@ -759,33 +782,56 @@ static void tcp_v4_send_ack(const struct sock *sk,
> > >  		rep.opt[1] = htonl(tsval);
> > >  		rep.opt[2] = htonl(tsecr);
> > >  		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
> > > +		offset += 3;
> > >  	}
> > > 
> > >  	/* Swap the send and the receive. */
> > >  	rep.th.dest    = th->source;
> > >  	rep.th.source  = th->dest;
> > > -	rep.th.doff    = arg.iov[0].iov_len / 4;
> > >  	rep.th.seq     = htonl(seq);
> > >  	rep.th.ack_seq = htonl(ack);
> > >  	rep.th.ack     = 1;
> > >  	rep.th.window  = htons(win);
> > > 
> > > +	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
> > > +		unsigned int remaining;
> > > +		unsigned int used;
> > > +		struct tcp_out_options opts;
> > > +
> > > +		remaining = sizeof(rep.th) + sizeof(rep.opt) - arg.iov[0].iov_len;
> > >  #ifdef CONFIG_TCP_MD5SIG
> > > -	if (key) {
> > > -		int offset = (tsecr) ? 3 : 0;
> > > +		if (key)
> > > +			remaining -= TCPOLEN_MD5SIG_ALIGNED;
> > > +#endif
> > > 
> > > +		memset(&opts, 0, sizeof(opts));
> > > +		rcu_read_lock();
> > > +		used = tcp_extra_options_prepare(NULL, TCPHDR_ACK, remaining,
> > > +						 &opts, sk);
> > > +
> > > +		tcp_extra_options_write(&rep.opt[offset], &opts, sk);
> > > +		rcu_read_unlock();
> > > +
> > > +		arg.iov[0].iov_len += used;
> > > +		offset += used / 4;
> > > +	}
> > > +
> > > +	rep.th.doff = arg.iov[0].iov_len / 4;
> > > +
> > > +#ifdef CONFIG_TCP_MD5SIG
> > > +	if (key) {
> > >  		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
> > >  					  (TCPOPT_NOP << 16) |
> > >  					  (TCPOPT_MD5SIG << 8) |
> > >  					  TCPOLEN_MD5SIG);
> > >  		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
> > > -		rep.th.doff = arg.iov[0].iov_len/4;
> > > 
> > >  		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
> > >  				    key, ip_hdr(skb)->saddr,
> > >  				    ip_hdr(skb)->daddr, &rep.th);
> > >  	}
> > >  #endif
> > > +
> > >  	arg.flags = reply_flags;
> > >  	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
> > >  				      ip_hdr(skb)->saddr, /* XXX */
> > > diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> > > index 188a6f31356d..1c3e91899dac 100644
> > > --- a/net/ipv4/tcp_minisocks.c
> > > +++ b/net/ipv4/tcp_minisocks.c
> > > @@ -98,7 +98,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
> > > 
> > >  	tmp_opt.saw_tstamp = 0;
> > >  	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
> > > -		tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
> > > +		tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL, NULL);
> > > 
> > >  		if (tmp_opt.saw_tstamp) {
> > >  			if (tmp_opt.rcv_tsecr)
> > > @@ -565,7 +565,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
> > > 
> > >  	tmp_opt.saw_tstamp = 0;
> > >  	if (th->doff > (sizeof(struct tcphdr)>>2)) {
> > > -		tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
> > > +		tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL, NULL);
> > > 
> > >  		if (tmp_opt.saw_tstamp) {
> > >  			tmp_opt.ts_recent = req->ts_recent;
> > > diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> > > index 0bc9e46a5369..61eba3d0ae17 100644
> > > --- a/net/ipv4/tcp_output.c
> > > +++ b/net/ipv4/tcp_output.c
> > > @@ -41,6 +41,7 @@
> > >  #include <linux/compiler.h>
> > >  #include <linux/gfp.h>
> > >  #include <linux/module.h>
> > > +#include <linux/static_key.h>
> > > 
> > >  /* People can turn this off for buggy TCP's found in printers etc. */
> > >  int sysctl_tcp_retrans_collapse __read_mostly = 1;
> > > @@ -413,23 +414,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
> > >  	return tp->snd_una != tp->snd_up;
> > >  }
> > > 
> > > -#define OPTION_SACK_ADVERTISE	(1 << 0)
> > > -#define OPTION_TS		(1 << 1)
> > > -#define OPTION_MD5		(1 << 2)
> > > -#define OPTION_WSCALE		(1 << 3)
> > > -#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
> > > -
> > > -struct tcp_out_options {
> > > -	u16 options;		/* bit field of OPTION_* */
> > > -	u16 mss;		/* 0 to disable */
> > > -	u8 ws;			/* window scale, 0 to disable */
> > > -	u8 num_sack_blocks;	/* number of SACK blocks to include */
> > > -	u8 hash_size;		/* bytes in hash_location */
> > > -	__u8 *hash_location;	/* temporary pointer, overloaded */
> > > -	__u32 tsval, tsecr;	/* need to include OPTION_TS */
> > > -	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
> > > -};
> > > -
> > >  /* Write previously computed TCP options to the packet.
> > >   *
> > >   * Beware: Something in the Internet is very sensitive to the ordering of
> > > @@ -536,6 +520,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
> > >  		}
> > >  		ptr += (len + 3) >> 2;
> > >  	}
> > > +
> > > +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> > > +		tcp_extra_options_write(ptr, opts, tcp_to_sk(tp));
> > >  }
> > > 
> > >  /* Compute TCP options for SYN packets. This is not the final
> > > @@ -603,6 +590,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
> > >  		}
> > >  	}
> > > 
> > > +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> > > +		remaining -= tcp_extra_options_prepare(skb, TCPHDR_SYN,
> > > +						       remaining, opts,
> > > +						       tcp_to_sk(tp));
> > > +
> > >  	return MAX_TCP_OPTION_SPACE - remaining;
> > >  }
> > > 
> > > @@ -663,6 +655,12 @@ static unsigned int tcp_synack_options(struct request_sock *req,
> > >  		}
> > >  	}
> > > 
> > > +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> > > +		remaining -= tcp_extra_options_prepare(skb,
> > > +						       TCPHDR_SYN | TCPHDR_ACK,
> > > +						       remaining, opts,
> > > +						       req_to_sk(req));
> > > +
> > >  	return MAX_TCP_OPTION_SPACE - remaining;
> > >  }
> > > 
> > > @@ -696,6 +694,11 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
> > >  		size += TCPOLEN_TSTAMP_ALIGNED;
> > >  	}
> > > 
> > > +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> > > +		size += tcp_extra_options_prepare(skb, 0,
> > > +						  MAX_TCP_OPTION_SPACE - size,
> > > +						  opts, tcp_to_sk(tp));
> > > +
> > >  	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
> > >  	if (unlikely(eff_sacks)) {
> > >  		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
> > > @@ -1016,6 +1019,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
> > >  	tcb = TCP_SKB_CB(skb);
> > >  	memset(&opts, 0, sizeof(opts));
> > > 
> > > +	rcu_read_lock();
> > >  	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
> > >  		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
> > >  	else
> > > @@ -1092,6 +1096,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
> > >  					       md5, sk, skb);
> > >  	}
> > >  #endif
> > > +	rcu_read_unlock();
> > > 
> > >  	icsk->icsk_af_ops->send_check(sk, skb);
> > > 
> > > @@ -3156,8 +3161,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
> > >  #endif
> > >  		skb->skb_mstamp = tcp_clock_us();
> > > 
> > > -#ifdef CONFIG_TCP_MD5SIG
> > >  	rcu_read_lock();
> > > +#ifdef CONFIG_TCP_MD5SIG
> > >  	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
> > >  #endif
> > >  	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
> > > @@ -3196,8 +3201,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
> > >  	if (md5)
> > >  		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
> > >  					       md5, req_to_sk(req), skb);
> > > -	rcu_read_unlock();
> > >  #endif
> > > +	rcu_read_unlock();
> > > 
> > >  	/* Do not fool tcpdump (if any), clean our debris */
> > >  	skb->tstamp = 0;
> > > diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
> > > index 4e7817abc0b9..407480366c73 100644
> > > --- a/net/ipv6/syncookies.c
> > > +++ b/net/ipv6/syncookies.c
> > > @@ -162,7 +162,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
> > > 
> > >  	/* check for timestamp cookie support */
> > >  	memset(&tcp_opt, 0, sizeof(tcp_opt));
> > > -	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
> > > +	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL, tp);
> > > 
> > >  	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
> > >  		tsoff = secure_tcpv6_ts_off(sock_net(sk),
> > > diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> > > index 64d94afa427f..4a3fba1ef3a2 100644
> > > --- a/net/ipv6/tcp_ipv6.c
> > > +++ b/net/ipv6/tcp_ipv6.c
> > > @@ -784,9 +784,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
> > >  	struct flowi6 fl6;
> > >  	struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
> > >  	struct sock *ctl_sk = net->ipv6.tcp_sk;
> > > -	unsigned int tot_len = sizeof(struct tcphdr);
> > > +	unsigned int tot_len = 0;
> > >  	struct dst_entry *dst;
> > >  	__be32 *topt;
> > > +	struct tcp_out_options extraopts;
> > > 
> > >  	if (tsecr)
> > >  		tot_len += TCPOLEN_TSTAMP_ALIGNED;
> > > @@ -795,10 +796,28 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
> > >  		tot_len += TCPOLEN_MD5SIG_ALIGNED;
> > >  #endif
> > > 
> > > +	rcu_read_lock();
> > > +	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
> > > +		unsigned int remaining = MAX_TCP_OPTION_SPACE - tot_len;
> > > +		u8 extraflags = rst ? TCPHDR_RST : 0;
> > > +
> > > +		if (!rst || !th->ack)
> > > +			extraflags |= TCPHDR_ACK;
> > > +
> > > +		memset(&extraopts, 0, sizeof(extraopts));
> > > +
> > > +		tot_len += tcp_extra_options_prepare(skb, extraflags, remaining,
> > > +						     &extraopts, sk);
> > > +	}
> > > +
> > > +	tot_len += sizeof(struct tcphdr);
> > > +
> > >  	buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
> > >  			 GFP_ATOMIC);
> > > -	if (!buff)
> > > +	if (!buff) {
> > > +		rcu_read_unlock();
> > >  		return;
> > > +	}
> > > 
> > >  	skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
> > > 
> > > @@ -835,6 +854,11 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
> > >  	}
> > >  #endif
> > > 
> > > +	if (static_branch_unlikely(&tcp_extra_options_enabled))
> > > +		tcp_extra_options_write(topt, &extraopts, sk);
> > > +
> > > +	rcu_read_unlock();
> > > +
> > >  	memset(&fl6, 0, sizeof(fl6));
> > >  	fl6.daddr = ipv6_hdr(skb)->saddr;
> > >  	fl6.saddr = ipv6_hdr(skb)->daddr;
> > > --
> > > 2.14.2
> > > 
> > > _______________________________________________
> > > mptcp mailing list
> > > mptcp(a)lists.01.org
> > > https://lists.01.org/mailman/listinfo/mptcp
> > 
> 
> --
> Mat Martineau
> Intel OTC

^ permalink raw reply	[flat|nested] 5+ messages in thread
* Re: [MPTCP] [PATCH v2] tcp: Register handlers for extra TCP options
@ 2017-10-05 18:56 Mat Martineau
  0 siblings, 0 replies; 5+ messages in thread
From: Mat Martineau @ 2017-10-05 18:56 UTC (permalink / raw)
  To: mptcp 

[-- Attachment #1: Type: text/plain, Size: 29473 bytes --]


Hi Christoph,

On Thu, 5 Oct 2017, Christoph Paasch wrote:

> Hello Mat,
>
> On 04/10/17 - 13:36:16, Mat Martineau wrote:
>> Allow additional TCP options to be handled by registered hook
>> functions.
>>
>> Registered options have a priority that determines the order in which
>> options are prepared and written. Lower priority numbers are handled
>> first.
>>
>> Option parsing will call the provided 'parse' function when a TCP option
>> number is not recognized by the normal option parsing code.
>>
>> The 'prepare' function determines the required space for registered
>> options and store associated data. 'write' adds the option to the TCP
>> header.
>>
>> A static key and RCU synchronization are used to minimize the
>> performance impact of these extensible TCP features.
>>
>> Signed-off-by: Mat Martineau <mathew.j.martineau(a)linux.intel.com>
>> ---
>>
>> Changes from v1: One 'prepare' callback (no more special callback for
>> request_sock), and add a few missing callback sites (like ipv6).
>
> great, I like that we now have only one 'prepare' callback.
>
>>
>> drivers/infiniband/hw/cxgb4/cm.c |   2 +-
>>  include/linux/tcp.h              |  22 +++++++
>>  include/net/tcp.h                |  40 +++++++++++-
>>  net/ipv4/syncookies.c            |   2 +-
>>  net/ipv4/tcp.c                   | 133 +++++++++++++++++++++++++++++++++++++++
>>  net/ipv4/tcp_input.c             |  16 +++--
>>  net/ipv4/tcp_ipv4.c              |  80 ++++++++++++++++++-----
>>  net/ipv4/tcp_minisocks.c         |   4 +-
>>  net/ipv4/tcp_output.c            |  43 +++++++------
>>  net/ipv6/syncookies.c            |   2 +-
>>  net/ipv6/tcp_ipv6.c              |  28 ++++++++-
>>  11 files changed, 323 insertions(+), 49 deletions(-)
>>
>> diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
>> index daf7a56e5d7e..c3eb31611011 100644
>> --- a/drivers/infiniband/hw/cxgb4/cm.c
>> +++ b/drivers/infiniband/hw/cxgb4/cm.c
>> @@ -3752,7 +3752,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
>>  	 */
>>  	memset(&tmp_opt, 0, sizeof(tmp_opt));
>>  	tcp_clear_options(&tmp_opt);
>> -	tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL);
>> +	tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL, NULL);
>>
>>  	req = __skb_push(skb, sizeof(*req));
>>  	memset(req, 0, sizeof(*req));
>> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
>> index 4aa40ef02d32..0347e6ce99be 100644
>> --- a/include/linux/tcp.h
>> +++ b/include/linux/tcp.h
>> @@ -112,6 +112,23 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
>>  	rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
>>  }
>>
>> +#define OPTION_SACK_ADVERTISE	(1 << 0)
>> +#define OPTION_TS		(1 << 1)
>> +#define OPTION_MD5		(1 << 2)
>> +#define OPTION_WSCALE		(1 << 3)
>> +#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
>> +
>> +struct tcp_out_options {
>> +	u16 options;		/* bit field of OPTION_* */
>> +	u16 mss;		/* 0 to disable */
>> +	u8 ws;			/* window scale, 0 to disable */
>> +	u8 num_sack_blocks;	/* number of SACK blocks to include */
>> +	u8 hash_size;		/* bytes in hash_location */
>> +	__u8 *hash_location;	/* temporary pointer, overloaded */
>> +	__u32 tsval, tsecr;	/* need to include OPTION_TS */
>> +	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
>> +};
>> +
>>  /* This is the max number of SACKS that we'll generate and process. It's safe
>>   * to increase this, although since:
>>   *   size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
>> @@ -389,6 +406,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
>>  	return (struct tcp_sock *)sk;
>>  }
>>
>> +static inline struct sock *tcp_to_sk(const struct tcp_sock *tp)
>> +{
>> +	return (struct sock *)tp;
>> +}
>
> Nice little function :)
>
>> +
>>  struct tcp_timewait_sock {
>>  	struct inet_timewait_sock tw_sk;
>>  #define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt
>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>> index 3bc910a9bfc6..04f3dcecf592 100644
>> --- a/include/net/tcp.h
>> +++ b/include/net/tcp.h
>> @@ -433,7 +433,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
>>  		int flags, int *addr_len);
>>  void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
>>  		       struct tcp_options_received *opt_rx,
>> -		       int estab, struct tcp_fastopen_cookie *foc);
>> +		       int estab, struct tcp_fastopen_cookie *foc,
>> +		       struct tcp_sock *tp);
>>  const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
>>
>>  /*
>> @@ -2109,4 +2110,41 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
>>  {
>>  	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
>>  }
>> +
>> +extern struct static_key_false tcp_extra_options_enabled;
>> +
>> +struct tcp_extra_option_ops {
>> +	struct list_head	list;
>> +	unsigned char		option_kind;
>> +	unsigned char		priority;
>> +	void (*parse)(int opsize, const unsigned char *opptr,
>> +		      const struct sk_buff *skb,
>> +		      struct tcp_options_received *opt_rx,
>> +		      struct sock *sk);
>> +	/* Return the number of bytes consumed */
>> +	unsigned int (*prepare)(struct sk_buff *skb, u8 flags,
>> +				unsigned int remaining,
>> +				struct tcp_out_options *opts,
>> +				const struct sock *sk);
>> +	void (*write)(__be32 *ptr, struct tcp_out_options *opts,
>> +		      const struct sock *sk);
>> +	struct module		*owner;
>> +};
>> +
>> +void tcp_extra_options_parse(int opcode, int opsize, const unsigned char *opptr,
>> +			     const struct sk_buff *skb,
>> +			     struct tcp_options_received *opt_rx,
>> +			     struct sock *sk);
>> +
>> +unsigned int tcp_extra_options_prepare(struct sk_buff *skb, u8 flags,
>> +				       unsigned int remaining,
>> +				       struct tcp_out_options *opts,
>> +				       const struct sock *sk);
>> +
>> +void tcp_extra_options_write(__be32 *ptr, struct tcp_out_options *opts,
>> +			     const struct sock *sk);
>> +
>> +int tcp_register_extra_option(struct tcp_extra_option_ops *ops);
>> +void tcp_unregister_extra_option(struct tcp_extra_option_ops *ops);
>> +
>>  #endif	/* _TCP_H */
>> diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
>> index b1bb1b3a1082..6c8d750a2243 100644
>> --- a/net/ipv4/syncookies.c
>> +++ b/net/ipv4/syncookies.c
>> @@ -313,7 +313,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
>>
>>  	/* check for timestamp cookie support */
>>  	memset(&tcp_opt, 0, sizeof(tcp_opt));
>> -	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
>> +	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL, tp);
>>
>>  	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
>>  		tsoff = secure_tcp_ts_off(sock_net(sk),
>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>> index 5091402720ab..8136857b992b 100644
>> --- a/net/ipv4/tcp.c
>> +++ b/net/ipv4/tcp.c
>> @@ -270,6 +270,7 @@
>>  #include <linux/time.h>
>>  #include <linux/slab.h>
>>  #include <linux/errqueue.h>
>> +#include <linux/static_key.h>
>>
>>  #include <net/icmp.h>
>>  #include <net/inet_common.h>
>> @@ -306,6 +307,13 @@ EXPORT_SYMBOL(tcp_memory_allocated);
>>  struct percpu_counter tcp_sockets_allocated;
>>  EXPORT_SYMBOL(tcp_sockets_allocated);
>>
>> +/*
>> + * Optional TCP option handlers
>> + */
>> +static DEFINE_SPINLOCK(tcp_option_list_lock);
>> +static LIST_HEAD(tcp_option_list);
>> +DEFINE_STATIC_KEY_FALSE(tcp_extra_options_enabled);
>> +
>>  /*
>>   * TCP splice context
>>   */
>> @@ -3375,6 +3383,130 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
>>
>>  #endif
>>
>> +/* Linear search, few entries are expected. The RCU read lock must
>> + * be held before calling.
>> + */
>> +static struct tcp_extra_option_ops *tcp_extra_options_find_kind(unsigned char kind)
>> +{
>> +	struct tcp_extra_option_ops *entry;
>> +
>> +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
>> +		if (entry->option_kind == kind)
>> +			return entry;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>> +void tcp_extra_options_parse(int opcode, int opsize, const unsigned char *opptr,
>> +			     const struct sk_buff *skb,
>> +			     struct tcp_options_received *opt_rx,
>> +			     struct sock *sk)
>> +{
>> +	struct tcp_extra_option_ops *entry;
>> +
>> +	rcu_read_lock();
>> +	entry = tcp_extra_options_find_kind(opcode);
>> +	if (entry && entry->parse)
>> +		entry->parse(opsize, opptr, skb, opt_rx, sk);
>> +	rcu_read_unlock();
>> +}
>> +EXPORT_SYMBOL_GPL(tcp_extra_options_parse);
>> +
>> +/* The RCU read lock must be held before calling, and should span both
>> + * the call to this function and tcp_extra_options_write to ensure that
>> + * tcp_option_list does not change between the two calls. To preserve
>> + * expected option alignment, always returns a multiple of 4 bytes.
>> + */
>
> The RCU read lock won't be able to protect the list from being altered. All
> it will take care of is that the elements on the list won't get free'd while
> traversing it and that the next-pointer won't get changed. That way the
> list-traversal is still fine.

Thanks for the description. If we switch to per-socket lists then RCU 
won't be necessary, but I think it's possible to coax RCU in to the 
behavior I want if:

* the list is copied/rebuilt every time an option is registered or 
unregistered

and

* tcp_extra_options_prepare returns a pointer to the version of the list 
for tcp_extra_options_write to use

But then it might as well be an array. The list should be short enough and 
rarely modified, so it wouldn't be a lot of overhead.

> If we move the extra-options to a mode where the list is on a per-TCP socket
> basis, we can avoid handling this. Because we can limit adding/removing
> TCP-options to happen only when the socket is in TCP_CLOSE. So, once the
> connection started, the list will always remain the same.

I was thinking a global list might still be useful to avoid complicating 
request_sock, but now realize that incoming connections would use the 
extra options list from the listening socket.

>> +unsigned int tcp_extra_options_prepare(struct sk_buff *skb, u8 flags,
>> +				       unsigned int remaining,
>> +				       struct tcp_out_options *opts,
>> +				       const struct sock *sk)
>> +{
>> +	struct tcp_extra_option_ops *entry;
>> +	unsigned int used = 0;
>> +
>> +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
>> +		if (unlikely(!entry->prepare))
>> +			continue;
>> +
>> +		used += entry->prepare(skb, flags, remaining - used, opts, sk);
>> +	}
>> +
>> +	return roundup(used, 4);
>> +}
>> +EXPORT_SYMBOL_GPL(tcp_extra_options_prepare);
>> +
>> +/* The RCU read lock must be held before calling, and should span both
>> + * the call to tcp_extra_options_write and this function to ensure that
>> + * tcp_option_list does not change between the two calls.
>> + */
>> +void tcp_extra_options_write(__be32 *ptr, struct tcp_out_options *opts,
>> +			     const struct sock *sk)
>> +{
>> +	struct tcp_extra_option_ops *entry;
>> +
>> +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
>> +		if (unlikely(!entry->write))
>> +			continue;
>> +
>> +		entry->write(ptr, opts, sk);
>> +	}
>> +}
>> +EXPORT_SYMBOL_GPL(tcp_extra_options_write);
>> +
>> +int tcp_register_extra_option(struct tcp_extra_option_ops *ops)
>> +{
>> +	struct tcp_extra_option_ops *entry;
>> +	struct list_head* add_before = &tcp_option_list;
>> +	int ret = 0;
>> +
>> +	if (!ops->option_kind)
>> +		return -EINVAL;
>> +
>> +	if (!try_module_get(ops->owner))
>> +		return -ENOENT;
>> +
>> +	spin_lock(&tcp_option_list_lock);
>> +
>> +	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
>> +		if (entry->option_kind == ops->option_kind) {
>> +			pr_notice("Option kind %u already registered\n",
>> +				  ops->option_kind);
>> +			spin_unlock(&tcp_option_list_lock);
>> +			module_put(ops->owner);
>> +			return -EEXIST;
>> +		}
>> +
>> +		if (entry->priority <= ops->priority)
>> +			add_before = &entry->list;
>> +	}
>> +
>> +	list_add_tail_rcu(&ops->list, add_before);
>> +	pr_debug("Option kind %u registered\n", ops->option_kind);
>> +
>> +	spin_unlock(&tcp_option_list_lock);
>> +
>> +	static_branch_inc(&tcp_extra_options_enabled);
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(tcp_register_extra_option);
>> +
>> +void tcp_unregister_extra_option(struct tcp_extra_option_ops *ops)
>> +{
>> +	spin_lock(&tcp_option_list_lock);
>> +	list_del_rcu(&ops->list);
>> +	spin_unlock(&tcp_option_list_lock);
>> +
>> +	synchronize_net();
>> +
>> +	static_branch_dec(&tcp_extra_options_enabled);
>> +
>> +	module_put(ops->owner);
>> +}
>> +EXPORT_SYMBOL_GPL(tcp_unregister_extra_option);
>> +
>>  void tcp_done(struct sock *sk)
>>  {
>>  	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
>> @@ -3521,6 +3653,7 @@ void __init tcp_init(void)
>>  		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
>>  	}
>>
>> +	INIT_LIST_HEAD(&tcp_option_list);
>>
>>  	cnt = tcp_hashinfo.ehash_mask + 1;
>>  	sysctl_tcp_max_orphans = cnt / 2;
>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> index c5d7656beeee..faf3c8d34cec 100644
>> --- a/net/ipv4/tcp_input.c
>> +++ b/net/ipv4/tcp_input.c
>> @@ -3728,7 +3728,7 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
>>  void tcp_parse_options(const struct net *net,
>>  		       const struct sk_buff *skb,
>>  		       struct tcp_options_received *opt_rx, int estab,
>> -		       struct tcp_fastopen_cookie *foc)
>> +		       struct tcp_fastopen_cookie *foc, struct tcp_sock *tp)
>>  {
>>  	const unsigned char *ptr;
>>  	const struct tcphdr *th = tcp_hdr(skb);
>> @@ -3830,6 +3830,12 @@ void tcp_parse_options(const struct net *net,
>>  						ptr + 2, th->syn, foc, true);
>>  				break;
>>
>> +			default:
>> +				tcp_extra_options_parse(opcode, opsize, ptr,
>> +							skb, opt_rx,
>> +							tcp_to_sk(tp));
>> +				break;
>> +
>>  			}
>>  			ptr += opsize-2;
>>  			length -= opsize;
>> @@ -3876,7 +3882,7 @@ static bool tcp_fast_parse_options(const struct net *net,
>>  			return true;
>>  	}
>>
>> -	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
>> +	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL, tp);
>>  	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
>>  		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
>>
>> @@ -5569,7 +5575,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
>>  		/* Get original SYNACK MSS value if user MSS sets mss_clamp */
>>  		tcp_clear_options(&opt);
>>  		opt.user_mss = opt.mss_clamp = 0;
>> -		tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
>> +		tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL, tp);
>>  		mss = opt.mss_clamp;
>>  	}
>>
>> @@ -5623,7 +5629,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
>>  	int saved_clamp = tp->rx_opt.mss_clamp;
>>  	bool fastopen_fail;
>>
>> -	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
>> +	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc, tp);
>>  	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
>>  		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
>>
>> @@ -6299,7 +6305,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
>>  	tmp_opt.mss_clamp = af_ops->mss_clamp;
>>  	tmp_opt.user_mss  = tp->rx_opt.user_mss;
>>  	tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
>> -			  want_cookie ? NULL : &foc);
>> +			  want_cookie ? NULL : &foc, tp);
>>
>>  	if (want_cookie && !tmp_opt.saw_tstamp)
>>  		tcp_clear_options(&tmp_opt);
>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>> index d9416b5162bc..537734e70317 100644
>> --- a/net/ipv4/tcp_ipv4.c
>> +++ b/net/ipv4/tcp_ipv4.c
>> @@ -598,9 +598,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
>>  	const struct tcphdr *th = tcp_hdr(skb);
>>  	struct {
>>  		struct tcphdr th;
>> -#ifdef CONFIG_TCP_MD5SIG
>> -		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
>> -#endif
>> +		__be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
>>  	} rep;
>>  	struct ip_reply_arg arg;
>>  #ifdef CONFIG_TCP_MD5SIG
>> @@ -611,6 +609,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
>>  	struct sock *sk1 = NULL;
>>  #endif
>>  	struct net *net;
>> +	int offset = 0;
>>
>>  	/* Never send a reset in response to a reset. */
>>  	if (th->rst)
>> @@ -676,17 +675,44 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
>>  			goto out;
>>
>>  	}
>> +#endif
>> +
>> +	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
>> +		unsigned int remaining;
>> +		unsigned int used;
>> +		struct tcp_out_options opts;
>> +
>> +		remaining = sizeof(rep.opt);
>> +#ifdef CONFIG_TCP_MD5SIG
>> +		if (key)
>> +			remaining -= TCPOLEN_MD5SIG_ALIGNED;
>> +#endif
>
> We will break TCP_MD5 here with this patch if we move it inside the static
> branch.
> Only after the patch that makes TCP_MD5 adopt the framework we can move this
> code to here.

The MD5 option is handled outside the static branch (below), this bit of 
conditional code adjusts the space available to the extra options. The 
TCP options are not part of the MD5 digest, so it should be ok to write 
the extra options early.

>> +
>> +		memset(&opts, 0, sizeof(opts));
>> +
>> +		rcu_read_lock();
>> +		used = tcp_extra_options_prepare(NULL, TCPHDR_RST, remaining,
>> +						 &opts, sk);
>> +
>> +		tcp_extra_options_write(&rep.opt[0], &opts, sk);
>> +		rcu_read_unlock();
>> +
>> +		arg.iov[0].iov_len += used;
>> +		offset += used / 4;

If any extra options were written, the MD5 option offset is adjusted here.

>> +		rep.th.doff = arg.iov[0].iov_len / 4;
>> +	}

(End of static branch)

>> +#ifdef CONFIG_TCP_MD5SIG
>>  	if (key) {
>> -		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
>> -				   (TCPOPT_NOP << 16) |
>> -				   (TCPOPT_MD5SIG << 8) |
>> -				   TCPOLEN_MD5SIG);
>> +		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |

The MD5 option will be written at the appropriate offset with or without 
the static branch above. Does that option need to be first? That could be 
accomodated.


Mat


>> +					  (TCPOPT_NOP << 16) |
>> +					  (TCPOPT_MD5SIG << 8) |
>> +					  TCPOLEN_MD5SIG);
>>  		/* Update length and the length the header thinks exists */
>>  		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
>>  		rep.th.doff = arg.iov[0].iov_len / 4;
>>
>> -		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
>> +		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
>>  				     key, ip_hdr(skb)->saddr,
>>  				     ip_hdr(skb)->daddr, &rep.th);
>>  	}
>> @@ -738,14 +764,11 @@ static void tcp_v4_send_ack(const struct sock *sk,
>>  	const struct tcphdr *th = tcp_hdr(skb);
>>  	struct {
>>  		struct tcphdr th;
>> -		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
>> -#ifdef CONFIG_TCP_MD5SIG
>> -			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
>> -#endif
>> -			];
>> +		__be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
>>  	} rep;
>>  	struct net *net = sock_net(sk);
>>  	struct ip_reply_arg arg;
>> +	int offset = 0;
>>
>>  	memset(&rep.th, 0, sizeof(struct tcphdr));
>>  	memset(&arg, 0, sizeof(arg));
>> @@ -759,33 +782,56 @@ static void tcp_v4_send_ack(const struct sock *sk,
>>  		rep.opt[1] = htonl(tsval);
>>  		rep.opt[2] = htonl(tsecr);
>>  		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
>> +		offset += 3;
>>  	}
>>
>>  	/* Swap the send and the receive. */
>>  	rep.th.dest    = th->source;
>>  	rep.th.source  = th->dest;
>> -	rep.th.doff    = arg.iov[0].iov_len / 4;
>>  	rep.th.seq     = htonl(seq);
>>  	rep.th.ack_seq = htonl(ack);
>>  	rep.th.ack     = 1;
>>  	rep.th.window  = htons(win);
>>
>> +	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
>> +		unsigned int remaining;
>> +		unsigned int used;
>> +		struct tcp_out_options opts;
>> +
>> +		remaining = sizeof(rep.th) + sizeof(rep.opt) - arg.iov[0].iov_len;
>>  #ifdef CONFIG_TCP_MD5SIG
>> -	if (key) {
>> -		int offset = (tsecr) ? 3 : 0;
>> +		if (key)
>> +			remaining -= TCPOLEN_MD5SIG_ALIGNED;
>> +#endif
>>
>> +		memset(&opts, 0, sizeof(opts));
>> +		rcu_read_lock();
>> +		used = tcp_extra_options_prepare(NULL, TCPHDR_ACK, remaining,
>> +						 &opts, sk);
>> +
>> +		tcp_extra_options_write(&rep.opt[offset], &opts, sk);
>> +		rcu_read_unlock();
>> +
>> +		arg.iov[0].iov_len += used;
>> +		offset += used / 4;
>> +	}
>> +
>> +	rep.th.doff = arg.iov[0].iov_len / 4;
>> +
>> +#ifdef CONFIG_TCP_MD5SIG
>> +	if (key) {
>>  		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
>>  					  (TCPOPT_NOP << 16) |
>>  					  (TCPOPT_MD5SIG << 8) |
>>  					  TCPOLEN_MD5SIG);
>>  		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
>> -		rep.th.doff = arg.iov[0].iov_len/4;
>>
>>  		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
>>  				    key, ip_hdr(skb)->saddr,
>>  				    ip_hdr(skb)->daddr, &rep.th);
>>  	}
>>  #endif
>> +
>>  	arg.flags = reply_flags;
>>  	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
>>  				      ip_hdr(skb)->saddr, /* XXX */
>> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
>> index 188a6f31356d..1c3e91899dac 100644
>> --- a/net/ipv4/tcp_minisocks.c
>> +++ b/net/ipv4/tcp_minisocks.c
>> @@ -98,7 +98,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
>>
>>  	tmp_opt.saw_tstamp = 0;
>>  	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
>> -		tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
>> +		tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL, NULL);
>>
>>  		if (tmp_opt.saw_tstamp) {
>>  			if (tmp_opt.rcv_tsecr)
>> @@ -565,7 +565,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
>>
>>  	tmp_opt.saw_tstamp = 0;
>>  	if (th->doff > (sizeof(struct tcphdr)>>2)) {
>> -		tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
>> +		tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL, NULL);
>>
>>  		if (tmp_opt.saw_tstamp) {
>>  			tmp_opt.ts_recent = req->ts_recent;
>> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
>> index 0bc9e46a5369..61eba3d0ae17 100644
>> --- a/net/ipv4/tcp_output.c
>> +++ b/net/ipv4/tcp_output.c
>> @@ -41,6 +41,7 @@
>>  #include <linux/compiler.h>
>>  #include <linux/gfp.h>
>>  #include <linux/module.h>
>> +#include <linux/static_key.h>
>>
>>  /* People can turn this off for buggy TCP's found in printers etc. */
>>  int sysctl_tcp_retrans_collapse __read_mostly = 1;
>> @@ -413,23 +414,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
>>  	return tp->snd_una != tp->snd_up;
>>  }
>>
>> -#define OPTION_SACK_ADVERTISE	(1 << 0)
>> -#define OPTION_TS		(1 << 1)
>> -#define OPTION_MD5		(1 << 2)
>> -#define OPTION_WSCALE		(1 << 3)
>> -#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
>> -
>> -struct tcp_out_options {
>> -	u16 options;		/* bit field of OPTION_* */
>> -	u16 mss;		/* 0 to disable */
>> -	u8 ws;			/* window scale, 0 to disable */
>> -	u8 num_sack_blocks;	/* number of SACK blocks to include */
>> -	u8 hash_size;		/* bytes in hash_location */
>> -	__u8 *hash_location;	/* temporary pointer, overloaded */
>> -	__u32 tsval, tsecr;	/* need to include OPTION_TS */
>> -	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
>> -};
>> -
>>  /* Write previously computed TCP options to the packet.
>>   *
>>   * Beware: Something in the Internet is very sensitive to the ordering of
>> @@ -536,6 +520,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
>>  		}
>>  		ptr += (len + 3) >> 2;
>>  	}
>> +
>> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
>> +		tcp_extra_options_write(ptr, opts, tcp_to_sk(tp));
>>  }
>>
>>  /* Compute TCP options for SYN packets. This is not the final
>> @@ -603,6 +590,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
>>  		}
>>  	}
>>
>> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
>> +		remaining -= tcp_extra_options_prepare(skb, TCPHDR_SYN,
>> +						       remaining, opts,
>> +						       tcp_to_sk(tp));
>> +
>>  	return MAX_TCP_OPTION_SPACE - remaining;
>>  }
>>
>> @@ -663,6 +655,12 @@ static unsigned int tcp_synack_options(struct request_sock *req,
>>  		}
>>  	}
>>
>> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
>> +		remaining -= tcp_extra_options_prepare(skb,
>> +						       TCPHDR_SYN | TCPHDR_ACK,
>> +						       remaining, opts,
>> +						       req_to_sk(req));
>> +
>>  	return MAX_TCP_OPTION_SPACE - remaining;
>>  }
>>
>> @@ -696,6 +694,11 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
>>  		size += TCPOLEN_TSTAMP_ALIGNED;
>>  	}
>>
>> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
>> +		size += tcp_extra_options_prepare(skb, 0,
>> +						  MAX_TCP_OPTION_SPACE - size,
>> +						  opts, tcp_to_sk(tp));
>> +
>>  	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
>>  	if (unlikely(eff_sacks)) {
>>  		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
>> @@ -1016,6 +1019,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
>>  	tcb = TCP_SKB_CB(skb);
>>  	memset(&opts, 0, sizeof(opts));
>>
>> +	rcu_read_lock();
>>  	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
>>  		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
>>  	else
>> @@ -1092,6 +1096,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
>>  					       md5, sk, skb);
>>  	}
>>  #endif
>> +	rcu_read_unlock();
>>
>>  	icsk->icsk_af_ops->send_check(sk, skb);
>>
>> @@ -3156,8 +3161,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
>>  #endif
>>  		skb->skb_mstamp = tcp_clock_us();
>>
>> -#ifdef CONFIG_TCP_MD5SIG
>>  	rcu_read_lock();
>> +#ifdef CONFIG_TCP_MD5SIG
>>  	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
>>  #endif
>>  	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
>> @@ -3196,8 +3201,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
>>  	if (md5)
>>  		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
>>  					       md5, req_to_sk(req), skb);
>> -	rcu_read_unlock();
>>  #endif
>> +	rcu_read_unlock();
>>
>>  	/* Do not fool tcpdump (if any), clean our debris */
>>  	skb->tstamp = 0;
>> diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
>> index 4e7817abc0b9..407480366c73 100644
>> --- a/net/ipv6/syncookies.c
>> +++ b/net/ipv6/syncookies.c
>> @@ -162,7 +162,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
>>
>>  	/* check for timestamp cookie support */
>>  	memset(&tcp_opt, 0, sizeof(tcp_opt));
>> -	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
>> +	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL, tp);
>>
>>  	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
>>  		tsoff = secure_tcpv6_ts_off(sock_net(sk),
>> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
>> index 64d94afa427f..4a3fba1ef3a2 100644
>> --- a/net/ipv6/tcp_ipv6.c
>> +++ b/net/ipv6/tcp_ipv6.c
>> @@ -784,9 +784,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
>>  	struct flowi6 fl6;
>>  	struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
>>  	struct sock *ctl_sk = net->ipv6.tcp_sk;
>> -	unsigned int tot_len = sizeof(struct tcphdr);
>> +	unsigned int tot_len = 0;
>>  	struct dst_entry *dst;
>>  	__be32 *topt;
>> +	struct tcp_out_options extraopts;
>>
>>  	if (tsecr)
>>  		tot_len += TCPOLEN_TSTAMP_ALIGNED;
>> @@ -795,10 +796,28 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
>>  		tot_len += TCPOLEN_MD5SIG_ALIGNED;
>>  #endif
>>
>> +	rcu_read_lock();
>> +	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
>> +		unsigned int remaining = MAX_TCP_OPTION_SPACE - tot_len;
>> +		u8 extraflags = rst ? TCPHDR_RST : 0;
>> +
>> +		if (!rst || !th->ack)
>> +			extraflags |= TCPHDR_ACK;
>> +
>> +		memset(&extraopts, 0, sizeof(extraopts));
>> +
>> +		tot_len += tcp_extra_options_prepare(skb, extraflags, remaining,
>> +						     &extraopts, sk);
>> +	}
>> +
>> +	tot_len += sizeof(struct tcphdr);
>> +
>>  	buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
>>  			 GFP_ATOMIC);
>> -	if (!buff)
>> +	if (!buff) {
>> +		rcu_read_unlock();
>>  		return;
>> +	}
>>
>>  	skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
>>
>> @@ -835,6 +854,11 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
>>  	}
>>  #endif
>>
>> +	if (static_branch_unlikely(&tcp_extra_options_enabled))
>> +		tcp_extra_options_write(topt, &extraopts, sk);
>> +
>> +	rcu_read_unlock();
>> +
>>  	memset(&fl6, 0, sizeof(fl6));
>>  	fl6.daddr = ipv6_hdr(skb)->saddr;
>>  	fl6.saddr = ipv6_hdr(skb)->daddr;
>> --
>> 2.14.2
>>
>> _______________________________________________
>> mptcp mailing list
>> mptcp(a)lists.01.org
>> https://lists.01.org/mailman/listinfo/mptcp
>

--
Mat Martineau
Intel OTC

^ permalink raw reply	[flat|nested] 5+ messages in thread
* [MPTCP] [PATCH v2] tcp: Register handlers for extra TCP options
@ 2017-10-04 20:36 Mat Martineau
  0 siblings, 0 replies; 5+ messages in thread
From: Mat Martineau @ 2017-10-04 20:36 UTC (permalink / raw)
  To: mptcp 

[-- Attachment #1: Type: text/plain, Size: 24927 bytes --]

Allow additional TCP options to be handled by registered hook
functions.

Registered options have a priority that determines the order in which
options are prepared and written. Lower priority numbers are handled
first.

Option parsing will call the provided 'parse' function when a TCP option
number is not recognized by the normal option parsing code.

The 'prepare' function determines the required space for registered
options and store associated data. 'write' adds the option to the TCP
header.

A static key and RCU synchronization are used to minimize the
performance impact of these extensible TCP features.

Signed-off-by: Mat Martineau <mathew.j.martineau(a)linux.intel.com>
---

Changes from v1: One 'prepare' callback (no more special callback for
request_sock), and add a few missing callback sites (like ipv6).

drivers/infiniband/hw/cxgb4/cm.c |   2 +-
 include/linux/tcp.h              |  22 +++++++
 include/net/tcp.h                |  40 +++++++++++-
 net/ipv4/syncookies.c            |   2 +-
 net/ipv4/tcp.c                   | 133 +++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c             |  16 +++--
 net/ipv4/tcp_ipv4.c              |  80 ++++++++++++++++++-----
 net/ipv4/tcp_minisocks.c         |   4 +-
 net/ipv4/tcp_output.c            |  43 +++++++------
 net/ipv6/syncookies.c            |   2 +-
 net/ipv6/tcp_ipv6.c              |  28 ++++++++-
 11 files changed, 323 insertions(+), 49 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index daf7a56e5d7e..c3eb31611011 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -3752,7 +3752,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
 	 */
 	memset(&tmp_opt, 0, sizeof(tmp_opt));
 	tcp_clear_options(&tmp_opt);
-	tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL);
+	tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL, NULL);
 
 	req = __skb_push(skb, sizeof(*req));
 	memset(req, 0, sizeof(*req));
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4aa40ef02d32..0347e6ce99be 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -112,6 +112,23 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
 	rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
 }
 
+#define OPTION_SACK_ADVERTISE	(1 << 0)
+#define OPTION_TS		(1 << 1)
+#define OPTION_MD5		(1 << 2)
+#define OPTION_WSCALE		(1 << 3)
+#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
+
+struct tcp_out_options {
+	u16 options;		/* bit field of OPTION_* */
+	u16 mss;		/* 0 to disable */
+	u8 ws;			/* window scale, 0 to disable */
+	u8 num_sack_blocks;	/* number of SACK blocks to include */
+	u8 hash_size;		/* bytes in hash_location */
+	__u8 *hash_location;	/* temporary pointer, overloaded */
+	__u32 tsval, tsecr;	/* need to include OPTION_TS */
+	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
+};
+
 /* This is the max number of SACKS that we'll generate and process. It's safe
  * to increase this, although since:
  *   size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8)
@@ -389,6 +406,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
 	return (struct tcp_sock *)sk;
 }
 
+static inline struct sock *tcp_to_sk(const struct tcp_sock *tp)
+{
+	return (struct sock *)tp;
+}
+
 struct tcp_timewait_sock {
 	struct inet_timewait_sock tw_sk;
 #define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3bc910a9bfc6..04f3dcecf592 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -433,7 +433,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 		int flags, int *addr_len);
 void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
 		       struct tcp_options_received *opt_rx,
-		       int estab, struct tcp_fastopen_cookie *foc);
+		       int estab, struct tcp_fastopen_cookie *foc,
+		       struct tcp_sock *tp);
 const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
 
 /*
@@ -2109,4 +2110,41 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 {
 	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
 }
+
+extern struct static_key_false tcp_extra_options_enabled;
+
+struct tcp_extra_option_ops {
+	struct list_head	list;
+	unsigned char		option_kind;
+	unsigned char		priority;
+	void (*parse)(int opsize, const unsigned char *opptr,
+		      const struct sk_buff *skb,
+		      struct tcp_options_received *opt_rx,
+		      struct sock *sk);
+	/* Return the number of bytes consumed */
+	unsigned int (*prepare)(struct sk_buff *skb, u8 flags,
+				unsigned int remaining,
+				struct tcp_out_options *opts,
+				const struct sock *sk);
+	void (*write)(__be32 *ptr, struct tcp_out_options *opts,
+		      const struct sock *sk);
+	struct module		*owner;
+};
+
+void tcp_extra_options_parse(int opcode, int opsize, const unsigned char *opptr,
+			     const struct sk_buff *skb,
+			     struct tcp_options_received *opt_rx,
+			     struct sock *sk);
+
+unsigned int tcp_extra_options_prepare(struct sk_buff *skb, u8 flags,
+				       unsigned int remaining,
+				       struct tcp_out_options *opts,
+				       const struct sock *sk);
+
+void tcp_extra_options_write(__be32 *ptr, struct tcp_out_options *opts,
+			     const struct sock *sk);
+
+int tcp_register_extra_option(struct tcp_extra_option_ops *ops);
+void tcp_unregister_extra_option(struct tcp_extra_option_ops *ops);
+
 #endif	/* _TCP_H */
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b1bb1b3a1082..6c8d750a2243 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -313,7 +313,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 
 	/* check for timestamp cookie support */
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
+	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL, tp);
 
 	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
 		tsoff = secure_tcp_ts_off(sock_net(sk),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5091402720ab..8136857b992b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,6 +270,7 @@
 #include <linux/time.h>
 #include <linux/slab.h>
 #include <linux/errqueue.h>
+#include <linux/static_key.h>
 
 #include <net/icmp.h>
 #include <net/inet_common.h>
@@ -306,6 +307,13 @@ EXPORT_SYMBOL(tcp_memory_allocated);
 struct percpu_counter tcp_sockets_allocated;
 EXPORT_SYMBOL(tcp_sockets_allocated);
 
+/*
+ * Optional TCP option handlers
+ */
+static DEFINE_SPINLOCK(tcp_option_list_lock);
+static LIST_HEAD(tcp_option_list);
+DEFINE_STATIC_KEY_FALSE(tcp_extra_options_enabled);
+
 /*
  * TCP splice context
  */
@@ -3375,6 +3383,130 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
 
 #endif
 
+/* Linear search, few entries are expected. The RCU read lock must
+ * be held before calling.
+ */
+static struct tcp_extra_option_ops *tcp_extra_options_find_kind(unsigned char kind)
+{
+	struct tcp_extra_option_ops *entry;
+
+	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
+		if (entry->option_kind == kind)
+			return entry;
+	}
+
+	return NULL;
+}
+
+void tcp_extra_options_parse(int opcode, int opsize, const unsigned char *opptr,
+			     const struct sk_buff *skb,
+			     struct tcp_options_received *opt_rx,
+			     struct sock *sk)
+{
+	struct tcp_extra_option_ops *entry;
+
+	rcu_read_lock();
+	entry = tcp_extra_options_find_kind(opcode);
+	if (entry && entry->parse)
+		entry->parse(opsize, opptr, skb, opt_rx, sk);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(tcp_extra_options_parse);
+
+/* The RCU read lock must be held before calling, and should span both
+ * the call to this function and tcp_extra_options_write to ensure that
+ * tcp_option_list does not change between the two calls. To preserve
+ * expected option alignment, always returns a multiple of 4 bytes.
+ */
+unsigned int tcp_extra_options_prepare(struct sk_buff *skb, u8 flags,
+				       unsigned int remaining,
+				       struct tcp_out_options *opts,
+				       const struct sock *sk)
+{
+	struct tcp_extra_option_ops *entry;
+	unsigned int used = 0;
+
+	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
+		if (unlikely(!entry->prepare))
+			continue;
+
+		used += entry->prepare(skb, flags, remaining - used, opts, sk);
+	}
+
+	return roundup(used, 4);
+}
+EXPORT_SYMBOL_GPL(tcp_extra_options_prepare);
+
+/* The RCU read lock must be held before calling, and should span both
+ * the call to tcp_extra_options_write and this function to ensure that
+ * tcp_option_list does not change between the two calls.
+ */
+void tcp_extra_options_write(__be32 *ptr, struct tcp_out_options *opts,
+			     const struct sock *sk)
+{
+	struct tcp_extra_option_ops *entry;
+
+	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
+		if (unlikely(!entry->write))
+			continue;
+
+		entry->write(ptr, opts, sk);
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_extra_options_write);
+
+int tcp_register_extra_option(struct tcp_extra_option_ops *ops)
+{
+	struct tcp_extra_option_ops *entry;
+	struct list_head* add_before = &tcp_option_list;
+	int ret = 0;
+
+	if (!ops->option_kind)
+		return -EINVAL;
+
+	if (!try_module_get(ops->owner))
+		return -ENOENT;
+
+	spin_lock(&tcp_option_list_lock);
+
+	list_for_each_entry_rcu(entry, &tcp_option_list, list) {
+		if (entry->option_kind == ops->option_kind) {
+			pr_notice("Option kind %u already registered\n",
+				  ops->option_kind);
+			spin_unlock(&tcp_option_list_lock);
+			module_put(ops->owner);
+			return -EEXIST;
+		}
+
+		if (entry->priority <= ops->priority)
+			add_before = &entry->list;
+	}
+
+	list_add_tail_rcu(&ops->list, add_before);
+	pr_debug("Option kind %u registered\n", ops->option_kind);
+
+	spin_unlock(&tcp_option_list_lock);
+
+	static_branch_inc(&tcp_extra_options_enabled);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_extra_option);
+
+void tcp_unregister_extra_option(struct tcp_extra_option_ops *ops)
+{
+	spin_lock(&tcp_option_list_lock);
+	list_del_rcu(&ops->list);
+	spin_unlock(&tcp_option_list_lock);
+
+	synchronize_net();
+
+	static_branch_dec(&tcp_extra_options_enabled);
+
+	module_put(ops->owner);
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_extra_option);
+
 void tcp_done(struct sock *sk)
 {
 	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
@@ -3521,6 +3653,7 @@ void __init tcp_init(void)
 		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
 	}
 
+	INIT_LIST_HEAD(&tcp_option_list);
 
 	cnt = tcp_hashinfo.ehash_mask + 1;
 	sysctl_tcp_max_orphans = cnt / 2;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5d7656beeee..faf3c8d34cec 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3728,7 +3728,7 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
 void tcp_parse_options(const struct net *net,
 		       const struct sk_buff *skb,
 		       struct tcp_options_received *opt_rx, int estab,
-		       struct tcp_fastopen_cookie *foc)
+		       struct tcp_fastopen_cookie *foc, struct tcp_sock *tp)
 {
 	const unsigned char *ptr;
 	const struct tcphdr *th = tcp_hdr(skb);
@@ -3830,6 +3830,12 @@ void tcp_parse_options(const struct net *net,
 						ptr + 2, th->syn, foc, true);
 				break;
 
+			default:
+				tcp_extra_options_parse(opcode, opsize, ptr,
+							skb, opt_rx,
+							tcp_to_sk(tp));
+				break;
+
 			}
 			ptr += opsize-2;
 			length -= opsize;
@@ -3876,7 +3882,7 @@ static bool tcp_fast_parse_options(const struct net *net,
 			return true;
 	}
 
-	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
+	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL, tp);
 	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
 		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
 
@@ -5569,7 +5575,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 		/* Get original SYNACK MSS value if user MSS sets mss_clamp */
 		tcp_clear_options(&opt);
 		opt.user_mss = opt.mss_clamp = 0;
-		tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
+		tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL, tp);
 		mss = opt.mss_clamp;
 	}
 
@@ -5623,7 +5629,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 	int saved_clamp = tp->rx_opt.mss_clamp;
 	bool fastopen_fail;
 
-	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
+	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc, tp);
 	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
 		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
 
@@ -6299,7 +6305,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	tmp_opt.mss_clamp = af_ops->mss_clamp;
 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
 	tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
-			  want_cookie ? NULL : &foc);
+			  want_cookie ? NULL : &foc, tp);
 
 	if (want_cookie && !tmp_opt.saw_tstamp)
 		tcp_clear_options(&tmp_opt);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d9416b5162bc..537734e70317 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -598,9 +598,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	const struct tcphdr *th = tcp_hdr(skb);
 	struct {
 		struct tcphdr th;
-#ifdef CONFIG_TCP_MD5SIG
-		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
-#endif
+		__be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
 	} rep;
 	struct ip_reply_arg arg;
 #ifdef CONFIG_TCP_MD5SIG
@@ -611,6 +609,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	struct sock *sk1 = NULL;
 #endif
 	struct net *net;
+	int offset = 0;
 
 	/* Never send a reset in response to a reset. */
 	if (th->rst)
@@ -676,17 +675,44 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 			goto out;
 
 	}
+#endif
+
+	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
+		unsigned int remaining;
+		unsigned int used;
+		struct tcp_out_options opts;
+
+		remaining = sizeof(rep.opt);
+#ifdef CONFIG_TCP_MD5SIG
+		if (key)
+			remaining -= TCPOLEN_MD5SIG_ALIGNED;
+#endif
+
+		memset(&opts, 0, sizeof(opts));
+
+		rcu_read_lock();
+		used = tcp_extra_options_prepare(NULL, TCPHDR_RST, remaining,
+						 &opts, sk);
+
+		tcp_extra_options_write(&rep.opt[0], &opts, sk);
+		rcu_read_unlock();
+
+		arg.iov[0].iov_len += used;
+		offset += used / 4;
+		rep.th.doff = arg.iov[0].iov_len / 4;
+	}
 
+#ifdef CONFIG_TCP_MD5SIG
 	if (key) {
-		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
-				   (TCPOPT_NOP << 16) |
-				   (TCPOPT_MD5SIG << 8) |
-				   TCPOLEN_MD5SIG);
+		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
+					  (TCPOPT_NOP << 16) |
+					  (TCPOPT_MD5SIG << 8) |
+					  TCPOLEN_MD5SIG);
 		/* Update length and the length the header thinks exists */
 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 		rep.th.doff = arg.iov[0].iov_len / 4;
 
-		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
+		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 				     key, ip_hdr(skb)->saddr,
 				     ip_hdr(skb)->daddr, &rep.th);
 	}
@@ -738,14 +764,11 @@ static void tcp_v4_send_ack(const struct sock *sk,
 	const struct tcphdr *th = tcp_hdr(skb);
 	struct {
 		struct tcphdr th;
-		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
-#ifdef CONFIG_TCP_MD5SIG
-			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
-#endif
-			];
+		__be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
 	} rep;
 	struct net *net = sock_net(sk);
 	struct ip_reply_arg arg;
+	int offset = 0;
 
 	memset(&rep.th, 0, sizeof(struct tcphdr));
 	memset(&arg, 0, sizeof(arg));
@@ -759,33 +782,56 @@ static void tcp_v4_send_ack(const struct sock *sk,
 		rep.opt[1] = htonl(tsval);
 		rep.opt[2] = htonl(tsecr);
 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
+		offset += 3;
 	}
 
 	/* Swap the send and the receive. */
 	rep.th.dest    = th->source;
 	rep.th.source  = th->dest;
-	rep.th.doff    = arg.iov[0].iov_len / 4;
 	rep.th.seq     = htonl(seq);
 	rep.th.ack_seq = htonl(ack);
 	rep.th.ack     = 1;
 	rep.th.window  = htons(win);
 
+	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
+		unsigned int remaining;
+		unsigned int used;
+		struct tcp_out_options opts;
+
+		remaining = sizeof(rep.th) + sizeof(rep.opt) - arg.iov[0].iov_len;
 #ifdef CONFIG_TCP_MD5SIG
-	if (key) {
-		int offset = (tsecr) ? 3 : 0;
+		if (key)
+			remaining -= TCPOLEN_MD5SIG_ALIGNED;
+#endif
 
+		memset(&opts, 0, sizeof(opts));
+		rcu_read_lock();
+		used = tcp_extra_options_prepare(NULL, TCPHDR_ACK, remaining,
+						 &opts, sk);
+
+		tcp_extra_options_write(&rep.opt[offset], &opts, sk);
+		rcu_read_unlock();
+
+		arg.iov[0].iov_len += used;
+		offset += used / 4;
+	}
+
+	rep.th.doff = arg.iov[0].iov_len / 4;
+
+#ifdef CONFIG_TCP_MD5SIG
+	if (key) {
 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 					  (TCPOPT_NOP << 16) |
 					  (TCPOPT_MD5SIG << 8) |
 					  TCPOLEN_MD5SIG);
 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
-		rep.th.doff = arg.iov[0].iov_len/4;
 
 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 				    key, ip_hdr(skb)->saddr,
 				    ip_hdr(skb)->daddr, &rep.th);
 	}
 #endif
+
 	arg.flags = reply_flags;
 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 				      ip_hdr(skb)->saddr, /* XXX */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 188a6f31356d..1c3e91899dac 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -98,7 +98,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-		tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
+		tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL, NULL);
 
 		if (tmp_opt.saw_tstamp) {
 			if (tmp_opt.rcv_tsecr)
@@ -565,7 +565,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
-		tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
+		tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL, NULL);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent = req->ts_recent;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0bc9e46a5369..61eba3d0ae17 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -41,6 +41,7 @@
 #include <linux/compiler.h>
 #include <linux/gfp.h>
 #include <linux/module.h>
+#include <linux/static_key.h>
 
 /* People can turn this off for buggy TCP's found in printers etc. */
 int sysctl_tcp_retrans_collapse __read_mostly = 1;
@@ -413,23 +414,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 	return tp->snd_una != tp->snd_up;
 }
 
-#define OPTION_SACK_ADVERTISE	(1 << 0)
-#define OPTION_TS		(1 << 1)
-#define OPTION_MD5		(1 << 2)
-#define OPTION_WSCALE		(1 << 3)
-#define OPTION_FAST_OPEN_COOKIE	(1 << 8)
-
-struct tcp_out_options {
-	u16 options;		/* bit field of OPTION_* */
-	u16 mss;		/* 0 to disable */
-	u8 ws;			/* window scale, 0 to disable */
-	u8 num_sack_blocks;	/* number of SACK blocks to include */
-	u8 hash_size;		/* bytes in hash_location */
-	__u8 *hash_location;	/* temporary pointer, overloaded */
-	__u32 tsval, tsecr;	/* need to include OPTION_TS */
-	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
-};
-
 /* Write previously computed TCP options to the packet.
  *
  * Beware: Something in the Internet is very sensitive to the ordering of
@@ -536,6 +520,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 		}
 		ptr += (len + 3) >> 2;
 	}
+
+	if (static_branch_unlikely(&tcp_extra_options_enabled))
+		tcp_extra_options_write(ptr, opts, tcp_to_sk(tp));
 }
 
 /* Compute TCP options for SYN packets. This is not the final
@@ -603,6 +590,11 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
+	if (static_branch_unlikely(&tcp_extra_options_enabled))
+		remaining -= tcp_extra_options_prepare(skb, TCPHDR_SYN,
+						       remaining, opts,
+						       tcp_to_sk(tp));
+
 	return MAX_TCP_OPTION_SPACE - remaining;
 }
 
@@ -663,6 +655,12 @@ static unsigned int tcp_synack_options(struct request_sock *req,
 		}
 	}
 
+	if (static_branch_unlikely(&tcp_extra_options_enabled))
+		remaining -= tcp_extra_options_prepare(skb,
+						       TCPHDR_SYN | TCPHDR_ACK,
+						       remaining, opts,
+						       req_to_sk(req));
+
 	return MAX_TCP_OPTION_SPACE - remaining;
 }
 
@@ -696,6 +694,11 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 		size += TCPOLEN_TSTAMP_ALIGNED;
 	}
 
+	if (static_branch_unlikely(&tcp_extra_options_enabled))
+		size += tcp_extra_options_prepare(skb, 0,
+						  MAX_TCP_OPTION_SPACE - size,
+						  opts, tcp_to_sk(tp));
+
 	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
 	if (unlikely(eff_sacks)) {
 		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
@@ -1016,6 +1019,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	tcb = TCP_SKB_CB(skb);
 	memset(&opts, 0, sizeof(opts));
 
+	rcu_read_lock();
 	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
 		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
 	else
@@ -1092,6 +1096,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 					       md5, sk, skb);
 	}
 #endif
+	rcu_read_unlock();
 
 	icsk->icsk_af_ops->send_check(sk, skb);
 
@@ -3156,8 +3161,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 #endif
 		skb->skb_mstamp = tcp_clock_us();
 
-#ifdef CONFIG_TCP_MD5SIG
 	rcu_read_lock();
+#ifdef CONFIG_TCP_MD5SIG
 	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
 #endif
 	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
@@ -3196,8 +3201,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	if (md5)
 		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
 					       md5, req_to_sk(req), skb);
-	rcu_read_unlock();
 #endif
+	rcu_read_unlock();
 
 	/* Do not fool tcpdump (if any), clean our debris */
 	skb->tstamp = 0;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 4e7817abc0b9..407480366c73 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -162,7 +162,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 
 	/* check for timestamp cookie support */
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
+	tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL, tp);
 
 	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
 		tsoff = secure_tcpv6_ts_off(sock_net(sk),
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 64d94afa427f..4a3fba1ef3a2 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -784,9 +784,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	struct flowi6 fl6;
 	struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
 	struct sock *ctl_sk = net->ipv6.tcp_sk;
-	unsigned int tot_len = sizeof(struct tcphdr);
+	unsigned int tot_len = 0;
 	struct dst_entry *dst;
 	__be32 *topt;
+	struct tcp_out_options extraopts;
 
 	if (tsecr)
 		tot_len += TCPOLEN_TSTAMP_ALIGNED;
@@ -795,10 +796,28 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 		tot_len += TCPOLEN_MD5SIG_ALIGNED;
 #endif
 
+	rcu_read_lock();
+	if (static_branch_unlikely(&tcp_extra_options_enabled)) {
+		unsigned int remaining = MAX_TCP_OPTION_SPACE - tot_len;
+		u8 extraflags = rst ? TCPHDR_RST : 0;
+
+		if (!rst || !th->ack)
+			extraflags |= TCPHDR_ACK;
+
+		memset(&extraopts, 0, sizeof(extraopts));
+
+		tot_len += tcp_extra_options_prepare(skb, extraflags, remaining,
+						     &extraopts, sk);
+	}
+
+	tot_len += sizeof(struct tcphdr);
+
 	buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
 			 GFP_ATOMIC);
-	if (!buff)
+	if (!buff) {
+		rcu_read_unlock();
 		return;
+	}
 
 	skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
 
@@ -835,6 +854,11 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	}
 #endif
 
+	if (static_branch_unlikely(&tcp_extra_options_enabled))
+		tcp_extra_options_write(topt, &extraopts, sk);
+
+	rcu_read_unlock();
+
 	memset(&fl6, 0, sizeof(fl6));
 	fl6.daddr = ipv6_hdr(skb)->saddr;
 	fl6.saddr = ipv6_hdr(skb)->daddr;
-- 
2.14.2


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-10-12 19:56 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-10-05  7:40 [MPTCP] [PATCH v2] tcp: Register handlers for extra TCP options Christoph Paasch
  -- strict thread matches above, loose matches on Subject: below --
2017-10-12 19:56 Christoph Paasch
2017-10-06  5:28 Christoph Paasch
2017-10-05 18:56 Mat Martineau
2017-10-04 20:36 Mat Martineau

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.