* [PATCH net-next 1/2] tcp: TCP experimental option for SMC
2017-10-16 15:42 [PATCH net-next 0/2] TCP experimental option for SMC rendezvous Ursula Braun
@ 2017-10-16 15:42 ` Ursula Braun
2017-10-16 16:13 ` Eric Dumazet
2017-10-16 15:42 ` [PATCH net-next 2/2] smc: add SMC rendezvous protocol Ursula Braun
1 sibling, 1 reply; 4+ messages in thread
From: Ursula Braun @ 2017-10-16 15:42 UTC (permalink / raw)
To: davem; +Cc: netdev, linux-s390, jwi, schwidefsky, heiko.carstens, raspl,
ubraun
The SMC protocol [1] relies on the use of a new TCP experimental
option [2, 3]. With this option, SMC capabilities are exchanged
between peers during the TCP three way handshake. This patch adds
support for this experimental option to TCP.
References:
[1] SMC-R Informational RFC: http://www.rfc-editor.org/info/rfc7609
[2] Shared Use of TCP Experimental Options RFC 6994:
https://tools.ietf.org/rfc/rfc6994.txt
[3] IANA ExID SMCR:
http://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml#tcp-exids
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
---
include/linux/tcp.h | 9 ++++--
include/net/inet_sock.h | 3 +-
include/net/tcp.h | 8 ++++++
net/ipv4/tcp.c | 6 ++++
net/ipv4/tcp_input.c | 45 ++++++++++++++++++++++++++++++
net/ipv4/tcp_minisocks.c | 20 ++++++++++++++
net/ipv4/tcp_output.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++--
7 files changed, 156 insertions(+), 6 deletions(-)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1d2c44e09e31..3fb28954a1b7 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -98,7 +98,8 @@ struct tcp_options_received {
tstamp_ok : 1, /* TIMESTAMP seen on SYN packet */
dsack : 1, /* D-SACK is scheduled */
wscale_ok : 1, /* Wscale seen on SYN packet */
- sack_ok : 4, /* SACK seen on SYN packet */
+ sack_ok : 3, /* SACK seen on SYN packet */
+ smc_ok : 1, /* SMC seen on SYN packet */
snd_wscale : 4, /* Window scaling received from sender */
rcv_wscale : 4; /* Window scaling to send to receiver */
u8 num_sacks; /* Number of SACK blocks */
@@ -110,6 +111,9 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
{
rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
+#if IS_ENABLED(CONFIG_SMC)
+ rx_opt->smc_ok = 0;
+#endif
}
/* This is the max number of SACKS that we'll generate and process. It's safe
@@ -228,7 +232,8 @@ struct tcp_sock {
syn_fastopen_ch:1, /* Active TFO re-enabling probe */
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
save_syn:1, /* Save headers of SYN packet */
- is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
+ is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
+ syn_smc:1; /* SYN includes SMC */
u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */
/* RTT measurement */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index aa95053dfc78..600a7626eba0 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -92,7 +92,8 @@ struct inet_request_sock {
wscale_ok : 1,
ecn_ok : 1,
acked : 1,
- no_srccheck: 1;
+ no_srccheck: 1,
+ smc_ok : 1;
kmemcheck_bitfield_end(flags);
u32 ir_mark;
union {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3b3b9b968e2d..d8a82a35c1b2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -45,6 +45,7 @@
#include <linux/seq_file.h>
#include <linux/memcontrol.h>
+#include <linux/unaligned/access_ok.h>
#include <linux/bpf.h>
#include <linux/filter.h>
@@ -191,6 +192,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
* experimental options. See draft-ietf-tcpm-experimental-options-00.txt
*/
#define TCPOPT_FASTOPEN_MAGIC 0xF989
+#define TCPOPT_SMC_MAGIC 0xE2D4C3D9
/*
* TCP option lengths
@@ -203,6 +205,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCPOLEN_MD5SIG 18
#define TCPOLEN_FASTOPEN_BASE 2
#define TCPOLEN_EXP_FASTOPEN_BASE 4
+#define TCPOLEN_EXP_SMC_BASE 6
/* But this is what stacks really send out. */
#define TCPOLEN_TSTAMP_ALIGNED 12
@@ -213,6 +216,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCPOLEN_SACK_PERBLOCK 8
#define TCPOLEN_MD5SIG_ALIGNED 20
#define TCPOLEN_MSS_ALIGNED 4
+#define TCPOLEN_EXP_SMC_BASE_ALIGNED 8
/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */
@@ -2101,4 +2105,8 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
{
return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
}
+
+#if IS_ENABLED(CONFIG_SMC)
+extern struct static_key tcp_have_smc;
+#endif
#endif /* _TCP_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3b34850d361f..d2d7e1ad0897 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,6 +270,7 @@
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/errqueue.h>
+#include <linux/static_key.h>
#include <net/icmp.h>
#include <net/inet_common.h>
@@ -300,6 +301,11 @@ EXPORT_SYMBOL(sysctl_tcp_wmem);
atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
EXPORT_SYMBOL(tcp_memory_allocated);
+#if IS_ENABLED(CONFIG_SMC)
+struct static_key tcp_have_smc __read_mostly = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(tcp_have_smc);
+#endif
+
/*
* Current number of TCP sockets.
*/
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d0682ce2a5d6..28d03af2c7b1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -75,6 +75,7 @@
#include <linux/ipsec.h>
#include <asm/unaligned.h>
#include <linux/errqueue.h>
+#include <linux/static_key.h>
int sysctl_tcp_fack __read_mostly;
int sysctl_tcp_max_reordering __read_mostly = 300;
@@ -3735,6 +3736,21 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
foc->exp = exp_opt;
}
+static void smc_parse_options(const struct tcphdr *th,
+ struct tcp_options_received *opt_rx,
+ const unsigned char *ptr,
+ int opsize)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (!static_key_false(&tcp_have_smc))
+ return;
+ if (th->syn && !(opsize & 1) &&
+ opsize >= TCPOLEN_EXP_SMC_BASE &&
+ get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
+ opt_rx->smc_ok = 1;
+#endif
+}
+
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
@@ -3842,6 +3858,9 @@ void tcp_parse_options(const struct net *net,
tcp_parse_fastopen_option(opsize -
TCPOLEN_EXP_FASTOPEN_BASE,
ptr + 2, th->syn, foc, true);
+ else
+ smc_parse_options(th, opt_rx, ptr,
+ opsize);
break;
}
@@ -5594,6 +5613,15 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
return false;
}
+static void smc_check_reset_syn(struct tcp_sock *tp)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (static_key_false(&tcp_have_smc))
+ if (tp->syn_smc && !tp->rx_opt.smc_ok)
+ tp->syn_smc = 0;
+#endif
+}
+
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th)
{
@@ -5700,6 +5728,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* is initialized. */
tp->copied_seq = tp->rcv_nxt;
+ smc_check_reset_syn(tp);
+
smp_mb();
tcp_finish_connect(sk, skb);
@@ -6129,6 +6159,20 @@ static void tcp_ecn_create_request(struct request_sock *req,
inet_rsk(req)->ecn_ok = 1;
}
+static void smc_set_capability(struct inet_request_sock *ireq,
+ const struct tcp_options_received *rx_opt)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (!static_key_false(&tcp_have_smc))
+ return;
+
+ if (rx_opt->smc_ok)
+ ireq->smc_ok = 1;
+ else
+ ireq->smc_ok = 0;
+#endif
+}
+
static void tcp_openreq_init(struct request_sock *req,
const struct tcp_options_received *rx_opt,
struct sk_buff *skb, const struct sock *sk)
@@ -6152,6 +6196,7 @@ static void tcp_openreq_init(struct request_sock *req,
ireq->ir_rmt_port = tcp_hdr(skb)->source;
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
ireq->ir_mark = inet_request_mark(sk, skb);
+ smc_set_capability(ireq, rx_opt);
}
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 2341b9f857b6..2b1bff09a8c3 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -23,6 +23,7 @@
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/workqueue.h>
+#include <linux/static_key.h>
#include <net/tcp.h>
#include <net/inet_common.h>
#include <net/xfrm.h>
@@ -417,6 +418,22 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
}
EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
+static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
+ struct request_sock *req,
+ struct tcp_sock *newtp)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ struct inet_request_sock *ireq;
+
+ if (!static_key_false(&tcp_have_smc))
+ return;
+
+ ireq = inet_rsk(req);
+ if (oldtp->syn_smc && !ireq->smc_ok)
+ newtp->syn_smc = 0;
+#endif
+}
+
/* This is not only more efficient than what we used to do, it eliminates
* a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
*
@@ -434,6 +451,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk = inet_csk(newsk);
struct tcp_sock *newtp = tcp_sk(newsk);
+ struct tcp_sock *oldtp = tcp_sk(sk);
+
+ smc_check_reset_syn_req(oldtp, req, newtp);
/* Now setup tcp_sock */
newtp->pred_flags = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6c74f2a39778..36b5e45f02ca 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -41,6 +41,7 @@
#include <linux/compiler.h>
#include <linux/gfp.h>
#include <linux/module.h>
+#include <linux/static_key.h>
#include <trace/events/tcp.h>
@@ -422,6 +423,22 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_MD5 (1 << 2)
#define OPTION_WSCALE (1 << 3)
#define OPTION_FAST_OPEN_COOKIE (1 << 8)
+#define OPTION_SMC (1 << 9)
+
+static void smc_options_write(__be32 *ptr, u16 *options)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (!static_key_false(&tcp_have_smc))
+ return;
+ if (unlikely(OPTION_SMC & *options)) {
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_EXP << 8) |
+ (TCPOLEN_EXP_SMC_BASE));
+ *ptr++ = htonl(TCPOPT_SMC_MAGIC);
+ }
+#endif
+}
struct tcp_out_options {
u16 options; /* bit field of OPTION_* */
@@ -540,6 +557,49 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
}
ptr += (len + 3) >> 2;
}
+
+ smc_options_write(ptr, &options);
+}
+
+static void smc_set_option(const struct tcp_sock *tp,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ if (!static_key_false(&tcp_have_smc))
+ return;
+ if (tp->syn_smc) {
+ u32 need = TCPOLEN_EXP_SMC_BASE_ALIGNED;
+
+ if (*remaining >= need) {
+ opts->options |= OPTION_SMC;
+ *remaining -= need;
+ }
+ }
+#endif
+}
+
+static void smc_set_option_cond(const struct tcp_sock *tp,
+ const struct request_sock *req,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+#if IS_ENABLED(CONFIG_SMC)
+ struct inet_request_sock *ireq;
+
+ if (!static_key_false(&tcp_have_smc))
+ return;
+
+ ireq = inet_rsk(req);
+ if (tp->syn_smc && ireq->smc_ok) {
+ u32 need = TCPOLEN_EXP_SMC_BASE_ALIGNED;
+
+ if (*remaining >= need) {
+ opts->options |= OPTION_SMC;
+ *remaining -= need;
+ }
+ }
+#endif
}
/* Compute TCP options for SYN packets. This is not the final
@@ -607,11 +667,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
}
+ smc_set_option(tp, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
/* Set up TCP options for SYN-ACKs. */
-static unsigned int tcp_synack_options(struct request_sock *req,
+static unsigned int tcp_synack_options(const struct sock *sk,
+ struct request_sock *req,
unsigned int mss, struct sk_buff *skb,
struct tcp_out_options *opts,
const struct tcp_md5sig_key *md5,
@@ -667,6 +730,8 @@ static unsigned int tcp_synack_options(struct request_sock *req,
}
}
+ smc_set_option_cond(tcp_sk(sk), req, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -3193,8 +3258,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
- tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) +
- sizeof(*th);
+ tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
+ foc) + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
--
2.13.5
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH net-next 2/2] smc: add SMC rendezvous protocol
2017-10-16 15:42 [PATCH net-next 0/2] TCP experimental option for SMC rendezvous Ursula Braun
2017-10-16 15:42 ` [PATCH net-next 1/2] tcp: TCP experimental option for SMC Ursula Braun
@ 2017-10-16 15:42 ` Ursula Braun
1 sibling, 0 replies; 4+ messages in thread
From: Ursula Braun @ 2017-10-16 15:42 UTC (permalink / raw)
To: davem; +Cc: netdev, linux-s390, jwi, schwidefsky, heiko.carstens, raspl,
ubraun
The SMC protocol [1] uses a rendezvous protocol to negotiate SMC
capability between peers. The current Linux implementation does not yet
use this rendezvous protocol and, thus, is not compliant to RFC7609 and
incompatible with other SMC implementations like in zOS.
This patch adds support for the SMC rendezvous protocol. It uses a new
TCP experimental option. With this option, SMC capabilities are
exchanged between the peers during the TCP three way handshake.
[1] SMC-R Informational RFC: http://www.rfc-editor.org/info/rfc7609
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
---
net/smc/af_smc.c | 21 ++++++++++++++++++++-
1 file changed, 20 insertions(+), 1 deletion(-)
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 745f145d4c4d..f1842da9d90e 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -390,6 +390,12 @@ static int smc_connect_rdma(struct smc_sock *smc)
int rc = 0;
u8 ibport;
+ if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
+ /* peer has not signalled SMC-capability */
+ smc->use_fallback = true;
+ goto out_connected;
+ }
+
/* IPSec connections opt out of SMC-R optimizations */
if (using_ipsec(smc)) {
reason_code = SMC_CLC_DECL_IPSEC;
@@ -555,6 +561,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
}
smc_copy_sock_settings_to_clc(smc);
+ tcp_sk(smc->clcsock->sk)->syn_smc = 1;
rc = kernel_connect(smc->clcsock, addr, alen, flags);
if (rc)
goto out;
@@ -752,6 +759,7 @@ static void smc_listen_work(struct work_struct *work)
struct smc_clc_msg_proposal pclc;
struct smc_ib_device *smcibdev;
struct sockaddr_in peeraddr;
+ bool lgr_lock_taken = false;
struct smc_link *link;
int reason_code = 0;
int rc = 0, len;
@@ -759,6 +767,12 @@ static void smc_listen_work(struct work_struct *work)
u8 prefix_len;
u8 ibport;
+ /* check if peer is smc capable */
+ if (!tcp_sk(newclcsock->sk)->syn_smc) {
+ new_smc->use_fallback = true;
+ goto out_connected;
+ }
+
/* do inband token exchange -
*wait for and receive SMC Proposal CLC message
*/
@@ -802,6 +816,7 @@ static void smc_listen_work(struct work_struct *work)
/* allocate connection / link group */
mutex_lock(&smc_create_lgr_pending);
+ lgr_lock_taken = true;
local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
smcibdev, ibport, &pclc.lcl, 0);
if (local_contact < 0) {
@@ -882,7 +897,8 @@ static void smc_listen_work(struct work_struct *work)
if (newsmcsk->sk_state == SMC_INIT)
newsmcsk->sk_state = SMC_ACTIVE;
enqueue:
- mutex_unlock(&smc_create_lgr_pending);
+ if (lgr_lock_taken)
+ mutex_unlock(&smc_create_lgr_pending);
lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
if (lsmc->sk.sk_state == SMC_LISTEN) {
smc_accept_enqueue(&lsmc->sk, newsmcsk);
@@ -963,6 +979,7 @@ static int smc_listen(struct socket *sock, int backlog)
* them to the clc socket -- copy smc socket options to clc socket
*/
smc_copy_sock_settings_to_clc(smc);
+ tcp_sk(smc->clcsock->sk)->syn_smc = 1;
rc = kernel_listen(smc->clcsock, backlog);
if (rc)
@@ -1405,6 +1422,7 @@ static int __init smc_init(void)
goto out_sock;
}
+ static_key_slow_inc(&tcp_have_smc);
return 0;
out_sock:
@@ -1429,6 +1447,7 @@ static void __exit smc_exit(void)
list_del_init(&lgr->list);
smc_lgr_free(lgr); /* free link group */
}
+ static_key_slow_dec(&tcp_have_smc);
smc_ib_unregister_client();
sock_unregister(PF_SMC);
proto_unregister(&smc_proto);
--
2.13.5
^ permalink raw reply related [flat|nested] 4+ messages in thread