Netdev List
 help / color / mirror / Atom feed
* [RFC 06/14] tcp_smc: Make SMC use TCP extra-option framework
From: Christoph Paasch @ 2017-12-18 21:51 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Mat Martineau, Alexei Starovoitov, Ursula Braun
In-Reply-To: <20171218215109.38700-1-cpaasch@apple.com>

Adopt the extra-option framework for SMC.
It allows us to entirely remove SMC-code out of the TCP-stack.

The static key is gone, as this is now covered by the static key of the
extra-option framework.

We allocate state (struct tcp_smc_opt) that indicates whether SMC was
successfully negotiated or not and check this state in the relevant
functions.

Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 include/linux/tcp.h      |   3 +-
 include/net/inet_sock.h  |   3 +-
 include/net/tcp.h        |   4 -
 net/ipv4/tcp.c           |   5 --
 net/ipv4/tcp_input.c     |  36 ---------
 net/ipv4/tcp_minisocks.c |  18 -----
 net/ipv4/tcp_output.c    |  54 --------------
 net/smc/af_smc.c         | 190 +++++++++++++++++++++++++++++++++++++++++++++--
 8 files changed, 186 insertions(+), 127 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4756bd2c4b54..231b352f587f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -257,8 +257,7 @@ struct tcp_sock {
 		syn_fastopen_ch:1, /* Active TFO re-enabling probe */
 		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
 		save_syn:1,	/* Save headers of SYN packet */
-		is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
-		syn_smc:1;	/* SYN includes SMC */
+		is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
 	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */
 
 /* RTT measurement */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 39efb968b7a4..8e51b4a69088 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -90,8 +90,7 @@ struct inet_request_sock {
 				wscale_ok  : 1,
 				ecn_ok	   : 1,
 				acked	   : 1,
-				no_srccheck: 1,
-				smc_ok	   : 1;
+				no_srccheck: 1;
 	u32                     ir_mark;
 	union {
 		struct ip_options_rcu __rcu	*ireq_opt;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ac62ceff9815..a5c4856e25c7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2062,10 +2062,6 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
 }
 
-#if IS_ENABLED(CONFIG_SMC)
-extern struct static_key_false tcp_have_smc;
-#endif
-
 struct tcp_extopt_store;
 
 struct tcp_extopt_ops {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 17f38afb4212..0a1cabee6d5e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -294,11 +294,6 @@ EXPORT_SYMBOL(sysctl_tcp_mem);
 atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
 EXPORT_SYMBOL(tcp_memory_allocated);
 
-#if IS_ENABLED(CONFIG_SMC)
-DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
-EXPORT_SYMBOL(tcp_have_smc);
-#endif
-
 /*
  * Current number of TCP sockets.
  */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1950ff80fb3f..af8f4f9fd098 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3671,24 +3671,6 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
 	foc->exp = exp_opt;
 }
 
-static int smc_parse_options(const struct tcphdr *th,
-			     struct tcp_options_received *opt_rx,
-			     const unsigned char *ptr,
-			     int opsize)
-{
-#if IS_ENABLED(CONFIG_SMC)
-	if (static_branch_unlikely(&tcp_have_smc)) {
-		if (th->syn && !(opsize & 1) &&
-		    opsize >= TCPOLEN_EXP_SMC_BASE &&
-		    get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
-			opt_rx->smc_ok = 1;
-			return 1;
-		}
-	}
-#endif
-	return 0;
-}
-
 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
  * But, this can also be called on packets in the established flow when
  * the fast version below fails.
@@ -3796,9 +3778,6 @@ void tcp_parse_options(const struct net *net,
 					tcp_parse_fastopen_option(opsize -
 						TCPOLEN_EXP_FASTOPEN_BASE,
 						ptr + 2, th->syn, foc, true);
-				else if (smc_parse_options(th, opt_rx, ptr,
-							   opsize))
-					break;
 				else if (opsize >= TCPOLEN_EXP_BASE)
 					tcp_extopt_parse(get_unaligned_be32(ptr),
 							 opsize, ptr, skb,
@@ -5572,16 +5551,6 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 	return false;
 }
 
-static void smc_check_reset_syn(struct tcp_sock *tp)
-{
-#if IS_ENABLED(CONFIG_SMC)
-	if (static_branch_unlikely(&tcp_have_smc)) {
-		if (tp->syn_smc && !tp->rx_opt.smc_ok)
-			tp->syn_smc = 0;
-	}
-#endif
-}
-
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 					 const struct tcphdr *th)
 {
@@ -5692,8 +5661,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 * is initialized. */
 		tp->copied_seq = tp->rcv_nxt;
 
-		smc_check_reset_syn(tp);
-
 		smp_mb();
 
 		tcp_finish_connect(sk, skb);
@@ -6150,9 +6117,6 @@ static void tcp_openreq_init(struct request_sock *req,
 	ireq->ir_rmt_port = tcp_hdr(skb)->source;
 	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
 	ireq->ir_mark = inet_request_mark(sk, skb);
-#if IS_ENABLED(CONFIG_SMC)
-	ireq->smc_ok = rx_opt->smc_ok;
-#endif
 }
 
 struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 676ad7ca13ad..aa2ff9aadad0 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -435,21 +435,6 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
 }
 EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
 
-static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
-				    struct request_sock *req,
-				    struct tcp_sock *newtp)
-{
-#if IS_ENABLED(CONFIG_SMC)
-	struct inet_request_sock *ireq;
-
-	if (static_branch_unlikely(&tcp_have_smc)) {
-		ireq = inet_rsk(req);
-		if (oldtp->syn_smc && !ireq->smc_ok)
-			newtp->syn_smc = 0;
-	}
-#endif
-}
-
 /* This is not only more efficient than what we used to do, it eliminates
  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
  *
@@ -467,9 +452,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		struct tcp_request_sock *treq = tcp_rsk(req);
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 		struct tcp_sock *newtp = tcp_sk(newsk);
-		struct tcp_sock *oldtp = tcp_sk(sk);
-
-		smc_check_reset_syn_req(oldtp, req, newtp);
 
 		/* Now setup tcp_sock */
 		newtp->pred_flags = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6804a9325107..baf1c913ca7f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -398,21 +398,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 	return tp->snd_una != tp->snd_up;
 }
 
-static void smc_options_write(__be32 *ptr, u16 *options)
-{
-#if IS_ENABLED(CONFIG_SMC)
-	if (static_branch_unlikely(&tcp_have_smc)) {
-		if (unlikely(OPTION_SMC & *options)) {
-			*ptr++ = htonl((TCPOPT_NOP  << 24) |
-				       (TCPOPT_NOP  << 16) |
-				       (TCPOPT_EXP <<  8) |
-				       (TCPOLEN_EXP_SMC_BASE));
-			*ptr++ = htonl(TCPOPT_SMC_MAGIC);
-		}
-	}
-#endif
-}
-
 /* Write previously computed TCP options to the packet.
  *
  * Beware: Something in the Internet is very sensitive to the ordering of
@@ -527,45 +512,10 @@ static void tcp_options_write(__be32 *ptr, struct sk_buff *skb, struct sock *sk,
 		ptr += (len + 3) >> 2;
 	}
 
-	smc_options_write(ptr, &options);
-
 	if (unlikely(!hlist_empty(extopt_list)))
 		tcp_extopt_write(ptr, skb, opts, sk);
 }
 
-static void smc_set_option(const struct tcp_sock *tp,
-			   struct tcp_out_options *opts,
-			   unsigned int *remaining)
-{
-#if IS_ENABLED(CONFIG_SMC)
-	if (static_branch_unlikely(&tcp_have_smc)) {
-		if (tp->syn_smc) {
-			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
-				opts->options |= OPTION_SMC;
-				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
-			}
-		}
-	}
-#endif
-}
-
-static void smc_set_option_cond(const struct tcp_sock *tp,
-				const struct inet_request_sock *ireq,
-				struct tcp_out_options *opts,
-				unsigned int *remaining)
-{
-#if IS_ENABLED(CONFIG_SMC)
-	if (static_branch_unlikely(&tcp_have_smc)) {
-		if (tp->syn_smc && ireq->smc_ok) {
-			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
-				opts->options |= OPTION_SMC;
-				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
-			}
-		}
-	}
-#endif
-}
-
 /* Compute TCP options for SYN packets. This is not the final
  * network wire format yet.
  */
@@ -631,8 +581,6 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
-	smc_set_option(tp, opts, &remaining);
-
 	if (unlikely(!hlist_empty(&tp->tcp_option_list)))
 		remaining -= tcp_extopt_prepare(skb, TCPHDR_SYN, remaining,
 						opts, tcp_to_sk(tp));
@@ -698,8 +646,6 @@ static unsigned int tcp_synack_options(const struct sock *sk,
 		}
 	}
 
-	smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
-
 	if (unlikely(!hlist_empty(&tcp_rsk(req)->tcp_option_list)))
 		remaining -= tcp_extopt_prepare(skb, TCPHDR_SYN | TCPHDR_ACK,
 						remaining, opts,
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index daf8075f5a4c..14bb84f81a50 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -44,6 +44,149 @@
 #include "smc_rx.h"
 #include "smc_close.h"
 
+static unsigned int tcp_smc_opt_prepare(struct sk_buff *skb, u8 flags,
+					unsigned int remaining,
+					struct tcp_out_options *opts,
+					const struct sock *sk,
+					struct tcp_extopt_store *store);
+static __be32 *tcp_smc_opt_write(__be32 *ptr, struct sk_buff *skb,
+				 struct tcp_out_options *opts,
+				 struct sock *sk,
+				 struct tcp_extopt_store *store);
+static void tcp_smc_opt_parse(int opsize, const unsigned char *opptr,
+			      const struct sk_buff *skb,
+			      struct tcp_options_received *opt_rx,
+			      struct sock *sk,
+			      struct tcp_extopt_store *store);
+static void tcp_smc_opt_post_process(struct sock *sk,
+				     struct tcp_options_received *opt,
+				     struct tcp_extopt_store *store);
+static struct tcp_extopt_store *tcp_smc_opt_copy(struct sock *listener,
+						 struct request_sock *req,
+						 struct tcp_options_received *opt,
+						 struct tcp_extopt_store *store);
+static void tcp_smc_opt_destroy(struct tcp_extopt_store *store);
+
+struct tcp_smc_opt {
+	struct tcp_extopt_store	store;
+	int			smc_ok:1; /* SMC supported on this connection */
+	struct rcu_head		rcu;
+};
+
+static const struct tcp_extopt_ops tcp_smc_extra_ops = {
+	.option_kind	= TCPOPT_SMC_MAGIC,
+	.parse		= tcp_smc_opt_parse,
+	.post_process	= tcp_smc_opt_post_process,
+	.prepare	= tcp_smc_opt_prepare,
+	.write		= tcp_smc_opt_write,
+	.copy		= tcp_smc_opt_copy,
+	.destroy	= tcp_smc_opt_destroy,
+	.owner		= THIS_MODULE,
+};
+
+static struct tcp_smc_opt *tcp_extopt_to_smc(struct tcp_extopt_store *store)
+{
+	return container_of(store, struct tcp_smc_opt, store);
+}
+
+static struct tcp_smc_opt *tcp_smc_opt_find(struct sock *sk)
+{
+	struct tcp_extopt_store *ext_opt;
+
+	ext_opt = tcp_extopt_find_kind(TCPOPT_SMC_MAGIC, sk);
+
+	return tcp_extopt_to_smc(ext_opt);
+}
+
+static unsigned int tcp_smc_opt_prepare(struct sk_buff *skb, u8 flags,
+					unsigned int remaining,
+					struct tcp_out_options *opts,
+					const struct sock *sk,
+					struct tcp_extopt_store *store)
+{
+	if (!(flags & TCPHDR_SYN))
+		return 0;
+
+	if (remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+		opts->options |= OPTION_SMC;
+		return TCPOLEN_EXP_SMC_BASE_ALIGNED;
+	}
+
+	return 0;
+}
+
+static __be32 *tcp_smc_opt_write(__be32 *ptr, struct sk_buff *skb,
+				 struct tcp_out_options *opts,
+				 struct sock *sk,
+				 struct tcp_extopt_store *store)
+{
+	if (unlikely(OPTION_SMC & opts->options)) {
+		*ptr++ = htonl((TCPOPT_NOP  << 24) |
+			       (TCPOPT_NOP  << 16) |
+			       (TCPOPT_EXP <<  8) |
+			       (TCPOLEN_EXP_SMC_BASE));
+		*ptr++ = htonl(TCPOPT_SMC_MAGIC);
+	}
+
+	return ptr;
+}
+
+static void tcp_smc_opt_parse(int opsize, const unsigned char *opptr,
+			      const struct sk_buff *skb,
+			      struct tcp_options_received *opt_rx,
+			      struct sock *sk,
+			      struct tcp_extopt_store *store)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+
+	if (th->syn && !(opsize & 1) && opsize >= TCPOLEN_EXP_SMC_BASE)
+		opt_rx->smc_ok = 1;
+}
+
+static void tcp_smc_opt_post_process(struct sock *sk,
+				     struct tcp_options_received *opt,
+				     struct tcp_extopt_store *store)
+{
+	struct tcp_smc_opt *smc_opt = tcp_extopt_to_smc(store);
+
+	if (sk->sk_state != TCP_SYN_SENT)
+		return;
+
+	if (opt->smc_ok)
+		smc_opt->smc_ok = 1;
+	else
+		smc_opt->smc_ok = 0;
+}
+
+static struct tcp_extopt_store *tcp_smc_opt_copy(struct sock *listener,
+						 struct request_sock *req,
+						 struct tcp_options_received *opt,
+						 struct tcp_extopt_store *store)
+{
+	struct tcp_smc_opt *smc_opt;
+
+	/* First, check if the peer sent us the smc-opt */
+	if (!opt->smc_ok)
+		return NULL;
+
+	smc_opt = kzalloc(sizeof(*smc_opt), GFP_ATOMIC);
+	if (!smc_opt)
+		return NULL;
+
+	smc_opt->store.ops = &tcp_smc_extra_ops;
+
+	smc_opt->smc_ok = 1;
+
+	return (struct tcp_extopt_store *)smc_opt;
+}
+
+static void tcp_smc_opt_destroy(struct tcp_extopt_store *store)
+{
+	struct tcp_smc_opt *smc_opt = tcp_extopt_to_smc(store);
+
+	kfree_rcu(smc_opt, rcu);
+}
+
 static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
 						 * creation
 						 */
@@ -384,13 +527,15 @@ static int smc_connect_rdma(struct smc_sock *smc)
 	struct smc_clc_msg_accept_confirm aclc;
 	int local_contact = SMC_FIRST_CONTACT;
 	struct smc_ib_device *smcibdev;
+	struct tcp_smc_opt *smc_opt;
 	struct smc_link *link;
 	u8 srv_first_contact;
 	int reason_code = 0;
 	int rc = 0;
 	u8 ibport;
 
-	if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
+	smc_opt = tcp_smc_opt_find(smc->clcsock->sk);
+	if (!smc_opt || !smc_opt->smc_ok) {
 		/* peer has not signalled SMC-capability */
 		smc->use_fallback = true;
 		goto out_connected;
@@ -535,6 +680,7 @@ static int smc_connect_rdma(struct smc_sock *smc)
 static int smc_connect(struct socket *sock, struct sockaddr *addr,
 		       int alen, int flags)
 {
+	struct tcp_smc_opt *smc_opt;
 	struct sock *sk = sock->sk;
 	struct smc_sock *smc;
 	int rc = -EINVAL;
@@ -548,9 +694,17 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
 		goto out_err;
 	smc->addr = addr;	/* needed for nonblocking connect */
 
+	smc_opt = kzalloc(sizeof(*smc_opt), GFP_KERNEL);
+	if (!smc_opt) {
+		rc = -ENOMEM;
+		goto out_err;
+	}
+	smc_opt->store.ops = &tcp_smc_extra_ops;
+
 	lock_sock(sk);
 	switch (sk->sk_state) {
 	default:
+		rc = -EINVAL;
 		goto out;
 	case SMC_ACTIVE:
 		rc = -EISCONN;
@@ -560,8 +714,15 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
 		break;
 	}
 
+	/* We are the only owner of smc->clcsock->sk, so we can be lockless */
+	rc = tcp_register_extopt(&smc_opt->store, smc->clcsock->sk);
+	if (rc) {
+		release_sock(smc->clcsock->sk);
+		kfree(smc_opt);
+		goto out_err;
+	}
+
 	smc_copy_sock_settings_to_clc(smc);
-	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
 	rc = kernel_connect(smc->clcsock, addr, alen, flags);
 	if (rc)
 		goto out;
@@ -760,6 +921,7 @@ static void smc_listen_work(struct work_struct *work)
 	struct smc_clc_msg_proposal *pclc;
 	struct smc_ib_device *smcibdev;
 	struct sockaddr_in peeraddr;
+	struct tcp_smc_opt *smc_opt;
 	u8 buf[SMC_CLC_MAX_LEN];
 	struct smc_link *link;
 	int reason_code = 0;
@@ -769,7 +931,8 @@ static void smc_listen_work(struct work_struct *work)
 	u8 ibport;
 
 	/* check if peer is smc capable */
-	if (!tcp_sk(newclcsock->sk)->syn_smc) {
+	smc_opt = tcp_smc_opt_find(newclcsock->sk);
+	if (!smc_opt || !smc_opt->smc_ok) {
 		new_smc->use_fallback = true;
 		goto out_connected;
 	}
@@ -962,10 +1125,18 @@ static void smc_tcp_listen_work(struct work_struct *work)
 
 static int smc_listen(struct socket *sock, int backlog)
 {
+	struct tcp_smc_opt *smc_opt;
 	struct sock *sk = sock->sk;
 	struct smc_sock *smc;
 	int rc;
 
+	smc_opt = kzalloc(sizeof(*smc_opt), GFP_KERNEL);
+	if (!smc_opt) {
+		rc = -ENOMEM;
+		goto out_err;
+	}
+	smc_opt->store.ops = &tcp_smc_extra_ops;
+
 	smc = smc_sk(sk);
 	lock_sock(sk);
 
@@ -978,11 +1149,19 @@ static int smc_listen(struct socket *sock, int backlog)
 		sk->sk_max_ack_backlog = backlog;
 		goto out;
 	}
+
+	/* We are the only owner of smc->clcsock->sk, so we can be lockless */
+	rc = tcp_register_extopt(&smc_opt->store, smc->clcsock->sk);
+	if (rc) {
+		release_sock(smc->clcsock->sk);
+		kfree(smc_opt);
+		goto out_err;
+	}
+
 	/* some socket options are handled in core, so we could not apply
 	 * them to the clc socket -- copy smc socket options to clc socket
 	 */
 	smc_copy_sock_settings_to_clc(smc);
-	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
 
 	rc = kernel_listen(smc->clcsock, backlog);
 	if (rc)
@@ -995,6 +1174,7 @@ static int smc_listen(struct socket *sock, int backlog)
 
 out:
 	release_sock(sk);
+out_err:
 	return rc;
 }
 
@@ -1425,7 +1605,6 @@ static int __init smc_init(void)
 		goto out_sock;
 	}
 
-	static_branch_enable(&tcp_have_smc);
 	return 0;
 
 out_sock:
@@ -1450,7 +1629,6 @@ static void __exit smc_exit(void)
 		list_del_init(&lgr->list);
 		smc_lgr_free(lgr); /* free link group */
 	}
-	static_branch_disable(&tcp_have_smc);
 	smc_ib_unregister_client();
 	sock_unregister(PF_SMC);
 	proto_unregister(&smc_proto);
-- 
2.15.0

^ permalink raw reply related

* [RFC 04/14] tcp_smc: Make smc_parse_options return 1 on success
From: Christoph Paasch @ 2017-12-18 21:50 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Mat Martineau, Alexei Starovoitov, Ursula Braun
In-Reply-To: <20171218215109.38700-1-cpaasch@apple.com>

As we allow a generic TCP-option parser that also parses experimental
TCP options, we need to add a return-value to smc_parse_options() that
indicates whether the option actually matched or not.

Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 net/ipv4/tcp_input.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eb97ee24c601..5c35fd568b13 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3671,19 +3671,22 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
 	foc->exp = exp_opt;
 }
 
-static void smc_parse_options(const struct tcphdr *th,
-			      struct tcp_options_received *opt_rx,
-			      const unsigned char *ptr,
-			      int opsize)
+static int smc_parse_options(const struct tcphdr *th,
+			     struct tcp_options_received *opt_rx,
+			     const unsigned char *ptr,
+			     int opsize)
 {
 #if IS_ENABLED(CONFIG_SMC)
 	if (static_branch_unlikely(&tcp_have_smc)) {
 		if (th->syn && !(opsize & 1) &&
 		    opsize >= TCPOLEN_EXP_SMC_BASE &&
-		    get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
+		    get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
 			opt_rx->smc_ok = 1;
+			return 1;
+		}
 	}
 #endif
+	return 0;
 }
 
 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
-- 
2.15.0

^ permalink raw reply related

* [RFC 07/14] tcp_md5: Don't pass along md5-key
From: Christoph Paasch @ 2017-12-18 21:51 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Mat Martineau, Alexei Starovoitov
In-Reply-To: <20171218215109.38700-1-cpaasch@apple.com>

It is much cleaner to store the key-pointer in tcp_out_options. It
allows to remove some MD5-specific code out of the function-arguments
and paves the way to adopting the TCP-option framework with TCP-MD5.

Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 include/linux/tcp.h   |  1 +
 net/ipv4/tcp_output.c | 46 +++++++++++++++++++---------------------------
 2 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 231b352f587f..b0b38f7100a4 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -131,6 +131,7 @@ struct tcp_out_options {
 	__u8 *hash_location;	/* temporary pointer, overloaded */
 	__u32 tsval, tsecr;	/* need to include OPTION_TS */
 	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
+	struct tcp_md5sig_key *md5; /* TCP_MD5 signature key */
 };
 
 /* This is the max number of SACKS that we'll generate and process. It's safe
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index baf1c913ca7f..43849ed73b03 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -520,21 +520,18 @@ static void tcp_options_write(__be32 *ptr, struct sk_buff *skb, struct sock *sk,
  * network wire format yet.
  */
 static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
-				struct tcp_out_options *opts,
-				struct tcp_md5sig_key **md5)
+				struct tcp_out_options *opts)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned int remaining = MAX_TCP_OPTION_SPACE;
 	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
 
 #ifdef CONFIG_TCP_MD5SIG
-	*md5 = tp->af_specific->md5_lookup(sk, sk);
-	if (*md5) {
+	opts->md5 = tp->af_specific->md5_lookup(sk, sk);
+	if (opts->md5) {
 		opts->options |= OPTION_MD5;
 		remaining -= TCPOLEN_MD5SIG_ALIGNED;
 	}
-#else
-	*md5 = NULL;
 #endif
 
 	/* We always get an MSS option.  The option bytes which will be seen in
@@ -549,7 +546,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 	opts->mss = tcp_advertise_mss(sk);
 	remaining -= TCPOLEN_MSS_ALIGNED;
 
-	if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
+	if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !opts->md5)) {
 		opts->options |= OPTION_TS;
 		opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
 		opts->tsecr = tp->rx_opt.ts_recent;
@@ -593,14 +590,13 @@ static unsigned int tcp_synack_options(const struct sock *sk,
 				       struct request_sock *req,
 				       unsigned int mss, struct sk_buff *skb,
 				       struct tcp_out_options *opts,
-				       const struct tcp_md5sig_key *md5,
 				       struct tcp_fastopen_cookie *foc)
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
 	unsigned int remaining = MAX_TCP_OPTION_SPACE;
 
 #ifdef CONFIG_TCP_MD5SIG
-	if (md5) {
+	if (opts->md5) {
 		opts->options |= OPTION_MD5;
 		remaining -= TCPOLEN_MD5SIG_ALIGNED;
 
@@ -658,8 +654,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
  * final wire format yet.
  */
 static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
-					struct tcp_out_options *opts,
-					struct tcp_md5sig_key **md5)
+					struct tcp_out_options *opts)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned int size = 0;
@@ -668,13 +663,13 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 	opts->options = 0;
 
 #ifdef CONFIG_TCP_MD5SIG
-	*md5 = tp->af_specific->md5_lookup(sk, sk);
-	if (unlikely(*md5)) {
+	opts->md5 = tp->af_specific->md5_lookup(sk, sk);
+	if (unlikely(opts->md5)) {
 		opts->options |= OPTION_MD5;
 		size += TCPOLEN_MD5SIG_ALIGNED;
 	}
 #else
-	*md5 = NULL;
+	opts->md5 = NULL;
 #endif
 
 	if (likely(tp->rx_opt.tstamp_ok)) {
@@ -992,7 +987,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	struct tcp_out_options opts;
 	unsigned int tcp_options_size, tcp_header_size;
 	struct sk_buff *oskb = NULL;
-	struct tcp_md5sig_key *md5;
 	struct tcphdr *th;
 	int err;
 
@@ -1021,10 +1015,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	memset(&opts, 0, sizeof(opts));
 
 	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
-		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
+		tcp_options_size = tcp_syn_options(sk, skb, &opts);
 	else
-		tcp_options_size = tcp_established_options(sk, skb, &opts,
-							   &md5);
+		tcp_options_size = tcp_established_options(sk, skb, &opts);
 	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
 
 	/* if no packet is in qdisc/device queue, then allow XPS to select
@@ -1090,10 +1083,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	tcp_options_write((__be32 *)(th + 1), skb, sk, &opts);
 #ifdef CONFIG_TCP_MD5SIG
 	/* Calculate the MD5 hash, as we have all we need now */
-	if (md5) {
+	if (opts.md5) {
 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
 		tp->af_specific->calc_md5_hash(opts.hash_location,
-					       md5, sk, skb);
+					       opts.md5, sk, skb);
 	}
 #endif
 
@@ -1537,7 +1530,6 @@ unsigned int tcp_current_mss(struct sock *sk)
 	u32 mss_now;
 	unsigned int header_len;
 	struct tcp_out_options opts;
-	struct tcp_md5sig_key *md5;
 
 	mss_now = tp->mss_cache;
 
@@ -1547,7 +1539,7 @@ unsigned int tcp_current_mss(struct sock *sk)
 			mss_now = tcp_sync_mss(sk, mtu);
 	}
 
-	header_len = tcp_established_options(sk, NULL, &opts, &md5) +
+	header_len = tcp_established_options(sk, NULL, &opts) +
 		     sizeof(struct tcphdr);
 	/* The mss_cache is sized based on tp->tcp_header_len, which assumes
 	 * some common options. If this is an odd packet (because we have SACK
@@ -3123,7 +3115,6 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
 	const struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_md5sig_key *md5 = NULL;
 	struct tcp_out_options opts;
 	struct sk_buff *skb;
 	int tcp_header_size;
@@ -3169,10 +3160,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 
 #ifdef CONFIG_TCP_MD5SIG
 	rcu_read_lock();
-	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
+	opts.md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
 #endif
 	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
-	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
+	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts,
 					     foc) + sizeof(*th);
 
 	skb_push(skb, tcp_header_size);
@@ -3199,9 +3190,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 
 #ifdef CONFIG_TCP_MD5SIG
 	/* Okay, we have all we need - do the md5 hash if needed */
-	if (md5)
+	if (opts.md5)
 		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
-					       md5, req_to_sk(req), skb);
+							 opts.md5,
+							 req_to_sk(req), skb);
 	rcu_read_unlock();
 #endif
 
-- 
2.15.0

^ permalink raw reply related

* [RFC 03/14] tcp: Allow tcp_fast_parse_options to drop segments
From: Christoph Paasch @ 2017-12-18 21:50 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Mat Martineau, Alexei Starovoitov
In-Reply-To: <20171218215109.38700-1-cpaasch@apple.com>

After parsing the TCP-options, some option-kinds might trigger a drop of
the segment (e.g., as is the case for TCP_MD5). As we are moving to
consolidate the TCP_MD5-code in follow-up patches, we need to add the
capability to drop a segment right after parsing the options in
tcp_fast_parse_options().

Originally, tcp_fast_parse_options() returned false, when there is no
timestamp option, except in the case of the slow-path processing through
tcp_parse_options() where it always returns true.

So, the return-value of tcp_fast_parse_options() was kind of
inconsistent. With this patch, we make it return true when the segment
should get dropped based on the parsed options, and false otherwise.

In tcp_validate_incoming, we will then just check for
tp->rx_opt.saw_tstamp to see if we should verify PAWS.

The goto will be used in a follow-up patch to check whether one of the
options triggers a drop of the segment.

Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 net/ipv4/tcp_input.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4d55c4b338ee..eb97ee24c601 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3827,6 +3827,8 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
 
 /* Fast parse options. This hopes to only see timestamps.
  * If it is wrong it falls back on tcp_parse_options().
+ *
+ * Returns true if we should drop this packet based on present TCP-options.
  */
 static bool tcp_fast_parse_options(const struct net *net,
 				   const struct sk_buff *skb,
@@ -3837,18 +3839,19 @@ static bool tcp_fast_parse_options(const struct net *net,
 	 */
 	if (th->doff == (sizeof(*th) / 4)) {
 		tp->rx_opt.saw_tstamp = 0;
-		return false;
+		goto extra_opt_check;
 	} else if (tp->rx_opt.tstamp_ok &&
 		   th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
 		if (tcp_parse_aligned_timestamp(tp, th))
-			return true;
+			goto extra_opt_check;
 	}
 
 	tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
 	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
 		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
 
-	return true;
+extra_opt_check:
+	return false;
 }
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -5168,9 +5171,11 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	bool rst_seq_match = false;
 
+	if (tcp_fast_parse_options(sock_net(sk), skb, th, tp))
+		goto discard;
+
 	/* RFC1323: H1. Apply PAWS check first. */
-	if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
-	    tp->rx_opt.saw_tstamp &&
+	if (tp->rx_opt.saw_tstamp &&
 	    tcp_paws_discard(sk, skb)) {
 		if (!th->rst) {
 			NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
-- 
2.15.0

^ permalink raw reply related

* [RFC 02/14] tcp: Pass sock and skb to tcp_options_write
From: Christoph Paasch @ 2017-12-18 21:50 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Mat Martineau, Alexei Starovoitov
In-Reply-To: <20171218215109.38700-1-cpaasch@apple.com>

An upcoming patch adds a configurable, per-socket list of TCP options to
populate in the TCP header. This requires tcp_options_write() to know the
socket (to use the options list) and the skb (to provide visibility to the
packet data for options like TCP_MD5SIG).

Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 net/ipv4/tcp_output.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0f66d101d0ca..efe599a41e36 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -444,10 +444,14 @@ struct tcp_out_options {
  * At least SACK_PERM as the first option is known to lead to a disaster
  * (but it may well be that other scenarios fail similarly).
  */
-static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
+static void tcp_options_write(__be32 *ptr, struct sk_buff *skb, struct sock *sk,
 			      struct tcp_out_options *opts)
 {
 	u16 options = opts->options;	/* mungable copy */
+	struct tcp_sock *tp = NULL;
+
+	if (sk_fullsock(sk))
+		tp = tcp_sk(sk);
 
 	if (unlikely(OPTION_MD5 & options)) {
 		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
@@ -1136,7 +1140,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		 */
 		th->window	= htons(min(tp->rcv_wnd, 65535U));
 	}
-	tcp_options_write((__be32 *)(th + 1), tp, &opts);
+	tcp_options_write((__be32 *)(th + 1), skb, sk, &opts);
 #ifdef CONFIG_TCP_MD5SIG
 	/* Calculate the MD5 hash, as we have all we need now */
 	if (md5) {
@@ -3243,7 +3247,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
 	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
 	th->doff = (tcp_header_size >> 2);
-	tcp_options_write((__be32 *)(th + 1), NULL, &opts);
+	tcp_options_write((__be32 *)(th + 1), skb, req_to_sk(req), &opts);
 	__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
 
 #ifdef CONFIG_TCP_MD5SIG
-- 
2.15.0

^ permalink raw reply related

* [RFC 01/14] tcp: Write options after the header has been fully done
From: Christoph Paasch @ 2017-12-18 21:50 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Mat Martineau, Alexei Starovoitov
In-Reply-To: <20171218215109.38700-1-cpaasch@apple.com>

The generic TCP-option framework will need to have access to the full
TCP-header (e.g., if we want to compute a checksum for TCP-MD5).

Thus, we move the call to tcp_options_write() to after all the fields in
the header have been filled out.

Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 net/ipv4/tcp_output.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 04be9f833927..0f66d101d0ca 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1126,7 +1126,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		}
 	}
 
-	tcp_options_write((__be32 *)(th + 1), tp, &opts);
 	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
 	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
 		th->window      = htons(tcp_select_window(sk));
@@ -1137,6 +1136,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		 */
 		th->window	= htons(min(tp->rcv_wnd, 65535U));
 	}
+	tcp_options_write((__be32 *)(th + 1), tp, &opts);
 #ifdef CONFIG_TCP_MD5SIG
 	/* Calculate the MD5 hash, as we have all we need now */
 	if (md5) {
@@ -3242,8 +3242,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 
 	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
 	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
-	tcp_options_write((__be32 *)(th + 1), NULL, &opts);
 	th->doff = (tcp_header_size >> 2);
+	tcp_options_write((__be32 *)(th + 1), NULL, &opts);
 	__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
 
 #ifdef CONFIG_TCP_MD5SIG
-- 
2.15.0

^ permalink raw reply related

* [RFC 00/14] Generic TCP-option framework and adoption for TCP-SMC and TCP-MD5
From: Christoph Paasch @ 2017-12-18 21:50 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Mat Martineau, Alexei Starovoitov

This patchset introduces a generic framework for handling TCP-options.

TCP-options like TCP_MD5 and SMC are rather rare use-cases, but their
implementation is rather intrusive to the TCP-stack. Other, more recent
TCP extensions like TCP-crypt, MPTCP or TCP-AO would make this situation
even worse.

This new framework allows to add these TCP-options in a modular way. Writing,
reading and acting upon these options is done through callbacks that get
registered to a TCP-socket. A TCP-socket has a list of "extra" TCP-options
that it will use.

We make TCP-SMC and TCP-MD5SIG adopt this new framework. As can be seen, there
is now no more TCP-SMC code in the TCP-files and the TCP-MD5 code has been
reduced to a bare minimum.

This patchset is admittedly rather big, but we wanted to show where the
framework will lead to and what it enables. Suggestions as to how to better
structure the patchset is appreciated.

One point of discussion might be whether or not we should use static-keys
before accessing the tcp_option_list to avoid branching and the additional
access to the tcp_sock, request-sock, time-wait-sock structure.
The way static keys could be used is that they get incremented each time a
socket starts using one of the new TCP-options and decremented when the socket
eventually gets destroyed. A caveat of this design would be that if a host keeps
on creating/closing these sockets in a sequence, each time we go into the slow
path of the static keys occuring potentially a big overhead to update all the
jump-labels.
For now we opted for a simple if (unlikely(!hlist_empty(...)) check.

Feedback is very welcome!


Thanks,
Mat & Christoph


Christoph Paasch (13):
  tcp: Write options after the header has been fully done
  tcp: Pass sock and skb to tcp_options_write
  tcp: Allow tcp_fast_parse_options to drop segments
  tcp_smc: Make smc_parse_options return 1 on success
  tcp_smc: Make SMC use TCP extra-option framework
  tcp_md5: Don't pass along md5-key
  tcp_md5: Detect key inside tcp_v4_send_ack instead of passing it as an
    argument
  tcp_md5: Detect key inside tcp_v6_send_response instead of passing it
    as an argument
  tcp_md5: Check for TCP_MD5 after TCP Timestamps in
    tcp_established_options
  tcp_md5: Move TCP-MD5 code out of TCP itself
  tcp_md5: Use tcp_extra_options in output path
  tcp_md5: Cleanup TCP-code
  tcp_md5: Use TCP extra-options on the input path

Mat Martineau (1):
  tcp: Register handlers for extra TCP options

 drivers/infiniband/hw/cxgb4/cm.c |    2 +-
 include/linux/inet_diag.h        |    1 +
 include/linux/tcp.h              |   43 +-
 include/linux/tcp_md5.h          |   39 ++
 include/net/inet_sock.h          |    3 +-
 include/net/tcp.h                |  213 +++---
 net/ipv4/Makefile                |    1 +
 net/ipv4/syncookies.c            |    6 +-
 net/ipv4/tcp.c                   |  391 ++++++++---
 net/ipv4/tcp_diag.c              |   81 +--
 net/ipv4/tcp_input.c             |  137 ++--
 net/ipv4/tcp_ipv4.c              |  556 ++--------------
 net/ipv4/tcp_md5.c               | 1359 ++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_minisocks.c         |   77 +--
 net/ipv4/tcp_output.c            |  182 +----
 net/ipv6/syncookies.c            |    6 +-
 net/ipv6/tcp_ipv6.c              |  390 ++---------
 net/smc/af_smc.c                 |  190 +++++-
 18 files changed, 2227 insertions(+), 1450 deletions(-)
 create mode 100644 include/linux/tcp_md5.h
 create mode 100644 net/ipv4/tcp_md5.c

-- 
2.15.0

^ permalink raw reply

* Re: [PATCH v7 2/3] sock: Move the socket inuse to namespace.
From: Cong Wang @ 2017-12-18 21:38 UTC (permalink / raw)
  To: David Miller; +Cc: Tonghao Zhang, Linux Kernel Network Developers
In-Reply-To: <20171218.143016.1035157635015719813.davem@davemloft.net>

On Mon, Dec 18, 2017 at 11:30 AM, David Miller <davem@davemloft.net> wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> Date: Thu, 14 Dec 2017 05:51:58 -0800
>
>> In some case, we want to know how many sockets are in use in
>> different _net_ namespaces. It's a key resource metric.
>
> Useful or not, you're not exporting this value.
>
> All this patch series does is convert the existing export of the
> global tally to add up the per-net values.
>
> So if you're not exporting the per-net value on it's own in any way,
> this patch series isn't achieving the stated goal.
>
> I'm not applying this series, sorry.


This value is already exported via procfs:
sockstat_seq_show() -> socket_seq_show().

And the proc file itself should already be per-net:

static int sockstat_seq_open(struct inode *inode, struct file *file)
{
        return single_open_net(inode, file, sockstat_seq_show);
}


This patch just makes that value to be per-net too.

^ permalink raw reply

* [PATCH net-next] dev: Correctly get length of alias string in dev_set_alias()
From: Serhey Popovych @ 2017-12-18 21:38 UTC (permalink / raw)
  To: netdev

We supply number of bytes available in @alias via @len
parameter to dev_set_alias() which is not the same
as zero terminated string length that can be shorter.

Both dev_set_alias() users (rtnetlink and sysfs) can
submit number of bytes up to IFALIASZ with actual string
length slightly shorter by putting '\0' not at @len - 1.

Use strnlen() to get length of zero terminated string
and not access beyond @len. Correct comment about @len
and explain how to unset alias (i.e. use zero for @len).

Signed-off-by: Serhey Popovych <serhe.popovych@gmail.com>
---
 net/core/dev.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index b0eee49..d362fe6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1243,7 +1243,7 @@ int dev_change_name(struct net_device *dev, const char *newname)
  *	dev_set_alias - change ifalias of a device
  *	@dev: device
  *	@alias: name up to IFALIASZ
- *	@len: limit of bytes to copy from info
+ *	@len: number of bytes available in @alias, zero to unset current alias
  *
  *	Set ifalias for a device,
  */
@@ -1255,6 +1255,8 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 		return -EINVAL;
 
 	if (len) {
+		len = strnlen(alias, len);
+
 		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
 		if (!new_alias)
 			return -ENOMEM;
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH iproute2 1/3] iplink: Improve index parameter handling
From: Serhey Popovich @ 2017-12-18 21:37 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20171218132231.6dcf7b54@xeon-e3>


[-- Attachment #1.1: Type: text/plain, Size: 2230 bytes --]

Stephen Hemminger wrote:
> On Mon, 18 Dec 2017 23:02:07 +0200
> Serhey Popovich <serhe.popovych@gmail.com> wrote:
> 
>> Stephen Hemminger wrote:
>>> On Mon, 18 Dec 2017 20:54:06 +0200
>>> Serhey Popovych <serhe.popovych@gmail.com> wrote:
>>>   
>>>> diff --git a/ip/iplink.c b/ip/iplink.c
>>>> index 1e685cc..4f9c169 100644
>>>> --- a/ip/iplink.c
>>>> +++ b/ip/iplink.c
>>>> @@ -586,8 +586,10 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req,
>>>>  			*name = *argv;
>>>>  		} else if (strcmp(*argv, "index") == 0) {
>>>>  			NEXT_ARG();
>>>> +			if (*index)
>>>> +				duparg("index", *argv);
>>>>  			*index = atoi(*argv);
>>>> -			if (*index < 0)
>>>> +			if (*index <= 0)  
>>>
>>> Why not use strtoul instead of atoi?  
>> Do not see reason for strtoul() instead atoi():
>>
>>   1) main arg: indexes in kernel represented as "int", which is
>>      signed. <= 0 values are reserved for various special purposes
>>      (see net/core/fib_rules.c on how device matching implemented).
>>
>>      Configuring network device manually with index <= 0 is not correct
>>      (however possible). Kernel itself never chooses ifindex <= 0.
>>
>>      Having unsigned int > 0x7fffffff actually means index <= 0.
>>
>>   2) this is not single place in iproute2 where it is used: not
>>      going to remove last user.
>>
>>   3) make changes clear and transparent for review.
> 
> I would rather all of iproute2 correctly handles unsigned values.
> Too much code is old K&R style C "the world is an int" and "who needs
> to check for negative".

You are right :(. I'm just trying to improve things a bit.

> 
> There already is get_unsigned() in iproute2 util functions.
This is good one based on strtoul(). But do we want to submit say
index = (unsigned int)2147483648(0x7fffffff) to the kernel that is
illegal from it's perspective?

Or do you mean I can prepare treewide change to replace atoi() with
get_unsigned()/get_integer() where appropriate?

We already check if (*index < 0) since commit 3c682146aeff
(iplink: forbid negative ifindex and modifying ifindex), and I just
put index == 0 in the same range of invalid indexes.

> Why not use that?
> 



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply

* Re: [PATCH iproute2 1/3] iplink: Improve index parameter handling
From: Stephen Hemminger @ 2017-12-18 21:22 UTC (permalink / raw)
  To: Serhey Popovich; +Cc: netdev
In-Reply-To: <7f83992a-90e0-4f15-2be4-7348a6742e6c@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1630 bytes --]

On Mon, 18 Dec 2017 23:02:07 +0200
Serhey Popovich <serhe.popovych@gmail.com> wrote:

> Stephen Hemminger wrote:
> > On Mon, 18 Dec 2017 20:54:06 +0200
> > Serhey Popovych <serhe.popovych@gmail.com> wrote:
> >   
> >> diff --git a/ip/iplink.c b/ip/iplink.c
> >> index 1e685cc..4f9c169 100644
> >> --- a/ip/iplink.c
> >> +++ b/ip/iplink.c
> >> @@ -586,8 +586,10 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req,
> >>  			*name = *argv;
> >>  		} else if (strcmp(*argv, "index") == 0) {
> >>  			NEXT_ARG();
> >> +			if (*index)
> >> +				duparg("index", *argv);
> >>  			*index = atoi(*argv);
> >> -			if (*index < 0)
> >> +			if (*index <= 0)  
> > 
> > Why not use strtoul instead of atoi?  
> Do not see reason for strtoul() instead atoi():
> 
>   1) main arg: indexes in kernel represented as "int", which is
>      signed. <= 0 values are reserved for various special purposes
>      (see net/core/fib_rules.c on how device matching implemented).
> 
>      Configuring network device manually with index <= 0 is not correct
>      (however possible). Kernel itself never chooses ifindex <= 0.
> 
>      Having unsigned int > 0x7fffffff actually means index <= 0.
> 
>   2) this is not single place in iproute2 where it is used: not
>      going to remove last user.
> 
>   3) make changes clear and transparent for review.

I would rather all of iproute2 correctly handles unsigned values.
Too much code is old K&R style C "the world is an int" and "who needs
to check for negative".

There already is get_unsigned() in iproute2 util functions.
Why not use that?


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH 3/3] trace: print address if symbol not found
From: Tobin C. Harding @ 2017-12-18 21:16 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: kernel-hardening, Tycho Andersen, Linus Torvalds, Kees Cook,
	Andrew Morton, Daniel Borkmann, Masahiro Yamada,
	Alexei Starovoitov, linux-kernel, Network Development
In-Reply-To: <20171218114947.2c11211a@gandalf.local.home>

On Mon, Dec 18, 2017 at 11:49:47AM -0500, Steven Rostedt wrote:
> On Mon, 18 Dec 2017 10:53:32 +1100
> "Tobin C. Harding" <me@tobin.cc> wrote:
> 
> > Fixes behaviour modified by: commit bd6b239cdbb2 ("kallsyms: don't leak
> > address when symbol not found")
> > 
> > Previous patch changed behaviour of kallsyms function sprint_symbol() to
> > return an error code instead of printing the address if a symbol was not
> > found. Ftrace relies on the original behaviour. We should not break
> > tracing when applying the previous patch. We can maintain the original
> > behaviour by checking the return code on calls to sprint_symbol() and
> > friends.
> > 
> > Check return code and print actual address on error (i.e symbol not
> > found).
> > 
> > Signed-off-by: Tobin C. Harding <me@tobin.cc>
> > ---
> >  kernel/trace/trace.h             | 24 ++++++++++++++++++++++++
> >  kernel/trace/trace_events_hist.c |  6 +++---
> >  2 files changed, 27 insertions(+), 3 deletions(-)
> > 
> > diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> > index 2a6d0325a761..881b1a577d75 100644
> > --- a/kernel/trace/trace.h
> > +++ b/kernel/trace/trace.h
> > @@ -1814,4 +1814,28 @@ static inline void trace_event_eval_update(struct trace_eval_map **map, int len)
> >  
> >  extern struct trace_iterator *tracepoint_print_iter;
> >  
> > +static inline int
> > +trace_sprint_symbol(char *buffer, unsigned long address)
> > +{
> > +	int ret;
> > +
> > +	ret = sprint_symbol(buffer, address);
> > +	if (ret == -1)
> > +		ret = sprintf(buffer, "0x%lx", address);
> > +
> > +	return ret;
> > +}
> > +
> > +static inline int
> > +trace_sprint_symbol_no_offset(char *buffer, unsigned long address)
> > +{
> > +	int ret;
> > +
> > +	ret = sprint_symbol_no_offset(buffer, address);
> > +	if (ret == -1)
> > +		ret = sprintf(buffer, "0x%lx", address);
> > +
> > +	return ret;
> > +}
> > +
> >  #endif /* _LINUX_KERNEL_TRACE_H */
> > diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
> > index 1e1558c99d56..3e28522a76f4 100644
> > --- a/kernel/trace/trace_events_hist.c
> > +++ b/kernel/trace/trace_events_hist.c
> > @@ -982,7 +982,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,
> >  			return;
> >  
> >  		seq_printf(m, "%*c", 1 + spaces, ' ');
> > -		sprint_symbol(str, stacktrace_entries[i]);
> > +		trace_sprint_symbol_addr(str, stacktrace_entries[i]);
> 
> Hmm, where is trace_sprint_symbol_addr() defined?

Gee, seems I can't build a kernel and I can't read a diff - epic
fail. Sorry for wasting your time Steve I'll try to be more careful.

FTR This should have been 

-		sprint_symbol(str, stacktrace_entries[i]);
+		trace_sprint_symbol(str, stacktrace_entries[i]);

If you have the time to give me some brief pointers on how I should go
about testing this I'd love to test it before the next version. I know
very little about ftrace.

thanks,
Tobin.

^ permalink raw reply

* Re: [PATCH iproute2 1/3] iplink: Improve index parameter handling
From: Serhey Popovich @ 2017-12-18 21:02 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20171218112359.1ef8db23@xeon-e3>


[-- Attachment #1.1: Type: text/plain, Size: 1239 bytes --]

Stephen Hemminger wrote:
> On Mon, 18 Dec 2017 20:54:06 +0200
> Serhey Popovych <serhe.popovych@gmail.com> wrote:
> 
>> diff --git a/ip/iplink.c b/ip/iplink.c
>> index 1e685cc..4f9c169 100644
>> --- a/ip/iplink.c
>> +++ b/ip/iplink.c
>> @@ -586,8 +586,10 @@ int iplink_parse(int argc, char **argv, struct iplink_req *req,
>>  			*name = *argv;
>>  		} else if (strcmp(*argv, "index") == 0) {
>>  			NEXT_ARG();
>> +			if (*index)
>> +				duparg("index", *argv);
>>  			*index = atoi(*argv);
>> -			if (*index < 0)
>> +			if (*index <= 0)
> 
> Why not use strtoul instead of atoi?
Do not see reason for strtoul() instead atoi():

  1) main arg: indexes in kernel represented as "int", which is
     signed. <= 0 values are reserved for various special purposes
     (see net/core/fib_rules.c on how device matching implemented).

     Configuring network device manually with index <= 0 is not correct
     (however possible). Kernel itself never chooses ifindex <= 0.

     Having unsigned int > 0x7fffffff actually means index <= 0.

  2) this is not single place in iproute2 where it is used: not
     going to remove last user.

  3) make changes clear and transparent for review.

Thanks.

> 



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 490 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 2/2 v9] net: ethernet: Add a driver for Gemini gigabit ethernet
From: Linus Walleij @ 2017-12-18 20:55 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Michał Mirosław, Tobias Waldvogel, Florian Fainelli,
	Paulius Zaleckas, netdev, Hans Ulli Kroll, Janos Laube,
	David S . Miller, Linux ARM
In-Reply-To: <20171218145403.GE10595@n2100.armlinux.org.uk>

On Mon, Dec 18, 2017 at 3:54 PM, Russell King - ARM Linux
<linux@armlinux.org.uk> wrote:
> On Mon, Dec 18, 2017 at 03:48:17PM +0100, Michał Mirosław wrote:
>> On Mon, Dec 18, 2017 at 02:57:37PM +0100, Linus Walleij wrote:
>> > On Sat, Dec 16, 2017 at 8:39 PM, Linus Walleij <linus.walleij@linaro.org> wrote:
>> >
>> > > The Gemini ethernet has been around for years as an out-of-tree
>> > > patch used with the NAS boxen and routers built on StorLink
>> > > SL3512 and SL3516, later Storm Semiconductor, later Cortina
>> > > Systems. These ASICs are still being deployed and brand new
>> > > off-the-shelf systems using it can easily be acquired.
>> [...]
>> > > ---
>> > > Changes from v8:
>> > > - Remove dependency guards in Kconfig to get a wider compile
>> > >   coverage for the driver to detect broken APIs etc.
>> >
>> > I guess we need to hold this off for a while, the code does
>> > some weird stuff using the ARM-internal page DMA mapping
>> > API.
>> >
>> > I *think* what happens is that the driver allocates a global queue
>> > used for RX and TX on both interfaces, then initializes that with
>> > page pointers and gives that to the hardware to play with.
>> >
>> > When an RX packet comes in, the RX routine needs to figure
>> > out from the DMA (physical) address which remapped
>> > page/address this random physical address pointer
>> > corresponds to.
>> >
>> > The Linux DMA API assumption is that the driver keeps track
>> > of this mapping, not the hardware. So we need to figure out
>> > a way to reverse-map this. Preferably quickly, and without
>> > using any ARM-internal mapping APIs.
>>
>> IIRC, the hardware copies descriptors from free queue (FREEQ)
>> to RX queues. FREEQ is shared among the two ethernet ports.

Seems like that to me too. I will try to refactor and break it
apart a bit.

The way freeq works is undocumented, even in the official
datasheet for CS3516 (the memory area is just "reserved"),
so the code is the only documentation of it.

>> This platform is CPU bound, so every additional lookup will
>> hit performance here. In my version I had an #ifdef for
>> COMPILE_TEST that replaced ARM-specific calls with stubs.
>> Since the driver is not expected to work on other platforms,
>> this seemed like the best workaround to make it compile
>> on other arches.
>
> Really.  No.  Stop going beneath the covers and using ARM private
> implementation APIs in drivers.
>
> Take that as a big NAK to that.

Don't worry, it won't happen. I am already thinking about better
approaches that stay with the public DMA-API.

> (I don't seem have the patch in question here to look at though.)

I'll put you on CC in future postings.

Yours,
Linus Walleij

^ permalink raw reply

* linux-next: Signed-off-by missing for commits in the net-next tree
From: Stephen Rothwell @ 2017-12-18 20:41 UTC (permalink / raw)
  To: David Miller, Networking
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List, Bert Kenward,
	Edward Cree

Hi all,

Commits

  d8d8ccf27741 ("sfc: update EF10 register definitions")
  0bc959a95e8c ("sfc: populate the timer reload field")

are missing a Signed-off-by from their author.

-- 
Cheers,
Stephen Rothwell

^ permalink raw reply

* Re: [PATCH] qed: Remove unused QED_RDMA_DEV_CAP_* symbols and dev->dev_caps
From: David Miller @ 2017-12-18 20:25 UTC (permalink / raw)
  To: helgaas; +Cc: netdev, linux-pci, Ariel.Elior, everest-linux-l2
In-Reply-To: <20171218.151354.978438255508508144.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Mon, 18 Dec 2017 15:13:54 -0500 (EST)

> From: Bjorn Helgaas <helgaas@kernel.org>
> Date: Fri, 15 Dec 2017 17:03:01 -0600
> 
>> From: Bjorn Helgaas <bhelgaas@google.com>
>> 
>> The QED_RDMA_DEV_CAP_* symbols are only used to set bits in dev->dev_caps.
>> Nobody ever looks at those bits.  Remove the symbols and dev_caps itself.
>> 
>> Note that if these are ever used and added back, it looks incorrect to set
>> QED_RDMA_DEV_CAP_ATOMIC_OP based on PCI_EXP_DEVCTL2_LTR_EN.  LTR is the
>> Latency Tolerance Reporting mechanism, which has nothing to do with Atomic
>> Ops.
>> 
>> No functional change intended.
>> 
>> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
> 
> Applied to net-next.

Actually, this doesn't build, reverted:

drivers/infiniband/hw/qedr/main.c: In function ‘qedr_set_device_attr’:
drivers/infiniband/hw/qedr/main.c:682:27: error: ‘struct qed_rdma_device’ has no member named ‘dev_caps’
  attr->dev_caps = qed_attr->dev_caps;

^ permalink raw reply

* Re: [bpf-next V1-RFC PATCH 08/14] nfp: setup xdp_rxq_info
From: Jesper Dangaard Brouer @ 2017-12-18 20:25 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Daniel Borkmann, Alexei Starovoitov, dsahern, oss-drivers,
	Simon Horman, netdev, gospo, bjorn.topel, michael.chan, brouer
In-Reply-To: <20171213183427.213f6206@cakuba.netronome.com>

On Wed, 13 Dec 2017 18:34:27 -0800
Jakub Kicinski <jakub.kicinski@netronome.com> wrote:

> On Wed, 13 Dec 2017 12:20:01 +0100, Jesper Dangaard Brouer wrote:
> > Driver hook points for xdp_rxq_info:
> >  * init+reg: nfp_net_rx_ring_alloc
> >  * unreg   : nfp_net_rx_ring_free
> > 
> > In struct nfp_net_rx_ring moved member @size into a hole on 64-bit.
> > Thus, the size remaines the same after adding member @xdp_rxq.
> > 
> > Cc: oss-drivers@netronome.com
> > Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
> > Cc: Simon Horman <simon.horman@netronome.com>
> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>  
> 
> > diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h
> > index 3801c52098d5..0e564cfabe7e 100644
> > --- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
> > +++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
> > @@ -47,6 +47,7 @@
> >  #include <linux/netdevice.h>
> >  #include <linux/pci.h>
> >  #include <linux/io-64-nonatomic-hi-lo.h>
> > +#include <net/xdp.h>
> >  
> >  #include "nfp_net_ctrl.h"
> >  
> > @@ -350,6 +351,7 @@ struct nfp_net_rx_buf {
> >   * @rxds:       Virtual address of FL/RX ring in host memory
> >   * @dma:        DMA address of the FL/RX ring
> >   * @size:       Size, in bytes, of the FL/RX ring (needed to free)
> > + * @xdp_rxq:    RX-ring info avail for XDP
> >   */
> >  struct nfp_net_rx_ring {
> >  	struct nfp_net_r_vector *r_vec;
> > @@ -361,13 +363,14 @@ struct nfp_net_rx_ring {
> >  	u32 idx;
> >  
> >  	int fl_qcidx;
> > +	unsigned int size;
> >  	u8 __iomem *qcp_fl;
> >  
> >  	struct nfp_net_rx_buf *rxbufs;
> >  	struct nfp_net_rx_desc *rxds;
> >  
> >  	dma_addr_t dma;
> > -	unsigned int size;
> > +	struct xdp_rxq_info xdp_rxq;
> >  } ____cacheline_aligned;  
> 
> The @size member is not in the hole on purpose.  IIRC all the members
> up to @dma are in the first cacheline.  All things which are not
> needed on the fast path are after @dma.  IOW @size is not used on the
> fast path and the hole is for fast path stuff :)

Yes, I did notice @size was not used on fast-path, but it didn't hurt
to move it up.  I was just excited to see I could add this without
increasing the rx_ring struct size.

I'm more and more considering Ahern's suggestion of returning an err,
and if I do so, I also want to do proper allocation of xdp_rxq_info,
which means this will be converted into a pointer instead (and thus
much smaller effect on rx_ring size).


> >  /**
> > diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
> > index ad3e9f6a61e5..6474aecd0451 100644
> > --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
> > +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
> > @@ -2252,6 +2253,7 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring *rx_ring)
> >  	struct nfp_net_r_vector *r_vec = rx_ring->r_vec;
> >  	struct nfp_net_dp *dp = &r_vec->nfp_net->dp;
> >  
> > +	xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
> >  	kfree(rx_ring->rxbufs);
> >  
> >  	if (rx_ring->rxds)
> > @@ -2277,6 +2279,12 @@ nfp_net_rx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring)
> >  {
> >  	int sz;
> >  
> > +	/* XDP RX-queue info */
> > +	xdp_rxq_info_init(&rx_ring->xdp_rxq);
> > +	rx_ring->xdp_rxq.dev = dp->netdev;
> > +	rx_ring->xdp_rxq.queue_index = rx_ring->idx;
> > +	xdp_rxq_info_reg(&rx_ring->xdp_rxq);
> > +
> >  	rx_ring->cnt = dp->rxd_cnt;
> >  	rx_ring->size = sizeof(*rx_ring->rxds) * rx_ring->cnt;
> >  	rx_ring->rxds = dma_zalloc_coherent(dp->dev, rx_ring->size,  
> 
> The nfp driver implements the prepare/commit for reallocating rings.  
> I don't think it matters now, but there can be 2 sets of rings with the
> same ID allocated during reconfiguration (see nfp_net_ring_reconfig()).
> Maybe place the register/unregister in nfp_net_open_stack() and
> nfp_net_close_stack() respectively?

Going over the your driver code again, I do think I handle this
correctly in nfp_net_rx_ring_free() / nfp_net_rx_ring_alloc().

Your calls nfp_net_open_stack() / nfp_net_close_stack(), doesn't
support failing, which conflicts with Ahern's suggestion.

As I explained, in another reply, I do want to support having 2 sets
of rings during reconfiguration, as many drivers do this.  This is also
the reason I cannot use net_device->_rx[] area.

> Perhaps that won't be necessary, only cleaner :)  I'm not sure how is
> the redirect between drivers intended to work WRT freeing rings and
> unloading drivers while packets fly...

I do have a plan for handling in-flight packets when driver is being
unloaded... that is the reason for having the unreg call. (Sorry, I
should have included you in that offlist discussion).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH] qed: Remove unused QED_RDMA_DEV_CAP_* symbols and dev->dev_caps
From: David Miller @ 2017-12-18 20:13 UTC (permalink / raw)
  To: helgaas; +Cc: netdev, linux-pci, Ariel.Elior, everest-linux-l2
In-Reply-To: <20171215230301.177993.80284.stgit@bhelgaas-glaptop.roam.corp.google.com>

From: Bjorn Helgaas <helgaas@kernel.org>
Date: Fri, 15 Dec 2017 17:03:01 -0600

> From: Bjorn Helgaas <bhelgaas@google.com>
> 
> The QED_RDMA_DEV_CAP_* symbols are only used to set bits in dev->dev_caps.
> Nobody ever looks at those bits.  Remove the symbols and dev_caps itself.
> 
> Note that if these are ever used and added back, it looks incorrect to set
> QED_RDMA_DEV_CAP_ATOMIC_OP based on PCI_EXP_DEVCTL2_LTR_EN.  LTR is the
> Latency Tolerance Reporting mechanism, which has nothing to do with Atomic
> Ops.
> 
> No functional change intended.
> 
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH] cxgb4: Simplify PCIe Completion Timeout setting
From: David Miller @ 2017-12-18 20:13 UTC (permalink / raw)
  To: helgaas; +Cc: netdev, linux-pci, ganeshgr
In-Reply-To: <20171215230150.177674.9821.stgit@bhelgaas-glaptop.roam.corp.google.com>

From: Bjorn Helgaas <helgaas@kernel.org>
Date: Fri, 15 Dec 2017 17:01:50 -0600

> From: Bjorn Helgaas <bhelgaas@google.com>
> 
> Simplify PCIe Completion Timeout setting by using the
> pcie_capability_clear_and_set_word() interface.  No functional change
> intended.
> 
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH net-next 0/2] net: erspan: a couple fixes
From: David Miller @ 2017-12-18 20:12 UTC (permalink / raw)
  To: u9012063; +Cc: netdev
In-Reply-To: <1513376864-33777-1-git-send-email-u9012063@gmail.com>

From: William Tu <u9012063@gmail.com>
Date: Fri, 15 Dec 2017 14:27:42 -0800

> Haishuang Yan reports a couple of issues (wrong return value, 
> pskb_may_pull) on erspan V1.  Since erspan V2 is in net-next,
> this series fix the similar issues on v2.

Series applied, thank you.

^ permalink raw reply

* Re: [PATCH] net: phy: xgene: disable clk on error paths
From: David Miller @ 2017-12-18 20:10 UTC (permalink / raw)
  To: khoroshilov
  Cc: isubramanian, kchudgar, qnguyen, netdev, linux-kernel,
	ldv-project
In-Reply-To: <1513374759-21384-1-git-send-email-khoroshilov@ispras.ru>

From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Sat, 16 Dec 2017 00:52:39 +0300

> There are several error paths in xgene_mdio_probe(),
> where clk is left undisabled. The patch fixes them.
> 
> Found by Linux Driver Verification project (linuxtesting.org).
> 
> Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>

Applied, thank you.

^ permalink raw reply

* Re: [PATCH v16 3/4] hinic: Replace PCI pool old API
From: David Miller @ 2017-12-18 20:07 UTC (permalink / raw)
  To: romain.perier
  Cc: axboe, akpm, dan.j.williams, vinod.koul, jeffrey.t.kirsher,
	aviad.krawczyk, jejb, martin.petersen, linux-scsi, bhelgaas,
	linux-pci, dmaengine, netdev, linux-kernel, gregkh, romain.perier
In-Reply-To: <20171215193123.13395-4-romain.perier@gmail.com>

From: Romain Perier <romain.perier@gmail.com>
Date: Fri, 15 Dec 2017 20:31:22 +0100

> From: Romain Perier <romain.perier@collabora.com>
> 
> The PCI pool API is deprecated. This commit replaces the PCI pool old
> API by the appropriate function with the DMA pool API.
> 
> Signed-off-by: Romain Perier <romain.perier@collabora.com>

Acked-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply

* Re: [PATCH v16 2/4] net: e100: Replace PCI pool old API
From: David Miller @ 2017-12-18 20:06 UTC (permalink / raw)
  To: romain.perier
  Cc: axboe, akpm, dan.j.williams, vinod.koul, jeffrey.t.kirsher,
	aviad.krawczyk, jejb, martin.petersen, linux-scsi, bhelgaas,
	linux-pci, dmaengine, netdev, linux-kernel, gregkh, romain.perier
In-Reply-To: <20171215193123.13395-3-romain.perier@gmail.com>

From: Romain Perier <romain.perier@gmail.com>
Date: Fri, 15 Dec 2017 20:31:21 +0100

> From: Romain Perier <romain.perier@collabora.com>
> 
> The PCI pool API is deprecated. This commit replaces the PCI pool old
> API by the appropriate function with the DMA pool API.
> 
> Signed-off-by: Romain Perier <romain.perier@collabora.com>
> Acked-by: Peter Senna Tschudin <peter.senna@collabora.com>
> Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Tested-by: Peter Senna Tschudin <peter.senna@collabora.com>

Acked-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply

* Re: [PATCH] net: phy: marvell: avoid pause mode on SGMII-to-Copper for 88e151x
From: David Miller @ 2017-12-18 20:04 UTC (permalink / raw)
  To: rmk+kernel; +Cc: andrew, f.fainelli, netdev
In-Reply-To: <E1ePsZ2-0007rr-F3@rmk-PC.armlinux.org.uk>

From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 15 Dec 2017 16:10:20 +0000

> Observed on the 88e1512 in SGMII-to-Copper mode, negotiating pause
> is unreliable.  While the pause bits can be set in the advertisment
> register, they clear shortly after negotiation with a link partner
> commences irrespective of the cause of the negotiation.
> 
> While these bits may be correctly conveyed to the link partner on the
> first negotiation, a subsequent negotiation (eg, due to negotiation
> restart by the link partner, or reconnection of the cable) will result
> in the link partner seeing these bits as zero, while the kernel
> believes that it has advertised pause modes.
> 
> This leads to the local kernel evaluating (eg) symmetric pause mode,
> while the remote end evaluates that we have no pause mode capability.
> 
> Since we can't guarantee the advertisment, disable pause mode support
> with this PHY when used in SGMII-to-Copper mode.
> 
> The 88e1510 in RGMII-to-Copper mode appears to behave correctly.
> 
> Reviewed-by: Andrew Lunn <andrew@lunn.ch>
> Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>

Applied.

^ permalink raw reply

* Re: [PATCH 0/3] More SFP/phylink fixes
From: David Miller @ 2017-12-18 19:58 UTC (permalink / raw)
  To: linux; +Cc: andrew, f.fainelli, netdev
In-Reply-To: <20171215160344.GU10595@n2100.armlinux.org.uk>

From: Russell King - ARM Linux <linux@armlinux.org.uk>
Date: Fri, 15 Dec 2017 16:03:44 +0000

> This series fixes a few more bits with sfp/phylink, particularly
> confusion with the right way to test for the RTNL mutex being
> held, a change in 2016 to the mdiobus_scan() behaviour that wasn't
> noticed, and a fix for reading module EEPROMs.

Series applied to net-next, because that's the tree this actually
applies cleanly to.

Please be explicit about which tree of mine you are targetting
in the future.

Thank you.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox