Netdev List
 help / color / mirror / Atom feed
* [PATCH v3 net] tcp: randomize timestamps on syncookies
From: Eric Dumazet @ 2017-05-05 13:56 UTC (permalink / raw)
  To: Florian Westphal; +Cc: David Miller, netdev, Yuchung Cheng
In-Reply-To: <1493950957.7796.36.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <edumazet@google.com>

Whole point of randomization was to hide server uptime, but an attacker
can simply start a syn flood and TCP generates 'old style' timestamps,
directly revealing server jiffies value.

Also, TSval sent by the server to a particular remote address vary
depending on syncookies being sent or not, potentially triggering PAWS
drops for innocent clients.

Lets implement proper randomization, including for SYNcookies.

Also we do not need to export sysctl_tcp_timestamps, since it is not
used from a module.

In v2, I added Florian feedback and contribution, adding tsoff to
tcp_get_cookie_sock().

v3 removed one unused variable in tcp_v4_connect() as Florian spotted.

Fixes: 95a22caee396c ("tcp: randomize tcp timestamp offsets for each connection")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Florian Westphal <fw@strlen.de>
Tested-by: Florian Westphal <fw@strlen.de>
Cc: Yuchung Cheng <ycheng@google.com>
---
 include/net/secure_seq.h |   10 ++++++----
 include/net/tcp.h        |    5 +++--
 net/core/secure_seq.c    |   31 +++++++++++++++++++------------
 net/ipv4/syncookies.c    |   12 ++++++++++--
 net/ipv4/tcp_input.c     |    8 +++-----
 net/ipv4/tcp_ipv4.c      |   32 +++++++++++++++++++-------------
 net/ipv6/syncookies.c    |   10 +++++++++-
 net/ipv6/tcp_ipv6.c      |   32 +++++++++++++++++++-------------
 8 files changed, 88 insertions(+), 52 deletions(-)

diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index fe236b3429f0d8caeb1adc367b5b4a20591c848b..b94006f6fbdde0d78fe33b9c2d86159e291c30cf 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -6,10 +6,12 @@
 u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
 u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport);
-u32 secure_tcp_seq_and_tsoff(__be32 saddr, __be32 daddr,
-			     __be16 sport, __be16 dport, u32 *tsoff);
-u32 secure_tcpv6_seq_and_tsoff(const __be32 *saddr, const __be32 *daddr,
-			       __be16 sport, __be16 dport, u32 *tsoff);
+u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
+		   __be16 sport, __be16 dport);
+u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr);
+u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
+		     __be16 sport, __be16 dport);
+u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr);
 u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
 				__be16 sport, __be16 dport);
 u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 270e5cc43c99e7030e95af218095cf9f283950bc..8c0e5a901d6424fbd01233cd3adfdce52076f7a9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -470,7 +470,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
 /* From syncookies.c */
 struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 				 struct request_sock *req,
-				 struct dst_entry *dst);
+				 struct dst_entry *dst, u32 tsoff);
 int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
 		      u32 cookie);
 struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
@@ -1822,7 +1822,8 @@ struct tcp_request_sock_ops {
 #endif
 	struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl,
 				       const struct request_sock *req);
-	__u32 (*init_seq_tsoff)(const struct sk_buff *skb, u32 *tsoff);
+	u32 (*init_seq)(const struct sk_buff *skb);
+	u32 (*init_ts_off)(const struct sk_buff *skb);
 	int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
 			   struct flowi *fl, struct request_sock *req,
 			   struct tcp_fastopen_cookie *foc,
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 6bd2f8fb0476baabf507557fc0d06b6787511c70..ae35cce3a40d70387bee815798933aa43a0e6d84 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -24,9 +24,13 @@ static siphash_key_t ts_secret __read_mostly;
 
 static __always_inline void net_secret_init(void)
 {
-	net_get_random_once(&ts_secret, sizeof(ts_secret));
 	net_get_random_once(&net_secret, sizeof(net_secret));
 }
+
+static __always_inline void ts_secret_init(void)
+{
+	net_get_random_once(&ts_secret, sizeof(ts_secret));
+}
 #endif
 
 #ifdef CONFIG_INET
@@ -47,7 +51,7 @@ static u32 seq_scale(u32 seq)
 #endif
 
 #if IS_ENABLED(CONFIG_IPV6)
-static u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr)
+u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr)
 {
 	const struct {
 		struct in6_addr saddr;
@@ -60,12 +64,14 @@ static u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr)
 	if (sysctl_tcp_timestamps != 1)
 		return 0;
 
+	ts_secret_init();
 	return siphash(&combined, offsetofend(typeof(combined), daddr),
 		       &ts_secret);
 }
+EXPORT_SYMBOL(secure_tcpv6_ts_off);
 
-u32 secure_tcpv6_seq_and_tsoff(const __be32 *saddr, const __be32 *daddr,
-			       __be16 sport, __be16 dport, u32 *tsoff)
+u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
+		     __be16 sport, __be16 dport)
 {
 	const struct {
 		struct in6_addr saddr;
@@ -78,14 +84,14 @@ u32 secure_tcpv6_seq_and_tsoff(const __be32 *saddr, const __be32 *daddr,
 		.sport = sport,
 		.dport = dport
 	};
-	u64 hash;
+	u32 hash;
+
 	net_secret_init();
 	hash = siphash(&combined, offsetofend(typeof(combined), dport),
 		       &net_secret);
-	*tsoff = secure_tcpv6_ts_off(saddr, daddr);
 	return seq_scale(hash);
 }
-EXPORT_SYMBOL(secure_tcpv6_seq_and_tsoff);
+EXPORT_SYMBOL(secure_tcpv6_seq);
 
 u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport)
@@ -107,11 +113,12 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
 #endif
 
 #ifdef CONFIG_INET
-static u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr)
+u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr)
 {
 	if (sysctl_tcp_timestamps != 1)
 		return 0;
 
+	ts_secret_init();
 	return siphash_2u32((__force u32)saddr, (__force u32)daddr,
 			    &ts_secret);
 }
@@ -121,15 +128,15 @@ static u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr)
  * it would be easy enough to have the former function use siphash_4u32, passing
  * the arguments as separate u32.
  */
-u32 secure_tcp_seq_and_tsoff(__be32 saddr, __be32 daddr,
-			     __be16 sport, __be16 dport, u32 *tsoff)
+u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
+		   __be16 sport, __be16 dport)
 {
-	u64 hash;
+	u32 hash;
+
 	net_secret_init();
 	hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,
 			    (__force u32)sport << 16 | (__force u32)dport,
 			    &net_secret);
-	*tsoff = secure_tcp_ts_off(saddr, daddr);
 	return seq_scale(hash);
 }
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 496b97e17aaf7ed2cf41cef303cb0696927f66ac..0257d965f11119acf8c55888d6e672d171ef5f08 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -16,6 +16,7 @@
 #include <linux/siphash.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
+#include <net/secure_seq.h>
 #include <net/tcp.h>
 #include <net/route.h>
 
@@ -203,7 +204,7 @@ EXPORT_SYMBOL_GPL(__cookie_v4_check);
 
 struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 				 struct request_sock *req,
-				 struct dst_entry *dst)
+				 struct dst_entry *dst, u32 tsoff)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct sock *child;
@@ -213,6 +214,7 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 						 NULL, &own_req);
 	if (child) {
 		atomic_set(&req->rsk_refcnt, 1);
+		tcp_sk(child)->tsoffset = tsoff;
 		sock_rps_save_rxhash(child, skb);
 		inet_csk_reqsk_queue_add(sk, req, child);
 	} else {
@@ -292,6 +294,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	struct rtable *rt;
 	__u8 rcv_wscale;
 	struct flowi4 fl4;
+	u32 tsoff = 0;
 
 	if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst)
 		goto out;
@@ -311,6 +314,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
 	tcp_parse_options(skb, &tcp_opt, 0, NULL);
 
+	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
+		tsoff = secure_tcp_ts_off(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
+		tcp_opt.rcv_tsecr -= tsoff;
+	}
+
 	if (!cookie_timestamp_decode(&tcp_opt))
 		goto out;
 
@@ -381,7 +389,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	ireq->rcv_wscale  = rcv_wscale;
 	ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
 
-	ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst);
+	ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst, tsoff);
 	/* ip_queue_xmit() depends on our flow being setup
 	 * Normal sockets get it right from inet_csk_route_child_sock()
 	 */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9739962bfb3fd2d39cb13f643def223f4f17fcb6..5a3ad09e2786fb41ad12681d09938c645b69866d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -85,7 +85,6 @@ int sysctl_tcp_dsack __read_mostly = 1;
 int sysctl_tcp_app_win __read_mostly = 31;
 int sysctl_tcp_adv_win_scale __read_mostly = 1;
 EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
-EXPORT_SYMBOL(sysctl_tcp_timestamps);
 
 /* rfc5961 challenge ack rate limiting */
 int sysctl_tcp_challenge_ack_limit = 1000;
@@ -6347,8 +6346,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	if (security_inet_conn_request(sk, skb, req))
 		goto drop_and_free;
 
-	if (isn && tmp_opt.tstamp_ok)
-		af_ops->init_seq_tsoff(skb, &tcp_rsk(req)->ts_off);
+	if (tmp_opt.tstamp_ok)
+		tcp_rsk(req)->ts_off = af_ops->init_ts_off(skb);
 
 	if (!want_cookie && !isn) {
 		/* Kill the following clause, if you dislike this way. */
@@ -6368,7 +6367,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 			goto drop_and_release;
 		}
 
-		isn = af_ops->init_seq_tsoff(skb, &tcp_rsk(req)->ts_off);
+		isn = af_ops->init_seq(skb);
 	}
 	if (!dst) {
 		dst = af_ops->route_req(sk, &fl, req);
@@ -6380,7 +6379,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
 	if (want_cookie) {
 		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
-		tcp_rsk(req)->ts_off = 0;
 		req->cookie_ts = tmp_opt.tstamp_ok;
 		if (!tmp_opt.tstamp_ok)
 			inet_rsk(req)->ecn_ok = 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index cbbafe546c0f5c5f43531eaf24f5b460264785c6..3a51582bef551f0dffd4e6b3968e23d29fcda6d1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -94,12 +94,18 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 struct inet_hashinfo tcp_hashinfo;
 EXPORT_SYMBOL(tcp_hashinfo);
 
-static u32 tcp_v4_init_seq_and_tsoff(const struct sk_buff *skb, u32 *tsoff)
+static u32 tcp_v4_init_seq(const struct sk_buff *skb)
 {
-	return secure_tcp_seq_and_tsoff(ip_hdr(skb)->daddr,
-					ip_hdr(skb)->saddr,
-					tcp_hdr(skb)->dest,
-					tcp_hdr(skb)->source, tsoff);
+	return secure_tcp_seq(ip_hdr(skb)->daddr,
+			      ip_hdr(skb)->saddr,
+			      tcp_hdr(skb)->dest,
+			      tcp_hdr(skb)->source);
+}
+
+static u32 tcp_v4_init_ts_off(const struct sk_buff *skb)
+{
+	return secure_tcp_ts_off(ip_hdr(skb)->daddr,
+				 ip_hdr(skb)->saddr);
 }
 
 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -145,7 +151,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	struct flowi4 *fl4;
 	struct rtable *rt;
 	int err;
-	u32 seq;
 	struct ip_options_rcu *inet_opt;
 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 
@@ -232,13 +237,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	rt = NULL;
 
 	if (likely(!tp->repair)) {
-		seq = secure_tcp_seq_and_tsoff(inet->inet_saddr,
-					       inet->inet_daddr,
-					       inet->inet_sport,
-					       usin->sin_port,
-					       &tp->tsoffset);
 		if (!tp->write_seq)
-			tp->write_seq = seq;
+			tp->write_seq = secure_tcp_seq(inet->inet_saddr,
+						       inet->inet_daddr,
+						       inet->inet_sport,
+						       usin->sin_port);
+		tp->tsoffset = secure_tcp_ts_off(inet->inet_saddr,
+						 inet->inet_daddr);
 	}
 
 	inet->inet_id = tp->write_seq ^ jiffies;
@@ -1239,7 +1244,8 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 	.cookie_init_seq =	cookie_v4_init_sequence,
 #endif
 	.route_req	=	tcp_v4_route_req,
-	.init_seq_tsoff	=	tcp_v4_init_seq_and_tsoff,
+	.init_seq	=	tcp_v4_init_seq,
+	.init_ts_off	=	tcp_v4_init_ts_off,
 	.send_synack	=	tcp_v4_send_synack,
 };
 
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 895ff650db43017ef39344679771d94ad6eaaf00..5abc3692b9011b140816dc4ce6223e79e5defddb 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -18,6 +18,7 @@
 #include <linux/random.h>
 #include <linux/siphash.h>
 #include <linux/kernel.h>
+#include <net/secure_seq.h>
 #include <net/ipv6.h>
 #include <net/tcp.h>
 
@@ -143,6 +144,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	int mss;
 	struct dst_entry *dst;
 	__u8 rcv_wscale;
+	u32 tsoff = 0;
 
 	if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst)
 		goto out;
@@ -162,6 +164,12 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
 	tcp_parse_options(skb, &tcp_opt, 0, NULL);
 
+	if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
+		tsoff = secure_tcpv6_ts_off(ipv6_hdr(skb)->daddr.s6_addr32,
+					    ipv6_hdr(skb)->saddr.s6_addr32);
+		tcp_opt.rcv_tsecr -= tsoff;
+	}
+
 	if (!cookie_timestamp_decode(&tcp_opt))
 		goto out;
 
@@ -242,7 +250,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	ireq->rcv_wscale = rcv_wscale;
 	ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst);
 
-	ret = tcp_get_cookie_sock(sk, skb, req, dst);
+	ret = tcp_get_cookie_sock(sk, skb, req, dst, tsoff);
 out:
 	return ret;
 out_free:
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8e42e8f54b705ed8780890c7434feeff1055599a..aeb9497b5bb754f2277dec4a4dec02bf25bdbbe5 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -101,12 +101,18 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 	}
 }
 
-static u32 tcp_v6_init_seq_and_tsoff(const struct sk_buff *skb, u32 *tsoff)
+static u32 tcp_v6_init_seq(const struct sk_buff *skb)
 {
-	return secure_tcpv6_seq_and_tsoff(ipv6_hdr(skb)->daddr.s6_addr32,
-					  ipv6_hdr(skb)->saddr.s6_addr32,
-					  tcp_hdr(skb)->dest,
-					  tcp_hdr(skb)->source, tsoff);
+	return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32,
+				ipv6_hdr(skb)->saddr.s6_addr32,
+				tcp_hdr(skb)->dest,
+				tcp_hdr(skb)->source);
+}
+
+static u32 tcp_v6_init_ts_off(const struct sk_buff *skb)
+{
+	return secure_tcpv6_ts_off(ipv6_hdr(skb)->daddr.s6_addr32,
+				   ipv6_hdr(skb)->saddr.s6_addr32);
 }
 
 static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
@@ -122,7 +128,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	struct flowi6 fl6;
 	struct dst_entry *dst;
 	int addr_type;
-	u32 seq;
 	int err;
 	struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 
@@ -282,13 +287,13 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	sk_set_txhash(sk);
 
 	if (likely(!tp->repair)) {
-		seq = secure_tcpv6_seq_and_tsoff(np->saddr.s6_addr32,
-						 sk->sk_v6_daddr.s6_addr32,
-						 inet->inet_sport,
-						 inet->inet_dport,
-						 &tp->tsoffset);
 		if (!tp->write_seq)
-			tp->write_seq = seq;
+			tp->write_seq = secure_tcpv6_seq(np->saddr.s6_addr32,
+							 sk->sk_v6_daddr.s6_addr32,
+							 inet->inet_sport,
+							 inet->inet_dport);
+		tp->tsoffset = secure_tcpv6_ts_off(np->saddr.s6_addr32,
+						   sk->sk_v6_daddr.s6_addr32);
 	}
 
 	if (tcp_fastopen_defer_connect(sk, &err))
@@ -749,7 +754,8 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
 	.cookie_init_seq =	cookie_v6_init_sequence,
 #endif
 	.route_req	=	tcp_v6_route_req,
-	.init_seq_tsoff	=	tcp_v6_init_seq_and_tsoff,
+	.init_seq	=	tcp_v6_init_seq,
+	.init_ts_off	=	tcp_v6_init_ts_off,
 	.send_synack	=	tcp_v6_send_synack,
 };
 

^ permalink raw reply related

* Re: [PATCH net-next ] tcp: add new parameter tcp_inherit_buffsize to control the initial buffer size for new passive connetion
From: Eric Dumazet @ 2017-05-05 13:52 UTC (permalink / raw)
  To: 单卫; +Cc: netdev, David Miller
In-Reply-To: <CAPYxyxJBPO6W_wjYwXZQ5DS0sFjLh+A_GPSz1qn6qRZkqhMFQQ@mail.gmail.com>

On Fri, 2017-05-05 at 18:22 +0800, 单卫 wrote:
> 
> 
> 2017-05-02 23:05 GMT+08:00 Eric Dumazet <eric.dumazet@gmail.com>:
>         
>         
>         Hi Shan
>         
>         1) Your patch never reached netdev, because it was sent in
>         HTML format.
>         
>         2) During Linus merge window, net-next is closed
>         
>         I am not really convinced that we need this with TCP
>         autotuning anyway.
>         Initial value of sk_sndbuf and sk_rcvbuf is really a hint.
>         
>         How often do you really tweak /proc/sys/net/ipv4 files in
>         production.
> 
> 
> 
Again you posted your reply in HTML, so netdev did not get it.

> 
> Historically, most products have not adjusted any wmem/rmem
> parameters.
> In our experience to adjust the parameters, you can get a better visit
> experience, faster. 
> 
> Now more and more products want to change this default value.
> But for a product, once we adjust the parameters, it is rarely
> adjusted.
> 
> 
> We hope that this patch can provide a capacity: adjust the parameters
> online and quickly take effect.
> 
> Without this patch, it is time consuming for us to adjust the
> parameters and need to migrate flows.
> 
> 
> 
>          
>         Please provide more information, like what actual values you
>         change back
>         and forth.
>         
> 
> 
> There is no a fixed value here. big value, will take up more memory
> resources.
> But we are also willing to sacrifice memory for performance.
> 
> Select the parameter value based on the characteristics of the
> product.
> 
> 
> For example, for 1MB files, we will set buffsize to 1611646 = (1MB /
> 1460) * 2244
> Mss = 1460, SKB_TRUESIZE = 2244
> 
> 
> With TCP autotuning, reach the buffer size until the cwnd equal 359
> from 10.

Looks like a problem with autotuning because you might have HZ=100 or
HZ=250 maybe ?

We want to make autotuning better, so that tweaks like that are no
longer needed.



> 
> 

^ permalink raw reply

* [PATCH v4 4/4] can: m_can: add deep Suspend/Resume support
From: Quentin Schulz @ 2017-05-05 13:50 UTC (permalink / raw)
  To: wg, mkl, mario.huettel, socketcan
  Cc: Quentin Schulz, linux-can, netdev, linux-kernel,
	alexandre.belloni, thomas.petazzoni
In-Reply-To: <20170505135033.8349-1-quentin.schulz@free-electrons.com>

This adds Power Management deep Suspend/Resume support for Bosch M_CAN
chip.

When entering deep sleep, the clocks are gated, the interrupts are
disabled. When resuming from deep sleep, the chip needs to be
reinitialized, the clocks ungated and the interrupts enabled.

Signed-off-by: Quentin Schulz <quentin.schulz@free-electrons.com>
---

v4:
 - added a TODO comment for efficient runtime PM support,
 - used m_can_clk_start/stop functions added in v4 to have symmetric
 suspend and resume functions.

v3:
 - do not close/reopen the can interface (which was previously done when
 calling m_can_close), basically do the same routine as in probe but
 it does not close/open the can device,
 - update commit log,

v2:
 - fix erroneous commit log,

 drivers/net/can/m_can/m_can.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 653b304d7091..f4947a74b65f 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -1668,6 +1668,8 @@ static int m_can_plat_probe(struct platform_device *pdev)
 	return ret;
 }
 
+/* TODO: runtime PM with power down or sleep mode  */
+
 static __maybe_unused int m_can_suspend(struct device *dev)
 {
 	struct net_device *ndev = dev_get_drvdata(dev);
@@ -1676,10 +1678,10 @@ static __maybe_unused int m_can_suspend(struct device *dev)
 	if (netif_running(ndev)) {
 		netif_stop_queue(ndev);
 		netif_device_detach(ndev);
+		m_can_stop(ndev);
+		m_can_clk_stop(priv);
 	}
 
-	/* TODO: enter low power */
-
 	priv->can.state = CAN_STATE_SLEEPING;
 
 	return 0;
@@ -1690,11 +1692,18 @@ static __maybe_unused int m_can_resume(struct device *dev)
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct m_can_priv *priv = netdev_priv(ndev);
 
-	/* TODO: exit low power */
+	m_can_init_ram(priv);
 
 	priv->can.state = CAN_STATE_ERROR_ACTIVE;
 
 	if (netif_running(ndev)) {
+		int ret;
+
+		ret = m_can_clk_start(priv);
+		if (ret)
+			return ret;
+
+		m_can_start(ndev);
 		netif_device_attach(ndev);
 		netif_start_queue(ndev);
 	}
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 3/4] can: m_can: factorize clock gating and ungating
From: Quentin Schulz @ 2017-05-05 13:50 UTC (permalink / raw)
  To: wg, mkl, mario.huettel, socketcan
  Cc: Quentin Schulz, linux-can, netdev, linux-kernel,
	alexandre.belloni, thomas.petazzoni
In-Reply-To: <20170505135033.8349-1-quentin.schulz@free-electrons.com>

This creates a function to ungate M_CAN clocks and another to gate the
same clocks, then swaps all gating/ungating code with their respective
function.

Signed-off-by: Quentin Schulz <quentin.schulz@free-electrons.com>
---

added in v4

 drivers/net/can/m_can/m_can.c | 45 +++++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 6115dede671e..653b304d7091 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -621,10 +621,8 @@ static int __m_can_get_berr_counter(const struct net_device *dev,
 	return 0;
 }
 
-static int m_can_get_berr_counter(const struct net_device *dev,
-				  struct can_berr_counter *bec)
+static int m_can_clk_start(struct m_can_priv *priv)
 {
-	struct m_can_priv *priv = netdev_priv(dev);
 	int err;
 
 	err = clk_prepare_enable(priv->hclk);
@@ -632,15 +630,31 @@ static int m_can_get_berr_counter(const struct net_device *dev,
 		return err;
 
 	err = clk_prepare_enable(priv->cclk);
-	if (err) {
+	if (err)
 		clk_disable_unprepare(priv->hclk);
-		return err;
-	}
 
-	__m_can_get_berr_counter(dev, bec);
+	return err;
+}
 
+static void m_can_clk_stop(struct m_can_priv *priv)
+{
 	clk_disable_unprepare(priv->cclk);
 	clk_disable_unprepare(priv->hclk);
+}
+
+static int m_can_get_berr_counter(const struct net_device *dev,
+				  struct can_berr_counter *bec)
+{
+	struct m_can_priv *priv = netdev_priv(dev);
+	int err;
+
+	err = m_can_clk_start(priv);
+	if (err)
+		return err;
+
+	__m_can_get_berr_counter(dev, bec);
+
+	m_can_clk_stop(priv);
 
 	return 0;
 }
@@ -1276,19 +1290,15 @@ static int m_can_open(struct net_device *dev)
 	struct m_can_priv *priv = netdev_priv(dev);
 	int err;
 
-	err = clk_prepare_enable(priv->hclk);
+	err = m_can_clk_start(priv);
 	if (err)
 		return err;
 
-	err = clk_prepare_enable(priv->cclk);
-	if (err)
-		goto exit_disable_hclk;
-
 	/* open the can device */
 	err = open_candev(dev);
 	if (err) {
 		netdev_err(dev, "failed to open can device\n");
-		goto exit_disable_cclk;
+		goto exit_disable_clks;
 	}
 
 	/* register interrupt handler */
@@ -1310,10 +1320,8 @@ static int m_can_open(struct net_device *dev)
 
 exit_irq_fail:
 	close_candev(dev);
-exit_disable_cclk:
-	clk_disable_unprepare(priv->cclk);
-exit_disable_hclk:
-	clk_disable_unprepare(priv->hclk);
+exit_disable_clks:
+	m_can_clk_stop(priv);
 	return err;
 }
 
@@ -1335,8 +1343,7 @@ static int m_can_close(struct net_device *dev)
 	netif_stop_queue(dev);
 	napi_disable(&priv->napi);
 	m_can_stop(dev);
-	clk_disable_unprepare(priv->hclk);
-	clk_disable_unprepare(priv->cclk);
+	m_can_clk_stop(priv);
 	free_irq(dev->irq, dev);
 	close_candev(dev);
 	can_led_event(dev, CAN_LED_EVENT_STOP);
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 2/4] can: m_can: make m_can_start and m_can_stop symmetric
From: Quentin Schulz @ 2017-05-05 13:50 UTC (permalink / raw)
  To: wg, mkl, mario.huettel, socketcan
  Cc: Quentin Schulz, linux-can, netdev, linux-kernel,
	alexandre.belloni, thomas.petazzoni
In-Reply-To: <20170505135033.8349-1-quentin.schulz@free-electrons.com>

This moves clocks gating outside of the m_can_stop function as the
m_can_start function does not (and cannot, at least in current
implementation) ungate clocks. This way, both functions can now be used
symmetrically.

Signed-off-by: Quentin Schulz <quentin.schulz@free-electrons.com>
---

added in v4

 drivers/net/can/m_can/m_can.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 5da1bdb202a3..6115dede671e 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -1324,9 +1324,6 @@ static void m_can_stop(struct net_device *dev)
 	/* disable all interrupts */
 	m_can_disable_all_interrupts(priv);
 
-	clk_disable_unprepare(priv->hclk);
-	clk_disable_unprepare(priv->cclk);
-
 	/* set the state as STOPPED */
 	priv->can.state = CAN_STATE_STOPPED;
 }
@@ -1338,6 +1335,8 @@ static int m_can_close(struct net_device *dev)
 	netif_stop_queue(dev);
 	napi_disable(&priv->napi);
 	m_can_stop(dev);
+	clk_disable_unprepare(priv->hclk);
+	clk_disable_unprepare(priv->cclk);
 	free_irq(dev->irq, dev);
 	close_candev(dev);
 	can_led_event(dev, CAN_LED_EVENT_STOP);
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 1/4] can: m_can: move Message RAM initialization to function
From: Quentin Schulz @ 2017-05-05 13:50 UTC (permalink / raw)
  To: wg, mkl, mario.huettel, socketcan
  Cc: Quentin Schulz, linux-can, netdev, linux-kernel,
	alexandre.belloni, thomas.petazzoni

To avoid possible ECC/parity checksum errors when reading an
uninitialized buffer, the entire Message RAM is initialized when probing
the driver. This initialization is done in the same function reading the
Device Tree properties.

This patch moves the RAM initialization to a separate function so it can
be called separately from device initialization from Device Tree.

Signed-off-by: Quentin Schulz <quentin.schulz@free-electrons.com>
---

v4:
  - remove unused variables from m_can_of_parse_mram,

 drivers/net/can/m_can/m_can.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index bf8fdaeb955e..5da1bdb202a3 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -1489,11 +1489,23 @@ static int register_m_can_dev(struct net_device *dev)
 	return register_candev(dev);
 }
 
+static void m_can_init_ram(struct m_can_priv *priv)
+{
+	int end, i, start;
+
+	/* initialize the entire Message RAM in use to avoid possible
+	 * ECC/parity checksum errors when reading an uninitialized buffer
+	 */
+	start = priv->mcfg[MRAM_SIDF].off;
+	end = priv->mcfg[MRAM_TXB].off +
+		priv->mcfg[MRAM_TXB].num * TXB_ELEMENT_SIZE;
+	for (i = start; i < end; i += 4)
+		writel(0x0, priv->mram_base + i);
+}
+
 static void m_can_of_parse_mram(struct m_can_priv *priv,
 				const u32 *mram_config_vals)
 {
-	int i, start, end;
-
 	priv->mcfg[MRAM_SIDF].off = mram_config_vals[0];
 	priv->mcfg[MRAM_SIDF].num = mram_config_vals[1];
 	priv->mcfg[MRAM_XIDF].off = priv->mcfg[MRAM_SIDF].off +
@@ -1529,15 +1541,7 @@ static void m_can_of_parse_mram(struct m_can_priv *priv,
 		priv->mcfg[MRAM_TXE].off, priv->mcfg[MRAM_TXE].num,
 		priv->mcfg[MRAM_TXB].off, priv->mcfg[MRAM_TXB].num);
 
-	/* initialize the entire Message RAM in use to avoid possible
-	 * ECC/parity checksum errors when reading an uninitialized buffer
-	 */
-	start = priv->mcfg[MRAM_SIDF].off;
-	end = priv->mcfg[MRAM_TXB].off +
-		priv->mcfg[MRAM_TXB].num * TXB_ELEMENT_SIZE;
-	for (i = start; i < end; i += 4)
-		writel(0x0, priv->mram_base + i);
-
+	m_can_init_ram(priv);
 }
 
 static int m_can_plat_probe(struct platform_device *pdev)
-- 
2.11.0


^ permalink raw reply related

* Re: [PATCH v2 net] tcp: randomize timestamps on syncookies
From: Eric Dumazet @ 2017-05-05 13:46 UTC (permalink / raw)
  To: Florian Westphal; +Cc: David Miller, netdev, Yuchung Cheng
In-Reply-To: <20170505093630.GA3233@breakpoint.cc>

On Fri, 2017-05-05 at 11:36 +0200, Florian Westphal wrote:
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > From: Eric Dumazet <edumazet@google.com>
> > 
> > Whole point of randomization was to hide server uptime, but an attacker
> > can simply start a syn flood and TCP generates 'old style' timestamps,
> > directly revealing server jiffies value.
> > 
> > Also, TSval sent by the server to a particular remote address vary
> > depending on syncookies being sent or not, potentially triggering PAWS
> > drops for innocent clients.
> > 
> > Lets implement proper randomization, including for SYNcookies.
> 
> 
> Thanks a lot Eric, this works for me (I also tested ipv6 this time ;) )
> 
> Minor nit:
> net/ipv4/tcp_ipv4.c:154:6: warning: unused variable 'seq' [-Wunused-variable]
> 

Thanks Florian, I will remove this in v3, and add your tags.

> Other than this:
> Reviewed-by: Florian Westphal <fw@strlen.de>
> Tested-by: Florian Westphal <fw@strlen.de>

^ permalink raw reply

* Re: [RFC iproute2 0/8] RDMA tool
From: Leon Romanovsky @ 2017-05-05 13:17 UTC (permalink / raw)
  To: Jiri Benc
  Cc: Stephen Hemminger, Doug Ledford, Jiri Pirko, Ariel Almog,
	Dennis Dalessandro, Ram Amrani, Bart Van Assche, Sagi Grimberg,
	Jason Gunthorpe, Christoph Hellwig, Or Gerlitz, Linux RDMA,
	Linux Netdev
In-Reply-To: <20170505085457.0029edc9@griffin>

[-- Attachment #1: Type: text/plain, Size: 1131 bytes --]

On Fri, May 05, 2017 at 08:54:57AM +0200, Jiri Benc wrote:
> On Thu,  4 May 2017 21:02:08 +0300, Leon Romanovsky wrote:
> > In order to close object model, ensure reuse of existing code and make this
> > tool usable from day one, we decided to implement wrappers over legacy sysfs
> > prior to implementing netlink functionality. As a nice bonus, it will allow
> > to use this tool with old kernels too.
>
> This sounds wrong. We don't support legacy ioctl interface for the 'ip'
> command, either. I think rdma should be converted to netlink first and
> the new tool should only use netlink.

RDMA in slightly different situation than "ip" tool was. "ip" was implemented
when tools like ifconfig existed. It allowed to old and new systems to be
configured to some degree. In RDMA community, there are no similar tools like
"ifconfig". Implementation in netlink-only interface will leave old systems without
common tool at all.

As an upstream-oriented person, I personally fine with that, but anyway would
like to get wider agreement/disagreement on that, before removing sysfs
parsing logic from the rdmatool.

Thanks

>
>  Jiri

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* [PATCH/RFC iproute2/net-next v2 2/2] tc: flower: allow control of tree traversal on packet parse errors
From: Simon Horman @ 2017-05-05 12:51 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jiri Pirko, Jamal Hadi Salim, Cong Wang, Dinan Gunawardena,
	netdev, oss-drivers, Simon Horman
In-Reply-To: <1493988685-7891-1-git-send-email-simon.horman@netronome.com>

Allow control how the tree of qdisc, classes and filters is further
traversed if an error is encountered when parsing the packet in order to
match the cls_flower filters at a particular prio.

By default continue to the next filter, the behaviour without this patch.

A use-case for this is to allow configuration of dropping of packets with
truncated headers.

For example, the following drops IPv4 packets that cannot be parsed by the
flow dissector up to the end of the UDP ports - e.g. because they are
truncated, and instantiates a continue action based on the port for packets
that can be parsed.

 # tc qdisc del dev eth0 ingress; tc qdisc add dev eth0 ingress
 # tc filter add dev eth0 protocol ip parent ffff: flower \
       indev eth0 ip_proto udp dst_port 80 truncated drop action continue

Signed-off-by: Simon Horman <simon.horman@netronome.com>
---
 man/man8/tc-flower.8 | 29 +++++++++++++++++++++++++++--
 tc/f_flower.c        | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index ba290657c224..23b450b193d8 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -7,6 +7,8 @@ flower \- flow based traffic control filter
 .ti -8
 .BR tc " " filter " ... " flower " [ "
 .IR MATCH_LIST " ] [ "
+.B truncated
+.IR CONTROL " ] [ "
 .B action
 .IR ACTION_SPEC " ] [ "
 .B classid
@@ -64,6 +66,28 @@ action from the generic action framework may be called.
 .BI action " ACTION_SPEC"
 Apply an action from the generic actions framework on matching packets.
 .TP
+.BI truncated " CONTROL"
+Control how the tree of qdisc, classes and filters is further traversed if
+an truncated header is encountered when parsing the packet in order to match
+against the \fIMATCH_LIST\fR.
+.RS
+.TP
+.B drop
+.TQ
+.B shot
+Drop the packet.
+.TP
+.B continue
+Continue classification with the next filter in line.
+.TP
+.B pass
+Finish classification process and return to calling qdisc for further packet
+processing. This is the default.
+.P
+All filters with the same prio must have the same truncated value - drop
+and shot are considered to be the same value.
+.RE
+.TP
 .BI classid " CLASSID"
 Specify a class to pass matching packets on to.
 .I CLASSID
@@ -219,8 +243,9 @@ and finally ICMP matches (\fBcode\fR and \fBtype\fR) depend on
 being set to
 .BR icmp " or " icmpv6.
 .P
-There can be only used one mask per one prio. If user needs to specify different
-mask, he has to use different prio.
+There can be only used one mask and truncated value per one prio.  If user
+needs to specify different mask or truncated value, he has to use different
+prio.
 .SH SEE ALSO
 .BR tc (8),
 .BR tc-flow (8)
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 5aac4a0837f4..7f8a386c1444 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -43,6 +43,7 @@ static void explain(void)
 	fprintf(stderr,
 		"Usage: ... flower [ MATCH-LIST ]\n"
 		"                  [ skip_sw | skip_hw ]\n"
+		"                  [ truncated CONTROL ]\n"
 		"                  [ action ACTION-SPEC ] [ classid CLASSID ]\n"
 		"\n"
 		"Where: MATCH-LIST := [ MATCH-LIST ] MATCH\n"
@@ -72,6 +73,7 @@ static void explain(void)
 		"       FILTERID := X:Y:Z\n"
 		"       MASKED_LLADDR := { LLADDR | LLADDR/MASK | LLADDR/BITS }\n"
 		"       ACTION-SPEC := ... look at individual actions\n"
+		"       CONTROL := ... drop | shot | continue | pass\n"
 		"\n"
 		"NOTE: CLASSID, IP-PROTO are parsed as hexadecimal input.\n"
 		"NOTE: There can be only used one mask per one prio. If user needs\n"
@@ -507,12 +509,14 @@ static int flower_parse_opt(struct filter_util *qu, char *handle,
 	struct tcmsg *t = NLMSG_DATA(n);
 	struct rtattr *tail;
 	__be16 eth_type = TC_H_MIN(t->tcm_info);
+	int err_action = TC_ACT_UNSPEC;
 	__be16 vlan_ethtype = 0;
 	__u8 ip_proto = 0xff;
 	__u32 flags = 0;
 	__u32 mtf = 0;
 	__u32 mtf_mask = 0;
 
+
 	if (handle) {
 		ret = get_u32(&t->tcm_handle, handle, 0);
 		if (ret) {
@@ -788,6 +792,23 @@ static int flower_parse_opt(struct filter_util *qu, char *handle,
 				return -1;
 			}
 			continue;
+		} else if (matches(*argv, "truncated") == 0) {
+			NEXT_ARG();
+
+			if (!argc || action_a2n(*argv, &err_action, false)) {
+				fprintf(stderr, "Illegal \"truncated\"\n");
+				return -1;
+			}
+
+			switch (err_action) {
+			case TC_ACT_UNSPEC:
+			case TC_ACT_OK:
+			case TC_ACT_SHOT:
+				break;
+			default:
+				fprintf(stderr, "Illegal \"truncated\"\n");
+				return -1;
+			}
 		} else if (strcmp(*argv, "help") == 0) {
 			explain();
 			return -1;
@@ -820,6 +841,12 @@ parse_done:
 			return ret;
 	}
 
+	ret = addattr32(n, MAX_MSG, TCA_FLOWER_META_TRUNCATED,
+			err_action);
+	if (ret)
+		return ret;
+
+
 	tail->rta_len = (((void *)n)+n->nlmsg_len) - (void *)tail;
 
 	return 0;
@@ -1173,6 +1200,12 @@ static int flower_print_opt(struct filter_util *qu, FILE *f,
 			fprintf(f, "\n  skip_sw");
 	}
 
+	if (tb[TCA_FLOWER_META_TRUNCATED]) {
+		int act = rta_getattr_u32(tb[TCA_FLOWER_META_TRUNCATED]);
+
+		fprintf(f, "\n  truncated %s", action_n2a(act));
+	}
+
 	if (tb[TCA_FLOWER_ACT])
 		tc_print_action(f, tb[TCA_FLOWER_ACT]);
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH/RFC iproute2/net-next v2 1/2] tc: flower: update headers for TCA_FLOWER_META_TRUNCATED
From: Simon Horman @ 2017-05-05 12:51 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jiri Pirko, Jamal Hadi Salim, Cong Wang, Dinan Gunawardena,
	netdev, oss-drivers, Simon Horman
In-Reply-To: <1493988685-7891-1-git-send-email-simon.horman@netronome.com>

This change is proposed for net-next.

Signed-off-by: Simon Horman <simon.horman@netronome.com>
---
 include/linux/pkt_cls.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index f1129e383b2a..dfbd5137e275 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -437,6 +437,8 @@ enum {
 	TCA_FLOWER_KEY_MPLS_TC,		/* u8 - 3 bits */
 	TCA_FLOWER_KEY_MPLS_LABEL,	/* be32 - 20 bits */
 
+	TCA_FLOWER_META_TRUNCATED,	/* u32 */
+
 	__TCA_FLOWER_MAX,
 };
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH/RFC iproute2/net-next v2 0/2] tc: flower: allow control of tree traversal on packet parse errors
From: Simon Horman @ 2017-05-05 12:51 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jiri Pirko, Jamal Hadi Salim, Cong Wang, Dinan Gunawardena,
	netdev, oss-drivers, Simon Horman

Hi,

this series is intended to allow control how the tree of qdisc, classes and
filters is further traversed if an error is encountered when parsing the
packet in order to match the cls_flower filters at a particular prio.

Please see the changelog of the last patch of this series for a more
detailed description.

Changes between RFCv1 and RFCv2:
* Rename new attribute in last path TCA_FLOWER_META_TRUNCATED
* Drop patch to add TCA_FLOWER_KEY_MPLS*: it is in net-next now


Simon Horman (2):
  tc: flower: update headers for TCA_FLOWER_META_TRUNCATED
  tc: flower: allow control of tree traversal on packet parse errors

 include/linux/pkt_cls.h |  2 ++
 man/man8/tc-flower.8    | 29 +++++++++++++++++++++++++++--
 tc/f_flower.c           | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+), 2 deletions(-)

^ permalink raw reply

* [PATCH/RFC net-next v2 4/4] net/sched: cls_flower: allow control of tree traversal on packet parse errors
From: Simon Horman @ 2017-05-05 12:47 UTC (permalink / raw)
  To: Jiri Pirko, Jamal Hadi Salim, Cong Wang
  Cc: Dinan Gunawardena, netdev, oss-drivers, Benjamin LaHaise,
	Simon Horman
In-Reply-To: <1493988426-22854-1-git-send-email-simon.horman@netronome.com>

Allow control how the tree of qdisc, classes and filters is further
traversed if an error is encountered when parsing the packet in order to
match the cls_flower filters at a particular prio.

By default continue to the next filter, the behaviour without this patch.

A use-case for this is to allow configuration of dropping of packets with
truncated headers.

For example, the following drops IPv4 packets that cannot be parsed by the
flow dissector up to the end of the UDP ports - e.g. because they are
truncated, and instantiates a continue action based on the port for packets
that can be parsed.

 # tc qdisc del dev eth0 ingress; tc qdisc add dev eth0 ingress
 # tc filter add dev eth0 protocol ip parent ffff: flower \
       indev eth0 ip_proto udp dst_port 80 truncated drop action continue

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Benjamin LaHaise <benjamin.lahaise@netronome.com>
---
rfc2
* s/TCA_FLOWER_HEADER_PARSE_ERR_ACT/TCA_FLOWER_META_TRUNCATED/
  after discussion with Jamal
* Clean up whitespace
---
 include/uapi/linux/pkt_cls.h |  2 ++
 net/sched/cls_flower.c       | 45 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index d613be3b3239..6474d6e55110 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -450,6 +450,8 @@ enum {
 	TCA_FLOWER_KEY_MPLS_TC,		/* u8 - 3 bits */
 	TCA_FLOWER_KEY_MPLS_LABEL,	/* be32 - 20 bits */
 
+	TCA_FLOWER_META_TRUNCATED,
+
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 90bfd003176b..000d0e3f44f9 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -67,13 +67,14 @@ struct cls_fl_head {
 	struct fl_flow_mask mask;
 	struct flow_dissector dissector;
 	u32 hgen;
-	bool mask_assigned;
+	bool assigned;
 	struct list_head filters;
 	struct rhashtable_params ht_params;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
 	};
+	int err_action;
 };
 
 struct cls_fl_filter {
@@ -188,7 +189,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	 */
 	skb_key.basic.n_proto = skb->protocol;
 	if (!skb_flow_dissect(skb, &head->dissector, &skb_key, 0))
-		return -1;
+		return head->err_action;
 
 	fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
 
@@ -317,7 +318,7 @@ static void fl_destroy_sleepable(struct work_struct *work)
 {
 	struct cls_fl_head *head = container_of(work, struct cls_fl_head,
 						work);
-	if (head->mask_assigned)
+	if (head->assigned)
 		rhashtable_destroy(&head->ht);
 	kfree(head);
 	module_put(THIS_MODULE);
@@ -425,6 +426,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
 	[TCA_FLOWER_KEY_MPLS_BOS]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_MPLS_TC]	= { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_MPLS_LABEL]	= { .type = NLA_U32 },
+	[TCA_FLOWER_META_TRUNCATED]	= { .type = NLA_U32 },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -791,13 +793,15 @@ static void fl_init_dissector(struct cls_fl_head *head,
 	skb_flow_dissector_init(&head->dissector, keys, cnt);
 }
 
-static int fl_check_assign_mask(struct cls_fl_head *head,
-				struct fl_flow_mask *mask)
+static int fl_check_assign_mask_and_err_action(struct cls_fl_head *head,
+					       struct fl_flow_mask *mask,
+					       int err_action)
 {
 	int err;
 
-	if (head->mask_assigned) {
-		if (!fl_mask_eq(&head->mask, mask))
+	if (head->assigned) {
+		if (!fl_mask_eq(&head->mask, mask) ||
+		    head->err_action != err_action)
 			return -EINVAL;
 		else
 			return 0;
@@ -810,7 +814,8 @@ static int fl_check_assign_mask(struct cls_fl_head *head,
 	if (err)
 		return err;
 	memcpy(&head->mask, mask, sizeof(head->mask));
-	head->mask_assigned = true;
+	head->assigned = true;
+	head->err_action = err_action;
 
 	fl_init_dissector(head, mask);
 
@@ -883,7 +888,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	struct cls_fl_filter *fnew;
 	struct nlattr **tb;
 	struct fl_flow_mask mask = {};
-	int err;
+	int err, err_action;
 
 	if (!tca[TCA_OPTIONS])
 		return -EINVAL;
@@ -930,11 +935,27 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		}
 	}
 
+	if (tb[TCA_FLOWER_META_TRUNCATED]) {
+		err_action = nla_get_u32(tb[TCA_FLOWER_META_TRUNCATED]);
+
+		switch (err_action) {
+		case TC_ACT_UNSPEC:
+		case TC_ACT_OK:
+		case TC_ACT_SHOT:
+			break;
+		default:
+			err = -EINVAL;
+			goto errout;
+		}
+	} else {
+		err_action = TC_ACT_UNSPEC;
+	}
+
 	err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr);
 	if (err)
 		goto errout;
 
-	err = fl_check_assign_mask(head, &mask);
+	err = fl_check_assign_mask_and_err_action(head, &mask, err_action);
 	if (err)
 		goto errout;
 
@@ -1321,6 +1342,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 	if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
 		goto nla_put_failure;
 
+	if (head->err_action != TC_ACT_UNSPEC &&
+	    nla_put_u32(skb, TCA_FLOWER_META_TRUNCATED, head->err_action))
+		goto nla_put_failure;
+
 	if (tcf_exts_dump(skb, &f->exts))
 		goto nla_put_failure;
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH/RFC net-next v2 3/4] net/sched: cls_flower: do not match if dissection fails
From: Simon Horman @ 2017-05-05 12:47 UTC (permalink / raw)
  To: Jiri Pirko, Jamal Hadi Salim, Cong Wang
  Cc: Dinan Gunawardena, netdev, oss-drivers, Benjamin LaHaise,
	Simon Horman
In-Reply-To: <1493988426-22854-1-git-send-email-simon.horman@netronome.com>

If the flow skb_flow_dissect() returns an error it indicates that
dissection was incomplete for some reason. Matching using the result of an
incomplete dissection may cause unexpected results. For example:

* A match on zero layer 4 ports will also match packets truncated at
  the end of the IP header; that is packets where ports are missing are
  treated the same way as packets with zero ports.
* Likewise, a match on zero ICMP code or type will also match packets
  truncated at the end of the IP header; that is packets where the ICMP
  type and code are missing will be treated the same way as packets with
  zero ICMP code and type.

Separate patches to the flow dissector are required in order for it to
return errors in the above cases.

Fixes: 77b9900ef53a ("tc: introduce Flower classifier")
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Benjamin LaHaise <benjamin.lahaise@netronome.com>
---
 net/sched/cls_flower.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index ca526c0881bd..90bfd003176b 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -187,7 +187,8 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	 * so do it rather here.
 	 */
 	skb_key.basic.n_proto = skb->protocol;
-	skb_flow_dissect(skb, &head->dissector, &skb_key, 0);
+	if (!skb_flow_dissect(skb, &head->dissector, &skb_key, 0))
+		return -1;
 
 	fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH/RFC net-next v2 2/4] flow dissector: return error on icmp dissection under-run
From: Simon Horman @ 2017-05-05 12:47 UTC (permalink / raw)
  To: Jiri Pirko, Jamal Hadi Salim, Cong Wang
  Cc: Dinan Gunawardena, netdev, oss-drivers, Benjamin LaHaise,
	Simon Horman
In-Reply-To: <1493988426-22854-1-git-send-email-simon.horman@netronome.com>

Return an error from __skb_flow_dissect() if insufficient packet data is
present when dissecting icmp type and code.

Without this patch the absence of the ICMP type and code in truncated
ICMPv4 or IPVPv6 packets is treated the way same as the presence of a code
and type of value of zero.  And without this patch the flower classifier is
unable to differentiate between these two cases which may lead to
unexpected matching of truncated packets.

With this patch the flow dissector and in turn the flower classifier can
differentiate between packets with zero ICMP type and code, and truncated
packets.

The approach taken here is to return an error if the IP protocol indicates
ICMP but the type and code data is not present in the packet - an error
return value from __skb_header_pointer().

This should only effect the flower classifier as it is the only user of
W_DISSECTOR_KEY_ICMP.  The behavioural update for flower only takes effect
with a separate patch to have it refuse to match if dissection fails.

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Benjamin LaHaise <benjamin.lahaise@netronome.com>
---
rfc2
* Update changelog
---
 net/core/flow_dissector.c | 65 +++++++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 31 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index b3bf4886f71f..496afd7b3051 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -58,28 +58,6 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 EXPORT_SYMBOL(skb_flow_dissector_init);
 
 /**
- * skb_flow_get_be16 - extract be16 entity
- * @skb: sk_buff to extract from
- * @poff: offset to extract at
- * @data: raw buffer pointer to the packet
- * @hlen: packet header length
- *
- * The function will try to retrieve a be32 entity at
- * offset poff
- */
-static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff,
-				void *data, int hlen)
-{
-	__be16 *u, _u;
-
-	u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u);
-	if (u)
-		return *u;
-
-	return 0;
-}
-
-/**
  * __skb_flow_get_ports - extract the upper layer ports and return them
  * @skb: sk_buff to extract the ports from
  * @thoff: transport header offset
@@ -353,6 +331,29 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
 	return FLOW_DISSECT_RET_OUT_PROTO_AGAIN;
 }
 
+static enum flow_dissect_ret
+__skb_flow_dissect_icmp(const struct sk_buff *skb,
+			struct flow_dissector *flow_dissector,
+			void *target_container, void *data, int nhoff, int hlen)
+{
+	struct flow_dissector_key_icmp *key_icmp;
+	__be16 *u, _u;
+
+	if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP))
+		return FLOW_DISSECT_RET_OUT_GOOD;
+
+	u = __skb_header_pointer(skb, nhoff, sizeof(_u), data, hlen, &_u);
+	if (!u)
+		return FLOW_DISSECT_RET_OUT_BAD;
+
+	key_icmp = skb_flow_dissector_target(flow_dissector,
+					     FLOW_DISSECTOR_KEY_ICMP,
+					     target_container);
+	key_icmp->icmp = *u;
+
+	return FLOW_DISSECT_RET_OUT_GOOD;
+}
+
 /**
  * __skb_flow_dissect - extract the flow_keys struct and return it
  * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
@@ -379,7 +380,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	struct flow_dissector_key_basic *key_basic;
 	struct flow_dissector_key_addrs *key_addrs;
 	struct flow_dissector_key_ports *key_ports;
-	struct flow_dissector_key_icmp *key_icmp;
 	struct flow_dissector_key_tags *key_tags;
 	struct flow_dissector_key_vlan *key_vlan;
 	bool skip_vlan = false;
@@ -694,6 +694,17 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	case IPPROTO_MPLS:
 		proto = htons(ETH_P_MPLS_UC);
 		goto mpls;
+	case IPPROTO_ICMP:
+	case NEXTHDR_ICMP:
+		switch (__skb_flow_dissect_icmp(skb, flow_dissector,
+						target_container, data,
+						nhoff, hlen)) {
+		case FLOW_DISSECT_RET_OUT_GOOD:
+			goto out_good;
+		case FLOW_DISSECT_RET_OUT_BAD:
+		default:
+			goto out_bad;
+		}
 	default:
 		break;
 	}
@@ -708,14 +719,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 			goto out_bad;
 	}
 
-	if (dissector_uses_key(flow_dissector,
-			       FLOW_DISSECTOR_KEY_ICMP)) {
-		key_icmp = skb_flow_dissector_target(flow_dissector,
-						     FLOW_DISSECTOR_KEY_ICMP,
-						     target_container);
-		key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen);
-	}
-
 out_good:
 	ret = true;
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH/RFC net-next v2 1/4] flow dissector: return error on port dissection under-run
From: Simon Horman @ 2017-05-05 12:47 UTC (permalink / raw)
  To: Jiri Pirko, Jamal Hadi Salim, Cong Wang
  Cc: Dinan Gunawardena, netdev, oss-drivers, Benjamin LaHaise,
	Simon Horman
In-Reply-To: <1493988426-22854-1-git-send-email-simon.horman@netronome.com>

Return an error from __skb_flow_dissect() if insufficient packet data is
present when dissecting layer 4 ports.

Without this patch the absence of ports in truncated - e.g. UDP - packets
is treated the same way by the flow dissector as the presence of ports with
a value of zero. And without this patch the flower classifier is unable to
differentiate between these two cases which may lead to unexpected matching
of truncated packets.

With this patch the flow dissector and in turn the flower classifier can
differentiate between packets with zero L4 ports and truncated packets.

The approach taken here is to only return an error if the offset of ports
for the previously dissected IP protocol is known - a non error return from
proto_ports_offset() - but port data is not present in the packet - an
error return value from __skb_header_pointer().

The behaviour for callers of __skb_flow_get_ports() is changed but the only
callers are skb_flow_get_ports() and the flow dissector.  The former has
been updated so that its behaviour is unchanged.  Behavioural change of the
latter is the intended purpose of this patch but will only take effect with
a separate patch to have it refuse to match if dissection fails.

This change will lead to behavioural changes of the users of the dissector
with FLOW_DISSECTOR_KEY_PORTS - flower, and users of
flow_keys_dissector_keys[] and flow_keys_dissector_symmetric_keys[].  The
behavioural change for *_keys[] changes seem reasonable as the change will
should only be for truncated packets.

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Benjamin LaHaise <benjamin.lahaise@netronome.com>
---
rfc2
* Update changelog
---
 include/linux/skbuff.h    | 11 ++++++++---
 net/core/flow_dissector.c | 40 ++++++++++++++++++++++++++--------------
 2 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a098d95b3d84..eb1b06dd0c8f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1108,13 +1108,18 @@ u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
 u32 skb_get_poff(const struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen);
-__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
-			    void *data, int hlen_proto);
+bool __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+			  void *data, int hlen_proto, __be32 *ports);
 
 static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
 					int thoff, u8 ip_proto)
 {
-	return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
+	__be32 ports;
+
+	if (__skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0, &ports))
+		return ports;
+	else
+		return 0;
 }
 
 void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 28d94bce4df8..b3bf4886f71f 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -86,30 +86,41 @@ static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff,
  * @ip_proto: protocol for which to get port offset
  * @data: raw buffer pointer to the packet, if NULL use skb->data
  * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
+ * @ports: pointer to return ports in
  *
  * The function will try to retrieve the ports at offset thoff + poff where poff
- * is the protocol port offset returned from proto_ports_offset
+ * is the protocol port offset returned from proto_ports_offset.
+ *
+ * Returns false on error, true otherwise.
  */
-__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
-			    void *data, int hlen)
+bool __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+			  void *data, int hlen, __be32 *ports)
 {
 	int poff = proto_ports_offset(ip_proto);
+	__be32 *p, _p;
+
+	/* proto_ports_offset returning an error indicates that ip_proto is
+	 * not known to have ports. This is not considered an error here.
+	 * Rather it is considered that the flow key of the caller may use
+	 * the default value of port fields: 0.
+	 */
+	if (poff < 0) {
+		*ports = 0;
+		return true;
+	}
 
 	if (!data) {
 		data = skb->data;
 		hlen = skb_headlen(skb);
 	}
 
-	if (poff >= 0) {
-		__be32 *ports, _ports;
+	p = __skb_header_pointer(skb, thoff + poff, sizeof(_p),
+				 data, hlen, &_p);
+	if (!p)
+		return false;
+	*ports = *p;
 
-		ports = __skb_header_pointer(skb, thoff + poff,
-					     sizeof(_ports), data, hlen, &_ports);
-		if (ports)
-			return *ports;
-	}
-
-	return 0;
+	return true;
 }
 EXPORT_SYMBOL(__skb_flow_get_ports);
 
@@ -692,8 +703,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 		key_ports = skb_flow_dissector_target(flow_dissector,
 						      FLOW_DISSECTOR_KEY_PORTS,
 						      target_container);
-		key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
-							data, hlen);
+		if (!__skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen,
+					  &key_ports->ports))
+			goto out_bad;
 	}
 
 	if (dissector_uses_key(flow_dissector,
-- 
2.1.4

^ permalink raw reply related

* [PATCH/RFC net-next v2 0/4] net/sched: cls_flower: avoid false matching of truncated packets
From: Simon Horman @ 2017-05-05 12:47 UTC (permalink / raw)
  To: Jiri Pirko, Jamal Hadi Salim, Cong Wang
  Cc: Dinan Gunawardena, netdev, oss-drivers, Benjamin LaHaise,
	Simon Horman

Hi,

this series is intended to avoid false-positives which match
truncated packets against flower classifiers which match on:
* zero L4 ports or;
* zero ICMP code or type

This requires updating the flow dissector to return an error in such cases
and updating flower to not match on the result of a failed dissection.

In the case of UDP this results in a behavioural change to users of
flow_keys_dissector_keys[] and flow_keys_dissector_symmetric_keys[] -
dissection will fail on truncated packets where the IP protocol of the
packets indicates ports should be present (according to skb_flow_get_ports()).

The last patch of the series builds on the above to allow users to specify
a policy for how to handle packets whose dissection fails.

I will separately provide RFC patches to iproute2 to allow exercising the
last patch.

Changes between RFCv1 and RFCv2
* Rename new attribute in last path TCA_FLOWER_META_TRUNCATED
  after discussion with Jamal.
* Update changelog for "flow dissector" patches to make it clearer what
  the before and after behaviours are.

Simon Horman (4):
  flow dissector: return error on port dissection under-run
  flow dissector: return error on icmp dissection under-run
  net/sched: cls_flower: do not match if dissection fails
  net/sched: cls_flower: allow control of tree traversal on packet parse
    errors

 include/linux/skbuff.h       |  11 +++--
 include/uapi/linux/pkt_cls.h |   2 +
 net/core/flow_dissector.c    | 105 ++++++++++++++++++++++++-------------------
 net/sched/cls_flower.c       |  46 ++++++++++++++-----
 4 files changed, 106 insertions(+), 58 deletions(-)

-- 
2.1.4

^ permalink raw reply

* Re: FEC on i.MX 7 transmit queue timeout
From: Andrew Lunn @ 2017-05-05 12:23 UTC (permalink / raw)
  To: Andy Duan
  Cc: Stefan Agner, festevam@gmail.com, netdev@vger.kernel.org,
	netdev-owner@vger.kernel.org
In-Reply-To: <46a27329-36df-1eaf-1321-24db037842fe@nxp.com>

> No, it is not workaround. As i said, quque1 and queue2 are for AVB paths 
> have higher priority in transmition.

Does this higher priority result in the low priority queue being
starved? Is that why the timer goes off? What happens when somebody
does use AVB. Are we back to the same problem? This is what seems to
make is sounds like a work around, not a fix.

     Andrew

^ permalink raw reply

* Re: net/key: slab-out-of-bounds in pfkey_compile_policy
From: Andrey Konovalov @ 2017-05-05 12:18 UTC (permalink / raw)
  To: Steffen Klassert
  Cc: Herbert Xu, David S. Miller, netdev, LKML, Dmitry Vyukov,
	Kostya Serebryany, Eric Dumazet, Cong Wang, syzkaller
In-Reply-To: <20170505091105.GA9813@secunet.com>

On Fri, May 5, 2017 at 11:11 AM, Steffen Klassert
<steffen.klassert@secunet.com> wrote:
> On Tue, May 02, 2017 at 06:45:03PM +0200, Andrey Konovalov wrote:
>> Hi,
>>
>> I've got the following error report while fuzzing the kernel with syzkaller.
>>
>> On commit d3b5d35290d729a2518af00feca867385a1b08fa (4.11).
>>
>> A reproducer and .config are attached.
>>
>> ==================================================================
>> BUG: KASAN: slab-out-of-bounds in pfkey_compile_policy+0x8e6/0xd40 at
>> addr ffff88006701f798
>> Read of size 1280 by task a.out/4181
>
>
> This bug was introduced twelve years ago...
>
> This patch is based just on code review, I don't have an option to
> function test this. But I see that we now exit with -EINVAL before the
> memcpy that causes the slab-out-of-bounds when using your reproducer,
> so it should at least fix the bug.

Hi Steffen,

This patch fixes the issue for me.

Thanks!

Tested-by: Andrey Konovalov <andreyknvl@google.com>

>
> Subject: [PATCH RFC] af_key: Fix slab-out-of-bounds in pfkey_compile_policy.
>
> The sadb_x_sec_len is stored in the unit 'byte divided by eight'.
> So we have to multiply this value by eight before we can do
> size checks. Otherwise we may get a slab-out-of-bounds when
> we memcpy the user sec_ctx.
>
> Fixes: df71837d502 ("[LSM-IPSec]: Security association restriction.")
> Reported-by: Andrey Konovalov <andreyknvl@google.com>
> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
> ---
>  net/key/af_key.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/key/af_key.c b/net/key/af_key.c
> index c1950bb..512dc43 100644
> --- a/net/key/af_key.c
> +++ b/net/key/af_key.c
> @@ -3285,7 +3285,7 @@ static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt,
>                 p += pol->sadb_x_policy_len*8;
>                 sec_ctx = (struct sadb_x_sec_ctx *)p;
>                 if (len < pol->sadb_x_policy_len*8 +
> -                   sec_ctx->sadb_x_sec_len) {
> +                   sec_ctx->sadb_x_sec_len*8) {
>                         *dir = -EINVAL;
>                         goto out;
>                 }
> --
> 2.7.4
>
>

^ permalink raw reply

* Re: [PATCH v5 00/20] net-next: stmmac: add dwmac-sun8i ethernet driver
From: Corentin Labbe @ 2017-05-05 12:13 UTC (permalink / raw)
  To: peppe.cavallaro-qxv4g6HH51o, alexandre.torgue-qxv4g6HH51o
  Cc: devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	robh+dt-DgEjT+Ai2ygdnm+yROfE0A, mark.rutland-5wv7dgnIgG8,
	maxime.ripard-wi1+55ScJUtKEb57/3fJTNBPR1lH4CV8, wens-jdAy2FN1RRM,
	linux-I+IVW8TIWO2tmTQ+vhA3Yw, catalin.marinas-5wv7dgnIgG8,
	will.deacon-5wv7dgnIgG8
In-Reply-To: <20170501124520.3769-1-clabbe.montjoie-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

On Mon, May 01, 2017 at 02:45:00PM +0200, Corentin Labbe wrote:
> Hello
> 
> This patch series add the driver for dwmac-sun8i which handle the Ethernet MAC
> present on Allwinner H3/H5/A83T/A64 SoCs.
> 
> This driver is the continuation of the sun8i-emac driver.
> During the development, it appeared that in fact the hardware was a modified
> version of some dwmac.
> So the driver is now written as a glue driver for stmmac.
> 
> It supports 10/100/1000 Mbit/s speed with half/full duplex.
> It can use an internal PHY (MII 10/100) or an external PHY
> via RGMII/RMII.
> 
> This patch series enable the driver only for the H3/A64/H5 SoC since A83T
> doesn't have the necessary clocks present in mainline.
> 
> The driver have been tested on the following boards:
> - H3 Orange PI PC, BananaPI-M2+
> - A64 Pine64, BananaPi-M64
> - A83T BananaPI-M3
> 
> The first two patchs are some mandatory changes for letting dwmac-sun8i be used.
> The following three patchs add the driver and its documentation.
> The remaining are DT patch enabling it.
> 
> Regards
> Corentin Labbe
> 

Hello Giuseppe

Since this serie have only one minor comment on documentation, could you give me your view about it ?

Regards
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v3 2/2] can: m_can: add deep Suspend/Resume support
From: Marc Kleine-Budde @ 2017-05-05 12:06 UTC (permalink / raw)
  To: Quentin Schulz, wg, mario.huettel, socketcan
  Cc: linux-can, netdev, linux-kernel, alexandre.belloni,
	thomas.petazzoni
In-Reply-To: <20170505114646.1278-2-quentin.schulz@free-electrons.com>


[-- Attachment #1.1: Type: text/plain, Size: 2548 bytes --]

On 05/05/2017 01:46 PM, Quentin Schulz wrote:
> This adds Power Management deep Suspend/Resume support for Bosch M_CAN
> chip.
> 
> When entering deep sleep, the clocks are gated, the interrupts are
> disabled. When resuming from deep sleep, the chip needs to be
> reinitialized, the clocks ungated and the interrupts enabled.
> 
> Signed-off-by: Quentin Schulz <quentin.schulz@free-electrons.com>
> ---
> 
> v3:
>  - do not close/reopen the can interface (which was previously done when
>  calling m_can_close), basically do the same routine as in probe but
>  it does not close/open the can device,

much better!

>  - update commit log,
> 
> v2:
>  - fix erroneous commit log,
> 
>  drivers/net/can/m_can/m_can.c | 17 ++++++++++++++---
>  1 file changed, 14 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
> index 3f0445440146..0a06690febe2 100644
> --- a/drivers/net/can/m_can/m_can.c
> +++ b/drivers/net/can/m_can/m_can.c
> @@ -1672,10 +1672,9 @@ static __maybe_unused int m_can_suspend(struct device *dev)
>  	if (netif_running(ndev)) {
>  		netif_stop_queue(ndev);
>  		netif_device_detach(ndev);
> +		m_can_stop(ndev);
>  	}
>  
> -	/* TODO: enter low power */
> -
>  	priv->can.state = CAN_STATE_SLEEPING;
>  
>  	return 0;
> @@ -1686,11 +1685,23 @@ static __maybe_unused int m_can_resume(struct device *dev)
>  	struct net_device *ndev = dev_get_drvdata(dev);
>  	struct m_can_priv *priv = netdev_priv(ndev);
>  
> -	/* TODO: exit low power */
> +	m_can_init_ram(priv);
>  
>  	priv->can.state = CAN_STATE_ERROR_ACTIVE;
>  
>  	if (netif_running(ndev)) {
> +		int ret = clk_prepare_enable(priv->hclk);
> +
> +		if (ret)
> +			return ret;
> +
> +		ret = clk_prepare_enable(priv->cclk);
> +		if (ret) {
> +			clk_disable_unprepare(priv->hclk);
> +			return ret;
> +		}
> +
> +		m_can_start(ndev);

Using m_can_stop() m_can_start() here is the way to go. However, when
looking at this hook we se, that the m_can_start/stop functions are not
symmetric (as they should be). Either move the clock handling to
completely in or out of m_can_start/stop.

>  		netif_device_attach(ndev);
>  		netif_start_queue(ndev);
>  	}
> 

Marc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* [PATCH v3 1/2] can: m_can: move Message RAM initialization to function
From: Quentin Schulz @ 2017-05-05 11:46 UTC (permalink / raw)
  To: wg, mkl, mario.huettel, socketcan
  Cc: Quentin Schulz, linux-can, netdev, linux-kernel,
	alexandre.belloni, thomas.petazzoni

To avoid possible ECC/parity checksum errors when reading an
uninitialized buffer, the entire Message RAM is initialized when probing
the driver. This initialization is done in the same function reading the
Device Tree properties.

This patch moves the RAM initialization to a separate function so it can
be called separately from device initialization from Device Tree.

Signed-off-by: Quentin Schulz <quentin.schulz@free-electrons.com>
---
 drivers/net/can/m_can/m_can.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index bf8fdaeb955e..3f0445440146 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -1489,6 +1489,20 @@ static int register_m_can_dev(struct net_device *dev)
 	return register_candev(dev);
 }
 
+static void m_can_init_ram(struct m_can_priv *priv)
+{
+	int end, i, start;
+
+	/* initialize the entire Message RAM in use to avoid possible
+	 * ECC/parity checksum errors when reading an uninitialized buffer
+	 */
+	start = priv->mcfg[MRAM_SIDF].off;
+	end = priv->mcfg[MRAM_TXB].off +
+		priv->mcfg[MRAM_TXB].num * TXB_ELEMENT_SIZE;
+	for (i = start; i < end; i += 4)
+		writel(0x0, priv->mram_base + i);
+}
+
 static void m_can_of_parse_mram(struct m_can_priv *priv,
 				const u32 *mram_config_vals)
 {
@@ -1529,15 +1543,7 @@ static void m_can_of_parse_mram(struct m_can_priv *priv,
 		priv->mcfg[MRAM_TXE].off, priv->mcfg[MRAM_TXE].num,
 		priv->mcfg[MRAM_TXB].off, priv->mcfg[MRAM_TXB].num);
 
-	/* initialize the entire Message RAM in use to avoid possible
-	 * ECC/parity checksum errors when reading an uninitialized buffer
-	 */
-	start = priv->mcfg[MRAM_SIDF].off;
-	end = priv->mcfg[MRAM_TXB].off +
-		priv->mcfg[MRAM_TXB].num * TXB_ELEMENT_SIZE;
-	for (i = start; i < end; i += 4)
-		writel(0x0, priv->mram_base + i);
-
+	m_can_init_ram(priv);
 }
 
 static int m_can_plat_probe(struct platform_device *pdev)
-- 
2.11.0

^ permalink raw reply related

* [PATCH v3 2/2] can: m_can: add deep Suspend/Resume support
From: Quentin Schulz @ 2017-05-05 11:46 UTC (permalink / raw)
  To: wg, mkl, mario.huettel, socketcan
  Cc: Quentin Schulz, linux-can, netdev, linux-kernel,
	alexandre.belloni, thomas.petazzoni
In-Reply-To: <20170505114646.1278-1-quentin.schulz@free-electrons.com>

This adds Power Management deep Suspend/Resume support for Bosch M_CAN
chip.

When entering deep sleep, the clocks are gated, the interrupts are
disabled. When resuming from deep sleep, the chip needs to be
reinitialized, the clocks ungated and the interrupts enabled.

Signed-off-by: Quentin Schulz <quentin.schulz@free-electrons.com>
---

v3:
 - do not close/reopen the can interface (which was previously done when
 calling m_can_close), basically do the same routine as in probe but
 it does not close/open the can device,
 - update commit log,

v2:
 - fix erroneous commit log,

 drivers/net/can/m_can/m_can.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 3f0445440146..0a06690febe2 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -1672,10 +1672,9 @@ static __maybe_unused int m_can_suspend(struct device *dev)
 	if (netif_running(ndev)) {
 		netif_stop_queue(ndev);
 		netif_device_detach(ndev);
+		m_can_stop(ndev);
 	}
 
-	/* TODO: enter low power */
-
 	priv->can.state = CAN_STATE_SLEEPING;
 
 	return 0;
@@ -1686,11 +1685,23 @@ static __maybe_unused int m_can_resume(struct device *dev)
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct m_can_priv *priv = netdev_priv(ndev);
 
-	/* TODO: exit low power */
+	m_can_init_ram(priv);
 
 	priv->can.state = CAN_STATE_ERROR_ACTIVE;
 
 	if (netif_running(ndev)) {
+		int ret = clk_prepare_enable(priv->hclk);
+
+		if (ret)
+			return ret;
+
+		ret = clk_prepare_enable(priv->cclk);
+		if (ret) {
+			clk_disable_unprepare(priv->hclk);
+			return ret;
+		}
+
+		m_can_start(ndev);
 		netif_device_attach(ndev);
 		netif_start_queue(ndev);
 	}
-- 
2.11.0


^ permalink raw reply related

* [PATCH] net: alx: handle pci_alloc_irq_vectors return correctly
From: Rakesh Pandit @ 2017-05-05 11:28 UTC (permalink / raw)
  To: Jay Cliburn, Chris Snook
  Cc: David S. Miller, Tobias Regnery, Feng Tang, Eric Dumazet, netdev,
	linux-kernel, Christoph Hellwig

It was introduced while switching to pci_alloc_irq_vectors recently
and fixes:

[   60.527052] alx 0000:03:00.0 enp3s0: Enabling MSI-X interrupts failed!
[   60.529323] BUG: unable to handle kernel NULL pointer dereference at 00000000000000b8
[   60.531589] IP: alx_alloc_napis+0xe6/0x1e0 [alx]
[   60.533831] PGD 0
[   60.533833] P4D 0

[   60.540559] Oops: 0002 [#1] SMP
[   60.542759] Modules linked in: ebtables ip6table_filter ip6_tables.....
[   60.549990]  drm_kms_helper drm crc32c_intel alx serio_raw mdio wmi video i2c_hid uas usb_storage
[   60.551404] CPU: 0 PID: 999 Comm: NetworkManager Not tainted 4.11.0+ #1
[   60.552813] Hardware name: Acer Predator G9-591/Mustang_SLS, BIOS V1.10 03/03/2016
[   60.554219] task: ffff8804ae833c00 task.stack: ffffc90003eec000
[   60.555383] RIP: 0010:alx_alloc_napis+0xe6/0x1e0 [alx]
[   60.556615] RSP: 0018:ffffc90003eef660 EFLAGS: 00010286
[   60.557787] RAX: ffff8804962835a0 RBX: ffff8804aee8a8c0 RCX: 0000000000000000
[   60.558987] RDX: 0000000000000060 RSI: 0000000000000000 RDI: ffff880496283600
[   60.559979] RBP: ffffc90003eef688 R08: ffff8804c1c1e7e0 R09: ffff8804962835a0
[   60.560978] R10: ffff8804962835a0 R11: 0000000000000102 R12: 0000000000000000
[   60.561974] R13: 0000000000000000 R14: ffff8804aee8aaf0 R15: ffffffffa0052ea0
[   60.562974] FS:  00007f1cecbc9940(0000) GS:ffff8804c1c00000(0000) knlGS:0000000000000000
[   60.564003] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   60.564884] CR2: 00000000000000b8 CR3: 0000000496025000 CR4: 00000000003406f0
[   60.565782] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   60.566676] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   60.567560] Call Trace:
[   60.568500]  __alx_open+0xa2/0x570 [alx]
[   60.569355]  ? notifier_call_chain+0x4a/0x70
[   60.570144]  alx_open+0x17/0x20 [alx]
[   60.570909]  __dev_open+0xc6/0x140
[   60.571682]  ? _raw_spin_unlock_bh+0x1a/0x20
[   60.572469]  __dev_change_flags+0x9d/0x160
[   60.573252]  dev_change_flags+0x29/0x60
[   60.574070]  do_setlink+0x32e/0xc90
[   60.574815]  ? ttwu_do_activate+0x77/0x80
[   60.575544]  ? drm_fb_helper_dirty.isra.17+0xc7/0xe0 [drm_kms_helper]
[   60.576273]  ? drm_fb_helper_cfb_imageblit+0x30/0x40 [drm_kms_helper]
[   60.577004]  ? bit_putcs+0x2f7/0x560
[   60.577729]  ? nla_parse+0x35/0x140
[   60.578518]  rtnl_newlink+0x7d3/0x900
[   60.579280]  ? security_capset+0x30/0x80
[   60.580029]  ? ns_capable_common+0x68/0x80
[   60.580747]  ? ns_capable+0x13/0x20
[   60.581453]  rtnetlink_rcv_msg+0xee/0x220
[   60.582198]  ? rtnl_newlink+0x900/0x900
[   60.582909]  netlink_rcv_skb+0xe7/0x120
[   60.583601]  rtnetlink_rcv+0x28/0x30
[   60.584303]  netlink_unicast+0x18c/0x220
[   60.585002]  netlink_sendmsg+0x2ba/0x3b0
[   60.585703]  sock_sendmsg+0x38/0x50
[   60.586436]  ___sys_sendmsg+0x2b6/0x2d0
[   60.587123]  ? lockref_put_or_lock+0x5e/0x80
[   60.587822]  ? dput+0x155/0x1d0
[   60.588518]  ? mntput+0x24/0x40
[   60.589215]  __sys_sendmsg+0x54/0x90
[   60.589907]  ? __sys_sendmsg+0x54/0x90
[   60.590627]  SyS_sendmsg+0x12/0x20
[   60.591333]  entry_SYSCALL_64_fastpath+0x1a/0xa5
[   60.592021] RIP: 0033:0x7f1ceb44e3b0
[   60.592697] RSP: 002b:00007fffd7f0a2d0 EFLAGS: 00000293 ORIG_RAX: 000000000000002e
[   60.593385] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f1ceb44e3b0
[   60.594107] RDX: 0000000000000000 RSI: 00007fffd7f0a380 RDI: 000000000000000c
[   60.594798] RBP: 00007fffd7f0a800 R08: 0000000000000000 R09: 0000000000000000
[   60.595502] R10: 0000564ffbae6e20 R11: 0000000000000293 R12: 0000000000000001
[   60.596200] R13: 0000000000000002 R14: 0000000000000010 R15: 00007fffd7f0a4d0
[   60.596899] Code: ed 85 c9 0f 8f ec 00 00 00 48 8b 3d 9d 97 1a e2 ba 50 00 00 00 be c0 80 40 01 4c 8b a3 30 02 00 00 e8 ff e5 1d e1 48 85 c0 74 a3 <49> 89 84 24 b8 00 00 00 48 8b 93 30 02 00 00 48 8b 4b 08 48 89
[   60.597642] RIP: alx_alloc_napis+0xe6/0x1e0 [alx] RSP: ffffc90003eef660
[   60.598427] CR2: 00000000000000b8

Fixes: f3297f68 ("net: alx: switch to pci_alloc_irq_vectors")
Signed-off-by: Rakesh Pandit <rakesh@tuxera.com>
---
 drivers/net/ethernet/atheros/alx/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c
index a8c2db8..567ee54 100644
--- a/drivers/net/ethernet/atheros/alx/main.c
+++ b/drivers/net/ethernet/atheros/alx/main.c
@@ -838,7 +838,7 @@ static int alx_enable_msix(struct alx_priv *alx)
 
 	err = pci_alloc_irq_vectors(alx->hw.pdev, num_vec, num_vec,
 			PCI_IRQ_MSIX);
-	if (err) {
+	if (err < 0) {
 		netdev_warn(alx->dev, "Enabling MSI-X interrupts failed!\n");
 		return err;
 	}
@@ -904,7 +904,7 @@ static int alx_init_intr(struct alx_priv *alx)
 
 	ret = pci_alloc_irq_vectors(alx->hw.pdev, 1, 1,
 			PCI_IRQ_MSI | PCI_IRQ_LEGACY);
-	if (ret)
+	if (ret < 0)
 		return ret;
 
 	alx->num_vec = 1;
-- 
2.9.3

^ permalink raw reply related

* RE: Question about SOCK_SEQPACKET
From: David Laight @ 2017-05-05 11:21 UTC (permalink / raw)
  To: 'Steven Whitehouse', Sowmini Varadhan
  Cc: Sam Kumar, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <e413b04e-b441-604c-f605-c5a28389d064@redhat.com>

From: Steven Whitehouse
> Sent: 05 May 2017 11:34
...
> Just before the part that you've quoted, the description for
> SOCK_SEQPACKET says:
> "The SOCK_SEQPACKET socket type is similar to the SOCK_STREAM type, and
> is also connection-oriented. The only difference between these types is
> that record boundaries are maintained using the SOCK_SEQPACKET type. A
> record can be sent using one or more output operations and received
> using one or more input operations, but a single operation never
> transfers parts of more than one record."

Right SOCK_SEQPACKET is for protocols like ISO transport.
There is no limit on the length of a 'record'.
I've written file transfer programs that put the entire file
data into a single 'record'. The receiver disconnected on
receipt of the 'end of record'.

The socket man pages could easily be wrong - they are very IP-centric.
Remember ISO transport use declined when Unix became more popular
back in the mid 1980s. Unix sockets have never really been used for
it - the address information needed just doesn't match that IP
(especially IPv4).

	David

^ permalink raw reply

* Re: Question about SOCK_SEQPACKET
From: Steven Whitehouse @ 2017-05-05 10:34 UTC (permalink / raw)
  To: Sowmini Varadhan; +Cc: Sam Kumar, linux-kernel, netdev@vger.kernel.org
In-Reply-To: <20170505100926.GD1547@oracle.com>

Hi,


On 05/05/17 11:09, Sowmini Varadhan wrote:
> On (05/05/17 10:45), Steven Whitehouse wrote:
>> I do wonder if the man page for recvmsg is wrong, or at least a bit
>> confusing. SOCK_SEQPACKET is stream based not message based - it just
>> happens to have EOR markers in the stream. There is no reason that the whole
>> message needs to be returned in a single read, and in fact that would be
>> impossible if the sender didn't insert any EOR markers but kept sending data
>> beyond the size that the socket could buffer.
>>
>> I notice that man 7 socket says SOCK_SEQPACKET is for datagrams of fixed
>> maximum length which is definitely wrong, as is the statement that a
>> consumer has to read an entire packet with each system call.
> Which man page do you think is wrong here? The POSIX definition is here
>
> http://pubs.opengroup.org/onlinepubs/009695399/functions/recvmsg.html
>
> The description in
>
> http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_10.html
>
> says, "It is protocol-specific whether a maximum record size is imposed."
> In my machine (Ubuntu 4.4.0-72, and it is in socket(2), not socket(7), btw)
> doesnt have any references to max length, but I'm not sure I'd boldly assert
> "definitely wrong" about the requirement of having to read entire
> packet in a system call (see POSIX man page)
>
> --Sowmini
>
Just before the part that you've quoted, the description for 
SOCK_SEQPACKET says:
"The SOCK_SEQPACKET socket type is similar to the SOCK_STREAM type, and 
is also connection-oriented. The only difference between these types is 
that record boundaries are maintained using the SOCK_SEQPACKET type. A 
record can be sent using one or more output operations and received 
using one or more input operations, but a single operation never 
transfers parts of more than one record."

The man page for socket says SOCK_SEQPACKET "Provides  a sequenced,  
reliable,  two-way connection-based data transmission path  for  
datagrams  of  fixed maximum  length" which is not true, because while 
there may be a length restriction, it is quite possible that there is 
not a length restriction (as per DECnet). It also says "a  consumer  is  
required  to read an entire packet with each input system call" which is 
also contradicted by POSIX which says that a record can be "received 
using one or more input operations". So both statements in the man page 
are wrong, I think.

I have to say that I'd not spotted the POSIX recvmsg wording before, 
which says "For message-based sockets, such as SOCK_DGRAM and 
SOCK_SEQPACKET, the entire message shall be read in a single operation" 
however that does contradict the earlier wording, where it explicitly 
says that multiple receive operations per record are ok for 
SOCK_SEQPACKET - at least if we assume that record == message in this 
case. Also, if this restriction was true (one message per recvmsg call) 
then MSG_EOR would never be needed on receive, since every recvmsg would 
be a single message/record only, and that same document does say that 
MSG_EOR can be set on receive for protocols which support it,

Steve.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox