Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next,RFC 10/13] netfilter: nf_flow_table: add flowtable for early ingress hook
From: Pablo Neira Ayuso @ 2018-06-14 14:19 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, steffen.klassert
In-Reply-To: <20180614141947.3580-1-pablo@netfilter.org>

Add the new flowtable type for the early ingress hook, this allows
us to combine the custom GRO chaining with the flowtable abstraction
to define fastpaths.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netfilter/nf_flow_table.h   |  3 ++
 net/ipv4/netfilter/nf_flow_table_ipv4.c | 11 ++++++
 net/netfilter/nf_flow_table_ip.c        | 62 +++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 4606bad41155..e270269dd1e8 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -126,6 +126,9 @@ unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 				     const struct nf_hook_state *state);
 unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 				       const struct nf_hook_state *state);
+unsigned int nf_flow_offload_early_ingress_ip_hook(void *priv,
+						   struct sk_buff *skb,
+						   const struct nf_hook_state *state);
 
 #define MODULE_ALIAS_NF_FLOWTABLE(family)	\
 	MODULE_ALIAS("nf-flowtable-" __stringify(family))
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 681c0d5c47d7..b771000ca894 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -14,15 +14,26 @@ static struct nf_flowtable_type flowtable_ipv4 = {
 	.owner		= THIS_MODULE,
 };
 
+static struct nf_flowtable_type flowtable_ipv4_early = {
+	.family		= NFPROTO_IPV4,
+	.hooknum	= NF_NETDEV_EARLY_INGRESS,
+	.init		= nf_flow_table_init,
+	.free		= nf_flow_table_free,
+	.hook		= nf_flow_offload_early_ingress_ip_hook,
+	.owner		= THIS_MODULE,
+};
+
 static int __init nf_flow_ipv4_module_init(void)
 {
 	nft_register_flowtable_type(&flowtable_ipv4);
+	nft_register_flowtable_type(&flowtable_ipv4_early);
 
 	return 0;
 }
 
 static void __exit nf_flow_ipv4_module_exit(void)
 {
+	nft_unregister_flowtable_type(&flowtable_ipv4_early);
 	nft_unregister_flowtable_type(&flowtable_ipv4);
 }
 
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 15ed91309992..0828e49bd95e 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -11,6 +11,7 @@
 #include <net/ip6_route.h>
 #include <net/neighbour.h>
 #include <net/netfilter/nf_flow_table.h>
+#include <net/xfrm.h>
 /* For layer 4 checksum field offset. */
 #include <linux/tcp.h>
 #include <linux/udp.h>
@@ -487,3 +488,64 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 	return NF_STOLEN;
 }
 EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
+
+unsigned int
+nf_flow_offload_early_ingress_ip_hook(void *priv, struct sk_buff *skb,
+				      const struct nf_hook_state *state)
+{
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct nf_flowtable *flow_table = priv;
+	struct flow_offload_tuple tuple = {};
+	enum flow_offload_tuple_dir dir;
+	struct flow_offload *flow;
+	struct net_device *outdev;
+	const struct rtable *rt;
+	unsigned int thoff;
+	struct iphdr *iph;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		return NF_ACCEPT;
+
+	if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
+		return NF_ACCEPT;
+
+	tuplehash = flow_offload_lookup(flow_table, &tuple);
+	if (tuplehash == NULL)
+		return NF_ACCEPT;
+
+	outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+	if (!outdev)
+		return NF_ACCEPT;
+
+	dir = tuplehash->tuple.dir;
+	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+	rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
+	    (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
+		return NF_ACCEPT;
+
+	if (skb_try_make_writable(skb, sizeof(*iph)))
+		return NF_DROP;
+
+	thoff = ip_hdr(skb)->ihl * 4;
+	if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
+		return NF_ACCEPT;
+
+	if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+	    nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
+		return NF_DROP;
+
+	flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+
+	skb_dst_set_noref(skb, flow->tuplehash[dir].tuple.dst_cache);
+
+	if (skb_dst(skb)->xfrm &&
+	    !xfrm_dev_offload_ok(skb, skb_dst(skb)->xfrm))
+		return NF_ACCEPT;
+
+	NAPI_GRO_CB(skb)->is_ffwd = 1;
+
+	return NF_STOLEN;
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_early_ingress_ip_hook);
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next,RFC 09/13] netfilter: nf_flow_table: add hooknum to flowtable type
From: Pablo Neira Ayuso @ 2018-06-14 14:19 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, steffen.klassert
In-Reply-To: <20180614141947.3580-1-pablo@netfilter.org>

This allows us to register different flowtable variants depending on the
hook type, hence we can define flowtable for new hook types.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netfilter/nf_flow_table.h   |   1 +
 net/ipv4/netfilter/nf_flow_table_ipv4.c |   1 +
 net/ipv6/netfilter/nf_flow_table_ipv6.c |   1 +
 net/netfilter/nf_flow_table_inet.c      |   1 +
 net/netfilter/nf_tables_api.c           | 120 +++++++++++++++++---------------
 5 files changed, 67 insertions(+), 57 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index ba9fa4592f2b..4606bad41155 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -14,6 +14,7 @@ struct nf_flowtable;
 struct nf_flowtable_type {
 	struct list_head		list;
 	int				family;
+	unsigned int			hooknum;
 	int				(*init)(struct nf_flowtable *ft);
 	void				(*free)(struct nf_flowtable *ft);
 	nf_hookfn			*hook;
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index e1e56d7123d2..681c0d5c47d7 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -7,6 +7,7 @@
 
 static struct nf_flowtable_type flowtable_ipv4 = {
 	.family		= NFPROTO_IPV4,
+	.hooknum	= NF_NETDEV_INGRESS,
 	.init		= nf_flow_table_init,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ip_hook,
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index c511d206bf9b..f1f976bdc151 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -8,6 +8,7 @@
 
 static struct nf_flowtable_type flowtable_ipv6 = {
 	.family		= NFPROTO_IPV6,
+	.hooknum	= NF_NETDEV_INGRESS,
 	.init		= nf_flow_table_init,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ipv6_hook,
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 99771aa7e7ea..347a640d9723 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -22,6 +22,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_flowtable_type flowtable_inet = {
 	.family		= NFPROTO_INET,
+	.hooknum	= NF_NETDEV_INGRESS,
 	.init		= nf_flow_table_init,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_inet_hook,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index ca4c4d994ddb..5d6c3b9eee6b 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5266,6 +5266,40 @@ static int nf_tables_parse_devices(const struct nft_ctx *ctx,
 	return err;
 }
 
+static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family,
+								int hooknum)
+{
+	const struct nf_flowtable_type *type;
+
+	list_for_each_entry(type, &nf_tables_flowtables, list) {
+		if (family == type->family &&
+		    hooknum == type->hooknum)
+			return type;
+	}
+	return NULL;
+}
+
+static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family,
+							      int hooknum)
+{
+	const struct nf_flowtable_type *type;
+
+	type = __nft_flowtable_type_get(family, hooknum);
+	if (type != NULL && try_module_get(type->owner))
+		return type;
+
+#ifdef CONFIG_MODULES
+	if (type == NULL) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nf-flowtable-%u", family);
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		if (__nft_flowtable_type_get(family, hooknum))
+			return ERR_PTR(-EAGAIN);
+	}
+#endif
+	return ERR_PTR(-ENOENT);
+}
+
 static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = {
 	[NFTA_FLOWTABLE_HOOK_NUM]	= { .type = NLA_U32 },
 	[NFTA_FLOWTABLE_HOOK_PRIORITY]	= { .type = NLA_U32 },
@@ -5278,6 +5312,7 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
 {
 	struct net_device *dev_array[NFT_FLOWTABLE_DEVICE_MAX];
 	struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
+	const struct nf_flowtable_type *type;
 	struct nf_hook_ops *ops;
 	int hooknum, priority;
 	int err, n = 0, i;
@@ -5293,19 +5328,31 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
 		return -EINVAL;
 
 	hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
-	if (hooknum != NF_NETDEV_INGRESS)
+	if (hooknum != NF_NETDEV_INGRESS &&
+	    hooknum != NF_NETDEV_EARLY_INGRESS)
 		return -EINVAL;
 
+	type = nft_flowtable_type_get(ctx->family, hooknum);
+	if (IS_ERR(type))
+		return PTR_ERR(type);
+
+	flowtable->data.type = type;
+	err = type->init(&flowtable->data);
+	if (err < 0)
+		goto err1;
+
 	priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
 
 	err = nf_tables_parse_devices(ctx, tb[NFTA_FLOWTABLE_HOOK_DEVS],
 				      dev_array, &n);
 	if (err < 0)
-		return err;
+		goto err2;
 
 	ops = kzalloc(sizeof(struct nf_hook_ops) * n, GFP_KERNEL);
-	if (!ops)
-		return -ENOMEM;
+	if (!ops) {
+		err = -ENOMEM;
+		goto err2;
+	}
 
 	flowtable->hooknum	= hooknum;
 	flowtable->priority	= priority;
@@ -5323,38 +5370,13 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
 							  GFP_KERNEL);
 	}
 
-	return err;
-}
-
-static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family)
-{
-	const struct nf_flowtable_type *type;
-
-	list_for_each_entry(type, &nf_tables_flowtables, list) {
-		if (family == type->family)
-			return type;
-	}
-	return NULL;
-}
-
-static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
-{
-	const struct nf_flowtable_type *type;
-
-	type = __nft_flowtable_type_get(family);
-	if (type != NULL && try_module_get(type->owner))
-		return type;
+	return 0;
+err2:
+	flowtable->data.type->free(&flowtable->data);
+err1:
+	module_put(type->owner);
 
-#ifdef CONFIG_MODULES
-	if (type == NULL) {
-		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-		request_module("nf-flowtable-%u", family);
-		nfnl_lock(NFNL_SUBSYS_NFTABLES);
-		if (__nft_flowtable_type_get(family))
-			return ERR_PTR(-EAGAIN);
-	}
-#endif
-	return ERR_PTR(-ENOENT);
+	return err;
 }
 
 static void nft_unregister_flowtable_net_hooks(struct net *net,
@@ -5377,7 +5399,6 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 				  struct netlink_ext_ack *extack)
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	const struct nf_flowtable_type *type;
 	struct nft_flowtable *flowtable, *ft;
 	u8 genmask = nft_genmask_next(net);
 	int family = nfmsg->nfgen_family;
@@ -5429,21 +5450,10 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 		goto err1;
 	}
 
-	type = nft_flowtable_type_get(family);
-	if (IS_ERR(type)) {
-		err = PTR_ERR(type);
-		goto err2;
-	}
-
-	flowtable->data.type = type;
-	err = type->init(&flowtable->data);
-	if (err < 0)
-		goto err3;
-
 	err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
 					     flowtable);
 	if (err < 0)
-		goto err4;
+		goto err2;
 
 	for (i = 0; i < flowtable->ops_len; i++) {
 		if (!flowtable->ops[i].dev)
@@ -5457,37 +5467,33 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 				if (flowtable->ops[i].dev == ft->ops[k].dev &&
 				    flowtable->ops[i].pf == ft->ops[k].pf) {
 					err = -EBUSY;
-					goto err5;
+					goto err3;
 				}
 			}
 		}
 
 		err = nf_register_net_hook(net, &flowtable->ops[i]);
 		if (err < 0)
-			goto err5;
+			goto err3;
 	}
 
 	err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
 	if (err < 0)
-		goto err6;
+		goto err4;
 
 	list_add_tail_rcu(&flowtable->list, &table->flowtables);
 	table->use++;
 
 	return 0;
-err6:
+err4:
 	i = flowtable->ops_len;
-err5:
+err3:
 	for (k = i - 1; k >= 0; k--) {
 		kfree(flowtable->dev_name[k]);
 		nf_unregister_net_hook(net, &flowtable->ops[k]);
 	}
 
 	kfree(flowtable->ops);
-err4:
-	flowtable->data.type->free(&flowtable->data);
-err3:
-	module_put(type->owner);
 err2:
 	kfree(flowtable->name);
 err1:
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next,RFC 08/13] netfilter: nft_chain_filter: add support for early ingress
From: Pablo Neira Ayuso @ 2018-06-14 14:19 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, steffen.klassert
In-Reply-To: <20180614141947.3580-1-pablo@netfilter.org>

This patch adds the new filter chain at the early ingress hook.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/netfilter/nft_chain_filter.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index 84c902477a91..bc7fb2dc0e44 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -277,9 +277,11 @@ static const struct nft_chain_type nft_chain_filter_netdev = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
 	.family		= NFPROTO_NETDEV,
-	.hook_mask	= (1 << NF_NETDEV_INGRESS),
+	.hook_mask	= (1 << NF_NETDEV_INGRESS) |
+			  (1 << NF_NETDEV_EARLY_INGRESS),
 	.hooks		= {
-		[NF_NETDEV_INGRESS]	= nft_do_chain_netdev,
+		[NF_NETDEV_INGRESS]		= nft_do_chain_netdev,
+		[NF_NETDEV_EARLY_INGRESS]	= nft_do_chain_netdev,
 	},
 };
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next,RFC 07/13] netfilter: add ESP support for early ingress
From: Pablo Neira Ayuso @ 2018-06-14 14:19 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, steffen.klassert
In-Reply-To: <20180614141947.3580-1-pablo@netfilter.org>

From: Steffen Klassert <steffen.klassert@secunet.com>

This patch adds the GSO logic for ESP and the codepath that allows
the xfrm infrastructure to signal the GRO layer that the packet is
following the fast forwarding path.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/early_ingress.h |  2 ++
 net/ipv4/netfilter/early_ingress.c    |  8 ++++++++
 net/ipv6/netfilter/early_ingress.c    |  8 ++++++++
 net/netfilter/early_ingress.c         | 36 +++++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_output.c                |  4 ++++
 5 files changed, 58 insertions(+)

diff --git a/include/net/netfilter/early_ingress.h b/include/net/netfilter/early_ingress.h
index 9ba8e2875345..6653b294f25a 100644
--- a/include/net/netfilter/early_ingress.h
+++ b/include/net/netfilter/early_ingress.h
@@ -8,6 +8,8 @@ struct sk_buff **nft_udp_gro_receive(struct sk_buff **head,
 				     struct sk_buff *skb);
 struct sk_buff **nft_tcp_gro_receive(struct sk_buff **head,
 				     struct sk_buff *skb);
+struct sk_buff *nft_esp_gso_segment(struct sk_buff *skb,
+				    netdev_features_t features);
 
 int nf_hook_early_ingress(struct sk_buff *skb);
 
diff --git a/net/ipv4/netfilter/early_ingress.c b/net/ipv4/netfilter/early_ingress.c
index 6ff6e34e5eff..74f3a7f1273d 100644
--- a/net/ipv4/netfilter/early_ingress.c
+++ b/net/ipv4/netfilter/early_ingress.c
@@ -5,6 +5,7 @@
 #include <net/arp.h>
 #include <net/udp.h>
 #include <net/tcp.h>
+#include <net/esp.h>
 #include <net/protocol.h>
 #include <net/netfilter/early_ingress.h>
 
@@ -303,9 +304,16 @@ static const struct net_offload nft_tcp4_offload = {
 	},
 };
 
+static const struct net_offload nft_esp4_offload = {
+	.callbacks = {
+		.gso_segment = nft_esp_gso_segment,
+	},
+};
+
 static const struct net_offload __rcu *nft_ip_offloads[MAX_INET_PROTOS] __read_mostly = {
 	[IPPROTO_UDP]	= &nft_udp4_offload,
 	[IPPROTO_TCP]	= &nft_tcp4_offload,
+	[IPPROTO_ESP]	= &nft_esp4_offload,
 };
 
 void nf_early_ingress_ip_enable(void)
diff --git a/net/ipv6/netfilter/early_ingress.c b/net/ipv6/netfilter/early_ingress.c
index 026d2814530a..fb00b083593b 100644
--- a/net/ipv6/netfilter/early_ingress.c
+++ b/net/ipv6/netfilter/early_ingress.c
@@ -5,6 +5,7 @@
 #include <net/arp.h>
 #include <net/udp.h>
 #include <net/tcp.h>
+#include <net/esp.h>
 #include <net/protocol.h>
 #include <net/netfilter/early_ingress.h>
 #include <net/ip6_route.h>
@@ -291,9 +292,16 @@ static const struct net_offload nft_tcp6_offload = {
 	},
 };
 
+static const struct net_offload nft_esp6_offload = {
+	.callbacks = {
+		.gso_segment = nft_esp_gso_segment,
+	},
+};
+
 static const struct net_offload __rcu *nft_ip6_offloads[MAX_INET_PROTOS] __read_mostly = {
 	[IPPROTO_UDP]	= &nft_udp6_offload,
 	[IPPROTO_TCP]	= &nft_tcp6_offload,
+	[IPPROTO_ESP]	= &nft_esp6_offload,
 };
 
 void nf_early_ingress_ip6_enable(void)
diff --git a/net/netfilter/early_ingress.c b/net/netfilter/early_ingress.c
index 4daf6cfea304..10d718bbe495 100644
--- a/net/netfilter/early_ingress.c
+++ b/net/netfilter/early_ingress.c
@@ -5,6 +5,7 @@
 #include <net/arp.h>
 #include <net/udp.h>
 #include <net/tcp.h>
+#include <net/esp.h>
 #include <net/protocol.h>
 #include <crypto/aead.h>
 #include <net/netfilter/early_ingress.h>
@@ -274,6 +275,41 @@ struct sk_buff **nft_tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	return pp;
 }
 
+struct sk_buff *nft_esp_gso_segment(struct sk_buff *skb,
+				    netdev_features_t features)
+{
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	netdev_features_t esp_features = features;
+	struct crypto_aead *aead;
+	struct ip_esp_hdr *esph;
+	struct xfrm_state *x;
+
+	if (!xo)
+		return ERR_PTR(-EINVAL);
+
+	x = skb->sp->xvec[skb->sp->len - 1];
+	aead = x->data;
+	esph = ip_esp_hdr(skb);
+
+	if (esph->spi != x->id.spi)
+		return ERR_PTR(-EINVAL);
+
+	if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+		return ERR_PTR(-EINVAL);
+
+	__skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead));
+
+	skb->encap_hdr_csum = 1;
+
+	if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
+	    (x->xso.dev != skb->dev))
+		esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
+
+	xo->flags |= XFRM_GSO_SEGMENT;
+
+	return x->outer_mode->gso_segment(x, skb, esp_features);
+}
+
 static inline bool nf_hook_early_ingress_active(const struct sk_buff *skb)
 {
 #ifdef HAVE_JUMP_LABEL
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 89b178a78dc7..c63b157f46ce 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -146,6 +146,10 @@ int xfrm_output_resume(struct sk_buff *skb, int err)
 	while (likely((err = xfrm_output_one(skb, err)) == 0)) {
 		nf_reset(skb);
 
+		if (!skb_dst(skb)->xfrm && skb->sp &&
+		    (skb_shinfo(skb)->gso_type & SKB_GSO_NFT))
+			return -EREMOTE;
+
 		err = skb_dst(skb)->ops->local_out(net, skb->sk, skb);
 		if (unlikely(err != 1))
 			goto out;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next,RFC 06/13] netfilter: add early ingress support for IPv6
From: Pablo Neira Ayuso @ 2018-06-14 14:19 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, steffen.klassert
In-Reply-To: <20180614141947.3580-1-pablo@netfilter.org>

From: Steffen Klassert <steffen.klassert@secunet.com>

This patch adds the custom GSO and GRO logic for the IPv6 early ingress
hook. Layer 4 supports UDP and TCP at this stage.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/early_ingress.h |   2 +
 net/ipv6/netfilter/Makefile           |   1 +
 net/ipv6/netfilter/early_ingress.c    | 307 ++++++++++++++++++++++++++++++++++
 net/netfilter/early_ingress.c         |   2 +
 4 files changed, 312 insertions(+)
 create mode 100644 net/ipv6/netfilter/early_ingress.c

diff --git a/include/net/netfilter/early_ingress.h b/include/net/netfilter/early_ingress.h
index caaef9fe619f..9ba8e2875345 100644
--- a/include/net/netfilter/early_ingress.h
+++ b/include/net/netfilter/early_ingress.h
@@ -13,6 +13,8 @@ int nf_hook_early_ingress(struct sk_buff *skb);
 
 void nf_early_ingress_ip_enable(void);
 void nf_early_ingress_ip_disable(void);
+void nf_early_ingress_ip6_enable(void);
+void nf_early_ingress_ip6_disable(void);
 
 void nf_early_ingress_enable(void);
 void nf_early_ingress_disable(void);
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 10a5a1c87320..445dfcf51ca8 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -2,6 +2,7 @@
 #
 # Makefile for the netfilter modules on top of IPv6.
 #
+obj-$(CONFIG_NETFILTER_EARLY_INGRESS) += early_ingress.o
 
 # Link order matters here.
 obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o
diff --git a/net/ipv6/netfilter/early_ingress.c b/net/ipv6/netfilter/early_ingress.c
new file mode 100644
index 000000000000..026d2814530a
--- /dev/null
+++ b/net/ipv6/netfilter/early_ingress.c
@@ -0,0 +1,307 @@
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/types.h>
+#include <net/xfrm.h>
+#include <net/arp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/protocol.h>
+#include <net/netfilter/early_ingress.h>
+#include <net/ip6_route.h>
+
+static const struct net_offload __rcu *nft_ip6_offloads[MAX_INET_PROTOS] __read_mostly;
+
+static struct sk_buff *nft_udp6_gso_segment(struct sk_buff *skb,
+					    netdev_features_t features)
+{
+	skb_push(skb, sizeof(struct ipv6hdr));
+	return nft_skb_segment(skb);
+}
+
+static struct sk_buff *nft_tcp6_gso_segment(struct sk_buff *skb,
+					    netdev_features_t features)
+{
+	skb_push(skb, sizeof(struct ipv6hdr));
+	return nft_skb_segment(skb);
+}
+
+static struct sk_buff *nft_ipv6_gso_segment(struct sk_buff *skb,
+					    netdev_features_t features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	const struct net_offload *ops;
+	struct packet_offload *ptype;
+	struct ipv6hdr *iph;
+	int proto;
+
+	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_NFT)) {
+		ptype = dev_get_packet_offload(skb->protocol, 1);
+		if (ptype)
+			return ptype->callbacks.gso_segment(skb, features);
+
+		return ERR_PTR(-EPROTONOSUPPORT);
+	}
+
+	if (SKB_GSO_CB(skb)->encap_level == 0) {
+		iph = ipv6_hdr(skb);
+		skb_reset_network_header(skb);
+	} else {
+		iph = (struct ipv6hdr *)skb->data;
+	}
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+		goto out;
+
+	SKB_GSO_CB(skb)->encap_level += sizeof(*iph);
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+		goto out;
+
+	__skb_pull(skb, sizeof(*iph));
+
+	proto = iph->nexthdr;
+
+	segs = ERR_PTR(-EPROTONOSUPPORT);
+
+	ops = rcu_dereference(nft_ip6_offloads[proto]);
+	if (likely(ops && ops->callbacks.gso_segment))
+		segs = ops->callbacks.gso_segment(skb, features);
+
+out:
+	return segs;
+}
+
+static int nft_ipv6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
+	struct dst_entry *dst = skb_dst(skb);
+	struct rt6_info *rt = (struct rt6_info *)dst;
+	const struct net_offload *ops;
+	struct packet_offload *ptype;
+	int proto = iph->nexthdr;
+	struct in6_addr *nexthop;
+	struct neighbour *neigh;
+	struct net_device *dev;
+	unsigned int hh_len;
+	int err = 0;
+	u16 count;
+
+	count = NAPI_GRO_CB(skb)->count;
+
+	if (!NAPI_GRO_CB(skb)->is_ffwd) {
+		ptype = dev_get_packet_offload(skb->protocol, 1);
+		if (ptype)
+			return ptype->callbacks.gro_complete(skb, nhoff);
+
+		return 0;
+	}
+
+	rcu_read_lock();
+	ops = rcu_dereference(nft_ip6_offloads[proto]);
+	if (!ops || !ops->callbacks.gro_complete)
+		goto out_unlock;
+
+	/* Only need to add sizeof(*iph) to get to the next hdr below
+	 * because any hdr with option will have been flushed in
+	 * inet_gro_receive().
+	 */
+	err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
+
+out_unlock:
+	rcu_read_unlock();
+
+	if (err)
+		return err;
+
+	skb_shinfo(skb)->gso_type |= SKB_GSO_NFT;
+	skb_shinfo(skb)->gso_segs = count;
+
+	dev = dst->dev;
+	dev_hold(dev);
+	skb->dev = dev;
+
+	if (skb_dst(skb)->xfrm) {
+		err = dst_output(dev_net(dev), NULL, skb);
+		if (err != -EREMOTE)
+			return -EINPROGRESS;
+	}
+
+	if (count <= 1)
+		skb_gso_reset(skb);
+
+	hh_len = LL_RESERVED_SPACE(dev);
+
+	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+		struct sk_buff *skb2;
+
+		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
+		if (!skb2) {
+			kfree_skb(skb);
+			return -ENOMEM;
+		}
+		consume_skb(skb);
+		skb = skb2;
+	}
+	rcu_read_lock();
+	nexthop = rt6_nexthop(rt, &iph->daddr);
+	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
+	if (unlikely(!neigh))
+		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
+	if (!IS_ERR(neigh))
+		neigh_output(neigh, skb);
+	rcu_read_unlock();
+
+	return -EINPROGRESS;
+}
+
+static struct sk_buff **nft_ipv6_gro_receive(struct sk_buff **head,
+					     struct sk_buff *skb)
+{
+	const struct net_offload *ops;
+	struct packet_offload *ptype;
+	struct sk_buff **pp = NULL;
+	struct sk_buff *p;
+	struct ipv6hdr *iph;
+	unsigned int nlen;
+	unsigned int hlen;
+	unsigned int off;
+	int proto, ret;
+
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*iph);
+
+	iph = skb_gro_header_slow(skb, hlen, off);
+	if (unlikely(!iph))
+		goto out;
+
+	proto = iph->nexthdr;
+
+	rcu_read_lock();
+
+	if (iph->version != 6)
+		goto out_unlock;
+
+	nlen = skb_network_header_len(skb);
+
+	ret = nf_hook_early_ingress(skb);
+	switch (ret) {
+	case NF_STOLEN:
+		break;
+	case NF_ACCEPT:
+		ptype = dev_get_packet_offload(skb->protocol, 1);
+		if (ptype)
+			pp = ptype->callbacks.gro_receive(head, skb);
+
+		goto out_unlock;
+	case NF_DROP:
+		pp = ERR_PTR(-EPERM);
+		goto out_unlock;
+	}
+
+	ops = rcu_dereference(nft_ip6_offloads[proto]);
+	if (!ops || !ops->callbacks.gro_receive)
+		goto out_unlock;
+
+	if (iph->hop_limit <= 1)
+		goto out_unlock;
+
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	for (p = *head; p; p = p->next) {
+		struct ipv6hdr *iph2;
+		__be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */
+
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		if (!NAPI_GRO_CB(p)->is_ffwd) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		if (!skb_dst(p)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		iph2 = ipv6_hdr(p);
+		first_word = *(__be32 *)iph ^ *(__be32 *)iph2;
+
+		/* All fields must match except length and Traffic Class.
+		 * XXX skbs on the gro_list have all been parsed and pulled
+		 * already so we don't need to compare nlen
+		 * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops)))
+		 * memcmp() alone below is suffcient, right?
+		 */
+		if ((first_word & htonl(0xF00FFFFF)) ||
+		   memcmp(&iph->nexthdr, &iph2->nexthdr,
+			  nlen - offsetof(struct ipv6hdr, nexthdr))) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+		/* flush if Traffic Class fields are different */
+		NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
+
+		NAPI_GRO_CB(skb)->is_ffwd = 1;
+		skb_dst_set_noref(skb, skb_dst(p));
+		pp = &p;
+
+		break;
+	}
+
+	NAPI_GRO_CB(skb)->is_atomic = true;
+
+	iph->hop_limit--;
+
+	skb_pull(skb, off);
+	NAPI_GRO_CB(skb)->data_offset = sizeof(*iph);
+	skb_reset_network_header(skb);
+	skb_set_transport_header(skb, sizeof(*iph));
+
+	pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
+out_unlock:
+	rcu_read_unlock();
+
+out:
+	NAPI_GRO_CB(skb)->data_offset = 0;
+	return pp;
+}
+
+static struct packet_offload nft_ip6_packet_offload __read_mostly = {
+	.type = cpu_to_be16(ETH_P_IPV6),
+	.priority = 0,
+	.callbacks = {
+		.gro_receive = nft_ipv6_gro_receive,
+		.gro_complete = nft_ipv6_gro_complete,
+		.gso_segment = nft_ipv6_gso_segment,
+	},
+};
+
+static const struct net_offload nft_udp6_offload = {
+	.callbacks = {
+		.gso_segment = nft_udp6_gso_segment,
+		.gro_receive  =	nft_udp_gro_receive,
+	},
+};
+
+static const struct net_offload nft_tcp6_offload = {
+	.callbacks = {
+		.gso_segment = nft_tcp6_gso_segment,
+		.gro_receive  =	nft_tcp_gro_receive,
+	},
+};
+
+static const struct net_offload __rcu *nft_ip6_offloads[MAX_INET_PROTOS] __read_mostly = {
+	[IPPROTO_UDP]	= &nft_udp6_offload,
+	[IPPROTO_TCP]	= &nft_tcp6_offload,
+};
+
+void nf_early_ingress_ip6_enable(void)
+{
+	dev_add_offload(&nft_ip6_packet_offload);
+}
+
+void nf_early_ingress_ip6_disable(void)
+{
+	dev_remove_offload(&nft_ip6_packet_offload);
+}
diff --git a/net/netfilter/early_ingress.c b/net/netfilter/early_ingress.c
index bf31aa8b3721..4daf6cfea304 100644
--- a/net/netfilter/early_ingress.c
+++ b/net/netfilter/early_ingress.c
@@ -312,6 +312,7 @@ void nf_early_ingress_enable(void)
 	if (nf_early_ingress_use++ == 0) {
 		nf_early_ingress_use++;
 		nf_early_ingress_ip_enable();
+		nf_early_ingress_ip6_enable();
 	}
 }
 
@@ -319,5 +320,6 @@ void nf_early_ingress_disable(void)
 {
 	if (--nf_early_ingress_use == 0) {
 		nf_early_ingress_ip_disable();
+		nf_early_ingress_ip6_disable();
 	}
 }
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next,RFC 05/13] netfilter: add early ingress hook for IPv4
From: Pablo Neira Ayuso @ 2018-06-14 14:19 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, steffen.klassert
In-Reply-To: <20180614141947.3580-1-pablo@netfilter.org>

From: Steffen Klassert <steffen.klassert@secunet.com>

Add the new early ingress hook for the netdev family, this new hook is
called from the GRO layer before the standard ipv4 GRO layers.

This hook allows us to perform early packet filtering and to define fast
forwarding path through packet chaining and flowtables using the new GSO
netfilter type. Packet that don't follow the fast path are passed up to
the standard GRO path for aggregation as usual.

This patch adds the GRO and GSO logic for this custom packet chaining.
The chaining uses the frag_list pointer so this means we do not need to
mangle the packets, therefore the aggregation strategy we follow does
not modify the packet as in the standard GRO path - we have no need to
recalculate checksum. This chain of packets is sent from the
.gro_complete callback directly to the neighbour layer. The first packet
in the chain holds a reference to the destination route.

Supported layer 4 protocols for this custom GRO packet chaining include
TCP and UDP.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netdevice.h             |   2 +
 include/linux/netfilter.h             |   6 +
 include/linux/netfilter_ingress.h     |   1 +
 include/net/netfilter/early_ingress.h |  20 +++
 include/uapi/linux/netfilter.h        |   1 +
 net/ipv4/netfilter/Makefile           |   1 +
 net/ipv4/netfilter/early_ingress.c    | 319 +++++++++++++++++++++++++++++++++
 net/netfilter/Kconfig                 |   8 +
 net/netfilter/Makefile                |   1 +
 net/netfilter/core.c                  |  35 +++-
 net/netfilter/early_ingress.c         | 323 ++++++++++++++++++++++++++++++++++
 11 files changed, 716 insertions(+), 1 deletion(-)
 create mode 100644 include/net/netfilter/early_ingress.h
 create mode 100644 net/ipv4/netfilter/early_ingress.c
 create mode 100644 net/netfilter/early_ingress.c

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 62734cf0c43a..c79922665be5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1880,6 +1880,8 @@ struct net_device {
 	rx_handler_func_t __rcu	*rx_handler;
 	void __rcu		*rx_handler_data;
 
+	struct nf_hook_entries __rcu *nf_hooks_early_ingress;
+
 #ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc __rcu	*miniq_ingress;
 #endif
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 04551af2ff23..ad3f0b9ae4f1 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -429,4 +429,10 @@ extern struct nfnl_ct_hook __rcu *nfnl_ct_hook;
  */
 DECLARE_PER_CPU(bool, nf_skb_duplicated);
 
+int nf_hook_netdev(struct sk_buff *skb, struct nf_hook_state *state,
+		   const struct nf_hook_entries *e);
+
+void nf_early_ingress_enable(void);
+void nf_early_ingress_disable(void);
+
 #endif /*__LINUX_NETFILTER_H*/
diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
index 554c920691dd..7b70c9d4c435 100644
--- a/include/linux/netfilter_ingress.h
+++ b/include/linux/netfilter_ingress.h
@@ -40,6 +40,7 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 
 static inline void nf_hook_ingress_init(struct net_device *dev)
 {
+	RCU_INIT_POINTER(dev->nf_hooks_early_ingress, NULL);
 	RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
 }
 #else /* CONFIG_NETFILTER_INGRESS */
diff --git a/include/net/netfilter/early_ingress.h b/include/net/netfilter/early_ingress.h
new file mode 100644
index 000000000000..caaef9fe619f
--- /dev/null
+++ b/include/net/netfilter/early_ingress.h
@@ -0,0 +1,20 @@
+#ifndef _NF_EARLY_INGRESS_H_
+#define _NF_EARLY_INGRESS_H_
+
+#include <net/protocol.h>
+
+struct sk_buff *nft_skb_segment(struct sk_buff *head_skb);
+struct sk_buff **nft_udp_gro_receive(struct sk_buff **head,
+				     struct sk_buff *skb);
+struct sk_buff **nft_tcp_gro_receive(struct sk_buff **head,
+				     struct sk_buff *skb);
+
+int nf_hook_early_ingress(struct sk_buff *skb);
+
+void nf_early_ingress_ip_enable(void);
+void nf_early_ingress_ip_disable(void);
+
+void nf_early_ingress_enable(void);
+void nf_early_ingress_disable(void);
+
+#endif
diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index cca10e767cd8..55d26b20e09f 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -54,6 +54,7 @@ enum nf_inet_hooks {
 
 enum nf_dev_hooks {
 	NF_NETDEV_INGRESS,
+	NF_NETDEV_EARLY_INGRESS,
 	NF_NETDEV_NUMHOOKS
 };
 
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 8394c17c269f..faf5fab59f0f 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -2,6 +2,7 @@
 #
 # Makefile for the netfilter modules on top of IPv4.
 #
+obj-$(CONFIG_NETFILTER_EARLY_INGRESS) += early_ingress.o
 
 # objects for l3 independent conntrack
 nf_conntrack_ipv4-y	:=  nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
diff --git a/net/ipv4/netfilter/early_ingress.c b/net/ipv4/netfilter/early_ingress.c
new file mode 100644
index 000000000000..6ff6e34e5eff
--- /dev/null
+++ b/net/ipv4/netfilter/early_ingress.c
@@ -0,0 +1,319 @@
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/types.h>
+#include <net/xfrm.h>
+#include <net/arp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/protocol.h>
+#include <net/netfilter/early_ingress.h>
+
+static const struct net_offload __rcu *nft_ip_offloads[MAX_INET_PROTOS] __read_mostly;
+
+static struct sk_buff *nft_udp4_gso_segment(struct sk_buff *skb,
+					    netdev_features_t features)
+{
+	skb_push(skb, sizeof(struct iphdr));
+	return nft_skb_segment(skb);
+}
+
+static struct sk_buff *nft_tcp4_gso_segment(struct sk_buff *skb,
+					    netdev_features_t features)
+{
+	skb_push(skb, sizeof(struct iphdr));
+	return nft_skb_segment(skb);
+}
+
+static struct sk_buff *nft_ipv4_gso_segment(struct sk_buff *skb,
+					    netdev_features_t features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	const struct net_offload *ops;
+	struct packet_offload *ptype;
+	struct iphdr *iph;
+	int proto;
+	int ihl;
+
+	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_NFT)) {
+		ptype = dev_get_packet_offload(skb->protocol, 1);
+		if (ptype)
+			return ptype->callbacks.gso_segment(skb, features);
+
+		return ERR_PTR(-EPROTONOSUPPORT);
+	}
+
+	if (SKB_GSO_CB(skb)->encap_level == 0) {
+		iph = ip_hdr(skb);
+		skb_reset_network_header(skb);
+	} else {
+		iph = (struct iphdr *)skb->data;
+	}
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+		goto out;
+
+	ihl = iph->ihl * 4;
+	if (ihl < sizeof(*iph))
+		goto out;
+
+	SKB_GSO_CB(skb)->encap_level += ihl;
+
+	if (unlikely(!pskb_may_pull(skb, ihl)))
+		goto out;
+
+	__skb_pull(skb, ihl);
+
+	proto = iph->protocol;
+
+	segs = ERR_PTR(-EPROTONOSUPPORT);
+
+	ops = rcu_dereference(nft_ip_offloads[proto]);
+	if (likely(ops && ops->callbacks.gso_segment))
+		segs = ops->callbacks.gso_segment(skb, features);
+
+out:
+	return segs;
+}
+
+static int nft_ipv4_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
+	struct dst_entry *dst = skb_dst(skb);
+	struct rtable *rt = (struct rtable *)dst;
+	const struct net_offload *ops;
+	struct packet_offload *ptype;
+	struct net_device *dev;
+	struct neighbour *neigh;
+	unsigned int hh_len;
+	int err = 0;
+	u32 nexthop;
+	u16 count;
+
+	count = NAPI_GRO_CB(skb)->count;
+
+	if (!NAPI_GRO_CB(skb)->is_ffwd) {
+		ptype = dev_get_packet_offload(skb->protocol, 1);
+		if (ptype)
+			return ptype->callbacks.gro_complete(skb, nhoff);
+
+		return 0;
+	}
+
+	rcu_read_lock();
+	ops = rcu_dereference(nft_ip_offloads[iph->protocol]);
+	if (!ops || !ops->callbacks.gro_complete)
+		goto out_unlock;
+
+	/* Only need to add sizeof(*iph) to get to the next hdr below
+	 * because any hdr with option will have been flushed in
+	 * inet_gro_receive().
+	 */
+	err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
+
+out_unlock:
+	rcu_read_unlock();
+
+	if (err)
+		return err;
+
+	skb_shinfo(skb)->gso_type |= SKB_GSO_NFT;
+	skb_shinfo(skb)->gso_segs = count;
+
+	dev = dst->dev;
+	dev_hold(dev);
+	skb->dev = dev;
+
+	if (skb_dst(skb)->xfrm) {
+		err = dst_output(dev_net(dev), NULL, skb);
+		if (err != -EREMOTE)
+			return -EINPROGRESS;
+	}
+
+	if (count <= 1)
+		skb_gso_reset(skb);
+
+	hh_len = LL_RESERVED_SPACE(dev);
+
+	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+		struct sk_buff *skb2;
+
+		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
+		if (!skb2) {
+			kfree_skb(skb);
+			return -ENOMEM;
+		}
+		consume_skb(skb);
+		skb = skb2;
+	}
+	rcu_read_lock();
+	nexthop = (__force u32) rt_nexthop(rt, iph->daddr);
+	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
+	if (unlikely(!neigh))
+		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
+	if (!IS_ERR(neigh))
+		neigh_output(neigh, skb);
+	rcu_read_unlock();
+
+	return -EINPROGRESS;
+}
+
+static struct sk_buff **nft_ipv4_gro_receive(struct sk_buff **head,
+					     struct sk_buff *skb)
+{
+	const struct net_offload *ops;
+	struct packet_offload *ptype;
+	struct sk_buff **pp = NULL;
+	struct sk_buff *p;
+	struct iphdr *iph;
+	unsigned int hlen;
+	unsigned int off;
+	int proto, ret;
+
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*iph);
+
+	iph = skb_gro_header_slow(skb, hlen, off);
+	if (unlikely(!iph)) {
+		pp = ERR_PTR(-EPERM);
+		goto out;
+	}
+
+	proto = iph->protocol;
+
+	rcu_read_lock();
+
+	if (*(u8 *)iph != 0x45) {
+		kfree_skb(skb);
+		pp = ERR_PTR(-EPERM);
+		goto out_unlock;
+	}
+
+	if (unlikely(ip_fast_csum((u8 *)iph, 5))) {
+		kfree_skb(skb);
+		pp = ERR_PTR(-EPERM);
+		goto out_unlock;
+	}
+
+	if (ip_is_fragment(iph))
+		goto out_unlock;
+
+	ret = nf_hook_early_ingress(skb);
+	switch (ret) {
+	case NF_STOLEN:
+		break;
+	case NF_ACCEPT:
+		ptype = dev_get_packet_offload(skb->protocol, 1);
+		if (ptype)
+			pp = ptype->callbacks.gro_receive(head, skb);
+
+		goto out_unlock;
+	case NF_DROP:
+		pp = ERR_PTR(-EPERM);
+		goto out_unlock;
+	}
+
+	ops = rcu_dereference(nft_ip_offloads[proto]);
+	if (!ops || !ops->callbacks.gro_receive)
+		goto out_unlock;
+
+	if (iph->ttl <= 1) {
+		kfree_skb(skb);
+		pp = ERR_PTR(-EPERM);
+		goto out_unlock;
+	}
+
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	for (p = *head; p; p = p->next) {
+		struct iphdr *iph2;
+
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		iph2 = ip_hdr(p);
+		/* The above works because, with the exception of the top
+		 * (inner most) layer, we only aggregate pkts with the same
+		 * hdr length so all the hdrs we'll need to verify will start
+		 * at the same offset.
+		 */
+		if ((iph->protocol ^ iph2->protocol) |
+		    ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
+		    ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		if (!NAPI_GRO_CB(p)->is_ffwd)
+			continue;
+
+		if (!skb_dst(p))
+			continue;
+
+		/* All fields must match except length and checksum. */
+		NAPI_GRO_CB(p)->flush |=
+			((iph->ttl - 1) ^ iph2->ttl) |
+			(iph->tos ^ iph2->tos) |
+			((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
+
+		pp = &p;
+
+		break;
+	}
+
+	NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
+
+	ip_decrease_ttl(iph);
+	skb->priority = rt_tos2priority(iph->tos);
+
+	skb_pull(skb, off);
+	NAPI_GRO_CB(skb)->data_offset = sizeof(*iph);
+	skb_reset_network_header(skb);
+	skb_set_transport_header(skb, sizeof(*iph));
+
+	pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
+out_unlock:
+	rcu_read_unlock();
+
+out:
+	NAPI_GRO_CB(skb)->data_offset = 0;
+	return pp;
+}
+
+static struct packet_offload nft_ipv4_packet_offload __read_mostly = {
+	.type = cpu_to_be16(ETH_P_IP),
+	.priority = 0,
+	.callbacks = {
+		.gro_receive = nft_ipv4_gro_receive,
+		.gro_complete = nft_ipv4_gro_complete,
+		.gso_segment = nft_ipv4_gso_segment,
+	},
+};
+
+static const struct net_offload nft_udp4_offload = {
+	.callbacks = {
+		.gso_segment = nft_udp4_gso_segment,
+		.gro_receive  =	nft_udp_gro_receive,
+	},
+};
+
+static const struct net_offload nft_tcp4_offload = {
+	.callbacks = {
+		.gso_segment = nft_tcp4_gso_segment,
+		.gro_receive  =	nft_tcp_gro_receive,
+	},
+};
+
+static const struct net_offload __rcu *nft_ip_offloads[MAX_INET_PROTOS] __read_mostly = {
+	[IPPROTO_UDP]	= &nft_udp4_offload,
+	[IPPROTO_TCP]	= &nft_tcp4_offload,
+};
+
+void nf_early_ingress_ip_enable(void)
+{
+	dev_add_offload(&nft_ipv4_packet_offload);
+}
+
+void nf_early_ingress_ip_disable(void)
+{
+	dev_remove_offload(&nft_ipv4_packet_offload);
+}
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index dbd7d1fad277..8f803a1fd76e 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -9,6 +9,14 @@ config NETFILTER_INGRESS
 	  This allows you to classify packets from ingress using the Netfilter
 	  infrastructure.
 
+config NETFILTER_EARLY_INGRESS
+	bool "Netfilter early ingress support"
+	default y
+	help
+	  This allows you to perform very early filtering and packet aggregation
+	  for fast forwarding bypass by exercising the GRO engine from the
+	  Netfilter infrastructure.
+
 config NETFILTER_NETLINK
 	tristate
 
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 44449389e527..eebc0e35f9e5 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o utils.o
+netfilter-$(CONFIG_NETFILTER_EARLY_INGRESS) += early_ingress.o
 
 nf_conntrack-y	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 168af54db975..4885365380d3 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -306,6 +306,11 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
 			return &dev->nf_hooks_ingress;
 	}
 #endif
+	if (hooknum == NF_NETDEV_EARLY_INGRESS) {
+		if (dev && dev_net(dev) == net)
+			return &dev->nf_hooks_early_ingress;
+	}
+
 	WARN_ON_ONCE(1);
 	return NULL;
 }
@@ -321,7 +326,8 @@ static int __nf_register_net_hook(struct net *net, int pf,
 		if (reg->hooknum == NF_NETDEV_INGRESS)
 			return -EOPNOTSUPP;
 #endif
-		if (reg->hooknum != NF_NETDEV_INGRESS ||
+		if ((reg->hooknum != NF_NETDEV_INGRESS &&
+		     reg->hooknum != NF_NETDEV_EARLY_INGRESS) ||
 		    !reg->dev || dev_net(reg->dev) != net)
 			return -EINVAL;
 	}
@@ -347,6 +353,9 @@ static int __nf_register_net_hook(struct net *net, int pf,
 	if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 		net_inc_ingress_queue();
 #endif
+	if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EARLY_INGRESS)
+		nf_early_ingress_enable();
+
 #ifdef HAVE_JUMP_LABEL
 	static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]);
 #endif
@@ -404,6 +413,9 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
 #ifdef CONFIG_NETFILTER_INGRESS
 		if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 			net_dec_ingress_queue();
+
+		if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EARLY_INGRESS)
+			nf_early_ingress_disable();
 #endif
 #ifdef HAVE_JUMP_LABEL
 		static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
@@ -535,6 +547,27 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
 }
 EXPORT_SYMBOL(nf_hook_slow);
 
+int nf_hook_netdev(struct sk_buff *skb, struct nf_hook_state *state,
+		   const struct nf_hook_entries *e)
+{
+	unsigned int verdict, s, v = NF_ACCEPT;
+
+	for (s = 0; s < e->num_hook_entries; s++) {
+		verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state);
+		v = verdict & NF_VERDICT_MASK;
+		switch (v) {
+		case NF_ACCEPT:
+			break;
+		case NF_DROP:
+			kfree_skb(skb);
+			/* Fall through */
+		default:
+			return v;
+		}
+	}
+
+	return v;
+}
 
 int skb_make_writable(struct sk_buff *skb, unsigned int writable_len)
 {
diff --git a/net/netfilter/early_ingress.c b/net/netfilter/early_ingress.c
new file mode 100644
index 000000000000..bf31aa8b3721
--- /dev/null
+++ b/net/netfilter/early_ingress.c
@@ -0,0 +1,323 @@
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/types.h>
+#include <net/xfrm.h>
+#include <net/arp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/protocol.h>
+#include <crypto/aead.h>
+#include <net/netfilter/early_ingress.h>
+
+/* XXX: Maybe export this from net/core/skbuff.c
+ * instead of holding a local copy */
+static void skb_headers_offset_update(struct sk_buff *skb, int off)
+{
+	/* Only adjust this if it actually is csum_start rather than csum */
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		skb->csum_start += off;
+	/* {transport,network,mac}_header and tail are relative to skb->head */
+	skb->transport_header += off;
+	skb->network_header   += off;
+	if (skb_mac_header_was_set(skb))
+		skb->mac_header += off;
+	skb->inner_transport_header += off;
+	skb->inner_network_header += off;
+	skb->inner_mac_header += off;
+}
+
+struct sk_buff *nft_skb_segment(struct sk_buff *head_skb)
+{
+	unsigned int headroom;
+	struct sk_buff *nskb;
+	struct sk_buff *segs = NULL;
+	struct sk_buff *tail = NULL;
+	unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
+	struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
+	unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
+	unsigned int delta_segs, delta_len, delta_truesize;
+
+	__skb_push(head_skb, doffset);
+
+	headroom = skb_headroom(head_skb);
+
+	delta_segs = delta_len = delta_truesize = 0;
+
+	skb_shinfo(head_skb)->frag_list = NULL;
+
+	segs = skb_clone(head_skb, GFP_ATOMIC);
+	if (unlikely(!segs))
+		return ERR_PTR(-ENOMEM);
+
+	do {
+		nskb = list_skb;
+
+		list_skb = list_skb->next;
+
+		if (!tail)
+			segs->next = nskb;
+		else
+			tail->next = nskb;
+
+		tail = nskb;
+
+		delta_len += nskb->len;
+		delta_truesize += nskb->truesize;
+
+		skb_push(nskb, doffset);
+
+		nskb->dev = head_skb->dev;
+		nskb->queue_mapping = head_skb->queue_mapping;
+		nskb->network_header = head_skb->network_header;
+		nskb->mac_len = head_skb->mac_len;
+		nskb->mac_header = head_skb->mac_header;
+		nskb->transport_header = head_skb->transport_header;
+
+		if (!secpath_exists(nskb))
+			nskb->sp = secpath_get(head_skb->sp);
+
+		skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
+
+		skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
+						 nskb->data - tnl_hlen,
+						 doffset + tnl_hlen);
+
+	} while (list_skb);
+
+	segs->len = head_skb->len - delta_len;
+	segs->data_len = head_skb->data_len - delta_len;
+	segs->truesize += head_skb->data_len - delta_truesize;
+
+	head_skb->len = segs->len;
+	head_skb->data_len = segs->data_len;
+	head_skb->truesize += segs->truesize;
+
+	skb_shinfo(segs)->gso_size = 0;
+	skb_shinfo(segs)->gso_segs = 0;
+	skb_shinfo(segs)->gso_type = 0;
+
+	segs->prev = tail;
+
+	return segs;
+}
+
+static int nft_skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+	struct sk_buff *p = *head;
+
+	if (unlikely((!NAPI_GRO_CB(p)->is_ffwd) || !skb_dst(p)))
+		return -EINVAL;
+
+	if (NAPI_GRO_CB(p)->last == p)
+		skb_shinfo(p)->frag_list = skb;
+	else
+		NAPI_GRO_CB(p)->last->next = skb;
+	NAPI_GRO_CB(p)->last = skb;
+
+	NAPI_GRO_CB(p)->count++;
+	p->data_len += skb->len;
+	p->truesize += skb->truesize;
+	p->len += skb->len;
+
+	NAPI_GRO_CB(skb)->same_flow = 1;
+	return 0;
+}
+
+static struct sk_buff **udp_gro_ffwd_receive(struct sk_buff **head,
+					     struct sk_buff *skb,
+					     struct udphdr *uh)
+{
+	struct sk_buff *p = NULL;
+	struct sk_buff **pp = NULL;
+	struct udphdr *uh2;
+	int flush = 0;
+
+	for (; (p = *head); head = &p->next) {
+
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		uh2 = udp_hdr(p);
+
+		/* Match ports and either checksums are either both zero
+		 * or nonzero.
+		 */
+		if ((*(u32 *)&uh->source != *(u32 *)&uh2->source) ||
+		    (!uh->check ^ !uh2->check)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		goto found;
+	}
+
+	goto out;
+
+found:
+	p = *head;
+
+	if (nft_skb_gro_receive(head, skb))
+		flush = 1;
+
+out:
+	if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
+		pp = head;
+
+	NAPI_GRO_CB(skb)->flush |= flush;
+	return pp;
+}
+
+struct sk_buff **nft_udp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+	struct udphdr *uh;
+
+	uh = skb_gro_header_slow(skb, skb_transport_offset(skb) + sizeof(struct udphdr),
+				 skb_transport_offset(skb));
+
+	if (unlikely(!uh))
+		goto flush;
+
+	if (NAPI_GRO_CB(skb)->flush)
+		goto flush;
+
+	if (NAPI_GRO_CB(skb)->is_ffwd)
+		return udp_gro_ffwd_receive(head, skb, uh);
+
+flush:
+	NAPI_GRO_CB(skb)->flush = 1;
+	return NULL;
+}
+
+struct sk_buff **nft_tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+	struct sk_buff **pp = NULL;
+	struct sk_buff *p;
+	struct tcphdr *th;
+	struct tcphdr *th2;
+	unsigned int len;
+	unsigned int thlen;
+	__be32 flags;
+	unsigned int mss = 1;
+	unsigned int hlen;
+	int flush = 1;
+	int i;
+
+	th = skb_gro_header_slow(skb, skb_transport_offset(skb) + sizeof(struct tcphdr),
+				 skb_transport_offset(skb));
+	if (unlikely(!th))
+		goto out;
+
+	thlen = th->doff * 4;
+	if (thlen < sizeof(*th))
+		goto out;
+
+	hlen = skb_transport_offset(skb) + thlen;
+
+	th = skb_gro_header_slow(skb, hlen, skb_transport_offset(skb));
+	if (unlikely(!th))
+		goto out;
+
+	skb_gro_pull(skb, thlen);
+	len = skb_gro_len(skb);
+	flags = tcp_flag_word(th);
+
+	for (; (p = *head); head = &p->next) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		th2 = tcp_hdr(p);
+
+		if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		goto found;
+	}
+
+	goto out_check_final;
+
+found:
+	flush = NAPI_GRO_CB(p)->flush;
+	flush |= (__force int)(flags & TCP_FLAG_CWR);
+	flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
+		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
+	flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
+	for (i = sizeof(*th); i < thlen; i += 4)
+		flush |= *(u32 *)((u8 *)th + i) ^
+			 *(u32 *)((u8 *)th2 + i);
+
+	mss = skb_shinfo(p)->gso_size;
+
+	flush |= (len - 1) >= mss;
+	flush |= (ntohl(th2->seq) + (skb_gro_len(p) - (hlen * (NAPI_GRO_CB(p)->count - 1)))) ^ ntohl(th->seq);
+
+	if (flush || nft_skb_gro_receive(head, skb)) {
+		mss = 1;
+		goto out_check_final;
+	}
+
+	p = *head;
+
+out_check_final:
+	flush = len < mss;
+	flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
+					TCP_FLAG_RST | TCP_FLAG_SYN |
+					TCP_FLAG_FIN));
+
+	if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
+		pp = head;
+
+out:
+	NAPI_GRO_CB(skb)->flush |= (flush != 0);
+
+	return pp;
+}
+
+static inline bool nf_hook_early_ingress_active(const struct sk_buff *skb)
+{
+#ifdef HAVE_JUMP_LABEL
+	if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_EARLY_INGRESS]))
+		return false;
+#endif
+	return rcu_access_pointer(skb->dev->nf_hooks_early_ingress);
+}
+
+int nf_hook_early_ingress(struct sk_buff *skb)
+{
+	struct nf_hook_entries *e =
+		rcu_dereference(skb->dev->nf_hooks_early_ingress);
+	struct nf_hook_state state;
+	int ret = NF_ACCEPT;
+
+	if (nf_hook_early_ingress_active(skb)) {
+		if (unlikely(!e))
+			return 0;
+
+		nf_hook_state_init(&state, NF_NETDEV_EARLY_INGRESS,
+				   NFPROTO_NETDEV, skb->dev, NULL, NULL,
+				   dev_net(skb->dev), NULL);
+
+		ret = nf_hook_netdev(skb, &state, e);
+	}
+
+	return ret;
+}
+
+/* protected by nf_hook_mutex. */
+static int nf_early_ingress_use;
+
+void nf_early_ingress_enable(void)
+{
+	if (nf_early_ingress_use++ == 0) {
+		nf_early_ingress_use++;
+		nf_early_ingress_ip_enable();
+	}
+}
+
+void nf_early_ingress_disable(void)
+{
+	if (--nf_early_ingress_use == 0) {
+		nf_early_ingress_ip_disable();
+	}
+}
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next,RFC 04/13] net: Use one bit of NAPI_GRO_CB for the netfilter fastpath.
From: Pablo Neira Ayuso @ 2018-06-14 14:19 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, steffen.klassert
In-Reply-To: <20180614141947.3580-1-pablo@netfilter.org>

From: Steffen Klassert <steffen.klassert@secunet.com>

This patch adds a is_ffwd bit to the NAPI_GRO_CB to indicate
fastpath packtes in the GRO layer. It also implements the
logic we need for this in the generic codepath. The rest
of the needed logic is implemented within netfilter and
introduced with a followup patch.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netdevice.h |  2 +-
 net/core/dev.c            | 36 +++++++++++++++++++++++++++---------
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d8cadfa3769b..62734cf0c43a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2238,7 +2238,7 @@ struct napi_gro_cb {
 	/* Number of gro_receive callbacks this packet already went through */
 	u8 recursion_counter:4;
 
-	/* 1 bit hole */
+	u8	is_ffwd:1;
 
 	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
 	__wsum	csum;
diff --git a/net/core/dev.c b/net/core/dev.c
index 115de8bfcb54..75f530886874 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4864,7 +4864,8 @@ static int napi_gro_complete(struct sk_buff *skb)
 
 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
 
-	if (NAPI_GRO_CB(skb)->count == 1) {
+	if (NAPI_GRO_CB(skb)->count == 1 &&
+	    !(NAPI_GRO_CB(skb)->is_ffwd)) {
 		skb_shinfo(skb)->gso_size = 0;
 		goto out;
 	}
@@ -4880,8 +4881,10 @@ static int napi_gro_complete(struct sk_buff *skb)
 	rcu_read_unlock();
 
 	if (err) {
-		WARN_ON(&ptype->list == head);
-		kfree_skb(skb);
+		if (err != -EINPROGRESS) {
+			WARN_ON(&ptype->list == head);
+			kfree_skb(skb);
+		}
 		return NET_RX_SUCCESS;
 	}
 
@@ -4936,8 +4939,10 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 
 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 		diffs |= p->vlan_tci ^ skb->vlan_tci;
-		diffs |= skb_metadata_dst_cmp(p, skb);
-		diffs |= skb_metadata_differs(p, skb);
+		if (!NAPI_GRO_CB(p)->is_ffwd) {
+			diffs |= skb_metadata_dst_cmp(p, skb);
+			diffs |= skb_metadata_differs(p, skb);
+		}
 		if (maclen == ETH_HLEN)
 			diffs |= compare_ether_header(skb_mac_header(p),
 						      skb_mac_header(skb));
@@ -5019,6 +5024,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		NAPI_GRO_CB(skb)->is_fou = 0;
 		NAPI_GRO_CB(skb)->is_atomic = 1;
 		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
+		NAPI_GRO_CB(skb)->is_ffwd = 0;
 
 		/* Setup for GRO checksum validation */
 		switch (skb->ip_summed) {
@@ -5044,9 +5050,14 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	if (&ptype->list == head)
 		goto normal;
 
-	if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
-		ret = GRO_CONSUMED;
-		goto ok;
+	if (IS_ERR(pp)) {
+		int err;
+
+		err = PTR_ERR(pp);
+		if (err == -EINPROGRESS || err == -EPERM) {
+			ret = GRO_CONSUMED;
+			goto ok;
+		}
 	}
 
 	same_flow = NAPI_GRO_CB(skb)->same_flow;
@@ -5064,8 +5075,15 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	if (same_flow)
 		goto ok;
 
-	if (NAPI_GRO_CB(skb)->flush)
+	if (NAPI_GRO_CB(skb)->flush) {
+		if (NAPI_GRO_CB(skb)->is_ffwd) {
+			napi_gro_complete(skb);
+			ret = GRO_CONSUMED;
+			goto ok;
+		}
+
 		goto normal;
+	}
 
 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
 		struct sk_buff *nskb = napi->gro_list;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next,RFC 03/13] net: Add a GSO feature bit for the netfilter forward fastpath.
From: Pablo Neira Ayuso @ 2018-06-14 14:19 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, steffen.klassert
In-Reply-To: <20180614141947.3580-1-pablo@netfilter.org>

From: Steffen Klassert <steffen.klassert@secunet.com>

The netfilter forward fastpath has its own logic to create
GSO packets. So add a feature bit that we can catch GSO
packets that are generated by the fastpath GRO handler.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netdev_features.h | 4 +++-
 include/linux/netdevice.h       | 1 +
 include/linux/skbuff.h          | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 623bb8ced060..f380a27410ef 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -56,8 +56,9 @@ enum {
 	NETIF_F_GSO_ESP_BIT,		/* ... ESP with TSO */
 	NETIF_F_GSO_UDP_BIT,		/* ... UFO, deprecated except tuntap */
 	NETIF_F_GSO_UDP_L4_BIT,		/* ... UDP payload GSO (not UFO) */
+	NETIF_F_GSO_NFT_BIT,		/* ... NFT generic */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
-		NETIF_F_GSO_UDP_L4_BIT,
+		NETIF_F_GSO_NFT_BIT,
 
 	NETIF_F_FCOE_CRC_BIT,		/* FCoE CRC32 */
 	NETIF_F_SCTP_CRC_BIT,		/* SCTP checksum offload */
@@ -140,6 +141,7 @@ enum {
 #define NETIF_F_GSO_SCTP	__NETIF_F(GSO_SCTP)
 #define NETIF_F_GSO_ESP		__NETIF_F(GSO_ESP)
 #define NETIF_F_GSO_UDP		__NETIF_F(GSO_UDP)
+#define NETIF_F_GSO_NFT		__NETIF_F(GSO_NFT)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 13a56f9b2a32..d8cadfa3769b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4229,6 +4229,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_NFT != (NETIF_F_GSO_NFT >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c86885954994..4a5cff1ffcaa 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -575,6 +575,8 @@ enum {
 	SKB_GSO_UDP = 1 << 16,
 
 	SKB_GSO_UDP_L4 = 1 << 17,
+
+	SKB_GSO_NFT = 1 << 18,
 };
 
 #if BITS_PER_LONG > 32
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next,RFC 02/13] net: Change priority of ipv4 and ipv6 packet offloads.
From: Pablo Neira Ayuso @ 2018-06-14 14:19 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, steffen.klassert
In-Reply-To: <20180614141947.3580-1-pablo@netfilter.org>

From: Steffen Klassert <steffen.klassert@secunet.com>

The forward fastpath needs to insert callbacks with
higher priority than the standard callbacks. So change
the priority of ipv4 and ipv6 packet offloads from zero
to one. With this we are able to insert callbacks with
priotity zero if needed.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/af_inet.c     | 1 +
 net/ipv6/ip6_offload.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 15e125558c76..fbb90f7556ea 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1841,6 +1841,7 @@ static int ipv4_proc_init(void);
 
 static struct packet_offload ip_packet_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IP),
+	.priority = 1,
 	.callbacks = {
 		.gso_segment = inet_gso_segment,
 		.gro_receive = inet_gro_receive,
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 5b3f2f89ef41..863913fb690f 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -343,6 +343,7 @@ static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff)
 
 static struct packet_offload ipv6_packet_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IPV6),
+	.priority = 1,
 	.callbacks = {
 		.gso_segment = ipv6_gso_segment,
 		.gro_receive = ipv6_gro_receive,
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next,RFC 01/13] net: Add a helper to get the packet offload callbacks by priority.
From: Pablo Neira Ayuso @ 2018-06-14 14:19 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, steffen.klassert
In-Reply-To: <20180614141947.3580-1-pablo@netfilter.org>

From: Steffen Klassert <steffen.klassert@secunet.com>

With this helper it is possible to request callbacks with
a certain priority. This will be used in the upcoming forward
fastpath to pass packets to the standard GRO path.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ec9850c7936..13a56f9b2a32 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2523,6 +2523,7 @@ void dev_remove_pack(struct packet_type *pt);
 void __dev_remove_pack(struct packet_type *pt);
 void dev_add_offload(struct packet_offload *po);
 void dev_remove_offload(struct packet_offload *po);
+struct packet_offload *dev_get_packet_offload(__be16 type, int priority);
 
 int dev_get_iflink(const struct net_device *dev);
 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 6e18242a1cae..115de8bfcb54 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -468,7 +468,21 @@ void dev_remove_pack(struct packet_type *pt)
 }
 EXPORT_SYMBOL(dev_remove_pack);
 
+struct packet_offload *dev_get_packet_offload(__be16 type, int priority)
+{
+	struct list_head *offload_head = &offload_base;
+	struct packet_offload *ptype;
+
+	list_for_each_entry_rcu(ptype, offload_head, list) {
+		if (ptype->type != type || !ptype->callbacks.gro_receive || !ptype->callbacks.gro_complete || ptype->priority < priority)
+			continue;
 
+		return ptype;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(dev_get_packet_offload);
 /**
  *	dev_add_offload - register offload handlers
  *	@po: protocol offload declaration
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next,RFC 00/13] New fast forwarding path
From: Pablo Neira Ayuso @ 2018-06-14 14:19 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev, steffen.klassert

Hi,

This patchset proposes a new fast forwarding path infrastructure that
combines the GRO/GSO and the flowtable infrastructures. The idea is to
add a hook at the GRO layer that is invoked before the standard GRO
protocol offloads. This allows us to build custom packet chains that we
can quickly pass in one go to the neighbour layer to define fast
forwarding path for flows.

For each packet that gets into the GRO layer, we first check if there is
an entry in the flowtable, if so, the packet is placed in a list until
the GRO infrastructure decides to send the batch from gro_complete to
the neighbour layer. The first packet in the list takes the route from
the flowtable entry, so we avoid reiterative routing lookups.

In case no entry is found in the flowtable, the packet is passed up to
the classic GRO offload handlers. Thus, this packet follows the standard
forwarding path. Note that the initial packets of the flow always go
through the standard IPv4/IPv6 netfilter forward hook, that is used to
configure what flows are placed in the flowtable. Therefore, only a few
(initial) packets follow the standard forwarding path while most of the
follow up packets take this new fast forwarding path.

The fast forwarding path is enabled through explicit user policy, so the
user needs to request this behaviour from control plane, the following
example shows how to place flows in the new fast forwarding path from
the netfilter forward chain:

 table x {
        flowtable f {
                hook early_ingress priority 0; devices = { eth0, eth1 }
        }

        chain y {
                type filter hook forward priority 0;
                ip protocol tcp flow offload @f
        }
 }

The example above defines a fastpath for TCP flows that are placed in
the flowtable 'f', this flowtable is hooked at the new early_ingress
hook. The initial TCP packets that match this rule from the standard
fowarding path create an entry in the flowtable, thus, GRO creates chain
of packets for those that find an entry in the flowtable and send
them through the neighbour layer.

This new hook is happening before the ingress taps, therefore, packets
that follow this new fast forwarding path are not shown by tcpdump.

This patchset supports both layer 3 IPv4 and IPv6, and layer 4 TCP and
UDP protocols. This fastpath also integrates with the IPSec
infrastructure and the ESP protocol.

We have collected performance numbers:

        TCP TSO         TCP Fast Forward
        32.5 Gbps       35.6 Gbps

        UDP             UDP Fast Forward
        17.6 Gbps       35.6 Gbps

        ESP             ESP Fast Forward
        6 Gbps          7.5 Gbps

For UDP, this is doubling performance, and we almost achieve line rate
with one single CPU using the Intel i40e NIC. We got similar numbers
with the Mellanox ConnectX-4. For TCP, this is slightly improving things
even if TSO is being defeated given that we need to segment the packet
chain in software. We would like to explore HW GRO support with hardware
vendors with this new mode, we think that should improve the TCP numbers
we are showing above even more. For ESP traffic, performance improvement
is ~25%, in this case, perf shows the bottleneck becomes the crypto layer.

This patchset is co-authored work with Steffen Klassert.

Comments are welcome, thanks.


Pablo Neira Ayuso (6):
  netfilter: nft_chain_filter: add support for early ingress
  netfilter: nf_flow_table: add hooknum to flowtable type
  netfilter: nf_flow_table: add flowtable for early ingress hook
  netfilter: nft_flow_offload: enable offload after second packet is seen
  netfilter: nft_flow_offload: remove secpath check
  netfilter: nft_flow_offload: make sure route is not stale

Steffen Klassert (7):
  net: Add a helper to get the packet offload callbacks by priority.
  net: Change priority of ipv4 and ipv6 packet offloads.
  net: Add a GSO feature bit for the netfilter forward fastpath.
  net: Use one bit of NAPI_GRO_CB for the netfilter fastpath.
  netfilter: add early ingress hook for IPv4
  netfilter: add early ingress support for IPv6
  netfilter: add ESP support for early ingress

 include/linux/netdev_features.h         |   4 +-
 include/linux/netdevice.h               |   6 +-
 include/linux/netfilter.h               |   6 +
 include/linux/netfilter_ingress.h       |   1 +
 include/linux/skbuff.h                  |   2 +
 include/net/netfilter/early_ingress.h   |  24 +++
 include/net/netfilter/nf_flow_table.h   |   4 +
 include/uapi/linux/netfilter.h          |   1 +
 net/core/dev.c                          |  50 ++++-
 net/ipv4/af_inet.c                      |   1 +
 net/ipv4/netfilter/Makefile             |   1 +
 net/ipv4/netfilter/early_ingress.c      | 327 +++++++++++++++++++++++++++++
 net/ipv4/netfilter/nf_flow_table_ipv4.c |  12 ++
 net/ipv6/ip6_offload.c                  |   1 +
 net/ipv6/netfilter/Makefile             |   1 +
 net/ipv6/netfilter/early_ingress.c      | 315 ++++++++++++++++++++++++++++
 net/ipv6/netfilter/nf_flow_table_ipv6.c |   1 +
 net/netfilter/Kconfig                   |   8 +
 net/netfilter/Makefile                  |   1 +
 net/netfilter/core.c                    |  35 +++-
 net/netfilter/early_ingress.c           | 361 ++++++++++++++++++++++++++++++++
 net/netfilter/nf_flow_table_inet.c      |   1 +
 net/netfilter/nf_flow_table_ip.c        |  72 +++++++
 net/netfilter/nf_tables_api.c           | 120 ++++++-----
 net/netfilter/nft_chain_filter.c        |   6 +-
 net/netfilter/nft_flow_offload.c        |  13 +-
 net/xfrm/xfrm_output.c                  |   4 +
 27 files changed, 1297 insertions(+), 81 deletions(-)
 create mode 100644 include/net/netfilter/early_ingress.h
 create mode 100644 net/ipv4/netfilter/early_ingress.c
 create mode 100644 net/ipv6/netfilter/early_ingress.c
 create mode 100644 net/netfilter/early_ingress.c

-- 
2.11.0

^ permalink raw reply

* Re: FW: [PATCH 2/2] ath10k: allow ATH10K_SNOC with COMPILE_TEST
From: Kalle Valo @ 2018-06-14 14:09 UTC (permalink / raw)
  To: Niklas Cassel
  Cc: Govind Singh, bjorn.andersson, davem, netdev, linux-wireless,
	linux-kernel, ath10k
In-Reply-To: <20180613132819.GA12603@centauri.ideon.se>

Niklas Cassel <niklas.cassel@linaro.org> writes:

> On Tue, Jun 12, 2018 at 02:44:03PM +0200, Niklas Cassel wrote:
>> On Tue, Jun 12, 2018 at 06:02:48PM +0530, Govind Singh wrote:
>> > On 2018-06-12 17:45, Govind Singh wrote:
>> > > 
>> > > ATH10K_SNOC builds just fine with COMPILE_TEST, so make that possible.
>> > > 
>> > > Signed-off-by: Niklas Cassel <niklas.cassel@linaro.org>
>> > > ---
>> > >  drivers/net/wireless/ath/ath10k/Kconfig | 3 ++-
>> > >  1 file changed, 2 insertions(+), 1 deletion(-)
>> > > 
>> > > diff --git a/drivers/net/wireless/ath/ath10k/Kconfig
>> > > b/drivers/net/wireless/ath/ath10k/Kconfig
>> > > index 54ff5930126c..6572a43590a8 100644
>> > > --- a/drivers/net/wireless/ath/ath10k/Kconfig
>> > > +++ b/drivers/net/wireless/ath/ath10k/Kconfig
>> > > @@ -42,7 +42,8 @@ config ATH10K_USB
>> > > 
>> > >  config ATH10K_SNOC
>> > >  	tristate "Qualcomm ath10k SNOC support (EXPERIMENTAL)"
>> > > -	depends on ATH10K && ARCH_QCOM
>> > > +	depends on ATH10K
>> > > +	depends on ARCH_QCOM || COMPILE_TEST
>> > >  	---help---
>> > >  	  This module adds support for integrated WCN3990 chip connected
>> > >  	  to system NOC(SNOC). Currently work in progress and will not
>> > 
>> > Thanks Niklas for enabling COMPILE_TEST. With QMI set of
>> > changes(https://patchwork.kernel.org/patch/10448183/), we need to enable
>> > COMPILE_TEST for
>> > QCOM_SCM/QMI_HELPERS which seems broken today. Are you planning to fix the
>> > same.
>
> This patch is good as is.
>
> However, Govind's QMI patch set together with this patch
> resulted in build errors.
>
> FTR, these are fixed by:
> https://marc.info/?l=linux-kernel&m=152880985402356
> https://marc.info/?l=linux-kernel&m=152889452326350

So the problem is that if I apply this patch I can't apply Govind's QMI
patchset (due to the build problems) until Niklas' fixes to qcom and
rpmsg subsystems propogate back to my tree and that might take weeks, or
even months. But I really would like to apply the QMI patchset ASAP so
that we can complete the wcn3990 support and not unnecessarily delay it.

So what I propose is that I put this patch 2 as 'Awaiting Upstream' in
patchwork and apply it once Niklas' patches get to my tree. Does that
sound good?

-- 
Kalle Valo

^ permalink raw reply

* [PATCH] net: cxgb3: add error handling for sysfs_create_group
From: Zhouyang Jia @ 2018-06-14 13:56 UTC (permalink / raw)
  Cc: Zhouyang Jia, Santosh Raspatur, David S. Miller, netdev,
	linux-kernel

When sysfs_create_group fails, the lack of error-handling code may
cause unexpected results.

This patch adds error-handling code after calling sysfs_create_group.

Signed-off-by: Zhouyang Jia <jiazhouyang09@gmail.com>
---
 drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
index 2edfdbd..73d6aa9 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -3362,6 +3362,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	err = sysfs_create_group(&adapter->port[0]->dev.kobj,
 				 &cxgb3_attr_group);
+	if (err) {
+		dev_err(&pdev->dev, "cannot create sysfs group\n");
+		goto out_free_dev;
+	}
 
 	print_port_info(adapter, ai);
 	return 0;
-- 
2.7.4

^ permalink raw reply related

* Re: [iproute2 1/1] rdma: sync some IP headers with glibc
From: Leon Romanovsky @ 2018-06-14 13:52 UTC (permalink / raw)
  To: Hoang Le; +Cc: jon.maloy, maloy, ying.xue, netdev, tipc-discussion
In-Reply-To: <1528862996-7045-1-git-send-email-hoang.h.le@dektech.com.au>

[-- Attachment #1: Type: text/plain, Size: 614 bytes --]

On Wed, Jun 13, 2018 at 11:09:56AM +0700, Hoang Le wrote:
> In the commit 9a362cc71a45, new userspace header:
>   (i.e rdma/rdma_user_cm.h -> linux/in6.h)
> is included before the kernel space header:
>   (i.e utils.h -> resolv.h -> netinet/in.h).
>
> This leads to unsynchronous some IP headers and compiler got failure
> with error: redefinition of some structs IP.
>
> In this commit, just reorder this including to make them in-sync.
>
> Signed-off-by: Hoang Le <hoang.h.le@dektech.com.au>
> ---
>  rdma/rdma.h | 1 +
>  1 file changed, 1 insertion(+)
>

Thanks,
Acked-by: Leon Romanovsky <leonro@mellanox.com>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: [PATCH iproute2 v2] ipaddress: strengthen check on 'label' input
From: Patrick Talbert @ 2018-06-14 13:46 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20180601155641.76616597@shemminger-XPS-13-9360>

On Fri, Jun 1, 2018 at 9:56 PM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Tue, 29 May 2018 16:57:07 +0200
> Patrick Talbert <ptalbert@redhat.com> wrote:
>
>> As mentioned in the ip-address man page, an address label must
>> be equal to the device name or prefixed by the device name
>> followed by a colon. Currently the only check on this input is
>> to see if the device name appears at the beginning of the label
>> string.
>>
>> This commit adds an additional check to ensure label == dev or
>> continues with a colon.
>>
>> Signed-off-by: Patrick Talbert <ptalbert@redhat.com>
>> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
>
> Yes, this looks better but still have some feedback.
>
>> ---
>>  ip/ipaddress.c | 21 +++++++++++++++++++--
>>  1 file changed, 19 insertions(+), 2 deletions(-)
>>
>> diff --git a/ip/ipaddress.c b/ip/ipaddress.c
>> index 00da14c..fce2008 100644
>> --- a/ip/ipaddress.c
>> +++ b/ip/ipaddress.c
>> @@ -2040,6 +2040,22 @@ static bool ipaddr_is_multicast(inet_prefix *a)
>>               return false;
>>  }
>>
>> +static bool is_valid_label(const char *dev, const char *label)
>> +{
>> +     char alias[strlen(dev) + 1];
>> +
>> +     if (strlen(label) < strlen(dev))
>> +             return false;
>> +
>> +     strcpy(alias, dev);
>> +     strcat(alias, ":");
>> +     if (strncmp(label, dev, strlen(dev)) == 0 ||
>> +         strncmp(label, alias, strlen(alias)) == 0)
>> +             return true;
>> +     else
>> +             return false;
>> +}
>
> This string copying and comparison still is much more overhead than it
> needs to be. The following tests out to be equivalent with a single strncmp
> and strlen.
>
> Why not just:
> diff --git a/ip/ipaddress.c b/ip/ipaddress.c
> index 00da14c6f97c..eac489e94fe4 100644
> --- a/ip/ipaddress.c
> +++ b/ip/ipaddress.c
> @@ -2040,6 +2040,16 @@ static bool ipaddr_is_multicast(inet_prefix *a)
>                 return false;
>  }
>
> +static bool is_valid_label(const char *label, const char *dev)
> +{
> +       size_t len = strlen(dev);
> +
> +       if (strncmp(label, dev, len) != 0)
> +               return false;
> +
> +       return label[len] == '\0' || label[len] == ':';
> +}
> +

Woah. This is way better. v3 coming up....

Thank you for all of your help with this... and by help I mean writing
the patch.

>
>
> Doesn't matter much now, but code seems to get copied.

^ permalink raw reply

* [PATCH iproute2 v3] ipaddress: strengthen check on 'label' input
From: Patrick Talbert @ 2018-06-14 13:46 UTC (permalink / raw)
  To: netdev; +Cc: stephen

As mentioned in the ip-address man page, an address label must
be equal to the device name or prefixed by the device name
followed by a colon. Currently the only check on this input is
to see if the device name appears at the beginning of the label
string.

This commit adds an additional check to ensure label == dev or
continues with a colon.

Signed-off-by: Patrick Talbert <ptalbert@redhat.com>
Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
---
 ip/ipaddress.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index bbd35e7..713962b 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -2065,6 +2065,16 @@ static bool ipaddr_is_multicast(inet_prefix *a)
 		return false;
 }
 
+static bool is_valid_label(const char *dev, const char *label)
+{
+	size_t len = strlen(dev);
+
+	if (strncmp(label, dev, len) != 0)
+		return false;
+
+	return label[len] == '\0' || label[len] == ':';
+}
+
 static int ipaddr_modify(int cmd, int flags, int argc, char **argv)
 {
 	struct {
@@ -2208,8 +2218,9 @@ static int ipaddr_modify(int cmd, int flags, int argc, char **argv)
 		fprintf(stderr, "Not enough information: \"dev\" argument is required.\n");
 		return -1;
 	}
-	if (l && matches(d, l) != 0) {
-		fprintf(stderr, "\"dev\" (%s) must match \"label\" (%s).\n", d, l);
+	if (l && ! is_valid_label(d, l)) {
+		fprintf(stderr, "\"label\" (%s) must match \"dev\" (%s) or be prefixed by"
+			" \"dev\" with a colon.\n", l, d);
 		return -1;
 	}
 
-- 
1.8.3.1

^ permalink raw reply related

* Re: [RFC PATCH RESEND] tcp: avoid F-RTO if SACK and timestamps are disabled
From: Michal Kubecek @ 2018-06-14 13:18 UTC (permalink / raw)
  To: Ilpo Järvinen; +Cc: Yuchung Cheng, netdev, Eric Dumazet, LKML
In-Reply-To: <alpine.DEB.2.20.1806141409150.29120@whs-18.cs.helsinki.fi>

On Thu, Jun 14, 2018 at 02:51:18PM +0300, Ilpo Järvinen wrote:
> On Thu, 14 Jun 2018, Michal Kubecek wrote:
> > On Thu, Jun 14, 2018 at 11:42:43AM +0300, Ilpo Järvinen wrote:
> > > On Wed, 13 Jun 2018, Yuchung Cheng wrote:
> > > > On Wed, Jun 13, 2018 at 9:55 AM, Michal Kubecek <mkubecek@suse.cz> wrote:
> > 
> > AFAICS RFC 5682 is not explicit about this and offers multiple options.
> > Anyway, this is not essential and in most of the customer provided
> > captures, it wasn't the case.
> 
> Lacking the new segments is essential for hiding the actual bug as the 
> trace would look weird otherwise with a burst of new data segments (due 
> to the other bug).

The trace wouldn't look so nice but it can be reproduced even with more
data to send. I've copied an example below. I couldn't find a really
nice one quickly so that first few retransmits (17:22:13.865105 through
17:23:05.841105) are without new data but starting at 17:23:58.189150,
you can see that sending new (previously unsent) data may not suffice to
break the loop.

> > Normally, we would have timestamps (and even SACK). Without them, you
> > cannot reliably recognize a dupack with changed window size from
> > a spontaneous window update.
> 
> No! The window should not update window on ACKs the receiver intends to 
> designate as "duplicate ACKs". That is not without some potential cost 
> though as it requires delaying window updates up to the next cumulative 
> ACK. In the non-SACK series one of the changes is fixing this for
> non-SACK Linux TCP flows.

That sounds like a reasonable change (at least at the first glance,
I didn't think about it too deeply) but even if we fix Linux stack to
behave like this, we cannot force everyone else to do the same.

Michal Kubecek


17:22:13.660030 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1101588007:1101650727, ack 1871152053, win 28, length 62720
17:22:13.660039 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294151936, win 12146, length 0
17:22:13.660047 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 62720:125440, ack 1, win 28, length 62720
17:22:13.660050 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294178816, win 12146, length 0
17:22:13.660052 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294196736, win 12146, length 0
17:22:13.660131 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 125440:188160, ack 1, win 28, length 62720
17:22:13.660142 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294223616, win 12146, length 0
17:22:13.660164 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 188160:250880, ack 1, win 28, length 62720
17:22:13.660171 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294250496, win 12146, length 0
17:22:13.660177 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294277376, win 12146, length 0
17:22:13.660181 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294304256, win 12146, length 0
17:22:13.660185 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294331136, win 12146, length 0
17:22:13.660196 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294349056, win 12146, length 0
17:22:13.660212 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 250880:313600, ack 1, win 28, length 62720
17:22:13.660224 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 313600:376320, ack 1, win 28, length 62720
17:22:13.660266 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294384896, win 12146, length 0
17:22:13.660292 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294411776, win 12146, length 0
17:22:13.660294 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294438656, win 12146, length 0
17:22:13.660295 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294465536, win 12146, length 0
17:22:13.660353 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 376320:439040, ack 1, win 28, length 62720
17:22:13.660377 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 439040:501760, ack 1, win 28, length 62720
17:22:13.660391 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294501376, win 12146, length 0
17:22:13.660396 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294519296, win 12146, length 0
17:22:13.660400 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 501760:564480, ack 1, win 28, length 62720
17:22:13.660409 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294555136, win 12146, length 0
17:22:13.660420 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294582016, win 12146, length 0
17:22:13.660434 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294608896, win 12146, length 0
17:22:13.660458 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 564480:627200, ack 1, win 28, length 62720
17:22:13.660515 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294644736, win 12146, length 0
17:22:13.660527 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294671616, win 12146, length 0
17:22:13.660540 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294698496, win 12146, length 0
17:22:13.660541 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294725376, win 12146, length 0
17:22:13.660542 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294743296, win 12146, length 0
17:22:13.660580 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 627200:689920, ack 1, win 28, length 62720
17:22:13.660597 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 689920:752640, ack 1, win 28, length 62720     <--- first loss
17:22:13.660642 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294770176, win 12146, length 0
17:22:13.660648 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 752640:815360, ack 1, win 28, length 62720
17:22:13.660655 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294797056, win 12146, length 0
17:22:13.660662 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294823936, win 12146, length 0
17:22:13.660666 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 815360:878080, ack 1, win 28, length 62720
17:22:13.660672 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294850816, win 12146, length 0
17:22:13.660696 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294877696, win 12146, length 0
17:22:13.660704 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 878080:940800, ack 1, win 28, length 62720
17:22:13.660765 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294913536, win 12146, length 0
17:22:13.660779 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 4294940416, win 12146, length 0
17:22:13.660791 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 0, win 12146, length 0
17:22:13.660793 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 26880, win 12146, length 0
17:22:13.660795 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 53760, win 12146, length 0
17:22:13.660821 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 940800:1003520, ack 1, win 28, length 62720
17:22:13.660837 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1003520:1066240, ack 1, win 28, length 62720
17:22:13.660890 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 80640, win 12146, length 0
17:22:13.660897 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1066240:1128960, ack 1, win 28, length 62720
17:22:13.660923 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 107520, win 12146, length 0
17:22:13.660928 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 134400, win 12146, length 0
17:22:13.660932 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 161280, win 12146, length 0
17:22:13.660936 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1128960:1191680, ack 1, win 28, length 62720
17:22:13.660944 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 188160, win 12146, length 0
17:22:13.661015 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 215040, win 12146, length 0
17:22:13.661044 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 241920, win 12146, length 0
17:22:13.661045 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 268800, win 12146, length 0
17:22:13.661047 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 295680, win 12146, length 0
17:22:13.661048 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 322560, win 12135, length 0
17:22:13.661106 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1191680:1254400, ack 1, win 28, length 62720
17:22:13.661139 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1254400:1317120, ack 1, win 28, length 62720
17:22:13.661145 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 349440, win 12146, length 0
17:22:13.661148 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 358400, win 12146, length 0
17:22:13.661149 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 358400, win 12146, length 0
17:22:13.661150 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 358400, win 12146, length 0
17:22:13.661151 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 358400, win 12146, length 0
17:22:13.661153 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 456960, win 12130, length 0
17:22:13.661155 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1317120:1379840, ack 1, win 28, length 62720
17:22:13.661178 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1379840:1442560, ack 1, win 28, length 62720
17:22:13.661192 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1442560:1505280, ack 1, win 28, length 62720
17:22:13.661264 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 483840, win 12146, length 0
17:22:13.661286 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 510720, win 12146, length 0
17:22:13.661292 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1505280:1568000, ack 1, win 28, length 62720
17:22:13.661299 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 537600, win 12146, length 0
17:22:13.661303 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 564480, win 12146, length 0
17:22:13.661308 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1568000:1630720, ack 1, win 28, length 62720
17:22:13.661317 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 582400, win 12140, length 0
17:22:13.661390 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 609280, win 12146, length 0
17:22:13.661411 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 636160, win 12146, length 0
17:22:13.661412 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 663040, win 12146, length 0
17:22:13.661429 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 689920, win 12146, length 0
17:22:13.661430 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 716800, win 12146, length 0
17:22:13.661437 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1630720:1693440, ack 1, win 28, length 62720
17:22:13.661445 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 725760, win 12146, length 0
17:22:13.661447 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661454 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1693440:1756160, ack 1, win 28, length 62720
17:22:13.661508 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0                    <--- first dupack
17:22:13.661513 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1756160:1818880, ack 1, win 28, length 62720
17:22:13.661520 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661524 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661527 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661530 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661532 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661635 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661637 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661638 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661640 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661641 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661642 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661653 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1818880:1881600, ack 1, win 28, length 62720
17:22:13.661757 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661761 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661764 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661768 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1881600:1944320, ack 1, win 28, length 62720
17:22:13.661778 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661782 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661886 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661891 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661894 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661897 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661900 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661902 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1944320:2007040, ack 1, win 28, length 62720
17:22:13.661928 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.661931 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662016 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662020 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662023 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662026 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662029 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662032 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2007040:2069760, ack 1, win 28, length 62720
17:22:13.662039 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662042 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662132 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662136 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662139 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662142 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662145 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662148 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2069760:2132480, ack 1, win 28, length 62720
17:22:13.662154 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662263 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662267 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662269 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662272 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662275 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662385 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662390 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2132480:2195200, ack 1, win 28, length 62720
17:22:13.662397 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662400 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662402 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662405 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662408 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662508 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662512 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662515 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2195200:2257920, ack 1, win 28, length 62720
17:22:13.662522 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662525 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662527 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662530 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662633 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662637 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662640 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662643 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2257920:2320640, ack 1, win 28, length 62720
17:22:13.662649 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662652 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662759 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662763 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662766 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.662881 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 734720, win 12146, length 0
17:22:13.865105 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 734720:743680, ack 1, win 28, length 8960      <--- first retransmit
17:22:13.865227 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 851200, win 12033, length 0
17:22:14.273092 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 851200:860160, ack 1, win 28, length 8960
17:22:14.273207 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 869120, win 12016, length 0
17:22:15.089125 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 869120:878080, ack 1, win 28, length 8960
17:22:15.089244 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 887040, win 11999, length 0
17:22:16.725135 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 887040:896000, ack 1, win 28, length 8960
17:22:16.725269 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 976640, win 11912, length 0
17:22:19.997144 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 976640:985600, ack 1, win 28, length 8960
17:22:19.997257 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1039360, win 11851, length 0
17:22:26.545096 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1039360:1048320, ack 1, win 28, length 8960
17:22:26.545212 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1057280, win 11834, length 0
17:22:39.629137 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1057280:1066240, ack 1, win 28, length 8960
17:22:39.629268 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1173760, win 11721, length 0
17:23:05.841105 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1173760:1182720, ack 1, win 28, length 8960
17:23:05.841229 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1290240, win 11613, length 0
17:23:58.189150 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1290240:1299200, ack 1, win 28, length 8960
17:23:58.189268 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1344000, win 11670, length 0
17:23:58.189310 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2320640:2383360, ack 1, win 28, length 62720   <--- new data
17:23:58.189416 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1344000, win 11689, length 0                   <--- ack but window update
17:23:58.189424 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1344000, win 11689, length 0
17:23:58.189458 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1344000, win 11689, length 0
17:23:58.189466 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1344000, win 11689, length 0
17:23:58.189475 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2383360:2446080, ack 1, win 28, length 62720   <--- more new data
17:23:58.189575 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1344000, win 11689, length 0
17:23:58.189620 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1344000, win 11689, length 0
17:23:58.189623 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1344000, win 11689, length 0
17:25:42.769136 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1344000:1352960, ack 1, win 28, length 8960    <--- retransmit only after RTO
17:25:42.769243 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1361920, win 11672, length 0
17:27:43.085128 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1361920:1370880, ack 1, win 28, length 8960
17:27:43.085240 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1469440, win 11631, length 0
17:27:43.085261 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2446080:2508800, ack 1, win 28, length 62720
17:27:43.085363 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1469440, win 11678, length 0
17:27:43.085425 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1469440, win 11678, length 0
17:27:43.085430 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1469440, win 11678, length 0
17:27:43.085433 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1469440, win 11678, length 0
17:27:43.085437 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2508800:2571520, ack 1, win 28, length 62720
17:27:43.085458 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1469440, win 11678, length 0
17:27:43.085461 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1469440, win 11678, length 0
17:27:43.085531 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1469440, win 11678, length 0
17:27:43.085578 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1469440, win 11678, length 0
17:27:43.085581 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1469440, win 11678, length 0
17:29:43.405123 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1469440:1478400, ack 1, win 28, length 8960
17:29:43.405249 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1594880, win 11614, length 0
17:29:43.405288 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2571520:2634240, ack 1, win 28, length 62720
17:29:43.405400 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1594880, win 11671, length 0
17:29:43.405408 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1594880, win 11671, length 0
17:29:43.405446 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1594880, win 11671, length 0
17:29:43.405454 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1594880, win 11671, length 0
17:29:43.405462 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2634240:2696960, ack 1, win 28, length 62720
17:29:43.405502 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1594880, win 11671, length 0
17:29:43.405579 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1594880, win 11671, length 0
17:29:43.405626 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1594880, win 11671, length 0
17:29:43.405629 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1594880, win 11671, length 0
17:31:43.725113 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1594880:1603840, ack 1, win 28, length 8960
17:31:43.725273 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1657600, win 11610, length 0
17:33:44.045093 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1657600:1666560, ack 1, win 28, length 8960
17:33:44.045248 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1675520, win 11636, length 0
17:35:44.365137 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1675520:1684480, ack 1, win 28, length 8960
17:35:44.365319 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11642, length 0
17:35:44.365345 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2696960:2759680, ack 1, win 28, length 62720
17:35:44.365370 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2759680:2822400, ack 1, win 28, length 62720
17:35:44.365463 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:35:44.365467 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:35:44.365509 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:35:44.365513 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:35:44.365517 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2822400:2885120, ack 1, win 28, length 62720
17:35:44.365541 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:35:44.365563 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:35:44.365567 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:35:44.365623 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:35:44.365670 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:35:44.365674 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:35:44.365678 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:35:44.365682 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2885120:2947840, ack 1, win 28, length 62720
17:35:44.365801 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:35:44.365850 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:35:44.365854 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:35:44.365894 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1783040, win 11689, length 0
17:37:44.685086 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1783040:1792000, ack 1, win 28, length 8960
17:37:44.685204 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 1899520, win 11576, length 0
17:39:45.005099 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 1899520:1908480, ack 1, win 28, length 8960
17:39:45.005228 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 2947840, win 11616, length 0
17:39:45.005304 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 2947840:3010560, ack 1, win 28, length 62720
17:39:45.005339 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3010560:3073280, ack 1, win 28, length 62720
17:39:45.005385 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3073280:3136000, ack 1, win 28, length 62720
17:39:45.005408 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3136000:3198720, ack 1, win 28, length 62720
17:39:45.005430 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3198720:3261440, ack 1, win 28, length 62720
17:39:45.005458 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3261440:3324160, ack 1, win 28, length 62720
17:39:45.005516 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3324160:3386880, ack 1, win 28, length 62720
17:39:45.005572 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3386880:3449600, ack 1, win 28, length 62720
17:39:45.005595 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3449600:3512320, ack 1, win 28, length 62720
17:39:45.005616 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3512320:3575040, ack 1, win 28, length 62720
17:39:45.005654 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3575040:3637760, ack 1, win 28, length 62720
17:39:45.005675 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3637760:3700480, ack 1, win 28, length 62720
17:39:45.005710 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3700480:3763200, ack 1, win 28, length 62720
17:39:45.005739 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 2956800, win 11851, length 0
17:39:45.005765 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3763200:3825920, ack 1, win 28, length 62720
17:39:45.005798 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3825920:3888640, ack 1, win 28, length 62720
17:39:45.005824 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 2974720, win 11841, length 0
17:39:45.005826 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3001600, win 11827, length 0
17:39:45.005827 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3019520, win 11961, length 0
17:39:45.005829 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3046400, win 11947, length 0
17:39:45.005831 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3073280, win 11932, length 0
17:39:45.005832 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3100160, win 11918, length 0
17:39:45.005834 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3127040, win 11904, length 0
17:39:45.005835 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3136000, win 11898, length 0
17:39:45.005837 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3162880, win 12029, length 0
17:39:45.005838 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3171840, win 12023, length 0
17:39:45.005840 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3198720, win 12009, length 0
17:39:45.005841 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3207680, win 12004, length 0
17:39:45.005842 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3234560, win 11990, length 0
17:39:45.005844 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3252480, win 11980, length 0
17:39:45.005845 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3297280, win 12092, length 0
17:39:45.005859 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3888640:3951360, ack 1, win 28, length 62720
17:39:45.005892 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 3951360:4014080, ack 1, win 28, length 62720
17:39:45.005899 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3395840, win 12146, length 0
17:39:45.005903 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3422720, win 12146, length 0
17:39:45.005904 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3449600, win 12146, length 0
17:39:45.005906 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3512320, win 12115, length 0
17:39:45.005923 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 4014080:4076800, ack 1, win 28, length 62720
17:39:45.005946 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3539200, win 12146, length 0
17:39:45.005948 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3566080, win 12146, length 0
17:39:45.005996 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3601920, win 12146, length 0
17:39:45.005998 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3628800, win 12146, length 0
17:39:45.005999 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3646720, win 12146, length 0
17:39:45.006002 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 4076800:4139520, ack 1, win 28, length 62720
17:39:45.006040 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3673600, win 12146, length 0
17:39:45.006043 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3700480, win 12146, length 0
17:39:45.006045 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 4139520:4202240, ack 1, win 28, length 62720
17:39:45.006085 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 4202240:4264960, ack 1, win 28, length 62720
17:39:45.006087 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3718400, win 12146, length 0
17:39:45.006111 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 4264960:4327680, ack 1, win 28, length 62720
17:39:45.006135 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3745280, win 12146, length 0
17:39:45.006137 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3772160, win 12146, length 0
17:39:45.006139 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3790080, win 12146, length 0
17:39:45.006168 IP 10.30.59.58.1556 > 10.31.112.14.30284: Flags [.], seq 4327680:4390400, ack 1, win 28, length 62720
17:39:45.006197 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3816960, win 12146, length 0
17:39:45.006200 IP 10.31.112.14.30284 > 10.30.59.58.1556: Flags [.], ack 3843840, win 12146, length 0

^ permalink raw reply

* Re: [PATCH net 1/2] ipv4: igmp: use alarmtimer to prevent delayed reports
From: Tejaswi Tanikella @ 2018-06-14 13:14 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: netdev, f.fainelli, davem
In-Reply-To: <20180613144437.GA31647@lunn.ch>

On Wed, Jun 13, 2018 at 04:44:37PM +0200, Andrew Lunn wrote:
> While it has been asleep, it has also been dropping any multicast
> traffic in the stream. So it does not really matter it has left the
> group. You were not receiving the packets anyway.
> 
> Thing about this from another angle. I have an NTP client running on
> my laptop, using multicast address 224.0.1.1. I suspend my laptop and
> walk away for two hours. When i come back, i find that 20 seconds
> after i suspended it, it resumed and send an group response
> message. And an hour later, since it was still running, the battery
> went flat.
> 
> It seems to me, the change you are proposing cannot be the default
> behaviour.
> 
> I actually think you need to be looking at some sort of WoL feature.
> You need the multicast stream data packets to wake you, and you also
> need to wake up the IGMP query message. And you need to wake up to
> send the group membership. Does your hardware have this sort of WoL
> support? You can then explicitly enable this WoL for your application.
> 
> 	Andrew

Thanks Andrew.
You are right, this should not be the default behaviour.

-Tejaswi

^ permalink raw reply

* Re: [B.A.T.M.A.N.] [PATCH v2 3/5] batman: use BIT_ULL for NL80211_STA_INFO_* attribute types
From: Omer Efrat @ 2018-06-14 12:50 UTC (permalink / raw)
  To: Sven Eckelmann, Johannes Berg
  Cc: b.a.t.m.a.n@lists.open-mesh.org, netdev@vger.kernel.org,
	linux-wireless@vger.kernel.org
In-Reply-To: <7318287.kp0SrnPS43@bentobox>

Sven Eckelmann wrote:
>@Omer: If you want it as cleanup patch then make it clear in the patch that
>the warning you've showed here is not actually not something which you will
>see in in the modified code.

I will send v3 as clean up patch.

Omer Efrat.

________________________________________
From: Sven Eckelmann <sven@narfation.org>
Sent: Thursday, June 14, 2018 2:20:17 PM
To: Johannes Berg
Cc: b.a.t.m.a.n@lists.open-mesh.org; Omer Efrat; netdev@vger.kernel.org; linux-wireless@vger.kernel.org
Subject: Re: [B.A.T.M.A.N.] [PATCH v2 3/5] batman: use BIT_ULL for NL80211_STA_INFO_* attribute types

On Donnerstag, 14. Juni 2018 13:05:16 CEST Johannes Berg wrote:
[...]
> > in commit 739960f128e5 ("cfg80211/nl80211: Add support for
> > NL80211_STA_INFO_RX_DURATION")
>
> Yeah, which actually means this patch isn't needed?
>
> BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT) is fine since
> NL80211_STA_INFO_EXPECTED_THROUGHPUT is actually == 27.

Hadn't verified this before but this would make sense. So no fixes here - just
some "cleanup" patch to make these tests more consistent. Thanks for checking.

@Omer: If you want it as cleanup patch then make it clear in the patch that
the warning you've showed here is not actually not something which you will
see in in the modified code.

Kind regards,
        Sven

^ permalink raw reply

* Re: Re: [Qemu-devel] [PATCH] qemu: Introduce VIRTIO_NET_F_STANDBY feature bit to virtio_net
From: Michael S. Tsirkin @ 2018-06-14 12:50 UTC (permalink / raw)
  To: Siwei Liu
  Cc: Samudrala, Sridhar, Jason Wang, Alexander Duyck, virtio-dev,
	qemu-devel, Jiri Pirko, Jakub Kicinski, Netdev, Brandeburg, Jesse,
	virtualization, aaron.f.brown
In-Reply-To: <CADGSJ213f8tJpNXuOhv8qRew-Y5VZAwA+srNMrLZYnKdVGLdAA@mail.gmail.com>

On Wed, Jun 13, 2018 at 06:02:01PM -0700, Siwei Liu wrote:
> >> And it's the guest that needs failover support not the VM.
> >
> >
> > Isn't guest and VM synonymous?

Guest is whatever software is running on top of the hypervisor.

The virtual machine is the interface between the two.

-- 
MST

^ permalink raw reply

* WARNING in sk_stream_kill_queues (3)
From: syzbot @ 2018-06-14 12:47 UTC (permalink / raw)
  To: davem, gregkh, kstewart, linux-kernel, netdev, pombredanne,
	syzkaller-bugs, tglx

Hello,

syzbot found the following crash on:

HEAD commit:    81c310582f0e kmsan: unpoison virtio input buffers when add..
git tree:       https://github.com/google/kmsan.git/master
console output: https://syzkaller.appspot.com/x/log.txt?x=1747c21f800000
kernel config:  https://syzkaller.appspot.com/x/.config?x=848e40757852af3e
dashboard link: https://syzkaller.appspot.com/bug?extid=13e1ee9caeab5a9abc62
compiler:       clang version 7.0.0 (trunk 334104)
syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=105f5eaf800000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=13b15b6f800000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+13e1ee9caeab5a9abc62@syzkaller.appspotmail.com

WARNING: CPU: 0 PID: 4964 at net/core/stream.c:206  
sk_stream_kill_queues+0x944/0x970 net/core/stream.c:206
Kernel panic - not syncing: panic_on_warn set ...

CPU: 0 PID: 4964 Comm: syz-executor457 Not tainted 4.17.0+ #6
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x185/0x1d0 lib/dump_stack.c:113
  panic+0x3d0/0x990 kernel/panic.c:184
  __warn+0x40f/0x580 kernel/panic.c:536
  report_bug+0x72a/0x880 lib/bug.c:186
  fixup_bug arch/x86/kernel/traps.c:179 [inline]
  do_error_trap+0x1c1/0x620 arch/x86/kernel/traps.c:298
  do_invalid_op+0x46/0x50 arch/x86/kernel/traps.c:317
  invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
RIP: 0010:sk_stream_kill_queues+0x944/0x970 net/core/stream.c:206
RSP: 0018:ffff8801a867f368 EFLAGS: 00010293
RAX: ffffffff87dbf654 RBX: 0000000000000813 RCX: ffff8801ab7bd7c0
RDX: 0000000000000000 RSI: aaaaaaaaaaaab000 RDI: ffffea0000000000
RBP: ffff8801a867f3e8 R08: 0000000000000000 R09: 0000000000000002
R10: ffff8801a66d3a00 R11: ffffffff88c44c40 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000813
  inet_csk_destroy_sock+0x2a4/0x5d0 net/ipv4/inet_connection_sock.c:833
  tcp_close+0xe37/0x18f0 net/ipv4/tcp.c:2323
  tls_sk_proto_close+0xc2f/0xcd0 net/tls/tls_main.c:291
  inet_release+0x249/0x2b0 net/ipv4/af_inet.c:427
  inet6_release+0xaf/0x100 net/ipv6/af_inet6.c:460
  sock_release net/socket.c:594 [inline]
  sock_close+0xeb/0x310 net/socket.c:1149
  __fput+0x458/0xa30 fs/file_table.c:209
  ____fput+0x37/0x40 fs/file_table.c:243
  task_work_run+0x22e/0x2b0 kernel/task_work.c:113
  exit_task_work include/linux/task_work.h:22 [inline]
  do_exit+0x110e/0x3930 kernel/exit.c:867
  do_group_exit+0x1a0/0x360 kernel/exit.c:970
  get_signal+0x1405/0x1ec0 kernel/signal.c:2482
  do_signal+0xb8/0x1d20 arch/x86/kernel/signal.c:810
  exit_to_usermode_loop arch/x86/entry/common.c:162 [inline]
  prepare_exit_to_usermode+0x271/0x3a0 arch/x86/entry/common.c:196
  syscall_return_slowpath+0xe9/0x710 arch/x86/entry/common.c:265
  do_syscall_64+0x1ad/0x230 arch/x86/entry/common.c:290
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x447ce9
RSP: 002b:00007feb54132d98 EFLAGS: 00000212 ORIG_RAX: 000000000000002c
RAX: 0000000000008000 RBX: 00000000006dec5c RCX: 0000000000447ce9
RDX: 00000000fffffdef RSI: 00000000200005c0 RDI: 0000000000000007
RBP: 0000000000000000 R08: 0000000020000000 R09: 000000000000001c
R10: 0000000000000000 R11: 0000000000000212 R12: 00000000006dec58
R13: 0100000000000000 R14: 00007feb541339c0 R15: 000000000000000c
Dumping ftrace buffer:
    (ftrace buffer empty)
Kernel Offset: disabled
Rebooting in 86400 seconds..


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with  
syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* [PATCH] net: Fix device name resolving crash in default_device_exit()
From: Kirill Tkhai @ 2018-06-14 12:38 UTC (permalink / raw)
  To: netdev
  Cc: davem, daniel, jakub.kicinski, ktkhai, ast, linux, john.fastabend,
	brouer, dsahern

The following script makes kernel to crash since it can't obtain
a name for a device, when the name is occupied by another device:

#!/bin/bash
ifconfig eth0 down
ifconfig eth1 down
index=`cat /sys/class/net/eth1/ifindex`
ip link set eth1 name dev$index
unshare -n sleep 1h &
pid=$!
while [[ "`readlink /proc/self/ns/net`" == "`readlink /proc/$pid/ns/net`" ]]; do continue; done
ip link set dev$index netns $pid
ip link set eth0 name dev$index
kill -9 $pid

Kernel messages:

virtio_net virtio1 dev3: renamed from eth1
virtio_net virtio0 dev3: renamed from eth0
default_device_exit: failed to move dev3 to init_net: -17
------------[ cut here ]------------
kernel BUG at net/core/dev.c:8978!
invalid opcode: 0000 [#1] PREEMPT SMP
CPU: 1 PID: 276 Comm: kworker/u8:3 Not tainted 4.17.0+ #292
Workqueue: netns cleanup_net
RIP: 0010:default_device_exit+0x9c/0xb0
[stack trace snipped]

This patch gives more variability during choosing new name
of device and fixes the problem.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 net/core/dev.c |    4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 6e18242a1cae..6c9b9303ded6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8959,7 +8959,6 @@ static void __net_exit default_device_exit(struct net *net)
 	rtnl_lock();
 	for_each_netdev_safe(net, dev, aux) {
 		int err;
-		char fb_name[IFNAMSIZ];
 
 		/* Ignore unmoveable devices (i.e. loopback) */
 		if (dev->features & NETIF_F_NETNS_LOCAL)
@@ -8970,8 +8969,7 @@ static void __net_exit default_device_exit(struct net *net)
 			continue;
 
 		/* Push remaining network devices to init_net */
-		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
-		err = dev_change_net_namespace(dev, &init_net, fb_name);
+		err = dev_change_net_namespace(dev, &init_net, "dev%d");
 		if (err) {
 			pr_emerg("%s: failed to move %s to init_net: %d\n",
 				 __func__, dev->name, err);

^ permalink raw reply related

* [PATCH 3/3] net: dsa: Add Vitesse VSC73xx DSA router driver
From: Linus Walleij @ 2018-06-14 12:35 UTC (permalink / raw)
  To: Andrew Lunn, Vivien Didelot, Florian Fainelli
  Cc: netdev, openwrt-devel, LEDE Development List, Gabor Juhos,
	Linus Walleij
In-Reply-To: <20180614123534.8063-1-linus.walleij@linaro.org>

This adds a DSA driver for:

Vitesse VSC7385 SparX-G5 5-port Integrated Gigabit Ethernet Switch
Vitesse VSC7388 SparX-G8 8-port Integrated Gigabit Ethernet Switch
Vitesse VSC7395 SparX-G5e 5+1-port Integrated Gigabit Ethernet Switch
Vitesse VSC7398 SparX-G8e 8-port Integrated Gigabit Ethernet Switch

These switches have a built-in 8051 CPU and can download and execute
firmware in this CPU. They can also be configured to use an external
CPU handling the switch in a memory-mapped manner by connecting to
that external CPU's memory bus.

This driver (currently) only takes control of the switch chip over
SPI and configures it to route packages around when connected to a
CPU port. The chip has embedded PHYs and VLAN support so we model it
using DSA as a best fit so we can easily add VLAN support and maybe
later also exploit the internal frame header to get more direct
control over the switch.

The four built-in GPIO lines are exposed using a standard GPIO chip.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/net/dsa/Kconfig           |   12 +
 drivers/net/dsa/Makefile          |    1 +
 drivers/net/dsa/vitesse-vsc73xx.c | 1362 +++++++++++++++++++++++++++++
 3 files changed, 1375 insertions(+)
 create mode 100644 drivers/net/dsa/vitesse-vsc73xx.c

diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig
index 2b81b97e994f..2f6207b969e3 100644
--- a/drivers/net/dsa/Kconfig
+++ b/drivers/net/dsa/Kconfig
@@ -76,4 +76,16 @@ config NET_DSA_SMSC_LAN9303_MDIO
 	  Enable access functions if the SMSC/Microchip LAN9303 is configured
 	  for MDIO managed mode.
 
+config NET_DSA_VITESSE_VSC73XX
+	tristate "Vitesse VSC7385/7388/7395/7398 support"
+	depends on OF && SPI
+	depends on NET_DSA
+	select FIXED_PHY
+	select VITESSE_PHY
+	select NET_DSA_TAG_TRAILER
+	select GPIOLIB
+	---help---
+	  This enables support for the Vitesse VSC7385, VSC7388,
+	  VSC7395 and VSC7398 SparX integrated ethernet switches.
+
 endmenu
diff --git a/drivers/net/dsa/Makefile b/drivers/net/dsa/Makefile
index 15c2a831edf1..d4f873ae2f6a 100644
--- a/drivers/net/dsa/Makefile
+++ b/drivers/net/dsa/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_NET_DSA_QCA8K)	+= qca8k.o
 obj-$(CONFIG_NET_DSA_SMSC_LAN9303) += lan9303-core.o
 obj-$(CONFIG_NET_DSA_SMSC_LAN9303_I2C) += lan9303_i2c.o
 obj-$(CONFIG_NET_DSA_SMSC_LAN9303_MDIO) += lan9303_mdio.o
+obj-$(CONFIG_NET_DSA_VITESSE_VSC73XX) += vitesse-vsc73xx.o
 obj-y				+= b53/
 obj-y				+= microchip/
 obj-y				+= mv88e6xxx/
diff --git a/drivers/net/dsa/vitesse-vsc73xx.c b/drivers/net/dsa/vitesse-vsc73xx.c
new file mode 100644
index 000000000000..cf478856e53f
--- /dev/null
+++ b/drivers/net/dsa/vitesse-vsc73xx.c
@@ -0,0 +1,1362 @@
+// SPDX-License-Identifier: GPL-2.0
+/* DSA driver for:
+ * Vitesse VSC7385 SparX-G5 5+1-port Integrated Gigabit Ethernet Switch
+ * Vitesse VSC7388 SparX-G8 8-port Integrated Gigabit Ethernet Switch
+ * Vitesse VSC7395 SparX-G5e 5+1-port Integrated Gigabit Ethernet Switch
+ * Vitesse VSC7398 SparX-G8e 8-port Integrated Gigabit Ethernet Switch
+ *
+ * These switches have a built-in 8051 CPU and can download and execute a
+ * firmware in this CPU. They can also be configured to use an external CPU
+ * handling the switch in a memory-mapped manner by connecting to that external
+ * CPU's memory bus.
+ *
+ * This driver (currently) only takes control of the switch chip over SPI and
+ * configures it to route packages around when connected to a CPU port. The
+ * chip has embedded PHYs and VLAN support so we model it using DSA.
+ *
+ * Copyright (C) 2018 Linus Wallej <linus.walleij@linaro.org>
+ * Includes portions of code from the firmware uploader by:
+ * Copyright (C) 2009 Gabor Juhos <juhosg@openwrt.org>
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_mdio.h>
+#include <linux/platform_device.h>
+#include <linux/spi/spi.h>
+#include <linux/bitops.h>
+#include <linux/if_bridge.h>
+#include <linux/etherdevice.h>
+#include <linux/gpio/consumer.h>
+#include <linux/gpio/driver.h>
+#include <linux/random.h>
+#include <net/dsa.h>
+
+#define VSC73XX_BLOCK_MAC	0x1 /* Subblocks 0-4, 6 (CPU port) */
+#define VSC73XX_BLOCK_ANALYZER	0x2 /* Only subblock 0 */
+#define VSC73XX_BLOCK_MII	0x3 /* Subblocks 0 and 1 */
+#define VSC73XX_BLOCK_MEMINIT	0x3 /* Only subblock 2 */
+#define VSC73XX_BLOCK_CAPTURE	0x4 /* Only subblock 2 */
+#define VSC73XX_BLOCK_ARBITER	0x5 /* Only subblock 0 */
+#define VSC73XX_BLOCK_SYSTEM	0x7 /* Only subblock 0 */
+
+#define CPU_PORT	6 /* CPU port */
+
+/* MAC Block registers */
+#define VSC73XX_MAC_CFG		0x00
+#define VSC73XX_MACHDXGAP	0x02
+#define VSC73XX_FCCONF		0x04
+#define VSC73XX_FCMACHI		0x08
+#define VSC73XX_FCMACLO		0x0c
+#define VSC73XX_MAXLEN		0x10
+#define VSC73XX_ADVPORTM	0x19
+#define VSC73XX_TXUPDCFG	0x24
+#define VSC73XX_TXQ_SELECT_CFG	0x28
+#define VSC73XX_RXOCT		0x50
+#define VSC73XX_TXOCT		0x51
+#define VSC73XX_C_RX0		0x52
+#define VSC73XX_C_RX1		0x53
+#define VSC73XX_C_RX2		0x54
+#define VSC73XX_C_TX0		0x55
+#define VSC73XX_C_TX1		0x56
+#define VSC73XX_C_TX2		0x57
+#define VSC73XX_C_CFG		0x58
+#define VSC73XX_CAT_DROP	0x6e
+#define VSC73XX_CAT_PR_MISC_L2	0x6f
+#define VSC73XX_CAT_PR_USR_PRIO	0x75
+#define VSC73XX_Q_MISC_CONF	0xdf
+
+/* MAC_CFG register bits */
+#define VSC73XX_MAC_CFG_WEXC_DIS	BIT(31)
+#define VSC73XX_MAC_CFG_PORT_RST	BIT(29)
+#define VSC73XX_MAC_CFG_TX_EN		BIT(28)
+#define VSC73XX_MAC_CFG_SEED_LOAD	BIT(27)
+#define VSC73XX_MAC_CFG_SEED_MASK	GENMASK(26, 19)
+#define VSC73XX_MAC_CFG_SEED_OFFSET	19
+#define VSC73XX_MAC_CFG_FDX		BIT(18)
+#define VSC73XX_MAC_CFG_GIGA_MODE	BIT(17)
+#define VSC73XX_MAC_CFG_RX_EN		BIT(16)
+#define VSC73XX_MAC_CFG_VLAN_DBLAWR	BIT(15)
+#define VSC73XX_MAC_CFG_VLAN_AWR	BIT(14)
+#define VSC73XX_MAC_CFG_100_BASE_T	BIT(13) /* Not in manual */
+#define VSC73XX_MAC_CFG_TX_IPG_MASK	GENMASK(10, 6)
+#define VSC73XX_MAC_CFG_TX_IPG_OFFSET	6
+#define VSC73XX_MAC_CFG_TX_IPG_1000M	(6 << VSC73XX_MAC_CFG_TX_IPG_OFFSET)
+#define VSC73XX_MAC_CFG_TX_IPG_100_10M	(17 << VSC73XX_MAC_CFG_TX_IPG_OFFSET)
+#define VSC73XX_MAC_CFG_MAC_RX_RST	BIT(5)
+#define VSC73XX_MAC_CFG_MAC_TX_RST	BIT(4)
+#define VSC73XX_MAC_CFG_CLK_SEL_MASK	GENMASK(2, 0)
+#define VSC73XX_MAC_CFG_CLK_SEL_OFFSET	0
+#define VSC73XX_MAC_CFG_CLK_SEL_1000M	1
+#define VSC73XX_MAC_CFG_CLK_SEL_100M	2
+#define VSC73XX_MAC_CFG_CLK_SEL_10M	3
+#define VSC73XX_MAC_CFG_CLK_SEL_EXT	4
+
+#define VSC73XX_MAC_CFG_1000M_F_PHY	(VSC73XX_MAC_CFG_FDX | \
+					 VSC73XX_MAC_CFG_GIGA_MODE | \
+					 VSC73XX_MAC_CFG_TX_IPG_1000M | \
+					 VSC73XX_MAC_CFG_CLK_SEL_EXT)
+#define VSC73XX_MAC_CFG_100_10M_F_PHY	(VSC73XX_MAC_CFG_FDX | \
+					 VSC73XX_MAC_CFG_TX_IPG_100_10M | \
+					 VSC73XX_MAC_CFG_CLK_SEL_EXT)
+#define VSC73XX_MAC_CFG_100_10M_H_PHY	(VSC73XX_MAC_CFG_TX_IPG_100_10M | \
+					 VSC73XX_MAC_CFG_CLK_SEL_EXT)
+#define VSC73XX_MAC_CFG_1000M_F_RGMII	(VSC73XX_MAC_CFG_FDX | \
+					 VSC73XX_MAC_CFG_GIGA_MODE | \
+					 VSC73XX_MAC_CFG_TX_IPG_1000M | \
+					 VSC73XX_MAC_CFG_CLK_SEL_1000M)
+#define VSC73XX_MAC_CFG_RESET		(VSC73XX_MAC_CFG_PORT_RST | \
+					 VSC73XX_MAC_CFG_MAC_RX_RST | \
+					 VSC73XX_MAC_CFG_MAC_TX_RST)
+
+/* Flow control register bits */
+#define VSC73XX_FCCONF_ZERO_PAUSE_EN	BIT(17)
+#define VSC73XX_FCCONF_FLOW_CTRL_OBEY	BIT(16)
+#define VSC73XX_FCCONF_PAUSE_VAL_MASK	GENMASK(15, 0)
+
+/* ADVPORTM advanced port setup register bits */
+#define VSC73XX_ADVPORTM_IFG_PPM	BIT(7)
+#define VSC73XX_ADVPORTM_EXC_COL_CONT	BIT(6)
+#define VSC73XX_ADVPORTM_EXT_PORT	BIT(5)
+#define VSC73XX_ADVPORTM_INV_GTX	BIT(4)
+#define VSC73XX_ADVPORTM_ENA_GTX	BIT(3)
+#define VSC73XX_ADVPORTM_DDR_MODE	BIT(2)
+#define VSC73XX_ADVPORTM_IO_LOOPBACK	BIT(1)
+#define VSC73XX_ADVPORTM_HOST_LOOPBACK	BIT(0)
+
+/* CAT_DROP categorizer frame dropping register bits */
+#define VSC73XX_CAT_DROP_DROP_MC_SMAC_ENA	BIT(6)
+#define VSC73XX_CAT_DROP_FWD_CTRL_ENA		BIT(4)
+#define VSC73XX_CAT_DROP_FWD_PAUSE_ENA		BIT(3)
+#define VSC73XX_CAT_DROP_UNTAGGED_ENA		BIT(2)
+#define VSC73XX_CAT_DROP_TAGGED_ENA		BIT(1)
+#define VSC73XX_CAT_DROP_NULL_MAC_ENA		BIT(0)
+
+#define VSC73XX_Q_MISC_CONF_EXTENT_MEM		BIT(31)
+#define VSC73XX_Q_MISC_CONF_EARLY_TX_MASK	GENMASK(4, 1)
+#define VSC73XX_Q_MISC_CONF_EARLY_TX_512	(1 << 1)
+#define VSC73XX_Q_MISC_CONF_MAC_PAUSE_MODE	BIT(0)
+
+/* Frame analyzer block 2 registers */
+#define VSC73XX_STORMLIMIT	0x02
+#define VSC73XX_ADVLEARN	0x03
+#define VSC73XX_IFLODMSK	0x04
+#define VSC73XX_VLANMASK	0x05
+#define VSC73XX_MACHDATA	0x06
+#define VSC73XX_MACLDATA	0x07
+#define VSC73XX_ANMOVED		0x08
+#define VSC73XX_ANAGEFIL	0x09
+#define VSC73XX_ANEVENTS	0x0a
+#define VSC73XX_ANCNTMASK	0x0b
+#define VSC73XX_ANCNTVAL	0x0c
+#define VSC73XX_LEARNMASK	0x0d
+#define VSC73XX_UFLODMASK	0x0e
+#define VSC73XX_MFLODMASK	0x0f
+#define VSC73XX_RECVMASK	0x10
+#define VSC73XX_AGGRCTRL	0x20
+#define VSC73XX_AGGRMSKS	0x30 /* Until 0x3f */
+#define VSC73XX_DSTMASKS	0x40 /* Until 0x7f */
+#define VSC73XX_SRCMASKS	0x80 /* Until 0x87 */
+#define VSC73XX_CAPENAB		0xa0
+#define VSC73XX_MACACCESS	0xb0
+#define VSC73XX_IPMCACCESS	0xb1
+#define VSC73XX_MACTINDX	0xc0
+#define VSC73XX_VLANACCESS	0xd0
+#define VSC73XX_VLANTIDX	0xe0
+#define VSC73XX_AGENCTRL	0xf0
+#define VSC73XX_CAPRST		0xff
+
+#define VSC73XX_MACACCESS_CPU_COPY		BIT(14)
+#define VSC73XX_MACACCESS_FWD_KILL		BIT(13)
+#define VSC73XX_MACACCESS_IGNORE_VLAN		BIT(12)
+#define VSC73XX_MACACCESS_AGED_FLAG		BIT(11)
+#define VSC73XX_MACACCESS_VALID			BIT(10)
+#define VSC73XX_MACACCESS_LOCKED		BIT(9)
+#define VSC73XX_MACACCESS_DEST_IDX_MASK		GENMASK(8, 3)
+#define VSC73XX_MACACCESS_CMD_MASK		GENMASK(2, 0)
+#define VSC73XX_MACACCESS_CMD_IDLE		0
+#define VSC73XX_MACACCESS_CMD_LEARN		1
+#define VSC73XX_MACACCESS_CMD_FORGET		2
+#define VSC73XX_MACACCESS_CMD_AGE_TABLE		3
+#define VSC73XX_MACACCESS_CMD_FLUSH_TABLE	4
+#define VSC73XX_MACACCESS_CMD_CLEAR_TABLE	5
+#define VSC73XX_MACACCESS_CMD_READ_ENTRY	6
+#define VSC73XX_MACACCESS_CMD_WRITE_ENTRY	7
+
+#define VSC73XX_VLANACCESS_LEARN_DISABLED	BIT(30)
+#define VSC73XX_VLANACCESS_VLAN_MIRROR		BIT(29)
+#define VSC73XX_VLANACCESS_VLAN_SRC_CHECK	BIT(28)
+#define VSC73XX_VLANACCESS_VLAN_PORT_MASK	GENMASK(9, 2)
+#define VSC73XX_VLANACCESS_VLAN_TBL_CMD_MASK	GENMASK(2, 0)
+#define VSC73XX_VLANACCESS_VLAN_TBL_CMD_IDLE	0
+#define VSC73XX_VLANACCESS_VLAN_TBL_CMD_READ_ENTRY	1
+#define VSC73XX_VLANACCESS_VLAN_TBL_CMD_WRITE_ENTRY	2
+#define VSC73XX_VLANACCESS_VLAN_TBL_CMD_CLEAR_TABLE	3
+
+/* MII block 3 registers */
+#define VSC73XX_MII_STAT	0x0
+#define VSC73XX_MII_CMD		0x1
+#define VSC73XX_MII_DATA	0x2
+
+/* Arbiter block 5 registers */
+#define VSC73XX_ARBEMPTY		0x0c
+#define VSC73XX_ARBDISC			0x0e
+#define VSC73XX_SBACKWDROP		0x12
+#define VSC73XX_DBACKWDROP		0x13
+#define VSC73XX_ARBBURSTPROB		0x15
+
+/* System block 7 registers */
+#define VSC73XX_ICPU_SIPAD		0x01
+#define VSC73XX_GMIIDELAY		0x05
+#define VSC73XX_ICPU_CTRL		0x10
+#define VSC73XX_ICPU_ADDR		0x11
+#define VSC73XX_ICPU_SRAM		0x12
+#define VSC73XX_HWSEM			0x13
+#define VSC73XX_GLORESET		0x14
+#define VSC73XX_ICPU_MBOX_VAL		0x15
+#define VSC73XX_ICPU_MBOX_SET		0x16
+#define VSC73XX_ICPU_MBOX_CLR		0x17
+#define VSC73XX_CHIPID			0x18
+#define VSC73XX_GPIO			0x34
+
+#define VSC73XX_GMIIDELAY_GMII0_GTXDELAY_NONE	0
+#define VSC73XX_GMIIDELAY_GMII0_GTXDELAY_1_4_NS	1
+#define VSC73XX_GMIIDELAY_GMII0_GTXDELAY_1_7_NS	2
+#define VSC73XX_GMIIDELAY_GMII0_GTXDELAY_2_0_NS	3
+
+#define VSC73XX_GMIIDELAY_GMII0_RXDELAY_NONE	(0 << 4)
+#define VSC73XX_GMIIDELAY_GMII0_RXDELAY_1_4_NS	(1 << 4)
+#define VSC73XX_GMIIDELAY_GMII0_RXDELAY_1_7_NS	(2 << 4)
+#define VSC73XX_GMIIDELAY_GMII0_RXDELAY_2_0_NS	(3 << 4)
+
+#define VSC73XX_ICPU_CTRL_WATCHDOG_RST	BIT(31)
+#define VSC73XX_ICPU_CTRL_CLK_DIV_MASK	GENMASK(12, 8)
+#define VSC73XX_ICPU_CTRL_SRST_HOLD	BIT(7)
+#define VSC73XX_ICPU_CTRL_ICPU_PI_EN	BIT(6)
+#define VSC73XX_ICPU_CTRL_BOOT_EN	BIT(3)
+#define VSC73XX_ICPU_CTRL_EXT_ACC_EN	BIT(2)
+#define VSC73XX_ICPU_CTRL_CLK_EN	BIT(1)
+#define VSC73XX_ICPU_CTRL_SRST		BIT(0)
+
+#define VSC73XX_CHIPID_ID_SHIFT		12
+#define VSC73XX_CHIPID_ID_MASK		0xffff
+#define VSC73XX_CHIPID_REV_SHIFT	28
+#define VSC73XX_CHIPID_REV_MASK		0xf
+#define VSC73XX_CHIPID_ID_7385		0x7385
+#define VSC73XX_CHIPID_ID_7388		0x7388
+#define VSC73XX_CHIPID_ID_7395		0x7395
+#define VSC73XX_CHIPID_ID_7398		0x7398
+
+#define VSC73XX_GLORESET_STROBE		BIT(4)
+#define VSC73XX_GLORESET_ICPU_LOCK	BIT(3)
+#define VSC73XX_GLORESET_MEM_LOCK	BIT(2)
+#define VSC73XX_GLORESET_PHY_RESET	BIT(1)
+#define VSC73XX_GLORESET_MASTER_RESET	BIT(0)
+
+#define VSC73XX_CMD_MODE_READ		0
+#define VSC73XX_CMD_MODE_WRITE		1
+#define VSC73XX_CMD_MODE_SHIFT		4
+#define VSC73XX_CMD_BLOCK_SHIFT		5
+#define VSC73XX_CMD_BLOCK_MASK		0x7
+#define VSC73XX_CMD_SUBBLOCK_MASK	0xf
+
+#define VSC7385_CLOCK_DELAY		((3 << 4) | 3)
+#define VSC7385_CLOCK_DELAY_MASK	((3 << 4) | 3)
+
+#define VSC73XX_ICPU_CTRL_STOP	(VSC73XX_ICPU_CTRL_SRST_HOLD | \
+				 VSC73XX_ICPU_CTRL_BOOT_EN | \
+				 VSC73XX_ICPU_CTRL_EXT_ACC_EN)
+
+#define VSC73XX_ICPU_CTRL_START	(VSC73XX_ICPU_CTRL_CLK_DIV | \
+				 VSC73XX_ICPU_CTRL_BOOT_EN | \
+				 VSC73XX_ICPU_CTRL_CLK_EN | \
+				 VSC73XX_ICPU_CTRL_SRST)
+
+/**
+ * struct vsc73xx - VSC73xx state container
+ */
+struct vsc73xx {
+	struct device		*dev;
+	struct gpio_desc	*reset;
+	struct spi_device	*spi;
+	struct dsa_switch	*ds;
+	struct gpio_chip	gc;
+	u16			chipid;
+	bool			is_vsc7385;
+	bool			is_vsc7388;
+	bool			is_vsc7395;
+	bool			is_vsc7398;
+	u8			addr[ETH_ALEN];
+	struct mutex		lock; /* Protects SPI traffic */
+};
+
+struct vsc73xx_counter {
+	u8 counter;
+	const char *name;
+};
+
+/* Counters are named according to the MIB standards where applicable.
+ * Some counters are custom, non-standard. The standard counters are
+ * named in accordance with RFC2819, RFC2021 and IEEE Std 802.3-2002 Annex
+ * 30A Counters.
+ */
+static const struct vsc73xx_counter vsc73xx_rx_counters[] = {
+	{ 0, "RxEtherStatsPkts" },
+	{ 1, "RxBroadcast+MulticastPkts" }, /* non-standard counter */
+	{ 2, "RxTotalErrorPackets" }, /* non-standard counter */
+	{ 3, "RxEtherStatsBroadcastPkts" },
+	{ 4, "RxEtherStatsMulticastPkts" },
+	{ 5, "RxEtherStatsPkts64Octets" },
+	{ 6, "RxEtherStatsPkts65to127Octets" },
+	{ 7, "RxEtherStatsPkts128to255Octets" },
+	{ 8, "RxEtherStatsPkts256to511Octets" },
+	{ 9, "RxEtherStatsPkts512to1023Octets" },
+	{ 10, "RxEtherStatsPkts1024to1518Octets" },
+	{ 11, "RxJumboFrames" }, /* non-standard counter */
+	{ 12, "RxaPauseMACControlFramesTransmitted" },
+	{ 13, "RxFIFODrops" }, /* non-standard counter */
+	{ 14, "RxBackwardDrops" }, /* non-standard counter */
+	{ 15, "RxClassifierDrops" }, /* non-standard counter */
+	{ 16, "RxEtherStatsCRCAlignErrors" },
+	{ 17, "RxEtherStatsUndersizePkts" },
+	{ 18, "RxEtherStatsOversizePkts" },
+	{ 19, "RxEtherStatsFragments" },
+	{ 20, "RxEtherStatsJabbers" },
+	{ 21, "RxaMACControlFramesReceived" },
+	/* 22-24 are undefined */
+	{ 25, "RxaFramesReceivedOK" },
+	{ 26, "RxQoSClass0" }, /* non-standard counter */
+	{ 27, "RxQoSClass1" }, /* non-standard counter */
+	{ 28, "RxQoSClass2" }, /* non-standard counter */
+	{ 29, "RxQoSClass3" }, /* non-standard counter */
+};
+
+static const struct vsc73xx_counter vsc73xx_tx_counters[] = {
+	{ 0, "TxEtherStatsPkts" },
+	{ 1, "TxBroadcast+MulticastPkts" }, /* non-standard counter */
+	{ 2, "TxTotalErrorPackets" }, /* non-standard counter */
+	{ 3, "TxEtherStatsBroadcastPkts" },
+	{ 4, "TxEtherStatsMulticastPkts" },
+	{ 5, "TxEtherStatsPkts64Octets" },
+	{ 6, "TxEtherStatsPkts65to127Octets" },
+	{ 7, "TxEtherStatsPkts128to255Octets" },
+	{ 8, "TxEtherStatsPkts256to511Octets" },
+	{ 9, "TxEtherStatsPkts512to1023Octets" },
+	{ 10, "TxEtherStatsPkts1024to1518Octets" },
+	{ 11, "TxJumboFrames" }, /* non-standard counter */
+	{ 12, "TxaPauseMACControlFramesTransmitted" },
+	{ 13, "TxFIFODrops" }, /* non-standard counter */
+	{ 14, "TxDrops" }, /* non-standard counter */
+	{ 15, "TxEtherStatsCollisions" },
+	{ 16, "TxEtherStatsCRCAlignErrors" },
+	{ 17, "TxEtherStatsUndersizePkts" },
+	{ 18, "TxEtherStatsOversizePkts" },
+	{ 19, "TxEtherStatsFragments" },
+	{ 20, "TxEtherStatsJabbers" },
+	/* 21-24 are undefined */
+	{ 25, "TxaFramesReceivedOK" },
+	{ 26, "TxQoSClass0" }, /* non-standard counter */
+	{ 27, "TxQoSClass1" }, /* non-standard counter */
+	{ 28, "TxQoSClass2" }, /* non-standard counter */
+	{ 29, "TxQoSClass3" }, /* non-standard counter */
+};
+
+static int vsc73xx_is_addr_valid(u8 block, u8 subblock)
+{
+	switch (block) {
+	case VSC73XX_BLOCK_MAC:
+		switch (subblock) {
+		case 0 ... 4:
+		case 6:
+			return 1;
+		}
+		break;
+
+	case VSC73XX_BLOCK_ANALYZER:
+	case VSC73XX_BLOCK_SYSTEM:
+		switch (subblock) {
+		case 0:
+			return 1;
+		}
+		break;
+
+	case VSC73XX_BLOCK_MII:
+	case VSC73XX_BLOCK_CAPTURE:
+	case VSC73XX_BLOCK_ARBITER:
+		switch (subblock) {
+		case 0 ... 1:
+			return 1;
+		}
+		break;
+	}
+
+	return 0;
+}
+
+static u8 vsc73xx_make_addr(u8 mode, u8 block, u8 subblock)
+{
+	u8 ret;
+
+	ret = (block & VSC73XX_CMD_BLOCK_MASK) << VSC73XX_CMD_BLOCK_SHIFT;
+	ret |= (mode & 1) << VSC73XX_CMD_MODE_SHIFT;
+	ret |= subblock & VSC73XX_CMD_SUBBLOCK_MASK;
+
+	return ret;
+}
+
+static int vsc73xx_read(struct vsc73xx *vsc, u8 block, u8 subblock, u8 reg,
+			u32 *val)
+{
+	struct spi_transfer t[2];
+	struct spi_message m;
+	u8 cmd[4];
+	u8 buf[4];
+	int ret;
+
+	if (!vsc73xx_is_addr_valid(block, subblock))
+		return -EINVAL;
+
+	spi_message_init(&m);
+
+	memset(&t, 0, sizeof(t));
+
+	t[0].tx_buf = cmd;
+	t[0].len = sizeof(cmd);
+	spi_message_add_tail(&t[0], &m);
+
+	t[1].rx_buf = buf;
+	t[1].len = sizeof(buf);
+	spi_message_add_tail(&t[1], &m);
+
+	cmd[0] = vsc73xx_make_addr(VSC73XX_CMD_MODE_READ, block, subblock);
+	cmd[1] = reg;
+	cmd[2] = 0;
+	cmd[3] = 0;
+
+	mutex_lock(&vsc->lock);
+	ret = spi_sync(vsc->spi, &m);
+	mutex_unlock(&vsc->lock);
+
+	if (ret)
+		return ret;
+
+	*val = (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
+
+	return 0;
+}
+
+static int vsc73xx_write(struct vsc73xx *vsc, u8 block, u8 subblock, u8 reg,
+			 u32 val)
+{
+	struct spi_transfer t[2];
+	struct spi_message m;
+	u8 cmd[2];
+	u8 buf[4];
+	int ret;
+
+	if (!vsc73xx_is_addr_valid(block, subblock))
+		return -EINVAL;
+
+	spi_message_init(&m);
+
+	memset(&t, 0, sizeof(t));
+
+	t[0].tx_buf = cmd;
+	t[0].len = sizeof(cmd);
+	spi_message_add_tail(&t[0], &m);
+
+	t[1].tx_buf = buf;
+	t[1].len = sizeof(buf);
+	spi_message_add_tail(&t[1], &m);
+
+	cmd[0] = vsc73xx_make_addr(VSC73XX_CMD_MODE_WRITE, block, subblock);
+	cmd[1] = reg;
+
+	buf[0] = (val >> 24) & 0xff;
+	buf[1] = (val >> 16) & 0xff;
+	buf[2] = (val >> 8) & 0xff;
+	buf[3] = val & 0xff;
+
+	mutex_lock(&vsc->lock);
+	ret = spi_sync(vsc->spi, &m);
+	mutex_unlock(&vsc->lock);
+
+	return ret;
+}
+
+static int vsc73xx_update_bits(struct vsc73xx *vsc, u8 block, u8 subblock,
+			       u8 reg, u32 mask, u32 val)
+{
+	u32 tmp, orig;
+	int ret;
+
+	/* Same read-modify-write algorithm as e.g. regmap */
+	ret = vsc73xx_read(vsc, block, subblock, reg, &orig);
+	if (ret)
+		return ret;
+	tmp = orig & ~mask;
+	tmp |= val & mask;
+	return vsc73xx_write(vsc, block, subblock, reg, tmp);
+}
+
+static int vsc73xx_detect(struct vsc73xx *vsc)
+{
+	bool icpu_si_boot_en;
+	bool icpu_pi_en;
+	u32 val;
+	u32 rev;
+	int ret;
+	u32 id;
+
+	ret = vsc73xx_read(vsc, VSC73XX_BLOCK_SYSTEM, 0,
+			   VSC73XX_ICPU_MBOX_VAL, &val);
+	if (ret) {
+		dev_err(vsc->dev, "unable to read mailbox (%d)\n", ret);
+		return ret;
+	}
+
+	if (val == 0xffffffff) {
+		dev_info(vsc->dev, "chip seems dead, assert reset\n");
+		gpiod_set_value_cansleep(vsc->reset, 1);
+		/* Reset pulse should be 20ns minimum, according to datasheet
+		 * table 245, so 10us should be fine
+		 */
+		usleep_range(10, 100);
+		gpiod_set_value_cansleep(vsc->reset, 0);
+		/* Wait 20ms according to datasheet table 245 */
+		msleep(20);
+
+		ret = vsc73xx_read(vsc, VSC73XX_BLOCK_SYSTEM, 0,
+				   VSC73XX_ICPU_MBOX_VAL, &val);
+		if (val == 0xffffffff) {
+			dev_err(vsc->dev, "seems not to help, giving up\n");
+			return -ENODEV;
+		}
+	}
+
+	ret = vsc73xx_read(vsc, VSC73XX_BLOCK_SYSTEM, 0,
+			   VSC73XX_CHIPID, &val);
+	if (ret) {
+		dev_err(vsc->dev, "unable to read chip id (%d)\n", ret);
+		return ret;
+	}
+
+	id = (val >> VSC73XX_CHIPID_ID_SHIFT) &
+		VSC73XX_CHIPID_ID_MASK;
+	switch (id) {
+	case VSC73XX_CHIPID_ID_7385:
+		vsc->is_vsc7385 = true;
+		break;
+	case VSC73XX_CHIPID_ID_7388:
+		vsc->is_vsc7388 = true;
+		break;
+	case VSC73XX_CHIPID_ID_7395:
+		vsc->is_vsc7395 = true;
+		break;
+	case VSC73XX_CHIPID_ID_7398:
+		vsc->is_vsc7398 = true;
+		break;
+	default:
+		dev_err(vsc->dev, "unsupported chip, id=%04x\n", id);
+		return -ENODEV;
+	}
+
+	vsc->chipid = id;
+	rev = (val >> VSC73XX_CHIPID_REV_SHIFT) &
+		VSC73XX_CHIPID_REV_MASK;
+	dev_info(vsc->dev, "VSC%04X (rev: %d) switch found\n", id, rev);
+
+	ret = vsc73xx_read(vsc, VSC73XX_BLOCK_SYSTEM, 0,
+			   VSC73XX_ICPU_CTRL, &val);
+	if (ret) {
+		dev_err(vsc->dev, "unable to read iCPU control\n");
+		return ret;
+	}
+
+	/* The iCPU can always be used but can boot in different ways.
+	 * If it is initially disabled and has no external memory,
+	 * we are in control and can do whatever we like, else we
+	 * are probably in trouble (we need some way to communicate
+	 * with the running firmware) so we bail out for now.
+	 */
+	icpu_pi_en = !!(val & VSC73XX_ICPU_CTRL_ICPU_PI_EN);
+	icpu_si_boot_en = !!(val & VSC73XX_ICPU_CTRL_BOOT_EN);
+	if (icpu_si_boot_en && icpu_pi_en) {
+		dev_err(vsc->dev,
+			"iCPU enabled boots from SI, has external memory\n");
+		dev_err(vsc->dev, "no idea how to deal with this\n");
+		return -ENODEV;
+	}
+	if (icpu_si_boot_en && !icpu_pi_en) {
+		dev_err(vsc->dev,
+			"iCPU enabled boots from SI, no external memory\n");
+		dev_err(vsc->dev, "no idea how to deal with this\n");
+		return -ENODEV;
+	}
+	if (!icpu_si_boot_en && icpu_pi_en) {
+		dev_err(vsc->dev,
+			"iCPU enabled, boots from PI external memory\n");
+		dev_err(vsc->dev, "no idea how to deal with this\n");
+		return -ENODEV;
+	}
+	/* !icpu_si_boot_en && !cpu_pi_en */
+	dev_info(vsc->dev, "iCPU disabled, no external memory\n");
+
+	return 0;
+}
+
+static int vsc73xx_phy_read(struct dsa_switch *ds, int phy, int regnum)
+{
+	struct vsc73xx *vsc = ds->priv;
+	u32 cmd;
+	u32 val;
+	int ret;
+
+	/* Setting bit 26 means "read" */
+	cmd = BIT(26) | (phy << 21) | (regnum << 16);
+	ret = vsc73xx_write(vsc, VSC73XX_BLOCK_MII, 0, 1, cmd);
+	if (ret)
+		return ret;
+	msleep(2);
+	ret = vsc73xx_read(vsc, VSC73XX_BLOCK_MII, 0, 2, &val);
+	if (ret)
+		return ret;
+	if (val & BIT(16)) {
+		dev_err(vsc->dev, "reading reg %02x from phy%d failed\n",
+			regnum, phy);
+		return -EIO;
+	}
+	val &= 0xFFFFU;
+
+	dev_dbg(vsc->dev, "read reg %02x from phy%d = %04x\n",
+		regnum, phy, val);
+
+	return val;
+}
+
+static int vsc73xx_phy_write(struct dsa_switch *ds, int phy, int regnum,
+			     u16 val)
+{
+	struct vsc73xx *vsc = ds->priv;
+	u32 cmd;
+	int ret;
+
+	/* It was found through tedious experiments that this router
+	 * chip really hates to have it's PHYs reset. They
+	 * never recover if that happens: autonegotiation stops
+	 * working after a reset. Just filter out this command.
+	 * (Resetting the whole chip is OK.)
+	 */
+	if (regnum == 0 && (val & BIT(15))) {
+		dev_info(vsc->dev, "reset PHY - disallowed\n");
+		return 0;
+	}
+
+	cmd = (phy << 21) | (regnum << 16);
+	ret = vsc73xx_write(vsc, VSC73XX_BLOCK_MII, 0, 1, cmd);
+	if (ret)
+		return ret;
+
+	dev_dbg(vsc->dev, "write %04x to reg %02x in phy%d\n",
+		val, regnum, phy);
+	return 0;
+}
+
+static enum dsa_tag_protocol vsc73xx_get_tag_protocol(struct dsa_switch *ds,
+						      int port)
+{
+	/* The switch internally uses a 8 byte header with length,
+	 * source port, tag, LPA and priority. This is supposedly
+	 * only accessible when operating the switch using the internal
+	 * CPU or with an external CPU mapping the device in, but not
+	 * when operating the switch over SPI and putting frames in/out
+	 * on port 6 (the CPU port). So far we must assume that we
+	 * cannot access the tag. (See "Internal frame header" section
+	 * 3.9.1 in the manual.)
+	 */
+	return DSA_TAG_PROTO_NONE;
+}
+
+static int vsc73xx_setup(struct dsa_switch *ds)
+{
+	struct vsc73xx *vsc = ds->priv;
+	int i;
+
+	dev_info(vsc->dev, "set up the switch\n");
+
+	/* Issue RESET */
+	vsc73xx_write(vsc, VSC73XX_BLOCK_SYSTEM, 0, VSC73XX_GLORESET,
+		      VSC73XX_GLORESET_MASTER_RESET);
+	usleep_range(125, 200);
+
+	/* Initialize memory, initialize RAM bank 0..15 except 6 and 7
+	 * This sequence appears in the
+	 * VSC7385 SparX-G5 datasheet section 6.6.1
+	 * VSC7395 SparX-G5e datasheet section 6.6.1
+	 * "initialization sequence".
+	 * No explanation is given to the 0x1010400 magic number.
+	 */
+	for (i = 0; i <= 15; i++) {
+		if (i != 6 && i != 7) {
+			vsc73xx_write(vsc, VSC73XX_BLOCK_MEMINIT,
+				      2,
+				      0, 0x1010400 + i);
+			mdelay(1);
+		}
+	}
+	mdelay(30);
+
+	/* Clear MAC table */
+	vsc73xx_write(vsc, VSC73XX_BLOCK_ANALYZER, 0,
+		      VSC73XX_MACACCESS,
+		      VSC73XX_MACACCESS_CMD_CLEAR_TABLE);
+
+	/* Clear VLAN table */
+	vsc73xx_write(vsc, VSC73XX_BLOCK_ANALYZER, 0,
+		      VSC73XX_VLANACCESS,
+		      VSC73XX_VLANACCESS_VLAN_TBL_CMD_CLEAR_TABLE);
+
+	msleep(40);
+
+	/* Use 20KiB buffers on all ports on VSC7395
+	 * The VSC7385 has 16KiB buffers and that is the
+	 * default if we don't set this up explicitly.
+	 * Port "31" is "all ports".
+	 */
+	if (vsc->is_vsc7395 || vsc->is_vsc7398)
+		vsc73xx_write(vsc, VSC73XX_BLOCK_MAC, 0x1f,
+			      VSC73XX_Q_MISC_CONF,
+			      VSC73XX_Q_MISC_CONF_EXTENT_MEM);
+
+	/* Put all ports into reset until enabled */
+	for (i = 0; i < 7; i++) {
+		if (i == 5)
+			continue;
+		vsc73xx_write(vsc, VSC73XX_BLOCK_MAC, 4,
+			      VSC73XX_MAC_CFG, VSC73XX_MAC_CFG_RESET);
+	}
+
+	/* MII delay, set both GTX and RX delay to 2 ns */
+	vsc73xx_write(vsc, VSC73XX_BLOCK_SYSTEM, 0, VSC73XX_GMIIDELAY,
+		      VSC73XX_GMIIDELAY_GMII0_GTXDELAY_2_0_NS |
+		      VSC73XX_GMIIDELAY_GMII0_RXDELAY_2_0_NS);
+	/* Enable reception of frames on all ports */
+	vsc73xx_write(vsc, VSC73XX_BLOCK_ANALYZER, 0, VSC73XX_RECVMASK,
+		      0x5f);
+	/* IP multicast flood mask (table 144) */
+	vsc73xx_write(vsc, VSC73XX_BLOCK_ANALYZER, 0, VSC73XX_IFLODMSK,
+		      0xff);
+
+	mdelay(50);
+
+	/* Release reset from the internal PHYs */
+	vsc73xx_write(vsc, VSC73XX_BLOCK_SYSTEM, 0, VSC73XX_GLORESET,
+		      VSC73XX_GLORESET_PHY_RESET);
+
+	udelay(4);
+
+	return 0;
+}
+
+static void vsc73xx_init_port(struct vsc73xx *vsc, int port)
+{
+	u32 val;
+
+	/* MAC configure, first reset the port and then write defaults */
+	vsc73xx_write(vsc, VSC73XX_BLOCK_MAC,
+		      port,
+		      VSC73XX_MAC_CFG,
+		      VSC73XX_MAC_CFG_RESET);
+
+	/* Take up the port in 1Gbit mode by default, this will be
+	 * augmented after auto-negotiation on the PHY-facing
+	 * ports.
+	 */
+	if (port == CPU_PORT)
+		val = VSC73XX_MAC_CFG_1000M_F_RGMII;
+	else
+		val = VSC73XX_MAC_CFG_1000M_F_PHY;
+
+	vsc73xx_write(vsc, VSC73XX_BLOCK_MAC,
+		      port,
+		      VSC73XX_MAC_CFG,
+		      val |
+		      VSC73XX_MAC_CFG_TX_EN |
+		      VSC73XX_MAC_CFG_RX_EN);
+
+	/* Max length, we can do up to 9.6 KiB, so allow that.
+	 * According to application not "VSC7398 Jumbo Frames" setting
+	 * up the MTU to 9.6 KB does not affect the performance on standard
+	 * frames, so just enable it. It is clear from the application note
+	 * that "9.6 kilobytes" == 9600 bytes.
+	 */
+	vsc73xx_write(vsc, VSC73XX_BLOCK_MAC,
+		      port,
+		      VSC73XX_MAXLEN, 9600);
+
+	/* Flow control for the CPU port:
+	 * Use a zero delay pause frame when pause condition is left
+	 * Obey pause control frames
+	 */
+	vsc73xx_write(vsc, VSC73XX_BLOCK_MAC,
+		      port,
+		      VSC73XX_FCCONF,
+		      VSC73XX_FCCONF_ZERO_PAUSE_EN |
+		      VSC73XX_FCCONF_FLOW_CTRL_OBEY);
+
+	/* Issue pause control frames on PHY facing ports.
+	 * Allow early initiation of MAC transmission if the amount
+	 * of egress data is below 512 bytes on CPU port.
+	 * FIXME: enable 20KiB buffers?
+	 */
+	if (port == CPU_PORT)
+		val = VSC73XX_Q_MISC_CONF_EARLY_TX_512;
+	else
+		val = VSC73XX_Q_MISC_CONF_MAC_PAUSE_MODE;
+	val |= VSC73XX_Q_MISC_CONF_EXTENT_MEM;
+	vsc73xx_write(vsc, VSC73XX_BLOCK_MAC,
+		      port,
+		      VSC73XX_Q_MISC_CONF,
+		      val);
+
+	/* Flow control MAC: a MAC address used in flow control frames */
+	val = (vsc->addr[5] << 16) | (vsc->addr[4] << 8) | (vsc->addr[3]);
+	vsc73xx_write(vsc, VSC73XX_BLOCK_MAC,
+		      port,
+		      VSC73XX_FCMACHI,
+		      val);
+	val = (vsc->addr[2] << 16) | (vsc->addr[1] << 8) | (vsc->addr[0]);
+	vsc73xx_write(vsc, VSC73XX_BLOCK_MAC,
+		      port,
+		      VSC73XX_FCMACLO,
+		      val);
+
+	/* Tell the categorizer to forward pause frames, not control
+	 * frame. Do not drop anything.
+	 */
+	vsc73xx_write(vsc, VSC73XX_BLOCK_MAC,
+		      port,
+		      VSC73XX_CAT_DROP,
+		      VSC73XX_CAT_DROP_FWD_PAUSE_ENA);
+
+	/* Clear all counters */
+	vsc73xx_write(vsc, VSC73XX_BLOCK_MAC,
+		      port, VSC73XX_C_RX0, 0);
+}
+
+static void vsc73xx_adjust_enable_port(struct vsc73xx *vsc,
+				       int port, struct phy_device *phydev,
+				       u32 initval)
+{
+	u32 val = initval;
+	u8 seed;
+
+	/* Reset this port FIXME: break out subroutine */
+	val |= VSC73XX_MAC_CFG_RESET;
+	vsc73xx_write(vsc, VSC73XX_BLOCK_MAC, port, VSC73XX_MAC_CFG, val);
+
+	/* Seed the port randomness with randomness */
+	get_random_bytes(&seed, 1);
+	val |= seed << VSC73XX_MAC_CFG_SEED_OFFSET;
+	val |= VSC73XX_MAC_CFG_SEED_LOAD;
+	val |= VSC73XX_MAC_CFG_WEXC_DIS;
+	vsc73xx_write(vsc, VSC73XX_BLOCK_MAC, port, VSC73XX_MAC_CFG, val);
+
+	/* Flow control for the PHY facing ports:
+	 * Use a zero delay pause frame when pause condition is left
+	 * Obey pause control frames
+	 * When generating pause frames, use 0xff as pause value
+	 */
+	vsc73xx_write(vsc, VSC73XX_BLOCK_MAC, port, VSC73XX_FCCONF,
+		      VSC73XX_FCCONF_ZERO_PAUSE_EN |
+		      VSC73XX_FCCONF_FLOW_CTRL_OBEY |
+		      0xff);
+
+	/* Disallow backward dropping of frames from this port */
+	vsc73xx_update_bits(vsc, VSC73XX_BLOCK_ARBITER, 0,
+			    VSC73XX_SBACKWDROP, BIT(port), 0);
+
+	/* Enable TX, RX, deassert reset, stop loading seed */
+	vsc73xx_update_bits(vsc, VSC73XX_BLOCK_MAC, port,
+			    VSC73XX_MAC_CFG,
+			    VSC73XX_MAC_CFG_RESET | VSC73XX_MAC_CFG_SEED_LOAD |
+			    VSC73XX_MAC_CFG_TX_EN | VSC73XX_MAC_CFG_RX_EN,
+			    VSC73XX_MAC_CFG_TX_EN | VSC73XX_MAC_CFG_RX_EN);
+}
+
+static void vsc73xx_adjust_link(struct dsa_switch *ds, int port,
+				struct phy_device *phydev)
+{
+	struct vsc73xx *vsc = ds->priv;
+	u32 val;
+
+	/* Special handling of the CPU-facing port */
+	if (port == CPU_PORT) {
+		/* Other ports are already initialized but not this one */
+		vsc73xx_init_port(vsc, CPU_PORT);
+		/* Select the external port for this interface (EXT_PORT)
+		 * Enable the GMII GTX external clock
+		 * Use double data rate (DDR mode)
+		 */
+		vsc73xx_write(vsc, VSC73XX_BLOCK_MAC,
+			      CPU_PORT,
+			      VSC73XX_ADVPORTM,
+			      VSC73XX_ADVPORTM_EXT_PORT |
+			      VSC73XX_ADVPORTM_ENA_GTX |
+			      VSC73XX_ADVPORTM_DDR_MODE);
+	}
+
+	/* This is the MAC confiuration that always need to happen
+	 * after a PHY or the CPU port comes up or down.
+	 */
+	val = phy_read(phydev, 1);
+	if ((val & 0x0024) != 0x0024) {
+		dev_info(vsc->dev, "port %d: went down\n",
+			 port);
+
+		/* Disable RX on this port */
+		vsc73xx_update_bits(vsc, VSC73XX_BLOCK_MAC, port,
+				    VSC73XX_MAC_CFG,
+				    VSC73XX_MAC_CFG_RX_EN, 0);
+
+		/* Discard packets */
+		vsc73xx_update_bits(vsc, VSC73XX_BLOCK_ARBITER, 0,
+				    VSC73XX_ARBDISC, BIT(port), BIT(port));
+
+		/* Wait until queue is empty */
+		vsc73xx_read(vsc, VSC73XX_BLOCK_ARBITER, 0,
+			     VSC73XX_ARBEMPTY, &val);
+		while (!(val & BIT(port))) {
+			msleep(1);
+			vsc73xx_read(vsc, VSC73XX_BLOCK_ARBITER, 0,
+				     VSC73XX_ARBEMPTY, &val);
+		}
+
+		/* Put this port into reset */
+		vsc73xx_write(vsc, VSC73XX_BLOCK_MAC, port, VSC73XX_MAC_CFG,
+			      VSC73XX_MAC_CFG_RESET);
+
+		/* Accept packets again */
+		vsc73xx_update_bits(vsc, VSC73XX_BLOCK_ARBITER, 0,
+				    VSC73XX_ARBDISC, BIT(port), 0);
+
+		/* Allow backward dropping of frames from this port */
+		vsc73xx_update_bits(vsc, VSC73XX_BLOCK_ARBITER, 0,
+				    VSC73XX_SBACKWDROP, BIT(port), BIT(port));
+
+		/* Receive mask (disable forwarding) */
+		vsc73xx_update_bits(vsc, VSC73XX_BLOCK_ANALYZER, 0,
+				    VSC73XX_RECVMASK, BIT(port), 0);
+
+		return;
+	}
+
+	/* Figure out what speed was negotiated */
+	val = phy_read(phydev, 0x0a);
+	if (val & 0x0c00) {
+		dev_info(vsc->dev, "port %d: 1000 Mbit mode full duplex\n",
+			 port);
+
+		/* Set up default for internal or external RGMII */
+		if (port == CPU_PORT)
+			val = VSC73XX_MAC_CFG_1000M_F_RGMII;
+		else
+			val = VSC73XX_MAC_CFG_1000M_F_PHY;
+		vsc73xx_adjust_enable_port(vsc, port, phydev, val);
+	} else {
+		val = phy_read(phydev, 0x05);
+		val &= 0x05e0;
+		val >>= 5;
+		if (val & 0x0c) {
+			if (val & 0x08) {
+				val = VSC73XX_MAC_CFG_100_10M_F_PHY;
+				dev_info(vsc->dev,
+					 "port %d: 100 Mbit full duplex mode\n",
+					 port);
+			} else {
+				val = VSC73XX_MAC_CFG_100_10M_H_PHY;
+				dev_info(vsc->dev,
+					 "port %d: 100 Mbit half duplex mode\n",
+					 port);
+			}
+			vsc73xx_adjust_enable_port(vsc, port, phydev, val);
+		} else if (val & 0x03) {
+			if (val & 0x02) {
+				val = VSC73XX_MAC_CFG_100_10M_F_PHY;
+				dev_info(vsc->dev,
+					 "port %d: 10 Mbit full duplex mode\n",
+					 port);
+			} else {
+				val = VSC73XX_MAC_CFG_100_10M_H_PHY;
+				dev_info(vsc->dev,
+					 "port %d: 10 Mbit half duplex mode\n",
+					 port);
+			}
+			vsc73xx_adjust_enable_port(vsc, port, phydev, val);
+		} else {
+			dev_err(vsc->dev,
+				"could not adjust link: unknown speed\n");
+		}
+	}
+
+	/* Enable port (forwarding) in the receieve mask */
+	vsc73xx_update_bits(vsc, VSC73XX_BLOCK_ANALYZER, 0,
+			    VSC73XX_RECVMASK, BIT(port), BIT(port));
+}
+
+static int vsc73xx_port_enable(struct dsa_switch *ds, int port,
+			       struct phy_device *phy)
+{
+	struct vsc73xx *vsc = ds->priv;
+
+	dev_info(vsc->dev, "enable port %d\n", port);
+
+	/* VSC7385 and VSC7395 have ports 0..4 accessible */
+	if ((vsc->is_vsc7385 || vsc->is_vsc7395) && port > 4)
+		return -ENODEV;
+	if ((vsc->is_vsc7388 || vsc->is_vsc7398) && port > 7)
+		return -ENODEV;
+
+	vsc73xx_init_port(vsc, port);
+
+	return 0;
+}
+
+static void vsc73xx_port_disable(struct dsa_switch *ds, int port,
+				 struct phy_device *phy)
+{
+	struct vsc73xx *vsc = ds->priv;
+
+	/* VSC7385 and VSC7395 have ports 0..4 accessible */
+	if ((vsc->is_vsc7385 || vsc->is_vsc7395) && port > 4)
+		return;
+	if ((vsc->is_vsc7388 || vsc->is_vsc7398) && port > 7)
+		return;
+
+	/* Just put the port into reset */
+	vsc73xx_write(vsc, VSC73XX_BLOCK_MAC, port,
+		      VSC73XX_MAC_CFG, VSC73XX_MAC_CFG_RESET);
+}
+
+const struct vsc73xx_counter *vsc73xx_find_counter(struct vsc73xx *vsc,
+						   u8 counter,
+						   bool tx)
+{
+	const struct vsc73xx_counter *cnts;
+	int num_cnts;
+	int i;
+
+	if (tx) {
+		cnts = vsc73xx_tx_counters;
+		num_cnts = ARRAY_SIZE(vsc73xx_tx_counters);
+	} else {
+		cnts = vsc73xx_rx_counters;
+		num_cnts = ARRAY_SIZE(vsc73xx_rx_counters);
+	}
+
+	for (i = 0; i < num_cnts; i++) {
+		const struct vsc73xx_counter *cnt;
+
+		cnt = &cnts[i];
+		if (cnt->counter == counter)
+			return cnt;
+	}
+
+	return NULL;
+}
+
+void vsc73xx_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+{
+	const struct vsc73xx_counter *cnt;
+	struct vsc73xx *vsc = ds->priv;
+	u8 indices[6];
+	int i, j;
+	u32 val;
+	int ret;
+
+	ret = vsc73xx_read(vsc, VSC73XX_BLOCK_MAC, port,
+			   VSC73XX_C_CFG, &val);
+	if (ret)
+		return;
+
+	indices[0] = (val & 0x1f); /* RX counter 0 */
+	indices[1] = ((val >> 5) & 0x1f); /* RX counter 1 */
+	indices[2] = ((val >> 10) & 0x1f); /* RX counter 2 */
+	indices[3] = ((val >> 16) & 0x1f); /* TX counter 0 */
+	indices[4] = ((val >> 21) & 0x1f); /* TX counter 1 */
+	indices[5] = ((val >> 26) & 0x1f); /* TX counter 2 */
+
+	/* The first counters is the RX octets */
+	j = 0;
+	strncpy(data + j * ETH_GSTRING_LEN,
+		"RxEtherStatsOctets", ETH_GSTRING_LEN);
+	j++;
+
+	/* Each port supports recording 3 RX counters and 3 TX counters,
+	 * figure out what counters we use in this set-up and return the
+	 * names of them. The hardware default counters will be number of
+	 * packets on RX/TX, combined broadcast+multicast packets RX/TX and
+	 * total error packets RX/TX.
+	 */
+	for (i = 0; i < 3; i++) {
+		cnt = vsc73xx_find_counter(vsc, indices[i], false);
+		if (cnt)
+			strncpy(data + j * ETH_GSTRING_LEN,
+				cnt->name, ETH_GSTRING_LEN);
+		j++;
+	}
+
+	/* TX stats begins with the number of TX octets */
+	strncpy(data + j * ETH_GSTRING_LEN,
+		"TxEtherStatsOctets", ETH_GSTRING_LEN);
+	j++;
+
+	for (i = 3; i < 6; i++) {
+		cnt = vsc73xx_find_counter(vsc, indices[i], true);
+		if (cnt)
+			strncpy(data + j * ETH_GSTRING_LEN,
+				cnt->name, ETH_GSTRING_LEN);
+		j++;
+	}
+}
+
+int vsc73xx_get_sset_count(struct dsa_switch *ds, int port)
+{
+	/* RX and TX packets, then 3 RX counters, 3 TX counters */
+	return 8;
+}
+
+void vsc73xx_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data)
+{
+	struct vsc73xx *vsc = ds->priv;
+	u8 regs[] = {
+		VSC73XX_RXOCT,
+		VSC73XX_C_RX0,
+		VSC73XX_C_RX1,
+		VSC73XX_C_RX2,
+		VSC73XX_TXOCT,
+		VSC73XX_C_TX0,
+		VSC73XX_C_TX1,
+		VSC73XX_C_TX2,
+	};
+	u32 val;
+	int ret;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(regs); i++) {
+		ret = vsc73xx_read(vsc, VSC73XX_BLOCK_MAC, port,
+				   regs[i], &val);
+		if (ret) {
+			dev_err(vsc->dev, "error reading counter %d\n", i);
+			return;
+		}
+		data[i] = val;
+	}
+}
+
+static const struct dsa_switch_ops vsc73xx_ds_ops = {
+	.get_tag_protocol = vsc73xx_get_tag_protocol,
+	.setup = vsc73xx_setup,
+	.phy_read = vsc73xx_phy_read,
+	.phy_write = vsc73xx_phy_write,
+	.adjust_link = vsc73xx_adjust_link,
+	.get_strings = vsc73xx_get_strings,
+	.get_ethtool_stats = vsc73xx_get_ethtool_stats,
+	.get_sset_count = vsc73xx_get_sset_count,
+	.port_enable = vsc73xx_port_enable,
+	.port_disable = vsc73xx_port_disable,
+};
+
+static int vsc73xx_gpio_get(struct gpio_chip *chip, unsigned int offset)
+{
+	struct vsc73xx *vsc = gpiochip_get_data(chip);
+	u32 val;
+	int ret;
+
+	ret = vsc73xx_read(vsc, VSC73XX_BLOCK_SYSTEM, 0,
+			   VSC73XX_GPIO, &val);
+	if (ret)
+		return ret;
+
+	return !!(val & BIT(offset));
+}
+
+static void vsc73xx_gpio_set(struct gpio_chip *chip, unsigned int offset,
+			     int val)
+{
+	struct vsc73xx *vsc = gpiochip_get_data(chip);
+	u32 tmp = val ? BIT(offset) : 0;
+
+	vsc73xx_update_bits(vsc, VSC73XX_BLOCK_SYSTEM, 0,
+			    VSC73XX_GPIO, BIT(offset), tmp);
+}
+
+static int vsc73xx_gpio_direction_output(struct gpio_chip *chip,
+					 unsigned int offset, int val)
+{
+	struct vsc73xx *vsc = gpiochip_get_data(chip);
+	u32 tmp = val ? BIT(offset) : 0;
+
+	return vsc73xx_update_bits(vsc, VSC73XX_BLOCK_SYSTEM, 0,
+				   VSC73XX_GPIO, BIT(offset + 4) | BIT(offset),
+				   BIT(offset + 4) | tmp);
+}
+
+static int vsc73xx_gpio_direction_input(struct gpio_chip *chip,
+					unsigned int offset)
+{
+	struct vsc73xx *vsc = gpiochip_get_data(chip);
+
+	return  vsc73xx_update_bits(vsc, VSC73XX_BLOCK_SYSTEM, 0,
+				    VSC73XX_GPIO, BIT(offset + 4),
+				    0);
+}
+
+static int vsc73xx_gpio_get_direction(struct gpio_chip *chip,
+				      unsigned int offset)
+{
+	struct vsc73xx *vsc = gpiochip_get_data(chip);
+	u32 val;
+	int ret;
+
+	ret = vsc73xx_read(vsc, VSC73XX_BLOCK_SYSTEM, 0,
+			   VSC73XX_GPIO, &val);
+	if (ret)
+		return ret;
+
+	return !(val & BIT(offset + 4));
+}
+
+static int vsc73xx_probe(struct spi_device *spi)
+{
+	struct device *dev = &spi->dev;
+	struct vsc73xx *vsc;
+	int ret;
+
+	vsc = devm_kzalloc(dev, sizeof(*vsc), GFP_KERNEL);
+	if (!vsc)
+		return -ENOMEM;
+
+	spi_set_drvdata(spi, vsc);
+	vsc->spi = spi_dev_get(spi);
+	vsc->dev = dev;
+	mutex_init(&vsc->lock);
+
+	/* Release reset, if any */
+	vsc->reset = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW);
+	if (IS_ERR(vsc->reset)) {
+		dev_err(dev, "failed to get RESET GPIO\n");
+		return PTR_ERR(vsc->reset);
+	}
+	if (vsc->reset)
+		/* Wait 20ms according to datasheet table 245 */
+		msleep(20);
+
+	spi->mode = SPI_MODE_0;
+	spi->bits_per_word = 8;
+	ret = spi_setup(spi);
+	if (ret < 0) {
+		dev_err(dev, "spi setup failed.\n");
+		return ret;
+	}
+
+	ret = vsc73xx_detect(vsc);
+	if (ret) {
+		dev_err(dev, "no chip found (%d)\n", ret);
+		return -ENODEV;
+	}
+
+	eth_random_addr(vsc->addr);
+	dev_info(vsc->dev,
+		 "MAC for control frames: %02X:%02X:%02X:%02X:%02X:%02X\n",
+		 vsc->addr[0], vsc->addr[1], vsc->addr[2],
+		 vsc->addr[3], vsc->addr[4], vsc->addr[5]);
+
+	/* The VSC7395 switch chips have 5+1 ports which means 5
+	 * ordinary ports and a sixth CPU port facing the processor
+	 * with an RGMII interface. These ports are numbered 0..4
+	 * and 6, so they leave a "hole" in the port map for port 5,
+	 * which is invalid.
+	 *
+	 * The VSC7398 has 8 ports, port 7 is again the CPU port.
+	 *
+	 * We allocate 8 ports and avoid access to the nonexistant
+	 * ports.
+	 */
+	vsc->ds = dsa_switch_alloc(dev, 8);
+	if (!vsc->ds)
+		return -ENOMEM;
+	vsc->ds->priv = vsc;
+
+	vsc->ds->ops = &vsc73xx_ds_ops;
+	ret = dsa_register_switch(vsc->ds);
+	if (ret) {
+		dev_err(dev, "unable to register switch (%d)\n", ret);
+		return ret;
+	}
+
+	vsc->gc.label = devm_kasprintf(dev, GFP_KERNEL, "VSC%04x",
+				       vsc->chipid);
+	vsc->gc.ngpio = 4;
+	vsc->gc.owner = THIS_MODULE;
+	vsc->gc.parent = dev;
+	vsc->gc.of_node = dev->of_node;
+	vsc->gc.base = -1;
+	vsc->gc.get = vsc73xx_gpio_get;
+	vsc->gc.set = vsc73xx_gpio_set;
+	vsc->gc.direction_input = vsc73xx_gpio_direction_input;
+	vsc->gc.direction_output = vsc73xx_gpio_direction_output;
+	vsc->gc.get_direction = vsc73xx_gpio_get_direction;
+	vsc->gc.can_sleep = true;
+	ret = devm_gpiochip_add_data(dev, &vsc->gc, vsc);
+	if (ret) {
+		dev_err(dev, "unable to register GPIO chip\n");
+		dsa_unregister_switch(vsc->ds);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int vsc73xx_remove(struct spi_device *spi)
+{
+	struct vsc73xx *vsc = spi_get_drvdata(spi);
+
+	dsa_unregister_switch(vsc->ds);
+	gpiod_set_value(vsc->reset, 1);
+
+	return 0;
+}
+
+static const struct of_device_id vsc73xx_of_match[] = {
+	{
+		.compatible = "vitesse,vsc7385",
+	},
+	{
+		.compatible = "vitesse,vsc7388",
+	},
+	{
+		.compatible = "vitesse,vsc7395",
+	},
+	{
+		.compatible = "vitesse,vsc7398",
+	},
+	{ },
+};
+MODULE_DEVICE_TABLE(of, vsc73xx_of_match);
+
+static struct spi_driver vsc73xx_driver = {
+	.probe = vsc73xx_probe,
+	.remove = vsc73xx_remove,
+	.driver = {
+		.name = "vsc73xx",
+		.of_match_table = vsc73xx_of_match,
+	},
+};
+module_spi_driver(vsc73xx_driver);
+
+MODULE_AUTHOR("Linus Walleij <linus.walleij@linaro.org>");
+MODULE_DESCRIPTION("Vitesse VSC7385/7388/7395/7398 driver");
+MODULE_LICENSE("GPL v2");
-- 
2.17.1

^ permalink raw reply related

* [PATCH 2/3] net: phy: vitesse: Add support for VSC73xx
From: Linus Walleij @ 2018-06-14 12:35 UTC (permalink / raw)
  To: Andrew Lunn, Vivien Didelot, Florian Fainelli
  Cc: netdev, openwrt-devel, LEDE Development List, Gabor Juhos,
	Linus Walleij
In-Reply-To: <20180614123534.8063-1-linus.walleij@linaro.org>

The VSC7385, VSC7388, VSC7395 and VSC7398 are integrated
switch/router chips for 5+1 or 8-port switches/routers. When
managed directly by Linux using DSA we need to do a special
set-up "dance" on the PHY. Unfortunately these sequences
switches the PHY to undocumented pages named 2a30 and 52b6
and does undocumented things. It is described by these opaque
sequences also in the reference manual. This is a best
effort to integrate it anyways.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/net/phy/vitesse.c | 162 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 162 insertions(+)

diff --git a/drivers/net/phy/vitesse.c b/drivers/net/phy/vitesse.c
index d9dd8fbfffc7..526c71ae7d96 100644
--- a/drivers/net/phy/vitesse.c
+++ b/drivers/net/phy/vitesse.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/mii.h>
 #include <linux/ethtool.h>
+#include <linux/delay.h>
 #include <linux/phy.h>
 
 /* Vitesse Extended Page Magic Register(s) */
@@ -72,6 +73,10 @@
 #define PHY_ID_VSC8572			0x000704d0
 #define PHY_ID_VSC8574			0x000704a0
 #define PHY_ID_VSC8601			0x00070420
+#define PHY_ID_VSC7385			0x00070450
+#define PHY_ID_VSC7388			0x00070480
+#define PHY_ID_VSC7395			0x00070550
+#define PHY_ID_VSC7398			0x00070580
 #define PHY_ID_VSC8662			0x00070660
 #define PHY_ID_VSC8221			0x000fc550
 #define PHY_ID_VSC8211			0x000fc4b0
@@ -116,6 +121,127 @@ static int vsc824x_config_init(struct phy_device *phydev)
 	return err;
 }
 
+#define VSC73XX_EXT_PAGE_ACCESS 0x1f
+
+static int vsc73xx_read_page(struct phy_device *phydev)
+{
+	return __phy_read(phydev, VSC73XX_EXT_PAGE_ACCESS);
+}
+
+static int vsc73xx_write_page(struct phy_device *phydev, int page)
+{
+	return __phy_write(phydev, VSC73XX_EXT_PAGE_ACCESS, page);
+}
+
+static void vsc73xx_config_init(struct phy_device *phydev)
+{
+	/* Receiver init */
+	phy_write(phydev, 0x1f, 0x2a30);
+	phy_modify(phydev, 0x0c, 0x0300, 0x0200);
+	phy_write(phydev, 0x1f, 0x0000);
+
+	/* Config LEDs 0x61 */
+	phy_modify(phydev, 0x1b, 0xff00, 0x0061);
+}
+
+static int vsc738x_config_init(struct phy_device *phydev)
+{
+	u16 rev;
+	/* This magic sequence appear in the application note
+	 * "VSC7385/7388 PHY Configuration".
+	 *
+	 * Maybe one day we will get to know what it all means.
+	 */
+	phy_write(phydev, 0x1f, 0x2a30);
+	phy_modify(phydev, 0x08, 0x0200, 0x0200);
+	phy_write(phydev, 0x1f, 0x52b5);
+	phy_write(phydev, 0x10, 0xb68a);
+	phy_modify(phydev, 0x12, 0xff07, 0x0003);
+	phy_modify(phydev, 0x11, 0x00ff, 0x00a2);
+	phy_write(phydev, 0x10, 0x968a);
+	phy_write(phydev, 0x1f, 0x2a30);
+	phy_modify(phydev, 0x08, 0x0200, 0x0000);
+	phy_write(phydev, 0x1f, 0x0000);
+
+	/* Read revision */
+	rev = phy_read(phydev, 0x03);
+	rev &= 0x0f;
+
+	/* Special quirk for revision 0 */
+	if (rev == 0) {
+		phy_write(phydev, 0x1f, 0x2a30);
+		phy_modify(phydev, 0x08, 0x0200, 0x0200);
+		phy_write(phydev, 0x1f, 0x52b5);
+		phy_write(phydev, 0x12, 0x0000);
+		phy_write(phydev, 0x11, 0x0689);
+		phy_write(phydev, 0x10, 0x8f92);
+		phy_write(phydev, 0x1f, 0x52b5);
+		phy_write(phydev, 0x12, 0x0000);
+		phy_write(phydev, 0x11, 0x0e35);
+		phy_write(phydev, 0x10, 0x9786);
+		phy_write(phydev, 0x1f, 0x2a30);
+		phy_modify(phydev, 0x08, 0x0200, 0x0000);
+		phy_write(phydev, 0x17, 0xff80);
+		phy_write(phydev, 0x17, 0x0000);
+	}
+
+	phy_write(phydev, 0x1f, 0x0000);
+	phy_write(phydev, 0x12, 0x0048);
+
+	if (rev == 0) {
+		phy_write(phydev, 0x1f, 0x2a30);
+		phy_write(phydev, 0x14, 0x6600);
+		phy_write(phydev, 0x1f, 0x0000);
+		phy_write(phydev, 0x18, 0xa24e);
+	} else {
+		phy_write(phydev, 0x1f, 0x2a30);
+		phy_modify(phydev, 0x16, 0x0fc0, 0x0240);
+		phy_modify(phydev, 0x14, 0x6000, 0x4000);
+		/* bits 14-15 in extended register 0x14 controls DACG amplitude
+		 * 6 = -8%, 2 is hardware default
+		 */
+		phy_write(phydev, 0x1f, 0x0001);
+		phy_modify(phydev, 0x14, 0xe000, 0x6000);
+		phy_write(phydev, 0x1f, 0x0000);
+	}
+
+	vsc73xx_config_init(phydev);
+
+	return genphy_config_init(phydev);
+}
+
+static int vsc739x_config_init(struct phy_device *phydev)
+{
+	/* This magic sequence appears in the VSC7395 SparX-G5e application
+	 * note "VSC7395/VSC7398 PHY Configuration"
+	 *
+	 * Maybe one day we will get to know what it all means.
+	 */
+	phy_write(phydev, 0x1f, 0x2a30);
+	phy_modify(phydev, 0x08, 0x0200, 0x0200);
+	phy_write(phydev, 0x1f, 0x52b5);
+	phy_write(phydev, 0x10, 0xb68a);
+	phy_modify(phydev, 0x12, 0xff07, 0x0003);
+	phy_modify(phydev, 0x11, 0x00ff, 0x00a2);
+	phy_write(phydev, 0x10, 0x968a);
+	phy_write(phydev, 0x1f, 0x2a30);
+	phy_modify(phydev, 0x08, 0x0200, 0x0000);
+	phy_write(phydev, 0x1f, 0x0000);
+
+	phy_write(phydev, 0x1f, 0x0000);
+	phy_write(phydev, 0x12, 0x0048);
+	phy_write(phydev, 0x1f, 0x2a30);
+	phy_modify(phydev, 0x16, 0x0fc0, 0x0240);
+	phy_modify(phydev, 0x14, 0x6000, 0x4000);
+	phy_write(phydev, 0x1f, 0x0001);
+	phy_modify(phydev, 0x14, 0xe000, 0x6000);
+	phy_write(phydev, 0x1f, 0x0000);
+
+	vsc73xx_config_init(phydev);
+
+	return genphy_config_init(phydev);
+}
+
 /* This adds a skew for both TX and RX clocks, so the skew should only be
  * applied to "rgmii-id" interfaces. It may not work as expected
  * on "rgmii-txid", "rgmii-rxid" or "rgmii" interfaces. */
@@ -318,6 +444,38 @@ static struct phy_driver vsc82xx_driver[] = {
 	.config_init    = &vsc8601_config_init,
 	.ack_interrupt  = &vsc824x_ack_interrupt,
 	.config_intr    = &vsc82xx_config_intr,
+}, {
+	.phy_id         = PHY_ID_VSC7385,
+	.name           = "Vitesse VSC7385",
+	.phy_id_mask    = 0x000ffff0,
+	.features       = PHY_GBIT_FEATURES,
+	.config_init    = vsc738x_config_init,
+	.read_page      = vsc73xx_read_page,
+	.write_page     = vsc73xx_write_page,
+}, {
+	.phy_id         = PHY_ID_VSC7388,
+	.name           = "Vitesse VSC7388",
+	.phy_id_mask    = 0x000ffff0,
+	.features       = PHY_GBIT_FEATURES,
+	.config_init    = vsc738x_config_init,
+	.read_page      = vsc73xx_read_page,
+	.write_page     = vsc73xx_write_page,
+}, {
+	.phy_id         = PHY_ID_VSC7395,
+	.name           = "Vitesse VSC7395",
+	.phy_id_mask    = 0x000ffff0,
+	.features       = PHY_GBIT_FEATURES,
+	.config_init    = vsc739x_config_init,
+	.read_page      = vsc73xx_read_page,
+	.write_page     = vsc73xx_write_page,
+}, {
+	.phy_id         = PHY_ID_VSC7398,
+	.name           = "Vitesse VSC7398",
+	.phy_id_mask    = 0x000ffff0,
+	.features       = PHY_GBIT_FEATURES,
+	.config_init    = vsc739x_config_init,
+	.read_page      = vsc73xx_read_page,
+	.write_page     = vsc73xx_write_page,
 }, {
 	.phy_id         = PHY_ID_VSC8662,
 	.name           = "Vitesse VSC8662",
@@ -358,6 +516,10 @@ static struct mdio_device_id __maybe_unused vitesse_tbl[] = {
 	{ PHY_ID_VSC8514, 0x000ffff0 },
 	{ PHY_ID_VSC8572, 0x000ffff0 },
 	{ PHY_ID_VSC8574, 0x000ffff0 },
+	{ PHY_ID_VSC7385, 0x000ffff0 },
+	{ PHY_ID_VSC7388, 0x000ffff0 },
+	{ PHY_ID_VSC7395, 0x000ffff0 },
+	{ PHY_ID_VSC7398, 0x000ffff0 },
 	{ PHY_ID_VSC8662, 0x000ffff0 },
 	{ PHY_ID_VSC8221, 0x000ffff0 },
 	{ PHY_ID_VSC8211, 0x000ffff0 },
-- 
2.17.1

^ permalink raw reply related

* [PATCH 1/3] net: dsa: Add DT bindings for Vitesse VSC73xx switches
From: Linus Walleij @ 2018-06-14 12:35 UTC (permalink / raw)
  To: Andrew Lunn, Vivien Didelot, Florian Fainelli
  Cc: netdev, openwrt-devel, LEDE Development List, Gabor Juhos,
	Linus Walleij, devicetree
In-Reply-To: <20180614123534.8063-1-linus.walleij@linaro.org>

This adds the device tree bindings for the Vitesse VSC73xx
switches. We also add the vendor name for Vitesse.

Cc: devicetree@vger.kernel.org
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 .../bindings/net/dsa/vitesse,vsc73xx.txt      | 81 +++++++++++++++++++
 .../devicetree/bindings/vendor-prefixes.txt   |  1 +
 2 files changed, 82 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt

diff --git a/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt b/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt
new file mode 100644
index 000000000000..474cdba5fa37
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt
@@ -0,0 +1,81 @@
+Vitess VSC73xx Switches
+=======================
+
+This defines device tree bindings for the Vitesse VSC73xx switch chips.
+The Vitesse company has been acquired by Microsemi and Microsemi in turn
+acquired by Microchip but retains this vendor branding.
+
+The currently supported switch chips are:
+Vitesse VSC7385 SparX-G5 5+1-port Integrated Gigabit Ethernet Switch
+Vitesse VSC7388 SparX-G8 8-port Integrated Gigabit Ethernet Switch
+Vitesse VSC7395 SparX-G5e 5+1-port Integrated Gigabit Ethernet Switch
+Vitesse VSC7398 SparX-G8e 8-port Integrated Gigabit Ethernet Switch
+
+The device tree node is an SPI device so it must reside inside a SPI bus
+device tree node, see spi/spi-bus.txt
+
+Required properties:
+
+- compatible: must be exactly one of:
+	"vitesse,vsc7385"
+	"vitesse,vsc7388"
+	"vitesse,vsc7395"
+	"vitesse,vsc7398"
+- gpio-controller: indicates that this switch is also a GPIO controller,
+  see gpio/
+- #gpio-cells: this must be set to <2> and indicates that we are a twocell
+  GPIO controller.
+
+Optional properties:
+
+- reset-gpios: a handle to a GPIO line that can issue reset of the chip.
+  It should be tagged as active low.
+
+Required subnodes:
+
+See net/dsa/dsa.txt for a list of additional required and optional properties
+and subnodes of DSA switches.
+
+Examples:
+
+switch@0 {
+	compatible = "vitesse,vsc7395";
+	reg = <0>;
+	/* Specified for 2.5 MHz or below */
+	spi-max-frequency = <2500000>;
+	gpio-controller;
+	#gpio-cells = <2>;
+
+	ports {
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		port@0 {
+			reg = <0>;
+			label = "lan1";
+		};
+		port@1 {
+			reg = <1>;
+			label = "lan2";
+		};
+		port@2 {
+			reg = <2>;
+			label = "lan3";
+		};
+		port@3 {
+			reg = <3>;
+			label = "lan4";
+		};
+		vsc: port@6 {
+			reg = <6>;
+			label = "cpu";
+			ethernet = <&gmac1>;
+			phy-mode = "rgmii";
+			fixed-link {
+				speed = <1000>;
+				full-duplex;
+				pause;
+			};
+		};
+	};
+};
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt
index b5f978a4cac6..e8473894700c 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -385,6 +385,7 @@ v3	V3 Semiconductor
 variscite	Variscite Ltd.
 via	VIA Technologies, Inc.
 virtio	Virtual I/O Device Specification, developed by the OASIS consortium
+vitesse	Vitesse Semiconductor Corporation
 vivante	Vivante Corporation
 vocore VoCore Studio
 voipac	Voipac Technologies s.r.o.
-- 
2.17.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox