Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 03/11] netfilter: fix nf_conntrack_helper documentation
From: Pablo Neira Ayuso @ 2016-11-30 21:57 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1480543045-3389-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Since kernel 4.7 this defaults to off.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Documentation/networking/nf_conntrack-sysctl.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/nf_conntrack-sysctl.txt b/Documentation/networking/nf_conntrack-sysctl.txt
index 399e4e866a9c..433b6724797a 100644
--- a/Documentation/networking/nf_conntrack-sysctl.txt
+++ b/Documentation/networking/nf_conntrack-sysctl.txt
@@ -62,10 +62,13 @@ nf_conntrack_generic_timeout - INTEGER (seconds)
 	protocols.
 
 nf_conntrack_helper - BOOLEAN
-	0 - disabled
-	not 0 - enabled (default)
+	0 - disabled (default)
+	not 0 - enabled
 
 	Enable automatic conntrack helper assignment.
+	If disabled it is required to set up iptables rules to assign
+	helpers to connections.  See the CT target description in the
+	iptables-extensions(8) man page for further information.
 
 nf_conntrack_icmp_timeout - INTEGER (seconds)
 	default 30
-- 
2.1.4


^ permalink raw reply related

* [PATCH 06/11] netfilter: nat: switch to new rhlist interface
From: Pablo Neira Ayuso @ 2016-11-30 21:57 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1480543045-3389-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

I got offlist bug report about failing connections and high cpu usage.
This happens because we hit 'elasticity' checks in rhashtable that
refuses bucket list exceeding 16 entries.

The nat bysrc hash unfortunately needs to insert distinct objects that
share same key and are identical (have same source tuple), this cannot
be avoided.

Switch to the rhlist interface which is designed for this.

The nulls_base is removed here, I don't think its needed:

A (unlikely) false positive results in unneeded port clash resolution,
a false negative results in packet drop during conntrack confirmation,
when we try to insert the duplicate into main conntrack hash table.

Tested by adding multiple ip addresses to host, then adding
iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE

... and then creating multiple connections, from same source port but
different addresses:

for i in $(seq 2000 2032);do nc -p 1234 192.168.7.1 $i > /dev/null  & done

(all of these then get hashed to same bysource slot)

Then, to test that nat conflict resultion is working:

nc -s 10.0.0.1 -p 1234 192.168.7.1 2000
nc -s 10.0.0.2 -p 1234 192.168.7.1 2000

tcp  .. src=10.0.0.1 dst=192.168.7.1 sport=1234 dport=2000 src=192.168.7.1 dst=192.168.7.10 sport=2000 dport=1024 [ASSURED]
tcp  .. src=10.0.0.2 dst=192.168.7.1 sport=1234 dport=2000 src=192.168.7.1 dst=192.168.7.10 sport=2000 dport=1025 [ASSURED]
tcp  .. src=192.168.7.10 dst=192.168.7.1 sport=1234 dport=2000 src=192.168.7.1 dst=192.168.7.10 sport=2000 dport=1234 [ASSURED]
tcp  .. src=192.168.7.10 dst=192.168.7.1 sport=1234 dport=2001 src=192.168.7.1 dst=192.168.7.10 sport=2001 dport=1234 [ASSURED]
[..]

-> nat altered source ports to 1024 and 1025, respectively.
This can also be confirmed on destination host which shows
ESTAB      0      0   192.168.7.1:2000      192.168.7.10:1024
ESTAB      0      0   192.168.7.1:2000      192.168.7.10:1025
ESTAB      0      0   192.168.7.1:2000      192.168.7.10:1234

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Fixes: 870190a9ec907 ("netfilter: nat: convert nat bysrc hash to rhashtable")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h |  2 +-
 net/netfilter/nf_nat_core.c          | 40 +++++++++++++++++++++---------------
 2 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 50418052a520..dc143ada9762 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -118,7 +118,7 @@ struct nf_conn {
 	struct nf_ct_ext *ext;
 
 #if IS_ENABLED(CONFIG_NF_NAT)
-	struct rhash_head	nat_bysource;
+	struct rhlist_head nat_bysource;
 #endif
 	/* Storage reserved for other modules, must be the last member */
 	union nf_conntrack_proto proto;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index c632429706eb..5b9c884a452e 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -42,7 +42,7 @@ struct nf_nat_conn_key {
 	const struct nf_conntrack_zone *zone;
 };
 
-static struct rhashtable nf_nat_bysource_table;
+static struct rhltable nf_nat_bysource_table;
 
 inline const struct nf_nat_l3proto *
 __nf_nat_l3proto_find(u8 family)
@@ -207,7 +207,6 @@ static struct rhashtable_params nf_nat_bysource_params = {
 	.obj_cmpfn = nf_nat_bysource_cmp,
 	.nelem_hint = 256,
 	.min_size = 1024,
-	.nulls_base = (1U << RHT_BASE_SHIFT),
 };
 
 /* Only called for SRC manip */
@@ -226,12 +225,15 @@ find_appropriate_src(struct net *net,
 		.tuple = tuple,
 		.zone = zone
 	};
+	struct rhlist_head *hl;
 
-	ct = rhashtable_lookup_fast(&nf_nat_bysource_table, &key,
-				    nf_nat_bysource_params);
-	if (!ct)
+	hl = rhltable_lookup(&nf_nat_bysource_table, &key,
+			     nf_nat_bysource_params);
+	if (!hl)
 		return 0;
 
+	ct = container_of(hl, typeof(*ct), nat_bysource);
+
 	nf_ct_invert_tuplepr(result,
 			     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 	result->dst = tuple->dst;
@@ -449,11 +451,17 @@ nf_nat_setup_info(struct nf_conn *ct,
 	}
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
+		struct nf_nat_conn_key key = {
+			.net = nf_ct_net(ct),
+			.tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+			.zone = nf_ct_zone(ct),
+		};
 		int err;
 
-		err = rhashtable_insert_fast(&nf_nat_bysource_table,
-					     &ct->nat_bysource,
-					     nf_nat_bysource_params);
+		err = rhltable_insert_key(&nf_nat_bysource_table,
+					  &key,
+					  &ct->nat_bysource,
+					  nf_nat_bysource_params);
 		if (err)
 			return NF_DROP;
 	}
@@ -570,8 +578,8 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 	 * will delete entry from already-freed table.
 	 */
 	ct->status &= ~IPS_NAT_DONE_MASK;
-	rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
-			       nf_nat_bysource_params);
+	rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
+			nf_nat_bysource_params);
 
 	/* don't delete conntrack.  Although that would make things a lot
 	 * simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -701,8 +709,8 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 	if (!nat)
 		return;
 
-	rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
-			       nf_nat_bysource_params);
+	rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
+			nf_nat_bysource_params);
 }
 
 static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -837,13 +845,13 @@ static int __init nf_nat_init(void)
 {
 	int ret;
 
-	ret = rhashtable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
+	ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
 	if (ret)
 		return ret;
 
 	ret = nf_ct_extend_register(&nat_extend);
 	if (ret < 0) {
-		rhashtable_destroy(&nf_nat_bysource_table);
+		rhltable_destroy(&nf_nat_bysource_table);
 		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
 		return ret;
 	}
@@ -867,7 +875,7 @@ static int __init nf_nat_init(void)
 	return 0;
 
  cleanup_extend:
-	rhashtable_destroy(&nf_nat_bysource_table);
+	rhltable_destroy(&nf_nat_bysource_table);
 	nf_ct_extend_unregister(&nat_extend);
 	return ret;
 }
@@ -886,7 +894,7 @@ static void __exit nf_nat_cleanup(void)
 	for (i = 0; i < NFPROTO_NUMPROTO; i++)
 		kfree(nf_nat_l4protos[i]);
 
-	rhashtable_destroy(&nf_nat_bysource_table);
+	rhltable_destroy(&nf_nat_bysource_table);
 }
 
 MODULE_LICENSE("GPL");
-- 
2.1.4


^ permalink raw reply related

* [PATCH 01/11] netfilter: Update ip_route_me_harder to consider L3 domain
From: Pablo Neira Ayuso @ 2016-11-30 21:57 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1480543045-3389-1-git-send-email-pablo@netfilter.org>

From: David Ahern <dsa@cumulusnetworks.com>

ip_route_me_harder is not considering the L3 domain and sending lookups
to the wrong table. For example consider the following output rule:

iptables -I OUTPUT -p tcp --dport 12345 -j REJECT --reject-with tcp-reset

using perf to analyze lookups via the fib_table_lookup tracepoint shows:

vrf-test  1187 [001] 46887.295927: fib:fib_table_lookup: table 255 oif 0 iif 0 src 0.0.0.0 dst 10.100.1.254 tos 0 scope 0 flags 0
        ffffffff8143922c perf_trace_fib_table_lookup ([kernel.kallsyms])
        ffffffff81493aac fib_table_lookup ([kernel.kallsyms])
        ffffffff8148dda3 __inet_dev_addr_type ([kernel.kallsyms])
        ffffffff8148ddf6 inet_addr_type ([kernel.kallsyms])
        ffffffff8149e344 ip_route_me_harder ([kernel.kallsyms])

and

vrf-test  1187 [001] 46887.295933: fib:fib_table_lookup: table 255 oif 0 iif 1 src 10.100.1.254 dst 10.100.1.2 tos 0 scope 0 flags
        ffffffff8143922c perf_trace_fib_table_lookup ([kernel.kallsyms])
        ffffffff81493aac fib_table_lookup ([kernel.kallsyms])
        ffffffff814998ff fib4_rule_action ([kernel.kallsyms])
        ffffffff81437f35 fib_rules_lookup ([kernel.kallsyms])
        ffffffff81499758 __fib_lookup ([kernel.kallsyms])
        ffffffff8144f010 fib_lookup.constprop.34 ([kernel.kallsyms])
        ffffffff8144f759 __ip_route_output_key_hash ([kernel.kallsyms])
        ffffffff8144fc6a ip_route_output_flow ([kernel.kallsyms])
        ffffffff8149e39b ip_route_me_harder ([kernel.kallsyms])

In both cases the lookups are directed to table 255 rather than the
table associated with the device via the L3 domain. Update both
lookups to pull the L3 domain from the dst currently attached to the
skb.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index c3776ff6749f..b3cc1335adbc 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -24,10 +24,11 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
 	struct flowi4 fl4 = {};
 	__be32 saddr = iph->saddr;
 	__u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
+	struct net_device *dev = skb_dst(skb)->dev;
 	unsigned int hh_len;
 
 	if (addr_type == RTN_UNSPEC)
-		addr_type = inet_addr_type(net, saddr);
+		addr_type = inet_addr_type_dev_table(net, dev, saddr);
 	if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
 		flags |= FLOWI_FLAG_ANYSRC;
 	else
@@ -40,6 +41,8 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
 	fl4.saddr = saddr;
 	fl4.flowi4_tos = RT_TOS(iph->tos);
 	fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+	if (!fl4.flowi4_oif)
+		fl4.flowi4_oif = l3mdev_master_ifindex(dev);
 	fl4.flowi4_mark = skb->mark;
 	fl4.flowi4_flags = flags;
 	rt = ip_route_output_key(net, &fl4);
-- 
2.1.4

^ permalink raw reply related

* [PATCH 05/11] netfilter: nat: fix cmp return value
From: Pablo Neira Ayuso @ 2016-11-30 21:57 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1480543045-3389-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

The comparator works like memcmp, i.e. 0 means objects are equal.
In other words, when objects are distinct they are treated as identical,
when they are distinct they are allegedly the same.

The first case is rare (distinct objects are unlikely to get hashed to
same bucket).

The second case results in unneeded port conflict resolutions attempts.

Fixes: 870190a9ec907 ("netfilter: nat: convert nat bysrc hash to rhashtable")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_core.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index bbb8f3df79f7..c632429706eb 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -193,9 +193,12 @@ static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
 	const struct nf_nat_conn_key *key = arg->key;
 	const struct nf_conn *ct = obj;
 
-	return same_src(ct, key->tuple) &&
-	       net_eq(nf_ct_net(ct), key->net) &&
-	       nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL);
+	if (!same_src(ct, key->tuple) ||
+	    !net_eq(nf_ct_net(ct), key->net) ||
+	    !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
+		return 1;
+
+	return 0;
 }
 
 static struct rhashtable_params nf_nat_bysource_params = {
-- 
2.1.4

^ permalink raw reply related

* [PATCH 07/11] netfilter: nf_tables: fix inconsistent element expiration calculation
From: Pablo Neira Ayuso @ 2016-11-30 21:57 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1480543045-3389-1-git-send-email-pablo@netfilter.org>

From: "Anders K. Pedersen" <akp@cohaesio.com>

As Liping Zhang reports, after commit a8b1e36d0d1d ("netfilter: nft_dynset:
fix element timeout for HZ != 1000"), priv->timeout was stored in jiffies,
while set->timeout was stored in milliseconds. This is inconsistent and
incorrect.

Firstly, we already call msecs_to_jiffies in nft_set_elem_init, so
priv->timeout will be converted to jiffies twice.

Secondly, if the user did not specify the NFTA_DYNSET_TIMEOUT attr,
set->timeout will be used, but we forget to call msecs_to_jiffies
when do update elements.

Fix this by using jiffies internally for traditional sets and doing the
conversions to/from msec when interacting with userspace - as dynset
already does.

This is preferable to doing the conversions, when elements are inserted or
updated, because this can happen very frequently on busy dynsets.

Fixes: a8b1e36d0d1d ("netfilter: nft_dynset: fix element timeout for HZ != 1000")
Reported-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Anders K. Pedersen <akp@cohaesio.com>
Acked-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  2 +-
 net/netfilter/nf_tables_api.c     | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index d79d1e9b9546..b02af0bf5777 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -313,7 +313,7 @@ void nft_unregister_set(struct nft_set_ops *ops);
  * 	@size: maximum set size
  * 	@nelems: number of elements
  * 	@ndeact: number of deactivated elements queued for removal
- * 	@timeout: default timeout value in msecs
+ *	@timeout: default timeout value in jiffies
  * 	@gc_int: garbage collection interval in msecs
  *	@policy: set parameterization (see enum nft_set_policies)
  *	@udlen: user data length
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 026581b04ea8..e5194f6f906c 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2570,7 +2570,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 	}
 
 	if (set->timeout &&
-	    nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout),
+	    nla_put_be64(skb, NFTA_SET_TIMEOUT,
+			 cpu_to_be64(jiffies_to_msecs(set->timeout)),
 			 NFTA_SET_PAD))
 		goto nla_put_failure;
 	if (set->gc_int &&
@@ -2859,7 +2860,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	if (nla[NFTA_SET_TIMEOUT] != NULL) {
 		if (!(flags & NFT_SET_TIMEOUT))
 			return -EINVAL;
-		timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_TIMEOUT]));
+		timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
+						nla[NFTA_SET_TIMEOUT])));
 	}
 	gc_int = 0;
 	if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
@@ -3178,7 +3180,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
 	    nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
-			 cpu_to_be64(*nft_set_ext_timeout(ext)),
+			 cpu_to_be64(jiffies_to_msecs(
+						*nft_set_ext_timeout(ext))),
 			 NFTA_SET_ELEM_PAD))
 		goto nla_put_failure;
 
@@ -3447,7 +3450,7 @@ void *nft_set_elem_init(const struct nft_set *set,
 		memcpy(nft_set_ext_data(ext), data, set->dlen);
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION))
 		*nft_set_ext_expiration(ext) =
-			jiffies + msecs_to_jiffies(timeout);
+			jiffies + timeout;
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
 		*nft_set_ext_timeout(ext) = timeout;
 
@@ -3535,7 +3538,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
 		if (!(set->flags & NFT_SET_TIMEOUT))
 			return -EINVAL;
-		timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_ELEM_TIMEOUT]));
+		timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
+					nla[NFTA_SET_ELEM_TIMEOUT])));
 	} else if (set->flags & NFT_SET_TIMEOUT) {
 		timeout = set->timeout;
 	}
-- 
2.1.4

^ permalink raw reply related

* [PATCH 02/11] netfilter: Update nf_send_reset6 to consider L3 domain
From: Pablo Neira Ayuso @ 2016-11-30 21:57 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1480543045-3389-1-git-send-email-pablo@netfilter.org>

From: David Ahern <dsa@cumulusnetworks.com>

nf_send_reset6 is not considering the L3 domain and lookups are sent
to the wrong table. For example consider the following output rule:

ip6tables -A OUTPUT -p tcp --dport 12345 -j REJECT --reject-with tcp-reset

using perf to analyze lookups via the fib6_table_lookup tracepoint shows:

swapper     0 [001]   248.787816: fib6:fib6_table_lookup: table 255 oif 0 iif 1 src 2100:1::3 dst 2100:1:
        ffffffff81439cdc perf_trace_fib6_table_lookup ([kernel.kallsyms])
        ffffffff814c1ce3 trace_fib6_table_lookup ([kernel.kallsyms])
        ffffffff814c3e89 ip6_pol_route ([kernel.kallsyms])
        ffffffff814c40d5 ip6_pol_route_output ([kernel.kallsyms])
        ffffffff814e7b6f fib6_rule_action ([kernel.kallsyms])
        ffffffff81437f60 fib_rules_lookup ([kernel.kallsyms])
        ffffffff814e7c79 fib6_rule_lookup ([kernel.kallsyms])
        ffffffff814c2541 ip6_route_output_flags ([kernel.kallsyms])
                     528 nf_send_reset6 ([nf_reject_ipv6])

The lookup is directed to table 255 rather than the table associated with
the device via the L3 domain. Update nf_send_reset6 to pull the L3 domain
from the dst currently attached to the skb.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_reject_ipv6.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index a5400223fd74..10090400c72f 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -156,6 +156,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
 	fl6.daddr = oip6h->saddr;
 	fl6.fl6_sport = otcph->dest;
 	fl6.fl6_dport = otcph->source;
+	fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
 	security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (dst->error) {
-- 
2.1.4

^ permalink raw reply related

* [PATCH 04/11] netfilter: nft_hash: validate maximum value of u32 netlink hash attribute
From: Pablo Neira Ayuso @ 2016-11-30 21:57 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1480543045-3389-1-git-send-email-pablo@netfilter.org>

From: Laura Garcia Liebana <nevola@gmail.com>

Use the function nft_parse_u32_check() to fetch the value and validate
the u32 attribute into the hash len u8 field.

This patch revisits 4da449ae1df9 ("netfilter: nft_exthdr: Add size check
on u8 nft_exthdr attributes").

Fixes: cb1b69b0b15b ("netfilter: nf_tables: add hash expression")
Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_hash.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index baf694de3935..d5447a22275c 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -53,6 +53,7 @@ static int nft_hash_init(const struct nft_ctx *ctx,
 {
 	struct nft_hash *priv = nft_expr_priv(expr);
 	u32 len;
+	int err;
 
 	if (!tb[NFTA_HASH_SREG] ||
 	    !tb[NFTA_HASH_DREG] ||
@@ -67,8 +68,10 @@ static int nft_hash_init(const struct nft_ctx *ctx,
 	priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]);
 	priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
 
-	len = ntohl(nla_get_be32(tb[NFTA_HASH_LEN]));
-	if (len == 0 || len > U8_MAX)
+	err = nft_parse_u32_check(tb[NFTA_HASH_LEN], U8_MAX, &len);
+	if (err < 0)
+		return err;
+	if (len == 0)
 		return -ERANGE;
 
 	priv->len = len;
-- 
2.1.4

^ permalink raw reply related

* [PATCH 08/11] netfilter: nft_range: add the missing NULL pointer check
From: Pablo Neira Ayuso @ 2016-11-30 21:57 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1480543045-3389-1-git-send-email-pablo@netfilter.org>

From: Liping Zhang <zlpnobody@gmail.com>

Otherwise, kernel panic will happen if the user does not specify
the related attributes.

Fixes: 0f3cd9b36977 ("netfilter: nf_tables: add range expression")
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_range.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index fbc88009ca2e..8f0aaaea1376 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -59,6 +59,12 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
 	int err;
 	u32 op;
 
+	if (!tb[NFTA_RANGE_SREG]      ||
+	    !tb[NFTA_RANGE_OP]	      ||
+	    !tb[NFTA_RANGE_FROM_DATA] ||
+	    !tb[NFTA_RANGE_TO_DATA])
+		return -EINVAL;
+
 	err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from),
 			    &desc_from, tb[NFTA_RANGE_FROM_DATA]);
 	if (err < 0)
-- 
2.1.4

^ permalink raw reply related

* [PATCH 11/11] netfilter: arp_tables: fix invoking 32bit "iptable -P INPUT ACCEPT" failed in 64bit kernel
From: Pablo Neira Ayuso @ 2016-11-30 21:57 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1480543045-3389-1-git-send-email-pablo@netfilter.org>

From: Hongxu Jia <hongxu.jia@windriver.com>

Since 09d9686047db ("netfilter: x_tables: do compat validation via
translate_table"), it used compatr structure to assign newinfo
structure.  In translate_compat_table of ip_tables.c and ip6_tables.c,
it used compatr->hook_entry to replace info->hook_entry and
compatr->underflow to replace info->underflow, but not do the same
replacement in arp_tables.c.

It caused invoking 32-bit "arptbale -P INPUT ACCEPT" failed in 64bit
kernel.
--------------------------------------
root@qemux86-64:~# arptables -P INPUT ACCEPT
root@qemux86-64:~# arptables -P INPUT ACCEPT
ERROR: Policy for `INPUT' offset 448 != underflow 0
arptables: Incompatible with this kernel
--------------------------------------

Fixes: 09d9686047db ("netfilter: x_tables: do compat validation via translate_table")
Signed-off-by: Hongxu Jia <hongxu.jia@windriver.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index b31df597fd37..697538464e6e 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1201,8 +1201,8 @@ static int translate_compat_table(struct xt_table_info **pinfo,
 
 	newinfo->number = compatr->num_entries;
 	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
-		newinfo->hook_entry[i] = info->hook_entry[i];
-		newinfo->underflow[i] = info->underflow[i];
+		newinfo->hook_entry[i] = compatr->hook_entry[i];
+		newinfo->underflow[i] = compatr->underflow[i];
 	}
 	entry1 = newinfo->entries;
 	pos = entry1;
-- 
2.1.4

^ permalink raw reply related

* [PATCH 10/11] netfilter: ipv6: nf_defrag: drop mangled skb on ream error
From: Pablo Neira Ayuso @ 2016-11-30 21:57 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1480543045-3389-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Dmitry Vyukov reported GPF in network stack that Andrey traced down to
negative nh offset in nf_ct_frag6_queue().

Problem is that all network headers before fragment header are pulled.
Normal ipv6 reassembly will drop the skb when errors occur further down
the line.

netfilter doesn't do this, and instead passed the original fragment
along.  That was also fine back when netfilter ipv6 defrag worked with
cloned fragments, as the original, pristine fragment was passed on.

So we either have to undo the pull op, or discard such fragments.
Since they're malformed after all (e.g. overlapping fragment) it seems
preferrable to just drop them.

Same for temporary errors -- it doesn't make sense to accept (and
perhaps forward!) only some fragments of same datagram.

Fixes: 029f7f3b8701cc7ac ("netfilter: ipv6: nf_defrag: avoid/free clone operations")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Debugged-by: Andrey Konovalov <andreyknvl@google.com>
Diagnosed-by: Eric Dumazet <Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c   | 4 ++--
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index e4347aeb2e65..9948b5ce52da 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -576,11 +576,11 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	/* Jumbo payload inhibits frag. header */
 	if (ipv6_hdr(skb)->payload_len == 0) {
 		pr_debug("payload len = 0\n");
-		return -EINVAL;
+		return 0;
 	}
 
 	if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
-		return -EINVAL;
+		return 0;
 
 	if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr)))
 		return -ENOMEM;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index f7aab5ab93a5..f06b0471f39f 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -69,7 +69,7 @@ static unsigned int ipv6_defrag(void *priv,
 	if (err == -EINPROGRESS)
 		return NF_STOLEN;
 
-	return NF_ACCEPT;
+	return err == 0 ? NF_ACCEPT : NF_DROP;
 }
 
 static struct nf_hook_ops ipv6_defrag_ops[] = {
-- 
2.1.4

^ permalink raw reply related

* [PATCH 09/11] netfilter: nat: fix crash when conntrack entry is re-used
From: Pablo Neira Ayuso @ 2016-11-30 21:57 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1480543045-3389-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

Stas Nichiporovich reports oops in nf_nat_bysource_cmp(), trying to
access nf_conn struct at address 0xffffffffffffff50.

This is the result of fetching a null rhash list (struct embedded at
offset 176; 0 - 176 gets us ...fff50).

The problem is that conntrack entries are allocated from a
SLAB_DESTROY_BY_RCU cache, i.e. entries can be free'd and reused
on another cpu while nf nat bysource hash access the same conntrack entry.

Freeing is fine (we hold rcu read lock); zeroing rhlist_head isn't.

-> Move the rhlist struct outside of the memset()-inited area.

Fixes: 7c9664351980aaa6a ("netfilter: move nat hlist_head to nf_conn")
Reported-by: Stas Nichiporovich <stasn77@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index dc143ada9762..d9d52c020a70 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -100,6 +100,9 @@ struct nf_conn {
 
 	possible_net_t ct_net;
 
+#if IS_ENABLED(CONFIG_NF_NAT)
+	struct rhlist_head nat_bysource;
+#endif
 	/* all members below initialized via memset */
 	u8 __nfct_init_offset[0];
 
@@ -117,9 +120,6 @@ struct nf_conn {
 	/* Extensions */
 	struct nf_ct_ext *ext;
 
-#if IS_ENABLED(CONFIG_NF_NAT)
-	struct rhlist_head nat_bysource;
-#endif
 	/* Storage reserved for other modules, must be the last member */
 	union nf_conntrack_proto proto;
 };
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH net] cdc_ether: Fix handling connection notification
From: Kristian Evensen @ 2016-11-30 21:59 UTC (permalink / raw)
  To: Bjørn Mork
  Cc: Oliver Neukum, linux-usb, Network Development, linux-kernel,
	Henning Schild
In-Reply-To: <87shq83acr.fsf@miraculix.mork.no>

On Wed, Nov 30, 2016 at 10:51 PM, Bjørn Mork <bjorn@mork.no> wrote:
> Kristian Evensen <kristian.evensen@gmail.com> writes:
>
>> +void usbnet_cdc_zte_status(struct usbnet *dev, struct urb *urb)
>> +{
>> +     struct usb_cdc_notification *event;
>> +
>> +     if (urb->actual_length < sizeof(*event))
>> +             return;
>> +
>> +     event = urb->transfer_buffer;
>> +
>> +     if (event->bNotificationType != USB_CDC_NOTIFY_NETWORK_CONNECTION) {
>> +             usbnet_cdc_status(dev, urb);
>> +             return;
>> +     }
>> +
>> +     netif_dbg(dev, timer, dev->net, "CDC: carrier %s\n",
>> +               event->wValue ? "on" : "off");
>> +
>> +     if (event->wValue &&
>> +         !test_bit(__LINK_STATE_NOCARRIER, &dev->net->state))
>> +             usbnet_link_change(dev, 0, 0);
>> +
>> +     usbnet_link_change(dev, !!event->wValue, 0);
>> +}
>
> As Henning said: Use netif_carrier_ok instead of open coding it.
>
> But I also think you need to replace the first usbnet_link_change() with
> a plain netif_carrier_off(dev->net).  Calling usbnet_link_change() twice
> here is only going to set off the "kevent XX may have been dropped"
> message since you call schedule_work() twice without giving the work
> queue a chance to be processed.  No need to do that.

Thanks for the feedback and agree. Will submit a v2 tomorrow.

-Kristian

^ permalink raw reply

* [PATCH] net: atarilance: use %8ph for printing hex string
From: Rasmus Villemoes @ 2016-11-30 22:02 UTC (permalink / raw)
  To: Felipe Balbi; +Cc: Rasmus Villemoes, netdev, linux-kernel

This is already using the %pM printf extension; might as well also use
%ph to make the code smaller.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
---
 drivers/net/ethernet/amd/atarilance.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/amd/atarilance.c b/drivers/net/ethernet/amd/atarilance.c
index d2bc8e5dcd23..d208a0532811 100644
--- a/drivers/net/ethernet/amd/atarilance.c
+++ b/drivers/net/ethernet/amd/atarilance.c
@@ -1013,13 +1013,9 @@ static int lance_rx( struct net_device *dev )
 					u_char *data = PKTBUF_ADDR(head);
 
 					printk(KERN_DEBUG "%s: RX pkt type 0x%04x from %pM to %pM "
-						   "data %02x %02x %02x %02x %02x %02x %02x %02x "
-						   "len %d\n",
+						   "data %8ph len %d\n",
 						   dev->name, ((u_short *)data)[6],
-						   &data[6], data,
-						   data[15], data[16], data[17], data[18],
-						   data[19], data[20], data[21], data[22],
-						   pkt_len);
+						   &data[6], data, &data[15], pkt_len);
 				}
 
 				skb_reserve( skb, 2 );	/* 16 byte align */
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH 4/6] net: ethernet: ti: cpts: add ptp pps support
From: Richard Cochran @ 2016-11-30 22:17 UTC (permalink / raw)
  To: Grygorii Strashko
  Cc: Murali Karicheri, Wingman Kwok, David S. Miller, netdev,
	Mugunthan V N, Sekhar Nori, linux-kernel, linux-omap, Rob Herring,
	devicetree
In-Reply-To: <875d4cc2-8a47-b06d-fb46-0cacc28dbaee@ti.com>

On Wed, Nov 30, 2016 at 02:43:57PM -0600, Grygorii Strashko wrote:
> > In order to produce the PPS edge correctly, you would have to adjust
> > the comparison value whenever cc.mult changes, 
> 
> yes. And that is done in cpts_ptp_adjfreq()
> 	if (cpts->ts_comp_enabled)
> 		cpts->ts_comp_one_sec_cycs = cpts_cc_ns2cyc(cpts, NSEC_PER_SEC);
> 	^^^ re-calculate reload value for 
>  
> 	cpts_ts_comp_settime(cpts, ns);
> 	^^^ adjust the ts_comp

And it races with the pulse itself.  You forgot about this part:

> @@ -172,14 +232,31 @@ static int cpts_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
>  	adj *= ppb;
>  	diff = div_u64(adj, 1000000000ULL);
>  
> +	mutex_lock(&cpts->ptp_clk_mutex);
> +
>  	spin_lock_irqsave(&cpts->lock, flags);
> +	if (cpts->ts_comp_enabled) {
> +		cpts_ts_comp_disable(cpts);

Sorry, but this is a train wreck.

> > but of course this is unworkable.
> > 
> 
> Sry, but this is questionable - code for pps comes from TI internal
> branches (SDK releases) where it survived for a pretty long time.

That doesn't mean the code is any good.  If you adjust at the right
moment, then no pulse occurs at all!

> I'm, of course, agree that without HW support for freq adjustment
> this PPS feature is not super precise and has some limitation,
> but that is what we agree to live with. 

I do NOT agree to live with this.  I am one who is going to have to
explain to the world why their beagle bone PPS sucks.
 
Thanks,
Richard

^ permalink raw reply

* Re: [WIP] net+mlx4: auto doorbell
From: Jesper Dangaard Brouer @ 2016-11-30 22:30 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Rick Jones, netdev, Saeed Mahameed, Tariq Toukan, Achiad Shochat,
	brouer
In-Reply-To: <1480534200.18162.203.camel@edumazet-glaptop3.roam.corp.google.com>

On Wed, 30 Nov 2016 11:30:00 -0800
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> On Wed, 2016-11-30 at 20:17 +0100, Jesper Dangaard Brouer wrote:
> 
> > Don't take is as critique Eric.  I was hoping your patch would have
> > solved this issue of being sensitive to TX completion adjustments.  You
> > usually have good solutions for difficult issues. I basically rejected
> > Achiad's approach/patch because it was too sensitive to these kind of
> > adjustments.  
> 
> Well, this patch can hurt latencies, because a doorbell can be delayed,
> and softirqs can be delayed by many hundred of usec in some cases.
> 
> I would not enable this behavior by default.

What about another scheme, where dev_hard_start_xmit() can return an
indication that driver choose not to flush (based on TX queue depth),
and there by requesting stack to call flush at a later point.

Would that introduce less latency issues?


Patch muckup (not even compile tested):

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4ffcd874cc20..d7d15e4e6766 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -109,6 +109,7 @@ enum netdev_tx {
 	__NETDEV_TX_MIN	 = INT_MIN,	/* make sure enum is signed */
 	NETDEV_TX_OK	 = 0x00,	/* driver took care of packet */
 	NETDEV_TX_BUSY	 = 0x10,	/* driver tx path was busy*/
+	NETDEV_TX_FLUSHME= 0x04,	/* driver request doorbell/flush later */
 };
 typedef enum netdev_tx netdev_tx_t;
 
@@ -536,6 +537,8 @@ enum netdev_queue_state_t {
 	__QUEUE_STATE_DRV_XOFF,
 	__QUEUE_STATE_STACK_XOFF,
 	__QUEUE_STATE_FROZEN,
+	// __QUEUE_STATE_NEED_FLUSH
+	// is is better to store in txq state?
 };
 
 #define QUEUE_STATE_DRV_XOFF	(1 << __QUEUE_STATE_DRV_XOFF)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e6aa0a249672..7480e44c5a50 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -75,6 +75,7 @@ struct Qdisc {
 	void			*u32_node;
 
 	struct netdev_queue	*dev_queue;
+	struct netdev_queue	*flush_dev_queue; // store txq to flush here?
 
 	struct gnet_stats_rate_est64	rate_est;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
@@ -98,6 +99,20 @@ struct Qdisc {
 	spinlock_t		busylock ____cacheline_aligned_in_smp;
 };
 
+static inline void qdisc_request_txq_flush(struct Qdisc *qdisc,
+					   struct netdev_queue *txq)
+{
+	struct net_device dev;
+
+	if (qdisc->flush_dev_queue) {
+		if (likely(qdisc->flush_dev_queue == txq))
+			return;
+		/* Flush existing txq before reassignment */
+		dev_flush_xmit(qdisc_dev(q), txq);
+	}
+	qdisc->flush_dev_queue = txq;
+}
+
 static inline bool qdisc_is_running(const struct Qdisc *qdisc)
 {
 	return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
@@ -117,6 +132,19 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 
 static inline void qdisc_run_end(struct Qdisc *qdisc)
 {
+	/* flush device txq here, if needed */
+	if (qdisc->flush_dev_queue) {
+		struct netdev_queue *txq = qdisc->flush_dev_queue;
+		struct net_device *dev = qdisc_dev(q);
+
+		qdisc->flush_dev_queue = NULL;
+		dev_flush_xmit(dev, txq);
+		/*
+		 * DISCUSS: it is too soon to flush here? What about
+		 * rescheduling a NAPI poll cycle for this device,
+		 * before calling flush.
+		 */
+	}
 	write_seqcount_end(&qdisc->running);
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 048b46b7c92a..70339c267f33 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2880,6 +2880,15 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_skb_features);
 
+static int dev_flush_xmit(struct net_device *dev,
+			  struct netdev_queue *txq)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	ops->ndo_flush_xmit(dev, txq);
+	// Oh oh, do we need to take HARD_TX_LOCK ??
+}
+
 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 		    struct netdev_queue *txq, bool more)
 {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6cfb6e9038c2..55c01b6f6311 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -190,6 +190,13 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
 	if (dev_xmit_complete(ret)) {
 		/* Driver sent out skb successfully or skb was consumed */
+		if (ret == NETDEV_TX_FLUSHME) {
+			/* Driver choose no-TX-doorbell MMIO write.
+			 * This made taking qdisc root_lock less expensive.
+			 */
+			qdisc_request_txq_flush(q, txq);
+			// Flush happens later in qdisc_run_end()
+		}
 		ret = qdisc_qlen(q);
 	} else {
 		/* Driver returned NETDEV_TX_BUSY - requeue skb */


-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply related

* [PATCH] tcp_bbr: fix Kconfig to be able to make BBR the default congestion algorithm
From: Bernhard Held @ 2016-11-30 22:31 UTC (permalink / raw)
  To: David S. Miller, linux-kernel, netdev; +Cc: Neal Cardwell

Add missing line to be able to make BBR the default
congestion algorithm.

Signed-off-by: Bernhard Held <berny156@gmx.de>
---
  net/ipv4/Kconfig | 1 +
  1 file changed, 1 insertion(+)

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 300b068..b54b3ca 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -715,6 +715,7 @@ config DEFAULT_TCP_CONG
         default "reno" if DEFAULT_RENO
         default "dctcp" if DEFAULT_DCTCP
         default "cdg" if DEFAULT_CDG
+       default "bbr" if DEFAULT_BBR
         default "cubic"

  config TCP_MD5SIG

^ permalink raw reply related

* Re: [PATCH v2 1/7] net: dt-bindings: add RGMII TX delay configuration to meson8b-dwmac
From: Martin Blumenstingl @ 2016-11-30 22:33 UTC (permalink / raw)
  To: Rob Herring
  Cc: linux-amlogic, devicetree, netdev, davem, khilman, mark.rutland,
	linux-arm-kernel, alexandre.torgue, peppe.cavallaro, will.deacon,
	catalin.marinas, carlo, f.fainelli
In-Reply-To: <20161130214429.albylffpoumbeugi@rob-hp-laptop>

On Wed, Nov 30, 2016 at 10:44 PM, Rob Herring <robh@kernel.org> wrote:
> On Fri, Nov 25, 2016 at 02:01:50PM +0100, Martin Blumenstingl wrote:
>> This allows configuring the RGMII TX clock delay. The RGMII clock is
>> generated by underlying hardware of the the Meson 8b / GXBB DWMAC glue.
>> The configuration depends on the actual hardware (no delay may be
>> needed due to the design of the actual circuit, the PHY might add this
>> delay, etc.).
>>
>> Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
>> ---
>>  Documentation/devicetree/bindings/net/meson-dwmac.txt | 14 ++++++++++++++
>>  1 file changed, 14 insertions(+)
>>
>> diff --git a/Documentation/devicetree/bindings/net/meson-dwmac.txt b/Documentation/devicetree/bindings/net/meson-dwmac.txt
>> index 89e62dd..f8bc540 100644
>> --- a/Documentation/devicetree/bindings/net/meson-dwmac.txt
>> +++ b/Documentation/devicetree/bindings/net/meson-dwmac.txt
>> @@ -25,6 +25,20 @@ Required properties on Meson8b and newer:
>>               - "clkin0" - first parent clock of the internal mux
>>               - "clkin1" - second parent clock of the internal mux
>>
>> +Optional properties on Meson8b and newer:
>> +- amlogic,tx-delay-ns:       The internal RGMII TX clock delay (provided
>> +                     by this driver) in nanoseconds. Allowed values
>> +                     are: 0ns, 2ns, 4ns, 6ns.
>> +                     This must be configured when the phy-mode is
>> +                     "rgmii" (typically a value of 2ns is used in
>> +                     this case).
>> +                     When phy-mode is set to "rgmii-id" or
>> +                     "rgmii-txid" the TX clock delay is already
>> +                     provided by the PHY. In that case this
>> +                     property should be set to 0ns (which disables
>> +                     the TX clock delay in the MAC to prevent the
>> +                     clock from going off because both PHY and MAC
>> +                     are adding a delay).
>
> What's the default? Why can't no property mean 0ns delay?
This value (2ns) was previously hardcoded. Having a default of 0ns
means that patch 7 ("ARM64: dts: amlogic: add the ethernet TX delay
configuration") becomes mandatory (otherwise we'll have broken Gbit
ethernet again because the TX delay is now disabled in the PHY when
phy-mode is "rgmii" and additionally we don't apply the 2ns TX delay
on the MAC side when the property is missing).
I'm fine with either way though - just let me know so I can adjust the
code accordingly.

^ permalink raw reply

* [PATCH 1/2] net: ethernet: altera: TSE: Remove unneeded dma sync for tx buffers
From: Lino Sanfilippo @ 2016-11-30 22:48 UTC (permalink / raw)
  To: vbridger; +Cc: nios2-dev, linux-kernel, netdev, Lino Sanfilippo

An explicit dma sync for device directly after mapping as well as an
explicit dma sync for cpu directly before unmapping is unnecessary and
costly on the hotpath. So remove these calls.

Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
---
 drivers/net/ethernet/altera/altera_tse_main.c | 10 ----------
 1 file changed, 10 deletions(-)

 Please note that this is only compile tested since I do not have the
 concerning hardware.

diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c
index bda31f3..16c4163 100644
--- a/drivers/net/ethernet/altera/altera_tse_main.c
+++ b/drivers/net/ethernet/altera/altera_tse_main.c
@@ -400,12 +400,6 @@ static int tse_rx(struct altera_tse_private *priv, int limit)
 
 		skb_put(skb, pktlength);
 
-		/* make cache consistent with receive packet buffer */
-		dma_sync_single_for_cpu(priv->device,
-					priv->rx_ring[entry].dma_addr,
-					priv->rx_ring[entry].len,
-					DMA_FROM_DEVICE);
-
 		dma_unmap_single(priv->device, priv->rx_ring[entry].dma_addr,
 				 priv->rx_ring[entry].len, DMA_FROM_DEVICE);
 
@@ -592,10 +586,6 @@ static int tse_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	buffer->dma_addr = dma_addr;
 	buffer->len = nopaged_len;
 
-	/* Push data out of the cache hierarchy into main memory */
-	dma_sync_single_for_device(priv->device, buffer->dma_addr,
-				   buffer->len, DMA_TO_DEVICE);
-
 	priv->dmaops->tx_buffer(priv, buffer);
 
 	skb_tx_timestamp(skb);
-- 
2.7.4

^ permalink raw reply related

* [PATCH 2/2] net: ethernet: altera: TSE: do not use tx queue lock in tx completion handler
From: Lino Sanfilippo @ 2016-11-30 22:48 UTC (permalink / raw)
  To: vbridger; +Cc: nios2-dev, linux-kernel, netdev, Lino Sanfilippo
In-Reply-To: <1480546112-3099-1-git-send-email-LinoSanfilippo@gmx.de>

The driver already uses its private lock for synchronization between xmit
and xmit completion handler making the additional use of the xmit_lock
unnecessary.
Furthermore the driver does not set NETIF_F_LLTX resulting in xmit to be
called with the xmit_lock held and then taking the private lock while xmit
completion handler does the reverse, first take the private lock, then the
xmit_lock.
Fix these issues by not taking the xmit_lock in the tx completion handler.

Signed-off-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
---
 drivers/net/ethernet/altera/altera_tse_main.c | 2 --
 1 file changed, 2 deletions(-)

 Please note that this is only compile tested since I do not have the
 concerning hardware.

diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c
index 16c4163..cddc532 100644
--- a/drivers/net/ethernet/altera/altera_tse_main.c
+++ b/drivers/net/ethernet/altera/altera_tse_main.c
@@ -463,7 +463,6 @@ static int tse_tx_complete(struct altera_tse_private *priv)

 	if (unlikely(netif_queue_stopped(priv->dev) &&
 		     tse_tx_avail(priv) > TSE_TX_THRESH(priv))) {
-		netif_tx_lock(priv->dev);
 		if (netif_queue_stopped(priv->dev) &&
 		    tse_tx_avail(priv) > TSE_TX_THRESH(priv)) {
 			if (netif_msg_tx_done(priv))
@@ -471,7 +470,6 @@ static int tse_tx_complete(struct altera_tse_private *priv)
 					   __func__);
 			netif_wake_queue(priv->dev);
 		}
-		netif_tx_unlock(priv->dev);
 	}

 	spin_unlock(&priv->tx_lock);
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next] bpf, xdp: drop rcu_read_lock from bpf_prog_run_xdp and move to caller
From: Jakub Kicinski @ 2016-11-30 22:50 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: davem, alexei.starovoitov, saeedm, Yuval.Mintz, netdev
In-Reply-To: <b70c47e0284823e6a5db600ae75c01eac4cf7922.1480538565.git.daniel@iogearbox.net>

On Wed, 30 Nov 2016 22:16:06 +0100, Daniel Borkmann wrote:
> After 326fe02d1ed6 ("net/mlx4_en: protect ring->xdp_prog with rcu_read_lock"),
> the rcu_read_lock() in bpf_prog_run_xdp() is superfluous, since callers
> need to hold rcu_read_lock() already to make sure BPF program doesn't
> get released in the background.
> 
> Thus, drop it from bpf_prog_run_xdp(), as it can otherwise be misleading.
> Still keeping the bpf_prog_run_xdp() is useful as it allows for grepping
> in XDP supported drivers and to keep the typecheck on the context intact.
> For mlx4, this means we don't have a double rcu_read_lock() anymore. nfp can
> just make use of bpf_prog_run_xdp(), too. For qede, just move rcu_read_lock()
> out of the helper. When the driver gets atomic replace support, this will
> move to call-sites eventually.
> 
> mlx5 needs actual fixing as it has the same issue as described already in
> 326fe02d1ed6 ("net/mlx4_en: protect ring->xdp_prog with rcu_read_lock"),
> that is, we're under RCU bh at this time, BPF programs are released via
> call_rcu(), and call_rcu() != call_rcu_bh(), so we need to properly mark
> read side as programs can get xchg()'ed in mlx5e_xdp_set() without queue
> reset.
> 
> Fixes: 86994156c736 ("net/mlx5e: XDP fast RX drop bpf programs support")
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: Alexei Starovoitov <ast@kernel.org>

FWIW:

Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>

Thanks!

^ permalink raw reply

* Initial thoughts on TXDP
From: Tom Herbert @ 2016-11-30 22:54 UTC (permalink / raw)
  To: Linux Kernel Network Developers

Posting for discussion....

Now that XDP seems to be nicely gaining traction we can start to
consider the next logical step which is to apply the principles of XDP
to accelerating transport protocols in the kernel. For lack of a
better name I'll refer to this as Transport eXpress Data Path, or just
TXDP :-). Pulling off TXDP might not be the most trivial of problems
to solve, but if we can this may address the performance gap between
kernel bypass and the stack for transport layer protocols (XDP
addresses the performance gap for stateless packet processing). The
problem statement is analogous to that which we had for XDP, namely
can we create a mode in the kernel that offer the same performance
that is seen with L4 protocols over kernel bypass (e.g. TCP/OpenOnload
or TCP/DPDK) or perhaps something reasonably close to a full HW
offload solution (such as RDMA)?

TXDP is different from XDP in that we are dealing with stateful
protocols and is part of a full protocol implementation, specifically
this would be an accelerated mode of transport connections (e.g. TCP)
in the kernel. Also, unlike XDP we now need to be concerned with
transmit path (both application generating packets as well as protocol
sourced packets like ACKs, retransmits, clocking out data, etc.).
Another distinction is that the user API needs to be considered, for
instance optimizing the nominal protocol stack but then using an
unmodified socket interface could easily undo the effects of
optimizing the lower layers. This last point actually implies a nice
constraint, if we can't keep the accelerated path simple its probably
not worth trying to accelerate.

One simplifying assumption we might make is that TXDP is primarily for
optimizing latency, specifically request/response type operations
(think HPC, HFT, flash server, or other tightly coupled communications
within the datacenter). Notably, I don't think that saving CPU is as
relevant to TXDP, in fact we have already seen that CPU utilization
can be traded off for lower latency via spin polling. Similar to XDP
though, we might assume that single CPU performance is relevant (i.e.
on a cache server we'd like to spin as few CPUs as needed and no more
to handle the load an maintain throughput and latency requirements).
High throughput (ops/sec) and low variance should be side effects of
any design.

As with XDP, TXDP is _not_ intended to be a completely generic and
transparent solution. The application may be specifically optimized
for use with TXDP (for instance to implement perfect lockless
silo'ing). So TXDP is not going to be for everyone and it should be as
minimally invasive to the rest of the stack as possible.

I imagine there are a few reasons why userspace TCP stacks can get
good performance:

- Spin polling (we already can do this in kernel)
- Lockless, I would assume that threads typically have exclusive
access to a queue pair for a connection
- Minimal TCP/IP stack code
- Zero copy TX/RX
- Light weight structures for queuing
- No context switches
- Fast data path for in order, uncongested flows
- Silo'ing between application and device queues

Not all of these have cognates in the Linux stack, for instance we
probably can't entirely eliminate context switches for a userspace
application.

So with that the components of TXDP might look something like:

RX

  - Call into TCP/IP stack with page data directly from driver-- no
skbuff allocation or interface. This is essentially provided by the
XDP API although we would need to generalize the interface to call
stack functions (I previously posted patches for that). We will also
need a new action, XDP_HELD?, that indicates the XDP function held the
packet (put on a socket for instance).
  - Perform connection lookup. If we assume lockless model as
described below then we should be able to perform lockless connection
lookup similar to work Eric did to optimize UDP lookups for tunnel
processing
  - Call function that implements expedited TCP/IP datapath (something
like Van Jacobson's famous 80 instructions? :-) ).
  - If there is anything funky about the packet, connection state, or
TCP connection is not being TXDP accelerated just return XDP_PASS so
that packet follows normal stack processing. Since we did connection
lookup we could return an early demux also.Since we're already in an
exception mode this is where we might want to move packet processing
to different CPU (can be done by RPS/RFS)..
  - If packet contains new data we can allocate a "mini skbfuf (talked
about that at netdev) for queuing on socket.
  - If packet is an ACK we can process it directly without ever creating skbuff
  - There is also the possibility of avoiding the skbuff allocation
for in-kernel applications. Stream parser might also be taught how to
deal with raw buffers.
  - If we're really ambitious we can also consider putting packets
into a packet ring for user space presuming that packets are typically
in order (might be a little orthogonal to TXDP.

TX

  - Normal TX socket options apply however they might be lockless
under locking constraints below
  - skbuff is required for keeping data on socket a TCP (not
necessarily for UDP though), but we might be able to use a mini skbuff
for this
  - We'd need an interface to transmit a packet in a page buffer
without an skbuff.
  - When we transmit, it would be nice to go straight from TCP
connection to an XDP device queue and in particular skip the qdisc
layer. This follows the principle of low latency being first criteria.
Effective use of qdiscs, especially non work conserving ones, imply
longer latencies in effect which likely means TXDP isn't appropriate
in such a cases. BQL is also out, however we would want the TX
batching of XDP.
  - If we really need priority queing we should have the option of
using device mulitple queue
  - Pure ACKs would not require an skb also
  - Zero copy TX (splice) can be used where needed. This might be
particular useful in a flash or remote memory server
  - TX completion would most like be happening on same CPU also

Miscellaneous

  - Under the simplicity principle we really only want the TXDP to
contain the minimal necessary path. What is "minimal" is a very
relevant question. If we constrain the use case to be communications
within a single rack such that we can engineer for basically no loss
and no congestion (pause might be reasonable here) then the TXDP data
path might just implement that. Mostly this would just be the
established data path that is accelerated, handling other states might
be done in existing path (which becomes slow path in TXDP).
  - To make transport sockets to have a lockless mode I am
contemplating that connections/sockets can be bound to particularly
CPUs and that any operations (socket operations, timers, receive
processing) must occur on that CPU. The CPU would be the one where RX
happens. Note this implies perfect silo'ing, everything for driver RX
to application processing happens inline on the CPU. The stack would
not cross CPUs for a connection while in this mode.
  - We might be able to take advantage of per connection queues or
ntuple filter to isolate flows to to certain queues and hence certain
CPUs. I don't think this can be a requirement though, TXDP should be
able to work well with generic MQ devices. Specialized devices queues
are an optimization, without having those those we'd want to push to
other CPUs as quickly as possible (RPS but maybe we can get away with
a mini-skbuff here).

Thanks,
Tom

^ permalink raw reply

* [PATCH net] packet: fix race condition in packet_set_ring
From: Eric Dumazet @ 2016-11-30 22:55 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Philip Pettersson

From: Philip Pettersson <philip.pettersson@gmail.com>

When packet_set_ring creates a ring buffer it will initialize a
struct timer_list if the packet version is TPACKET_V3. This value
can then be raced by a different thread calling setsockopt to
set the version to TPACKET_V1 before packet_set_ring has finished.

This leads to a use-after-free on a function pointer in the
struct timer_list when the socket is closed as the previously
initialized timer will not be deleted.

The bug is fixed by taking lock_sock(sk) in packet_setsockopt when
changing the packet version while also taking the lock at the start
of packet_set_ring.

Fixes: f6fb8f100b80 ("af-packet: TPACKET_V3 flexible buffer implementation.")
Signed-off-by: Philip Pettersson <philip.pettersson@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/packet/af_packet.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d2238b204691b8e4f2e3acb9bc167b553ba32d50..dd2332390c45bbff7c3fc5d259453f2e1ca352bf 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3648,19 +3648,25 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 
 		if (optlen != sizeof(val))
 			return -EINVAL;
-		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
-			return -EBUSY;
 		if (copy_from_user(&val, optval, sizeof(val)))
 			return -EFAULT;
 		switch (val) {
 		case TPACKET_V1:
 		case TPACKET_V2:
 		case TPACKET_V3:
-			po->tp_version = val;
-			return 0;
+			break;
 		default:
 			return -EINVAL;
 		}
+		lock_sock(sk);
+		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
+			ret = -EBUSY;
+		} else {
+			po->tp_version = val;
+			ret = 0;
+		}
+		release_sock(sk);
+		return ret;
 	}
 	case PACKET_RESERVE:
 	{
@@ -4164,6 +4170,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 	/* Added to avoid minimal code churn */
 	struct tpacket_req *req = &req_u->req;
 
+	lock_sock(sk);
 	/* Opening a Tx-ring is NOT supported in TPACKET_V3 */
 	if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
 		net_warn_ratelimited("Tx-ring is not supported.\n");
@@ -4245,7 +4252,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 			goto out;
 	}
 
-	lock_sock(sk);
 
 	/* Detach socket from network */
 	spin_lock(&po->bind_lock);
@@ -4294,11 +4300,11 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 		if (!tx_ring)
 			prb_shutdown_retire_blk_timer(po, rb_queue);
 	}
-	release_sock(sk);
 
 	if (pg_vec)
 		free_pg_vec(pg_vec, order, req->tp_block_nr);
 out:
+	release_sock(sk);
 	return err;
 }
 

^ permalink raw reply related

* Re: [WIP] net+mlx4: auto doorbell
From: Eric Dumazet @ 2016-11-30 22:40 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Rick Jones, netdev, Saeed Mahameed, Tariq Toukan, Achiad Shochat
In-Reply-To: <20161130233015.3de95356@redhat.com>

On Wed, 2016-11-30 at 23:30 +0100, Jesper Dangaard Brouer wrote:
> On Wed, 30 Nov 2016 11:30:00 -0800
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> > On Wed, 2016-11-30 at 20:17 +0100, Jesper Dangaard Brouer wrote:
> > 
> > > Don't take is as critique Eric.  I was hoping your patch would have
> > > solved this issue of being sensitive to TX completion adjustments.  You
> > > usually have good solutions for difficult issues. I basically rejected
> > > Achiad's approach/patch because it was too sensitive to these kind of
> > > adjustments.  
> > 
> > Well, this patch can hurt latencies, because a doorbell can be delayed,
> > and softirqs can be delayed by many hundred of usec in some cases.
> > 
> > I would not enable this behavior by default.
> 
> What about another scheme, where dev_hard_start_xmit() can return an
> indication that driver choose not to flush (based on TX queue depth),
> and there by requesting stack to call flush at a later point.
> 
> Would that introduce less latency issues?


Again, how is it going to change anything when your netperf UDP sends
one packet at a time ?

qdisc gets one packet , dequeue it immediately.

No pressure. -> doorbell will be sent as before.

Some TX producers do not even use a qdisc to begin with.

Please think of a solution that does not involve qdisc layer at all.

^ permalink raw reply

* [PATCH net-next 0/6] net: dsa: mv88e6xxx: rework reset and PPU code
From: Vivien Didelot @ 2016-11-30 22:59 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot

Old Marvell chips (like 88E6060) don't have a PHY Polling Unit (PPU).

Next chips (like 88E6185) have a PPU, which has exclusive access to the
PHY registers, thus must be disabled before access.

Newer chips (like 88E6352) have an indirect mechanism to access the PHY
registers whenever, thus loose control over the PPU (alway enabled).

Here's a summary:

Model | PPU? | Has PPU ctrl?  | PPU state readable? | PHY access
----- | ---- | -------------- | ------------------- | ----------
 6060 | no   | no             | no                  | direct
 6185 | yes  | yes, PPUEn bit | yes, PPUState 2-bit | direct w/ PPU dis.
 6352 | yes  | no             | yes, PPUState 1-bit | indirect
 6390 | yes  | no             | yes, InitState bit  | indirect

Depending on the PPU control, a switch may have to restart the PPU when
resetting the switch. Once the switch is reset, we must wait for the PPU
state to be active polling again before accessing the registers.

For that purpose, add new operations to the chips to enable/disable the
PPU and check its polling state, and execute software reset. With these
new ops in place, rework the switch reset code and finally get rid of
the MV88E6XXX_FLAG_PPU* flags.

Vivien Didelot (6):
  net: dsa: mv88e6xxx: add helper to disable ports
  net: dsa: mv88e6xxx: add helper to hardware reset
  net: dsa: mv88e6xxx: add a software reset op
  net: dsa: mv88e6xxx: add a PPU polling op
  net: dsa: mv88e6xxx: add helper for switch ready
  net: dsa: mv88e6xxx: add PPU enable/disable ops

 drivers/net/dsa/mv88e6xxx/chip.c      | 241 ++++++++++++++++++++++------------
 drivers/net/dsa/mv88e6xxx/global1.c   | 111 ++++++++++++++++
 drivers/net/dsa/mv88e6xxx/global1.h   |  10 ++
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |  35 ++---
 4 files changed, 292 insertions(+), 105 deletions(-)

-- 
2.10.2

^ permalink raw reply

* [PATCH net-next 1/6] net: dsa: mv88e6xxx: add helper to disable ports
From: Vivien Didelot @ 2016-11-30 22:59 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot
In-Reply-To: <20161130225930.25510-1-vivien.didelot@savoirfairelinux.com>

Before resetting a switch, the ports should be set to the Disabled state
and the transmit queues should be drained.

Add an helper to explicit that.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 drivers/net/dsa/mv88e6xxx/chip.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index ce2f7ff..c89701e 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2361,6 +2361,26 @@ static void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port)
 	mutex_unlock(&chip->reg_lock);
 }
 
+static int mv88e6xxx_disable_ports(struct mv88e6xxx_chip *chip)
+{
+	int i, err;
+
+	/* Set all ports to the Disabled state */
+	for (i = 0; i < mv88e6xxx_num_ports(chip); i++) {
+		err = mv88e6xxx_port_set_state(chip, i,
+					       PORT_CONTROL_STATE_DISABLED);
+		if (err)
+			return err;
+	}
+
+	/* Wait for transmit queues to drain,
+	 * i.e. 2ms for a maximum frame to be transmitted at 10 Mbps.
+	 */
+	usleep_range(2000, 4000);
+
+	return 0;
+}
+
 static int mv88e6xxx_switch_reset(struct mv88e6xxx_chip *chip)
 {
 	bool ppu_active = mv88e6xxx_has(chip, MV88E6XXX_FLAG_PPU_ACTIVE);
@@ -2369,18 +2389,10 @@ static int mv88e6xxx_switch_reset(struct mv88e6xxx_chip *chip)
 	unsigned long timeout;
 	u16 reg;
 	int err;
-	int i;
 
-	/* Set all ports to the disabled state. */
-	for (i = 0; i < mv88e6xxx_num_ports(chip); i++) {
-		err = mv88e6xxx_port_set_state(chip, i,
-					       PORT_CONTROL_STATE_DISABLED);
-		if (err)
-			return err;
-	}
-
-	/* Wait for transmit queues to drain. */
-	usleep_range(2000, 4000);
+	err = mv88e6xxx_disable_ports(chip);
+	if (err)
+		return err;
 
 	/* If there is a gpio connected to the reset pin, toggle it */
 	if (gpiod) {
-- 
2.10.2

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox