Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 23/30] netfilter: nf_flow_table: cache mtu in struct flow_offload_tuple
From: Pablo Neira Ayuso @ 2018-03-12 17:59 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180312175920.9022-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

Reduces the number of cache lines touched in the offload forwarding
path. This is safe because PMTU limits are bypassed for the forwarding
path (see commit f87c10a8aa1e for more details).

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h   |  2 ++
 net/ipv4/netfilter/nf_flow_table_ipv4.c | 17 +++--------------
 net/ipv6/netfilter/nf_flow_table_ipv6.c | 17 +++--------------
 net/netfilter/nf_flow_table.c           |  8 ++++++--
 4 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 09ba67598991..76ee5c81b752 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -55,6 +55,8 @@ struct flow_offload_tuple {
 
 	int				oifidx;
 
+	u16				mtu;
+
 	struct dst_entry		*dst_cache;
 };
 
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 25d2975da156..e17ef57b0df4 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -177,7 +177,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
 }
 
 /* Based on ip_exceeds_mtu(). */
-static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 {
 	if (skb->len <= mtu)
 		return false;
@@ -191,17 +191,6 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	return true;
 }
 
-static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt)
-{
-	u32 mtu;
-
-	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
-	if (__nf_flow_exceeds_mtu(skb, mtu))
-		return true;
-
-	return false;
-}
-
 unsigned int
 nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 			const struct nf_hook_state *state)
@@ -232,9 +221,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 
 	dir = tuplehash->tuple.dir;
 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-
 	rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
-	if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
 		return NF_ACCEPT;
 
 	if (skb_try_make_writable(skb, sizeof(*iph)))
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index d346705d6ee6..f530efd3e378 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -173,7 +173,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
 }
 
 /* Based on ip_exceeds_mtu(). */
-static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 {
 	if (skb->len <= mtu)
 		return false;
@@ -184,17 +184,6 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	return true;
 }
 
-static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt)
-{
-	u32 mtu;
-
-	mtu = ip6_dst_mtu_forward(&rt->dst);
-	if (__nf_flow_exceeds_mtu(skb, mtu))
-		return true;
-
-	return false;
-}
-
 unsigned int
 nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 			  const struct nf_hook_state *state)
@@ -225,9 +214,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 
 	dir = tuplehash->tuple.dir;
 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-
 	rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
-	if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
 		return NF_ACCEPT;
 
 	if (skb_try_make_writable(skb, sizeof(*ip6h)))
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c
index db0673a40b97..7403a0dfddf7 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -4,6 +4,8 @@
 #include <linux/netfilter.h>
 #include <linux/rhashtable.h>
 #include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/ip6_route.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_conntrack.h>
@@ -23,6 +25,7 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 {
 	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
 	struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
+	struct dst_entry *dst = route->tuple[dir].dst;
 
 	ft->dir = dir;
 
@@ -30,10 +33,12 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 	case NFPROTO_IPV4:
 		ft->src_v4 = ctt->src.u3.in;
 		ft->dst_v4 = ctt->dst.u3.in;
+		ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
 		break;
 	case NFPROTO_IPV6:
 		ft->src_v6 = ctt->src.u3.in6;
 		ft->dst_v6 = ctt->dst.u3.in6;
+		ft->mtu = ip6_dst_mtu_forward(dst);
 		break;
 	}
 
@@ -44,8 +49,7 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 
 	ft->iifidx = route->tuple[dir].ifindex;
 	ft->oifidx = route->tuple[!dir].ifindex;
-
-	ft->dst_cache = route->tuple[dir].dst;
+	ft->dst_cache = dst;
 }
 
 struct flow_offload *
-- 
2.11.0

^ permalink raw reply related

* [PATCH 24/30] netfilter: nf_flow_table: rename nf_flow_table.c to nf_flow_table_core.c
From: Pablo Neira Ayuso @ 2018-03-12 17:59 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180312175920.9022-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

Preparation for adding more code to the same module

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Makefile                                  | 2 ++
 net/netfilter/{nf_flow_table.c => nf_flow_table_core.c} | 0
 2 files changed, 2 insertions(+)
 rename net/netfilter/{nf_flow_table.c => nf_flow_table_core.c} (100%)

diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 5d9b8b959e58..138db16d59ed 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -112,6 +112,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV)	+= nft_fwd_netdev.o
 
 # flow table infrastructure
 obj-$(CONFIG_NF_FLOW_TABLE)	+= nf_flow_table.o
+nf_flow_table-objs := nf_flow_table_core.o
+
 obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
 
 # generic X tables 
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table_core.c
similarity index 100%
rename from net/netfilter/nf_flow_table.c
rename to net/netfilter/nf_flow_table_core.c
-- 
2.11.0

^ permalink raw reply related

* [PATCH 25/30] netfilter: x_tables: fix build with CONFIG_COMPAT=n
From: Pablo Neira Ayuso @ 2018-03-12 17:59 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180312175920.9022-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

I placed the helpers within CONFIG_COMPAT section, move them
outside.

Fixes: 472ebdcd15ebdb ("netfilter: x_tables: check error target size too")
Fixes: 07a9da51b4b6ae ("netfilter: x_tables: check standard verdicts in core")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 62 ++++++++++++++++++++++++------------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 7521e8a72c06..bac932f1c582 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -577,6 +577,37 @@ int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_ho
 }
 EXPORT_SYMBOL(xt_check_table_hooks);
 
+static bool verdict_ok(int verdict)
+{
+	if (verdict > 0)
+		return true;
+
+	if (verdict < 0) {
+		int v = -verdict - 1;
+
+		if (verdict == XT_RETURN)
+			return true;
+
+		switch (v) {
+		case NF_ACCEPT: return true;
+		case NF_DROP: return true;
+		case NF_QUEUE: return true;
+		default:
+			break;
+		}
+
+		return false;
+	}
+
+	return false;
+}
+
+static bool error_tg_ok(unsigned int usersize, unsigned int kernsize,
+			const char *msg, unsigned int msglen)
+{
+	return usersize == kernsize && strnlen(msg, msglen) < msglen;
+}
+
 #ifdef CONFIG_COMPAT
 int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
 {
@@ -736,37 +767,6 @@ struct compat_xt_error_target {
 	char errorname[XT_FUNCTION_MAXNAMELEN];
 };
 
-static bool verdict_ok(int verdict)
-{
-	if (verdict > 0)
-		return true;
-
-	if (verdict < 0) {
-		int v = -verdict - 1;
-
-		if (verdict == XT_RETURN)
-			return true;
-
-		switch (v) {
-		case NF_ACCEPT: return true;
-		case NF_DROP: return true;
-		case NF_QUEUE: return true;
-		default:
-			break;
-		}
-
-		return false;
-	}
-
-	return false;
-}
-
-static bool error_tg_ok(unsigned int usersize, unsigned int kernsize,
-			const char *msg, unsigned int msglen)
-{
-	return usersize == kernsize && strnlen(msg, msglen) < msglen;
-}
-
 int xt_compat_check_entry_offsets(const void *base, const char *elems,
 				  unsigned int target_offset,
 				  unsigned int next_offset)
-- 
2.11.0

^ permalink raw reply related

* [PATCH 26/30] ipvs: use true and false for boolean values
From: Pablo Neira Ayuso @ 2018-03-12 17:59 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180312175920.9022-1-pablo@netfilter.org>

From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>

Assign true or false to boolean variables instead of an integer value.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_lblc.c  | 4 ++--
 net/netfilter/ipvs/ip_vs_lblcr.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 6a340c94c4b8..942e835caf7f 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -238,7 +238,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc)
 	int i;
 
 	spin_lock_bh(&svc->sched_lock);
-	tbl->dead = 1;
+	tbl->dead = true;
 	for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
 		hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
 			ip_vs_lblc_del(en);
@@ -369,7 +369,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
 	tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
 	tbl->rover = 0;
 	tbl->counter = 1;
-	tbl->dead = 0;
+	tbl->dead = false;
 	tbl->svc = svc;
 
 	/*
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 0627881128da..a5acab25c36b 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -404,7 +404,7 @@ static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
 	struct hlist_node *next;
 
 	spin_lock_bh(&svc->sched_lock);
-	tbl->dead = 1;
+	tbl->dead = true;
 	for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
 		hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
 			ip_vs_lblcr_free(en);
@@ -532,7 +532,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
 	tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
 	tbl->rover = 0;
 	tbl->counter = 1;
-	tbl->dead = 0;
+	tbl->dead = false;
 	tbl->svc = svc;
 
 	/*
-- 
2.11.0

^ permalink raw reply related

* [PATCH 27/30] netfilter: nf_tables: handle rt0 and rt2 properly
From: Pablo Neira Ayuso @ 2018-03-12 17:59 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180312175920.9022-1-pablo@netfilter.org>

From: Ahmed Abdelsalam <amsalam20@gmail.com>

This fixes Netfilter's bugzilla #1219.

Type 0 and 2 of the IPv6 Routing extension header are not handled
properlyby exthdr_init_raw() in src/exthdr.c

In order to fix the bug, we extended the "enum nft_exthdr_op" to
differentiate between rt, rt0, and rt2.

In this patch we extended the kernel implementation of nf_tables to
recognize the new options

Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 3 +++
 net/netfilter/nft_exthdr.c               | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 66dceee0ae30..bb2135c8ad73 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -731,6 +731,9 @@ enum nft_exthdr_flags {
 enum nft_exthdr_op {
 	NFT_EXTHDR_OP_IPV6,
 	NFT_EXTHDR_OP_TCPOPT,
+	NFT_EXTHDR_OP_RT0,
+	NFT_EXTHDR_OP_RT2,
+	NFT_EXTHDR_OP_RT4,
 	__NFT_EXTHDR_OP_MAX
 };
 #define NFT_EXTHDR_OP_MAX	(__NFT_EXTHDR_OP_MAX - 1)
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index 47ec1046ad11..bbc1be2b3b73 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -399,6 +399,9 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
 			return &nft_exthdr_tcp_ops;
 		break;
 	case NFT_EXTHDR_OP_IPV6:
+	case NFT_EXTHDR_OP_RT0:
+	case NFT_EXTHDR_OP_RT2:
+	case NFT_EXTHDR_OP_RT4:
 		if (tb[NFTA_EXTHDR_DREG])
 			return &nft_exthdr_ipv6_ops;
 		break;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 28/30] netfilter: Refactor nf_conncount
From: Pablo Neira Ayuso @ 2018-03-12 17:59 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180312175920.9022-1-pablo@netfilter.org>

From: Yi-Hung Wei <yihung.wei@gmail.com>

Remove parameter 'family' in nf_conncount_count() and count_tree().
It is because the parameter is not useful after commit 625c556118f3
("netfilter: connlimit: split xt_connlimit into front and backend").

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_count.h | 1 -
 net/netfilter/nf_conncount.c               | 4 +---
 net/netfilter/xt_connlimit.c               | 4 ++--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
index adf8db44cf86..e61184fbfb71 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -11,7 +11,6 @@ void nf_conncount_destroy(struct net *net, unsigned int family,
 unsigned int nf_conncount_count(struct net *net,
 				struct nf_conncount_data *data,
 				const u32 *key,
-				unsigned int family,
 				const struct nf_conntrack_tuple *tuple,
 				const struct nf_conntrack_zone *zone);
 #endif
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 6d65389e308f..9305a08b4422 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -158,7 +158,6 @@ static void tree_nodes_free(struct rb_root *root,
 static unsigned int
 count_tree(struct net *net, struct rb_root *root,
 	   const u32 *key, u8 keylen,
-	   u8 family,
 	   const struct nf_conntrack_tuple *tuple,
 	   const struct nf_conntrack_zone *zone)
 {
@@ -246,7 +245,6 @@ count_tree(struct net *net, struct rb_root *root,
 unsigned int nf_conncount_count(struct net *net,
 				struct nf_conncount_data *data,
 				const u32 *key,
-				unsigned int family,
 				const struct nf_conntrack_tuple *tuple,
 				const struct nf_conntrack_zone *zone)
 {
@@ -259,7 +257,7 @@ unsigned int nf_conncount_count(struct net *net,
 
 	spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
 
-	count = count_tree(net, root, key, data->keylen, family, tuple, zone);
+	count = count_tree(net, root, key, data->keylen, tuple, zone);
 
 	spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
 
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index b1b17b9353e1..6275106ccf50 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -67,8 +67,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		key[1] = zone->id;
 	}
 
-	connections = nf_conncount_count(net, info->data, key,
-					 xt_family(par), tuple_ptr, zone);
+	connections = nf_conncount_count(net, info->data, key, tuple_ptr,
+					 zone);
 	if (connections == 0)
 		/* kmalloc failed, drop it entirely */
 		goto hotdrop;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 29/30] netfilter: conncount: Support count only use case
From: Pablo Neira Ayuso @ 2018-03-12 17:59 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180312175920.9022-1-pablo@netfilter.org>

From: Yi-Hung Wei <yihung.wei@gmail.com>

Currently, nf_conncount_count() counts the number of connections that
matches key and inserts a conntrack 'tuple' with the same key into the
accounting data structure.  This patch supports another use case that only
counts the number of connections where 'tuple' is not provided.  Therefore,
proper changes are made on nf_conncount_count() to support the case where
'tuple' is NULL.  This could be useful for querying statistics or
debugging purpose.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conncount.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 9305a08b4422..153e690e2893 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -104,7 +104,7 @@ static unsigned int check_hlist(struct net *net,
 	struct nf_conn *found_ct;
 	unsigned int length = 0;
 
-	*addit = true;
+	*addit = tuple ? true : false;
 
 	/* check the saved connections */
 	hlist_for_each_entry_safe(conn, n, head, node) {
@@ -117,7 +117,7 @@ static unsigned int check_hlist(struct net *net,
 
 		found_ct = nf_ct_tuplehash_to_ctrack(found);
 
-		if (nf_ct_tuple_equal(&conn->tuple, tuple)) {
+		if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple)) {
 			/*
 			 * Just to be sure we have it only once in the list.
 			 * We should not see tuples twice unless someone hooks
@@ -220,6 +220,9 @@ count_tree(struct net *net, struct rb_root *root,
 		goto restart;
 	}
 
+	if (!tuple)
+		return 0;
+
 	/* no match, need to insert new node */
 	rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
 	if (rbconn == NULL)
@@ -242,6 +245,9 @@ count_tree(struct net *net, struct rb_root *root,
 	return 1;
 }
 
+/* Count and return number of conntrack entries in 'net' with particular 'key'.
+ * If 'tuple' is not null, insert it into the accounting data structure.
+ */
 unsigned int nf_conncount_count(struct net *net,
 				struct nf_conncount_data *data,
 				const u32 *key,
-- 
2.11.0

^ permalink raw reply related

* [PATCH 30/30] netfilter: nft_ct: add NFT_CT_{SRC,DST}_{IP,IP6}
From: Pablo Neira Ayuso @ 2018-03-12 17:59 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180312175920.9022-1-pablo@netfilter.org>

All existing keys, except the NFT_CT_SRC and NFT_CT_DST are assumed to
have strict datatypes. This is causing problems with sets and
concatenations given the specific length of these keys is not known.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Florian Westphal <fw@strlen.de>
---
 include/uapi/linux/netfilter/nf_tables.h | 12 ++++++++--
 net/netfilter/nft_ct.c                   | 38 ++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index bb2135c8ad73..09f4eb1928f0 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -912,8 +912,8 @@ enum nft_rt_attributes {
  * @NFT_CT_EXPIRATION: relative conntrack expiration time in ms
  * @NFT_CT_HELPER: connection tracking helper assigned to conntrack
  * @NFT_CT_L3PROTOCOL: conntrack layer 3 protocol
- * @NFT_CT_SRC: conntrack layer 3 protocol source (IPv4/IPv6 address)
- * @NFT_CT_DST: conntrack layer 3 protocol destination (IPv4/IPv6 address)
+ * @NFT_CT_SRC: conntrack layer 3 protocol source (IPv4/IPv6 address, deprecated)
+ * @NFT_CT_DST: conntrack layer 3 protocol destination (IPv4/IPv6 address, deprecated)
  * @NFT_CT_PROTOCOL: conntrack layer 4 protocol
  * @NFT_CT_PROTO_SRC: conntrack layer 4 protocol source
  * @NFT_CT_PROTO_DST: conntrack layer 4 protocol destination
@@ -923,6 +923,10 @@ enum nft_rt_attributes {
  * @NFT_CT_AVGPKT: conntrack average bytes per packet
  * @NFT_CT_ZONE: conntrack zone
  * @NFT_CT_EVENTMASK: ctnetlink events to be generated for this conntrack
+ * @NFT_CT_SRC_IP: conntrack layer 3 protocol source (IPv4 address)
+ * @NFT_CT_DST_IP: conntrack layer 3 protocol destination (IPv4 address)
+ * @NFT_CT_SRC_IP6: conntrack layer 3 protocol source (IPv6 address)
+ * @NFT_CT_DST_IP6: conntrack layer 3 protocol destination (IPv6 address)
  */
 enum nft_ct_keys {
 	NFT_CT_STATE,
@@ -944,6 +948,10 @@ enum nft_ct_keys {
 	NFT_CT_AVGPKT,
 	NFT_CT_ZONE,
 	NFT_CT_EVENTMASK,
+	NFT_CT_SRC_IP,
+	NFT_CT_DST_IP,
+	NFT_CT_SRC_IP6,
+	NFT_CT_DST_IP6,
 };
 
 /**
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 6ab274b14484..ea737fd789e8 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -196,6 +196,26 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
 	case NFT_CT_PROTO_DST:
 		nft_reg_store16(dest, (__force u16)tuple->dst.u.all);
 		return;
+	case NFT_CT_SRC_IP:
+		if (nf_ct_l3num(ct) != NFPROTO_IPV4)
+			goto err;
+		*dest = tuple->src.u3.ip;
+		return;
+	case NFT_CT_DST_IP:
+		if (nf_ct_l3num(ct) != NFPROTO_IPV4)
+			goto err;
+		*dest = tuple->dst.u3.ip;
+		return;
+	case NFT_CT_SRC_IP6:
+		if (nf_ct_l3num(ct) != NFPROTO_IPV6)
+			goto err;
+		memcpy(dest, tuple->src.u3.ip6, sizeof(struct in6_addr));
+		return;
+	case NFT_CT_DST_IP6:
+		if (nf_ct_l3num(ct) != NFPROTO_IPV6)
+			goto err;
+		memcpy(dest, tuple->dst.u3.ip6, sizeof(struct in6_addr));
+		return;
 	default:
 		break;
 	}
@@ -419,6 +439,20 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 			return -EAFNOSUPPORT;
 		}
 		break;
+	case NFT_CT_SRC_IP:
+	case NFT_CT_DST_IP:
+		if (tb[NFTA_CT_DIRECTION] == NULL)
+			return -EINVAL;
+
+		len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip);
+		break;
+	case NFT_CT_SRC_IP6:
+	case NFT_CT_DST_IP6:
+		if (tb[NFTA_CT_DIRECTION] == NULL)
+			return -EINVAL;
+
+		len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip6);
+		break;
 	case NFT_CT_PROTO_SRC:
 	case NFT_CT_PROTO_DST:
 		if (tb[NFTA_CT_DIRECTION] == NULL)
@@ -588,6 +622,10 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	switch (priv->key) {
 	case NFT_CT_SRC:
 	case NFT_CT_DST:
+	case NFT_CT_SRC_IP:
+	case NFT_CT_DST_IP:
+	case NFT_CT_SRC_IP6:
+	case NFT_CT_DST_IP6:
 	case NFT_CT_PROTO_SRC:
 	case NFT_CT_PROTO_DST:
 		if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH v3 net-next 3/4] net: macb: Add phy-handle DT support
From: Andrew Lunn @ 2018-03-12 17:59 UTC (permalink / raw)
  To: Brad Mouring
  Cc: Nicolas Ferre, Rob Herring, David S . Miller, Michael Grzeschik,
	Mark Rutland, netdev, Julia Cartwright, devicetree
In-Reply-To: <20180312171001.129209-4-brad.mouring@ni.com>

> @@ -488,6 +488,9 @@ static int macb_mii_probe(struct net_device *dev)
>  			}
>  			bp->phy_node = of_node_get(np);
>  		} else {
> +			/* attempt to find a phy-handle */
> +			bp->phy_node = of_parse_phandle(np, "phy-handle", 0);
> +

Hi Brad

It seems like you should check the return value here, any only
continue with the fallback if there was not a phy-handle.

>  			/* fallback to standard phy registration if no phy were
>  			 * found during dt phy registration
>  			 */

	 Andrew

^ permalink raw reply

* Re: [pci PATCH v5 1/4] pci: Add pci_sriov_configure_simple for PFs that don't manage VF resources
From: Alexander Duyck @ 2018-03-12 18:09 UTC (permalink / raw)
  To: Keith Busch
  Cc: Bjorn Helgaas, Duyck, Alexander H, linux-pci, virtio-dev, kvm,
	Netdev, Daly, Dan, LKML, linux-nvme, netanel, Maximilian Heyne,
	Wang, Liang-min, Rustad, Mark D, David Woodhouse,
	Christoph Hellwig, dwmw
In-Reply-To: <20180312174012.GE18494@localhost.localdomain>

On Mon, Mar 12, 2018 at 10:40 AM, Keith Busch <keith.busch@intel.com> wrote:
> On Mon, Mar 12, 2018 at 10:21:29AM -0700, Alexander Duyck wrote:
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index 024a1beda008..9cab9d0d51dc 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -1953,6 +1953,7 @@ static inline void pci_mmcfg_late_init(void) { }
>>  int pci_vfs_assigned(struct pci_dev *dev);
>>  int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
>>  int pci_sriov_get_totalvfs(struct pci_dev *dev);
>> +int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn);
>>  resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
>>  void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe);
>>  #else
>
> I recommend stubbing 'pci_sriov_configure_simple' or defining it to
> NULL in the '#else' section here so you don't need to repeat the "#ifdef
> CONFIG_PCI_IOV" in each driver wishing to use this function. Otherwise
> looks fine to me.

My concern with defining it as NULL is that somebody may end up
calling it in the future directly and that may end up causing issues.
One thought I have been debating is moving it to a different file. I
am just not sure where the best place to put something like this would
be. I could move this function to drivers/pci/pci.c if everyone is
okay with it and then I could just strip the contents out by wrapping
them in a #ifdef instead.

^ permalink raw reply

* Re: [PATCH] ipv6: Use ip6_multipath_hash_policy() in rt6_multipath_hash().
From: David Ahern @ 2018-03-12 18:11 UTC (permalink / raw)
  To: David Miller, netdev
In-Reply-To: <20180312.111017.1189396536892856155.davem@davemloft.net>

On 3/12/18 8:10 AM, David Miller wrote:
> 
> Make use of the new helper.
> 
> Suggested-by: David Ahern <dsahern@gmail.com>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>  net/ipv6/route.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index f0ae58424c45..81711e3e2604 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -1846,7 +1846,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
>  	struct flow_keys hash_keys;
>  	u32 mhash;
>  
> -	switch (net->ipv6.sysctl.multipath_hash_policy) {
> +	switch (ip6_multipath_hash_policy(net)) {
>  	case 0:
>  		memset(&hash_keys, 0, sizeof(hash_keys));
>  		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
> 

Thanks for taking care of that!

^ permalink raw reply

* [PATCH v2] sctp: Fix double free in sctp_sendmsg_to_asoc
From: Neil Horman @ 2018-03-12 18:15 UTC (permalink / raw)
  To: linux-sctp; +Cc: netdev, Neil Horman, davem, Xin Long
In-Reply-To: <00000000000017dfe905670110cf@google.com>

syzbot/kasan detected a double free in sctp_sendmsg_to_asoc:
BUG: KASAN: use-after-free in sctp_association_free+0x7b7/0x930
net/sctp/associola.c:332
Read of size 8 at addr ffff8801d8006ae0 by task syzkaller914861/4202

CPU: 1 PID: 4202 Comm: syzkaller914861 Not tainted 4.16.0-rc4+ #258
Hardware name: Google Google Compute Engine/Google Compute Engine
01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:17 [inline]
 dump_stack+0x194/0x24d lib/dump_stack.c:53
 print_address_description+0x73/0x250 mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report+0x23c/0x360 mm/kasan/report.c:412
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
 sctp_association_free+0x7b7/0x930 net/sctp/associola.c:332
 sctp_sendmsg+0xc67/0x1a80 net/sctp/socket.c:2075
 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:639
 SYSC_sendto+0x361/0x5c0 net/socket.c:1748
 SyS_sendto+0x40/0x50 net/socket.c:1716
 do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x42/0xb7

This was introduced by commit:
f84af33 sctp: factor out sctp_sendmsg_to_asoc from sctp_sendmsg

As the newly refactored function moved the wait_for_sndbuf call to a
point after the association was connected, allowing for peeloff events
to occur, which in turn caused wait_for_sndbuf to return -EPIPE which
was not caught by the logic that determines if an association should be
freed or not.

Fix it the easy way by returning the ordering of
sctp_primitive_ASSOCIATE and sctp_wait_for_sndbuf to the old order, to
ensure that EPIPE will not happen.

Tested by myself using the syzbot reproducers with positive results

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: davem@davemloft.net
CC: Xin Long <lucien.xin@gmail.com>
Reported-by: syzbot+a4e4112c3aff00c8cfd8@syzkaller.appspotmail.com

---
Change notes
v2)
 * Moved additional calls to restore origional ordering
 * add sctp prefix
---
 net/sctp/socket.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 7d3476a4860d..4bbfcf9532c2 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1876,6 +1876,19 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 		goto err;
 	}
 
+	if (asoc->pmtu_pending)
+		sctp_assoc_pending_pmtu(asoc);
+
+	if (sctp_wspace(asoc) < msg_len)
+		sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
+
+	if (!sctp_wspace(asoc)) {
+		timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+		err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
+		if (err)
+			goto err;
+	}
+
 	if (sctp_state(asoc, CLOSED)) {
 		err = sctp_primitive_ASSOCIATE(net, asoc, NULL);
 		if (err)
@@ -1893,19 +1906,6 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 		pr_debug("%s: we associated primitively\n", __func__);
 	}
 
-	if (asoc->pmtu_pending)
-		sctp_assoc_pending_pmtu(asoc);
-
-	if (sctp_wspace(asoc) < msg_len)
-		sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
-
-	if (!sctp_wspace(asoc)) {
-		timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
-		err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
-		if (err)
-			goto err;
-	}
-
 	datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter);
 	if (IS_ERR(datamsg)) {
 		err = PTR_ERR(datamsg);
-- 
2.14.3

^ permalink raw reply related

* Re: [pci PATCH v5 1/4] pci: Add pci_sriov_configure_simple for PFs that don't manage VF resources
From: Keith Busch @ 2018-03-12 18:23 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Bjorn Helgaas, Duyck, Alexander H, linux-pci, virtio-dev, kvm,
	Netdev, Daly, Dan, LKML, linux-nvme, netanel, Maximilian Heyne,
	Wang, Liang-min, Rustad, Mark D, David Woodhouse,
	Christoph Hellwig, dwmw
In-Reply-To: <CAKgT0UdWc=Sz-xgd4H3bTDt5d17=-PY8VPWz6A3mAciWV6=97w@mail.gmail.com>

On Mon, Mar 12, 2018 at 11:09:34AM -0700, Alexander Duyck wrote:
> On Mon, Mar 12, 2018 at 10:40 AM, Keith Busch <keith.busch@intel.com> wrote:
> > On Mon, Mar 12, 2018 at 10:21:29AM -0700, Alexander Duyck wrote:
> >> diff --git a/include/linux/pci.h b/include/linux/pci.h
> >> index 024a1beda008..9cab9d0d51dc 100644
> >> --- a/include/linux/pci.h
> >> +++ b/include/linux/pci.h
> >> @@ -1953,6 +1953,7 @@ static inline void pci_mmcfg_late_init(void) { }
> >>  int pci_vfs_assigned(struct pci_dev *dev);
> >>  int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
> >>  int pci_sriov_get_totalvfs(struct pci_dev *dev);
> >> +int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn);
> >>  resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
> >>  void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe);
> >>  #else
> >
> > I recommend stubbing 'pci_sriov_configure_simple' or defining it to
> > NULL in the '#else' section here so you don't need to repeat the "#ifdef
> > CONFIG_PCI_IOV" in each driver wishing to use this function. Otherwise
> > looks fine to me.
> 
> My concern with defining it as NULL is that somebody may end up
> calling it in the future directly and that may end up causing issues.
> One thought I have been debating is moving it to a different file. I
> am just not sure where the best place to put something like this would
> be. I could move this function to drivers/pci/pci.c if everyone is
> okay with it and then I could just strip the contents out by wrapping
> them in a #ifdef instead.

Okay, instead of NULL, a stub implementation in the header file may
suffice when CONFIG_PCI_IOV is not defined:

static inline int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn)
{
	return -ENOSYS;
}

See pci_iov_virtfn_bus, pci_iov_virtfn_devfn, pci_iov_add_virtfn, or
pci_enable_sriov for other examples.

^ permalink raw reply

* RE: [PATCH v2 iproute2-next 0/6] cm_id, cq, mr, and pd resource tracking
From: Steve Wise @ 2018-03-12 18:43 UTC (permalink / raw)
  To: 'David Ahern', stephen; +Cc: leon, netdev, linux-rdma
In-Reply-To: <87cc06e1-0e54-12f7-a9b0-326e568c3d6f@gmail.com>

> 
> On 3/12/18 8:16 AM, Steve Wise wrote:
> > Hey all,
> >
> > The kernel side of this series has been merged for rdma-next [1].  Let me
> > know if this iproute2 series can be merged, of if it needs more changes.
> >
> 
> The problem is that iproute2 headers are synced to kernel headers from
> DaveM's tree (net-next mainly). I take it this series will not appear in
> Dave's tree until after a merge through Linus' tree. Correct?

Yes, I think so.

Steve.

^ permalink raw reply

* Re: [PATCH net-next 1/5] bridge: initialize port flags with switchdev defaults
From: Igor Mitsyanko @ 2018-03-12 18:44 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: ivecera, jiri, netdev, bridge, sergey.matyukevich.os, ashevchenko,
	smaksimenko, dlebed
In-Reply-To: <20180310163050.GF29174@lunn.ch>

On 03/10/2018 08:30 AM, Andrew Lunn wrote:
>> +
>> +     ret = switchdev_port_attr_get(dev, &attr);
>> +     if (ret)
>> +             return BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD;
> 
> 
> Hi Igor
> 
> Please check if ret == -EOPNOTSUPP and only then use the defaults. A
> real error should be propagated, causing new_nbp to fail. You might
> also consider what to do when ENODATA is returned.
> 
>       Andrew
> 

Hi Andrew,
ok, will change it so an error is propagated.
There is one more comment from Stephen suggesting that flags must be set 
in switchdev, rather then queried: this approach should take care about 
ENODATA I assume.

^ permalink raw reply

* de-indirect TCP congestion control
From: Stephen Hemminger @ 2018-03-12 18:45 UTC (permalink / raw)
  To: Eric Dumazet, David Miller; +Cc: netdev

Since indirect calls are expensive, and now even more so, perhaps we should figure out
a way to make the default TCP congestion control hooks into direct calls.
99% of the users just use the single CC module compiled into the kernel.

Use some macros (or other stuff) to change indirect call to direct
calls. So that code like:

static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);

	icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);

to:

static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
	const struct tcp_congestion_ops *ops = inet_csk(sk)->icsk_ca_ops;

	if (ops == default_tcp_ops)
		default_tcp_cong_avoid(s, ack, acked);
	else
		icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);

The use macros and/or compiler symbols so the builtin (non-module) congestion
control is always default_tcp_XXX.

^ permalink raw reply

* Re: Re: [PATCH v4 2/2] virtio_net: Extend virtio to use VF datapath when available
From: Samudrala, Sridhar @ 2018-03-12 18:47 UTC (permalink / raw)
  To: Michael S. Tsirkin, Stephen Hemminger
  Cc: Alexander Duyck, Jiri Pirko, David Miller, Netdev, virtio-dev,
	Brandeburg, Jesse, Duyck, Alexander H, Jakub Kicinski
In-Reply-To: <20180307221001-mutt-send-email-mst@kernel.org>

On 3/7/2018 12:11 PM, Michael S. Tsirkin wrote:
> On Wed, Mar 07, 2018 at 10:06:30AM -0800, Stephen Hemminger wrote:
>> On Wed, 7 Mar 2018 09:50:50 -0800
>> Alexander Duyck <alexander.duyck@gmail.com> wrote:
>>
>>> On Tue, Mar 6, 2018 at 6:38 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>>> On Tue, Mar 06, 2018 at 03:27:46PM -0800, Alexander Duyck wrote:
>>>>>> I definitelly vote for a separate common shared code for both netvsc and
>>>>>> virtio_net - even if you use 2 and 3 netdev model, you could share the
>>>>>> common code. Strict checks and limitation should be in place.
>>>>> Noted. But as I also mentioned there isn't that much "common" code
>>>>> between the two models. I think if anything we could probably look at
>>>>> peeling out a few bits such as "get_<iface>_bymac" which really would
>>>>> become dev_get_by_mac_and_ops in order to find the device for the
>>>>> notifiers. I probably wouldn't even put that in our driver and would
>>>>> instead put it in the core code since it almost makes more sense
>>>>> there. Beyond that sharing becomes much more challenging due to the
>>>>> differences in the Rx and Tx paths that build out of the difference
>>>>> between the 2 driver and 3 driver models.
>>>> At this point it might be worth it to articulate the advantages
>>>> of the 3 netdev model.
>>> The main advantages are performance on the lower devices. Specifically
>>> we can run LLTX and IFF_NOQUEUE on the upper device, meaning the VF
>>> doesn't take a performance hit when we start transmitting through it.
>>> In addition the paravirtual device is able to fully expose it's
>>> features, so for example virtio_net can maintain in-driver XDP, and we
>>> can provide generic XDP on the upper device. We can also have an
>>> asymmetric configuration where the number of queues or feature sets
>>> can be different between the ports.
>>>
>>> Basically what it comes down to is in the 3 netdev model we never have
>>> to deal with the issues of going from being a standard netdev to being
>>> a master netdev. The role of each netdev is clearly defined.
>>>
>>> As far as doing a generic solution it is the preferred way to go as
>>> well since we don't have to rewrite functionality of the lower device.
>>> Currently the only changes really needed are to add a phys_port_name
>>> operation so that you can distinguish between the bypass interface and
>>> the original paravirtual one. As such you have no confusion about what
>>> you are running.
>>>
>>>> If they are compelling, why wouldn't netvsc users want them?
>>> I believe part of the logic for Stephen's choice is that netvsc
>>> doesn't have a "BACKUP" bit like what we have virtio_net. As a result
>>> they have no way of knowing if a VF will ever show up or not. That
>>> makes the 3 netdev solution less appealing as they always end up in
>>> the bonding mode. In addition they have intentionally limited
>>> themselves to avoid features such as XDP that would cause issues with
>>> the 2 netdev model.
>>>
>>>> Alex, I think you were one of the strongest proponents of this model,
>>>> you should be well placed to provide a summary.
>>> Hopefully the information provided helps. In my mind the issue is that
>>> netvsc was not designed correctly, and as such it is using the 2
>>> netdev model as a bit of a crutch to handle the case where they wanted
>>> to add a VF bypass. At this point it has become a part of their
>>> architecture so it would be confusing for their user base to deal with
>>> having 2 netdevs spawn every time their driver sees a new device. In
>>> the case of virtio we only spawn 2 netdevs if the backup bit is set
>>> which implies a specific use case. When the bit isn't set we are only
>>> spawning one device, and as a result we can get much better
>>> performance out of the resultant combination of devices, and can
>>> expose functionality such as in-driver XDP in virtio.
>>
>> I have experimented with toggling IFF_NOQUEUE and/or LLTX on the netvsc
>> device when doing passthrough. It didn't help performance much and there
>> are corner cases that make it painful, like what if qdisc was placed
>> by user on the netvsc device?  Should the qdisc then be moved back
>> and forth to the VF device when it arrives or is removed?
>>
>> As far as XDP support, it is on the plan to support XDP on the netvsc
>> device since the receive buffers do have to be copied. But there has
>> been no customer demand for it; and the VF model on Azure has limitations
>> which make a transparent XDP model pretty much impossible.
>
> Jiri, does that answer your question? Even though there is some
> similarity, the needs of netvsc and virtio appear significantly
> different.
>
> We do not yet know whether other PV devices will be virtio-like
> or netvsc-like.
>
> Do you agree?

Michael,

At this point can we agree to go with 3 netdev model for virtio and as 
this is
not compatible with netvsc's 2 netdev model,  the scope for any common code
between virtio and netvsc is very limited.

Should i submit a v5 version of this patch series with some minor 
fixes?  Or can
you pull in this series and i can submit patches on top of that?

Thanks
Sridhar

^ permalink raw reply

* Re: [PATCH net-next 1/1] tc-testing: updated gact tests with batch test cases
From: David Miller @ 2018-03-12 18:47 UTC (permalink / raw)
  To: mrv; +Cc: netdev, kernel, jhs, xiyou.wangcong, jiri
In-Reply-To: <85woyhjjuw.fsf@mojatatu.com>

From: Roman Mashak <mrv@mojatatu.com>
Date: Mon, 12 Mar 2018 13:15:35 -0400

> David Miller <davem@davemloft.net> writes:
> 
>> From: Roman Mashak <mrv@mojatatu.com>
>> Date: Mon, 12 Mar 2018 12:39:57 -0400
>>
>>> David Miller <davem@davemloft.net> writes:
>>> 
>>>> From: Roman Mashak <mrv@mojatatu.com>
>>>> Date: Fri,  9 Mar 2018 17:16:12 -0500
>>>>
>>>>> Add test cases to exercise code paths responsible for adding or deleting
>>>>> batch of TC actions.
>>>>> 
>>>>> Signed-off-by: Roman Mashak <mrv@mojatatu.com>
>>>>
>>>> You submitted this patch twice.
>>>>
>>>> You didn't indicate if this is a new version of the patch or something
>>>> like this.
>>>>
>>> 
>>> Sorry for that glitch, I've just resent the patch.
>>> 
>>>> Please resubmit your tc-testing changes, all of them, with this
>>>> sorted out.
>>
>> I said "all of your tc-testing changes" not just this one.
> 
> On March 9th I submitted two tc-testing patches:
> 
>  - "add TC vlan action tests"
>  - "updated gact tests with batch test cases" (which I resent)
> 
> Do you want to have those submitted as series?

I said resubmit all of them, which means both of these patches.
You only resubmitted one of them.

I really can't figure out how I can be more clear about my
request.

^ permalink raw reply

* [PATCH net-next] ibmvnic: Bail from ibmvnic_open if driver is already open
From: John Allen @ 2018-03-12 18:47 UTC (permalink / raw)
  To: netdev; +Cc: Thomas Falcon, Nathan Fontenot

If the driver is already in the "open" state, don't attempt the procedure
for opening the driver.

Signed-off-by: John Allen <jallen@linux.vnet.ibm.com>
---
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 7be4b06..98c4f75 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1057,6 +1057,9 @@ static int ibmvnic_open(struct net_device *netdev)

 	mutex_lock(&adapter->reset_lock);

+	if (adapter->state == VNIC_OPEN)
+		return 0;
+
 	if (adapter->state != VNIC_CLOSED) {
 		rc = ibmvnic_login(netdev);
 		if (rc) {

^ permalink raw reply related

* Re: [PATCH 00/30] Netfilter/IPVS updates for net-next
From: David Miller @ 2018-03-12 18:58 UTC (permalink / raw)
  To: pablo; +Cc: netfilter-devel, netdev
In-Reply-To: <20180312175920.9022-1-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 12 Mar 2018 18:58:50 +0100

> The following patchset contains Netfilter/IPVS updates for your net-next
> tree. This batch comes with more input sanitization for xtables to
> address bug reports from fuzzers, preparation works to the flowtable
> infrastructure and assorted updates. In no particular order, they are:

Sorry, I've seen enough.  I'm not pulling this.

What is the story with this flow table stuff?  I tried to ask you
about this before, but the response I was given was extremely vague
and did not answer my question at all.

This is a lot of code, and a lot of infrastructure, yet I see
no device using the infrastructure to offload conntack.

Nor can I see how this can possibly be even useful for such an
application.  What conntrack offload needs are things completely
outside of what the flow table stuff provides.  Mainly, they
require that the SKB is completely abstracted away from all of
the contrack code paths, and that the conntrack infrastructure
operates on an abstract packet metadata concept.

If you are targetting one specific piece of hardware with TCAMs
that you are familiar with.  I'd like you to stop right there.
Because if that is all that this infrastructure can actually
be used for, it is definitely designed wrong.

This, as has been the case in the past, is what is wrong with
netfilter approach to supporting offloading.  We see all of this
infrastructure before an actual working use case is provided for a
specific piece of hardware for a specific driver in the tree.

Nobody can evaluate whether the approach is good or not without
a clear driver change implementing support for it.

No other area of networking puts the cart before the horse like this.

I do not agree at all with the flow table infrastructure and I
therefore do not want to pull any more flow table changes into my tree
until there is an actual user of this stuff in that pull request which
actually works in a way which is useful for people.  It is completely
dead and useless code currently.

If you disagree you have to not just say it, but show it with a driver
that successfully and cleanly uses this code to offload conntrack.

Meanwhile, remove the flow table commits from this pull request out of
your tree and ask me to pull in the rest.

Thanks.

^ permalink raw reply

* Re: [PATCH ethtool 0/3] RSS ntuple filters
From: John W. Linville @ 2018-03-12 18:51 UTC (permalink / raw)
  To: Edward Cree; +Cc: netdev
In-Reply-To: <c061eab4-6069-017c-554f-210feb170520@solarflare.com>

On Fri, Mar 09, 2018 at 03:01:58PM +0000, Edward Cree wrote:
> This series adds support for the management of extra RSS contexts and RSS
>  spreading of ntuple/NFC filters introduced in kernel commit 84a1d9c48200
>  ("net: ethtool: extend RXNFC API to support RSS spreading of filter matches").
> 
> Edward Cree (3):
>   ethtool-copy.h: sync with net-next
>   ethtool: add support for extra RSS contexts and RSS steering filters
>   ethtool.8: Document RSS context control and RSS filters
> 
>  ethtool-copy.h |  32 ++++++++++---
>  ethtool.8.in   |  28 ++++++++++++
>  ethtool.c      | 141 +++++++++++++++++++++++++++++++++++++++++++++++++--------
>  internal.h     |   4 +-
>  rxclass.c      |  58 ++++++++++++++++++++----
>  5 files changed, 227 insertions(+), 36 deletions(-)
> 

LGTM -- queued for the next release...thanks!

John
-- 
John W. Linville		Someday the world will need a hero, and you
linville@tuxdriver.com			might be all we have.  Be ready.

^ permalink raw reply

* Re: [PATCH net-next] ibmvnic: Bail from ibmvnic_open if driver is already open
From: Andrew Lunn @ 2018-03-12 19:01 UTC (permalink / raw)
  To: John Allen; +Cc: netdev, Thomas Falcon, Nathan Fontenot
In-Reply-To: <f4857e9e-31d9-faa0-43e2-0ec0dfeae7ad@linux.vnet.ibm.com>

On Mon, Mar 12, 2018 at 01:47:51PM -0500, John Allen wrote:
> If the driver is already in the "open" state, don't attempt the procedure
> for opening the driver.
> 
> Signed-off-by: John Allen <jallen@linux.vnet.ibm.com>
> ---
> diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
> index 7be4b06..98c4f75 100644
> --- a/drivers/net/ethernet/ibm/ibmvnic.c
> +++ b/drivers/net/ethernet/ibm/ibmvnic.c
> @@ -1057,6 +1057,9 @@ static int ibmvnic_open(struct net_device *netdev)
> 
>  	mutex_lock(&adapter->reset_lock);
> 
> +	if (adapter->state == VNIC_OPEN)
> +		return 0;
> +

I've not looked at the actual code, but what about the lock it just
took and does not unlock before returning?

	Andrew

^ permalink raw reply

* Re: [PATCH net-next 1/5] bridge: initialize port flags with switchdev defaults
From: Igor Mitsyanko @ 2018-03-12 19:03 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: ivecera, jiri, netdev, bridge, sergey.matyukevich.os, ashevchenko,
	smaksimenko, dlebed
In-Reply-To: <20180310083234.47db5f06@xeon-e3>

On 03/10/2018 08:32 AM, Stephen Hemminger wrote:
> Yes hardware devices may come it with different default values.
> But the user experience on Linux needs to be consistent.
> 
> Instead, it makes more sense to me for each device driver using switchdev
> to program to the values that it sees in the bridge.
> 
> Please revise this.
> 

Right, it does change default user view, setting flags instead of 
querying makes more sense then.

However there is a problem that switchdev may not support all flags that 
bridge code sets by default. For example, looking at 
spectrum_switchdev.c, it only advertises support for BR_LEARNING | 
BR_FLOOD | BR_MCAST_FLOOD.
Currently, I assume that even though some flags are shown as set by 
default on a new bridge port, some of them are not actually working in 
switchdev?

^ permalink raw reply

* Re: de-indirect TCP congestion control
From: David Miller @ 2018-03-12 19:04 UTC (permalink / raw)
  To: stephen; +Cc: eric.dumazet, netdev
In-Reply-To: <20180312114552.3f51e6ac@xeon-e3>

From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 12 Mar 2018 11:45:52 -0700

> Since indirect calls are expensive, and now even more so, perhaps we should figure out
> a way to make the default TCP congestion control hooks into direct calls.
> 99% of the users just use the single CC module compiled into the kernel.

Who is this magic user with only one CC algorithm enabled in their
kernel?  I want to know who this dude is?

I don't think it's going to help much since people will have I think
at least two algorithms compiled into nearly everyone's tree.

Distributions will enable everything.

Google is going to have at least two algorithms enabled.

etc. etc. etc.

Getting rid of indirect calls is a fine goal, but the precondition you
are mentioning to achieve this doesn't seem practical at all.

^ permalink raw reply

* Re: [PATCH net-next] ibmvnic: Bail from ibmvnic_open if driver is already open
From: John Allen @ 2018-03-12 19:06 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: netdev, Thomas Falcon, Nathan Fontenot
In-Reply-To: <20180312190148.GT27783@lunn.ch>

On 03/12/2018 02:01 PM, Andrew Lunn wrote:
> On Mon, Mar 12, 2018 at 01:47:51PM -0500, John Allen wrote:
>> If the driver is already in the "open" state, don't attempt the procedure
>> for opening the driver.
>>
>> Signed-off-by: John Allen <jallen@linux.vnet.ibm.com>
>> ---
>> diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
>> index 7be4b06..98c4f75 100644
>> --- a/drivers/net/ethernet/ibm/ibmvnic.c
>> +++ b/drivers/net/ethernet/ibm/ibmvnic.c
>> @@ -1057,6 +1057,9 @@ static int ibmvnic_open(struct net_device *netdev)
>>
>>  	mutex_lock(&adapter->reset_lock);
>>
>> +	if (adapter->state == VNIC_OPEN)
>> +		return 0;
>> +
> 
> I've not looked at the actual code, but what about the lock it just
> took and does not unlock before returning?
Ah yes, I missed that. I will update and send a new version.

> 
> 	Andrew
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox