Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v4 1/3] ipv4: support sport, dport and ip_proto in RTM_GETROUTE
From: Roopa Prabhu @ 2018-05-22 14:51 UTC (permalink / raw)
  To: davem; +Cc: netdev, nikolay, dsa, idosch
In-Reply-To: <1527000668-25253-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This is a followup to fib rules sport, dport and ipproto
match support. Only supports tcp, udp and icmp for ipproto.
Used by fib rule self tests.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 include/net/ip.h               |   3 +
 include/uapi/linux/rtnetlink.h |   3 +
 net/ipv4/Makefile              |   2 +-
 net/ipv4/fib_frontend.c        |   4 ++
 net/ipv4/netlink.c             |  23 +++++++
 net/ipv4/route.c               | 146 ++++++++++++++++++++++++++++++-----------
 6 files changed, 141 insertions(+), 40 deletions(-)
 create mode 100644 net/ipv4/netlink.c

diff --git a/include/net/ip.h b/include/net/ip.h
index bada1f1..0d2281b 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -664,4 +664,7 @@ extern int sysctl_icmp_msgs_burst;
 int ip_misc_proc_init(void);
 #endif
 
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
+				struct netlink_ext_ack *extack);
+
 #endif	/* _IP_H */
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 9b15005..cabb210 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -327,6 +327,9 @@ enum rtattr_type_t {
 	RTA_PAD,
 	RTA_UID,
 	RTA_TTL_PROPAGATE,
+	RTA_IP_PROTO,
+	RTA_SPORT,
+	RTA_DPORT,
 	__RTA_MAX
 };
 
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index b379520..13f2ba9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -14,7 +14,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
-	     metrics.o
+	     metrics.o netlink.o
 
 obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4d622112..cf5cfc5 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -649,6 +649,10 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_ENCAP]		= { .type = NLA_NESTED },
 	[RTA_UID]		= { .type = NLA_U32 },
 	[RTA_MARK]		= { .type = NLA_U32 },
+	[RTA_TABLE]		= { .type = NLA_U32 },
+	[RTA_IP_PROTO]		= { .type = NLA_U8 },
+	[RTA_SPORT]		= { .type = NLA_U16 },
+	[RTA_DPORT]		= { .type = NLA_U16 },
 };
 
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c
new file mode 100644
index 0000000..f86bb4f
--- /dev/null
+++ b/net/ipv4/netlink.c
@@ -0,0 +1,23 @@
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/ip.h>
+
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
+				struct netlink_ext_ack *extack)
+{
+	*ip_proto = nla_get_u8(attr);
+
+	switch (*ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_ICMP:
+		return 0;
+	default:
+		NL_SET_ERR_MSG(extack, "Unsupported ip proto");
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2cfa1b5..0e401dc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2574,11 +2574,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
 /* called with rcu_read_lock held */
-static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
-			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
-			u32 seq)
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
+			struct sk_buff *skb, u32 portid, u32 seq)
 {
-	struct rtable *rt = skb_rtable(skb);
 	struct rtmsg *r;
 	struct nlmsghdr *nlh;
 	unsigned long expires = 0;
@@ -2674,7 +2673,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 			}
 		} else
 #endif
-			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
+			if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
 				goto nla_put_failure;
 	}
 
@@ -2689,43 +2688,93 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 	return -EMSGSIZE;
 }
 
+static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
+						   u8 ip_proto, __be16 sport,
+						   __be16 dport)
+{
+	struct sk_buff *skb;
+	struct iphdr *iph;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+
+	/* Reserve room for dummy headers, this skb can pass
+	 * through good chunk of routing engine.
+	 */
+	skb_reset_mac_header(skb);
+	skb_reset_network_header(skb);
+	skb->protocol = htons(ETH_P_IP);
+	iph = skb_put(skb, sizeof(struct iphdr));
+	iph->protocol = ip_proto;
+	iph->saddr = src;
+	iph->daddr = dst;
+	iph->version = 0x4;
+	iph->frag_off = 0;
+	iph->ihl = 0x5;
+	skb_set_transport_header(skb, skb->len);
+
+	switch (iph->protocol) {
+	case IPPROTO_UDP: {
+		struct udphdr *udph;
+
+		udph = skb_put_zero(skb, sizeof(struct udphdr));
+		udph->source = sport;
+		udph->dest = dport;
+		udph->len = sizeof(struct udphdr);
+		udph->check = 0;
+		break;
+	}
+	case IPPROTO_TCP: {
+		struct tcphdr *tcph;
+
+		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
+		tcph->source	= sport;
+		tcph->dest	= dport;
+		tcph->doff	= sizeof(struct tcphdr) / 4;
+		tcph->rst = 1;
+		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
+					    src, dst, 0);
+		break;
+	}
+	case IPPROTO_ICMP: {
+		struct icmphdr *icmph;
+
+		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
+		icmph->type = ICMP_ECHO;
+		icmph->code = 0;
+	}
+	}
+
+	return skb;
+}
+
 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(in_skb->sk);
-	struct rtmsg *rtm;
 	struct nlattr *tb[RTA_MAX+1];
+	u32 table_id = RT_TABLE_MAIN;
+	__be16 sport = 0, dport = 0;
 	struct fib_result res = {};
+	u8 ip_proto = IPPROTO_UDP;
 	struct rtable *rt = NULL;
+	struct sk_buff *skb;
+	struct rtmsg *rtm;
 	struct flowi4 fl4;
 	__be32 dst = 0;
 	__be32 src = 0;
+	kuid_t uid;
 	u32 iif;
 	int err;
 	int mark;
-	struct sk_buff *skb;
-	u32 table_id = RT_TABLE_MAIN;
-	kuid_t uid;
 
 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
 			  extack);
 	if (err < 0)
-		goto errout;
+		return err;
 
 	rtm = nlmsg_data(nlh);
-
-	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
-	if (!skb) {
-		err = -ENOBUFS;
-		goto errout;
-	}
-
-	/* Reserve room for dummy headers, this skb can pass
-	   through good chunk of routing engine.
-	 */
-	skb_reset_mac_header(skb);
-	skb_reset_network_header(skb);
-
 	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
 	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
@@ -2735,14 +2784,22 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	else
 		uid = (iif ? INVALID_UID : current_uid());
 
-	/* Bugfix: need to give ip_route_input enough of an IP header to
-	 * not gag.
-	 */
-	ip_hdr(skb)->protocol = IPPROTO_UDP;
-	ip_hdr(skb)->saddr = src;
-	ip_hdr(skb)->daddr = dst;
+	if (tb[RTA_IP_PROTO]) {
+		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+						  &ip_proto, extack);
+		if (err)
+			return err;
+	}
+
+	if (tb[RTA_SPORT])
+		sport = nla_get_be16(tb[RTA_SPORT]);
 
-	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+	if (tb[RTA_DPORT])
+		dport = nla_get_be16(tb[RTA_DPORT]);
+
+	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
+	if (!skb)
+		return -ENOBUFS;
 
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.daddr = dst;
@@ -2751,6 +2808,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
 	fl4.flowi4_mark = mark;
 	fl4.flowi4_uid = uid;
+	if (sport)
+		fl4.fl4_sport = sport;
+	if (dport)
+		fl4.fl4_dport = dport;
+	fl4.flowi4_proto = ip_proto;
 
 	rcu_read_lock();
 
@@ -2760,10 +2822,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		dev = dev_get_by_index_rcu(net, iif);
 		if (!dev) {
 			err = -ENODEV;
-			goto errout_free;
+			goto errout_rcu;
 		}
 
-		skb->protocol	= htons(ETH_P_IP);
+		fl4.flowi4_iif = iif; /* for rt_fill_info */
 		skb->dev	= dev;
 		skb->mark	= mark;
 		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
@@ -2783,7 +2845,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	}
 
 	if (err)
-		goto errout_free;
+		goto errout_rcu;
 
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
 		rt->rt_flags |= RTCF_NOTIFY;
@@ -2791,34 +2853,40 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
 		table_id = res.table ? res.table->tb_id : 0;
 
+	/* reset skb for netlink reply msg */
+	skb_trim(skb, 0);
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb_reset_mac_header(skb);
+
 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
 		if (!res.fi) {
 			err = fib_props[res.type].error;
 			if (!err)
 				err = -EHOSTUNREACH;
-			goto errout_free;
+			goto errout_rcu;
 		}
 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
 				    rt->rt_type, res.prefix, res.prefixlen,
 				    fl4.flowi4_tos, res.fi, 0);
 	} else {
-		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
+		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
 	}
 	if (err < 0)
-		goto errout_free;
+		goto errout_rcu;
 
 	rcu_read_unlock();
 
 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
-errout:
-	return err;
 
 errout_free:
+	return err;
+errout_rcu:
 	rcu_read_unlock();
 	kfree_skb(skb);
-	goto errout;
+	goto errout_free;
 }
 
 void ip_rt_multicast_event(struct in_device *in_dev)
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v4 2/3] ipv6: support sport, dport and ip_proto in RTM_GETROUTE
From: Roopa Prabhu @ 2018-05-22 14:51 UTC (permalink / raw)
  To: davem; +Cc: netdev, nikolay, dsa, idosch
In-Reply-To: <1527000668-25253-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This is a followup to fib6 rules sport, dport and ipproto
match support. Only supports tcp, udp and icmp for ipproto.
Used by fib rule self tests.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 net/ipv6/route.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index cc24ed3..7f1babb 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -63,6 +63,7 @@
 #include <net/lwtunnel.h>
 #include <net/ip_tunnels.h>
 #include <net/l3mdev.h>
+#include <net/ip.h>
 #include <trace/events/fib6.h>
 
 #include <linux/uaccess.h>
@@ -4083,6 +4084,9 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
 	[RTA_UID]		= { .type = NLA_U32 },
 	[RTA_MARK]		= { .type = NLA_U32 },
 	[RTA_TABLE]		= { .type = NLA_U32 },
+	[RTA_IP_PROTO]		= { .type = NLA_U8 },
+	[RTA_SPORT]		= { .type = NLA_U16 },
+	[RTA_DPORT]		= { .type = NLA_U16 },
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -4794,6 +4798,19 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	else
 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
 
+	if (tb[RTA_SPORT])
+		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
+
+	if (tb[RTA_DPORT])
+		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
+
+	if (tb[RTA_IP_PROTO]) {
+		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+						  &fl6.flowi6_proto, extack);
+		if (err)
+			goto errout;
+	}
+
 	if (iif) {
 		struct net_device *dev;
 		int flags = 0;
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v4 3/3] selftests: net: initial fib rule tests
From: Roopa Prabhu @ 2018-05-22 14:51 UTC (permalink / raw)
  To: davem; +Cc: netdev, nikolay, dsa, idosch
In-Reply-To: <1527000668-25253-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This adds a first set of tests for fib rule match/action for
ipv4 and ipv6. Initial tests only cover action lookup table.
can be extended to cover other actions in the future.
Uses ip route get to validate the rule lookup.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 tools/testing/selftests/net/Makefile          |   2 +-
 tools/testing/selftests/net/fib_rule_tests.sh | 248 ++++++++++++++++++++++++++
 2 files changed, 249 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/net/fib_rule_tests.sh

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index e60dddb..7cb0f49 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -6,7 +6,7 @@ CFLAGS += -I../../../../usr/include/
 
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh
-TEST_PROGS += udpgso_bench.sh
+TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
new file mode 100644
index 0000000..d4cfb6a
--- /dev/null
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -0,0 +1,248 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is for checking IPv4 and IPv6 FIB rules API
+
+ret=0
+
+PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
+IP="ip -netns testns"
+
+RTABLE=100
+GW_IP4=192.51.100.2
+SRC_IP=192.51.100.3
+GW_IP6=2001:db8:1::2
+SRC_IP6=2001:db8:1::3
+
+DEV_ADDR=192.51.100.1
+DEV=dummy0
+
+log_test()
+{
+	local rc=$1
+	local expected=$2
+	local msg="$3"
+
+	if [ ${rc} -eq ${expected} ]; then
+		nsuccess=$((nsuccess+1))
+		printf "\n    TEST: %-50s  [ OK ]\n" "${msg}"
+	else
+		nfail=$((nfail+1))
+		printf "\n    TEST: %-50s  [FAIL]\n" "${msg}"
+		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+			echo
+			echo "hit enter to continue, 'q' to quit"
+			read a
+			[ "$a" = "q" ] && exit 1
+		fi
+	fi
+}
+
+log_section()
+{
+	echo
+	echo "######################################################################"
+	echo "TEST SECTION: $*"
+	echo "######################################################################"
+}
+
+setup()
+{
+	set -e
+	ip netns add testns
+	$IP link set dev lo up
+
+	$IP link add dummy0 type dummy
+	$IP link set dev dummy0 up
+	$IP address add 198.51.100.1/24 dev dummy0
+	$IP -6 address add 2001:db8:1::1/64 dev dummy0
+
+	set +e
+}
+
+cleanup()
+{
+	$IP link del dev dummy0 &> /dev/null
+	ip netns del testns
+}
+
+fib_check_iproute_support()
+{
+	ip rule help 2>&1 | grep -q $1
+	if [ $? -ne 0 ]; then
+		echo "SKIP: iproute2 iprule too old, missing $1 match"
+		return 1
+	fi
+
+	ip route get help 2>&1 | grep -q $2
+	if [ $? -ne 0 ]; then
+		echo "SKIP: iproute2 get route too old, missing $2 match"
+		return 1
+	fi
+
+	return 0
+}
+
+fib_rule6_del()
+{
+	$IP -6 rule del $1
+	log_test $? 0 "rule6 del $1"
+}
+
+fib_rule6_del_by_pref()
+{
+	pref=$($IP -6 rule show | grep "$1 lookup $TABLE" | cut -d ":" -f 1)
+	$IP -6 rule del pref $pref
+}
+
+fib_rule6_test_match_n_redirect()
+{
+	local match="$1"
+	local getmatch="$2"
+
+	$IP -6 rule add $match table $RTABLE
+	$IP -6 route get $GW_IP6 $getmatch | grep -q "table $RTABLE"
+	log_test $? 0 "rule6 check: $1"
+
+	fib_rule6_del_by_pref "$match"
+	log_test $? 0 "rule6 del by pref: $match"
+}
+
+fib_rule6_test()
+{
+	# setup the fib rule redirect route
+	$IP -6 route add table $RTABLE default via $GW_IP6 dev $DEV onlink
+
+	match="oif $DEV"
+	fib_rule6_test_match_n_redirect "$match" "$match" "oif redirect to table"
+
+	match="from $SRC_IP6 iif $DEV"
+	fib_rule6_test_match_n_redirect "$match" "$match" "iif redirect to table"
+
+	match="tos 0x10"
+	fib_rule6_test_match_n_redirect "$match" "$match" "tos redirect to table"
+
+	match="fwmark 0x64"
+	getmatch="mark 0x64"
+	fib_rule6_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+
+	fib_check_iproute_support "uidrange" "uid"
+	if [ $? -eq 0 ]; then
+		match="uidrange 100-100"
+		getmatch="uid 100"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+	fi
+
+	fib_check_iproute_support "sport" "sport"
+	if [ $? -eq 0 ]; then
+		match="sport 666 dport 777"
+		fib_rule6_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto tcp"
+		fib_rule6_test_match_n_redirect "$match" "$match" "ipproto match"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto icmp"
+		fib_rule6_test_match_n_redirect "$match" "$match" "ipproto icmp match"
+	fi
+}
+
+fib_rule4_del()
+{
+	$IP rule del $1
+	log_test $? 0 "del $1"
+}
+
+fib_rule4_del_by_pref()
+{
+	pref=$($IP rule show | grep "$1 lookup $TABLE" | cut -d ":" -f 1)
+	$IP rule del pref $pref
+}
+
+fib_rule4_test_match_n_redirect()
+{
+	local match="$1"
+	local getmatch="$2"
+
+	$IP rule add $match table $RTABLE
+	$IP route get $GW_IP4 $getmatch | grep -q "table $RTABLE"
+	log_test $? 0 "rule4 check: $1"
+
+	fib_rule4_del_by_pref "$match"
+	log_test $? 0 "rule4 del by pref: $match"
+}
+
+fib_rule4_test()
+{
+	# setup the fib rule redirect route
+	$IP route add table $RTABLE default via $GW_IP4 dev $DEV onlink
+
+	match="oif $DEV"
+	fib_rule4_test_match_n_redirect "$match" "$match" "oif redirect to table"
+
+	match="from $SRC_IP iif $DEV"
+	fib_rule4_test_match_n_redirect "$match" "$match" "iif redirect to table"
+
+	match="tos 0x10"
+	fib_rule4_test_match_n_redirect "$match" "$match" "tos redirect to table"
+
+	match="fwmark 0x64"
+	getmatch="mark 0x64"
+	fib_rule4_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+
+	fib_check_iproute_support "uidrange" "uid"
+	if [ $? -eq 0 ]; then
+		match="uidrange 100-100"
+		getmatch="uid 100"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+	fi
+
+	fib_check_iproute_support "sport" "sport"
+	if [ $? -eq 0 ]; then
+		match="sport 666 dport 777"
+		fib_rule4_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto tcp"
+		fib_rule4_test_match_n_redirect "$match" "$match" "ipproto tcp match"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto icmp"
+		fib_rule4_test_match_n_redirect "$match" "$match" "ipproto icmp match"
+	fi
+}
+
+run_fibrule_tests()
+{
+	log_section "IPv4 fib rule"
+	fib_rule4_test
+	log_section "IPv6 fib rule"
+	fib_rule6_test
+}
+
+if [ "$(id -u)" -ne 0 ];then
+	echo "SKIP: Need root privileges"
+	exit 0
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+	echo "SKIP: Could not run test without ip tool"
+	exit 0
+fi
+
+# start clean
+cleanup &> /dev/null
+setup
+run_fibrule_tests
+cleanup
+
+exit $ret
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH net-next v4 1/3] ipv4: support sport, dport and ip_proto in RTM_GETROUTE
From: Eric Dumazet @ 2018-05-22 15:04 UTC (permalink / raw)
  To: Roopa Prabhu, davem; +Cc: netdev, nikolay, dsa, idosch
In-Reply-To: <1527000668-25253-2-git-send-email-roopa@cumulusnetworks.com>



On 05/22/2018 07:51 AM, Roopa Prabhu wrote:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
> 
> 

...

> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> index 4d622112..cf5cfc5 100644
> --- a/net/ipv4/fib_frontend.c
> +++ b/net/ipv4/fib_frontend.c
> @@ -649,6 +649,10 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
>  	[RTA_ENCAP]		= { .type = NLA_NESTED },
>  	[RTA_UID]		= { .type = NLA_U32 },
>  	[RTA_MARK]		= { .type = NLA_U32 },
> +	[RTA_TABLE]		= { .type = NLA_U32 },
> +	[RTA_IP_PROTO]		= { .type = NLA_U8 },
> +	[RTA_SPORT]		= { .type = NLA_U16 },
> +	[RTA_DPORT]		= { .type = NLA_U16 },
>  };

Hi Roopa

RTA_TABLE addition looks like a bug fix for net tree ?

This should be sent as an independent patch IMO.

Thanks.

^ permalink raw reply

* Re: [PATCH net-next v11 2/5] netvsc: refactor notifier/event handling code to use the failover framework
From: Jiri Pirko @ 2018-05-22 15:13 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, stephen, davem, netdev, virtualization,
	virtio-dev, jesse.brandeburg, alexander.h.duyck, kubakici,
	jasowang, loseweigh, aaron.f.brown, anjali.singhai
In-Reply-To: <20180522163502-mutt-send-email-mst@kernel.org>

Tue, May 22, 2018 at 03:39:33PM CEST, mst@redhat.com wrote:
>On Tue, May 22, 2018 at 03:26:26PM +0200, Jiri Pirko wrote:
>> Tue, May 22, 2018 at 03:17:37PM CEST, mst@redhat.com wrote:
>> >On Tue, May 22, 2018 at 03:14:22PM +0200, Jiri Pirko wrote:
>> >> Tue, May 22, 2018 at 03:12:40PM CEST, mst@redhat.com wrote:
>> >> >On Tue, May 22, 2018 at 11:08:53AM +0200, Jiri Pirko wrote:
>> >> >> Tue, May 22, 2018 at 11:06:37AM CEST, jiri@resnulli.us wrote:
>> >> >> >Tue, May 22, 2018 at 04:06:18AM CEST, sridhar.samudrala@intel.com wrote:
>> >> >> >>Use the registration/notification framework supported by the generic
>> >> >> >>failover infrastructure.
>> >> >> >>
>> >> >> >>Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> >> >> >
>> >> >> >In previous patchset versions, the common code did
>> >> >> >netdev_rx_handler_register() and netdev_upper_dev_link() etc
>> >> >> >(netvsc_vf_join()). Now, this is still done in netvsc. Why?
>> >> >> >
>> >> >> >This should be part of the common "failover" code.
>> >> >> >
>> >> >> 
>> >> >> Also note that in the current patchset you use IFF_FAILOVER flag for
>> >> >> master, yet for the slave you use IFF_SLAVE. That is wrong.
>> >> >> IFF_FAILOVER_SLAVE should be used.
>> >> >
>> >> >Or drop IFF_FAILOVER_SLAVE and set both IFF_FAILOVER and IFF_SLAVE?
>> >> 
>> >> No. IFF_SLAVE is for bonding.
>> >
>> >What breaks if we reuse it for failover?
>> 
>> This is exposed to userspace. IFF_SLAVE is expected for bonding slaves.
>> And failover slave is not a bonding slave.
>
>That does not really answer the question.  I'd claim it's sufficiently
>like a bond slave for IFF_SLAVE to make sense.
>
>In fact you will find that netvsc already sets IFF_SLAVE, and so

netvsc does the whole failover thing in a wrong way. This patchset is
trying to fix it.

>does e.g. the eql driver.
>
>The advantage of using IFF_SLAVE is that userspace knows to skip it.  If

The userspace should know how to skip other types of slaves - team,
bridge, ovs, etc. The "master link" should be the one to look at.


>we don't set IFF_SLAVE existing userspace tries to use the lowerdev.

Each master type has a IFF_ master flag and IFF_ slave flag. In private
flag. I don't see no reason to break this pattern here.

^ permalink raw reply

* Re: [PATCH net-next] net: stmmac: Add PPS and Flexible PPS support
From: Andrew Lunn @ 2018-05-22 15:14 UTC (permalink / raw)
  To: Jose Abreu, Richard Cochran
  Cc: netdev, David S. Miller, Joao Pinto, Vitor Soares,
	Giuseppe Cavallaro, Alexandre Torgue
In-Reply-To: <072478625b1cb3d4af9e3b42f83ece7303fd554e.1526993857.git.joabreu@synopsys.com>

On Tue, May 22, 2018 at 01:58:40PM +0100, Jose Abreu wrote:
> This adds support for PPS output and Flexible PPS (which is equivalent
> to per_out output of PTP subsystem).

You forgot to Cc: the PTP maintainer, Richard Cochran <richardcochran@gmail.com>

    Andrew

^ permalink raw reply

* Re: [PATCH] net/mlx4: fix spelling mistake: "Inrerface" -> "Interface"
From: Tariq Toukan @ 2018-05-22 15:21 UTC (permalink / raw)
  To: Colin King, Tariq Toukan, David S . Miller, netdev, linux-rdma
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <20180522083728.5874-1-colin.king@canonical.com>



On 22/05/2018 11:37 AM, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
> 
> Trivial fix to spelling mistake in mlx4_dbg debug message
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> ---
>   drivers/net/ethernet/mellanox/mlx4/intf.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
> index 2edcce98ab2d..6bd4103265d2 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/intf.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
> @@ -172,7 +172,7 @@ int mlx4_do_bond(struct mlx4_dev *dev, bool enable)
>   		list_add_tail(&dev_ctx->list, &priv->ctx_list);
>   		spin_unlock_irqrestore(&priv->ctx_lock, flags);
>   
> -		mlx4_dbg(dev, "Inrerface for protocol %d restarted with when bonded mode is %s\n",
> +		mlx4_dbg(dev, "Interface for protocol %d restarted with when bonded mode is %s\n",

Thanks Colin.
I think there's one more thing to fix here.
It is redundant to say "with when", it was probably done by mistake. 
Let's rephrase, maybe this way?

restarted with bonded mode %s

>   			 dev_ctx->intf->protocol, enable ?
>   			 "enabled" : "disabled");
>   	}
> 

^ permalink raw reply

* Re: [PATCH] net/mlx4: fix spelling mistake: "Inrerface" -> "Interface"
From: Colin Ian King @ 2018-05-22 15:23 UTC (permalink / raw)
  To: Tariq Toukan, David S . Miller, netdev, linux-rdma
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <77704a43-2b78-e88b-04b9-9a2623e8b5fb@mellanox.com>

On 22/05/18 16:21, Tariq Toukan wrote:
> 
> 
> On 22/05/2018 11:37 AM, Colin King wrote:
>> From: Colin Ian King <colin.king@canonical.com>
>>
>> Trivial fix to spelling mistake in mlx4_dbg debug message
>>
>> Signed-off-by: Colin Ian King <colin.king@canonical.com>
>> ---
>>   drivers/net/ethernet/mellanox/mlx4/intf.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c
>> b/drivers/net/ethernet/mellanox/mlx4/intf.c
>> index 2edcce98ab2d..6bd4103265d2 100644
>> --- a/drivers/net/ethernet/mellanox/mlx4/intf.c
>> +++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
>> @@ -172,7 +172,7 @@ int mlx4_do_bond(struct mlx4_dev *dev, bool enable)
>>           list_add_tail(&dev_ctx->list, &priv->ctx_list);
>>           spin_unlock_irqrestore(&priv->ctx_lock, flags);
>>   -        mlx4_dbg(dev, "Inrerface for protocol %d restarted with
>> when bonded mode is %s\n",
>> +        mlx4_dbg(dev, "Interface for protocol %d restarted with when
>> bonded mode is %s\n",
> 
> Thanks Colin.
> I think there's one more thing to fix here.
> It is redundant to say "with when", it was probably done by mistake.
> Let's rephrase, maybe this way?
> 
> restarted with bonded mode %s

Sounds like a good idea, do you want me to send V2 of the patch with
this fix?

> 
>>                dev_ctx->intf->protocol, enable ?
>>                "enabled" : "disabled");
>>       }
>>
> 
> -- 
> To unsubscribe from this list: send the line "unsubscribe
> kernel-janitors" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] net/mlx4: fix spelling mistake: "Inrerface" -> "Interface"
From: Tariq Toukan @ 2018-05-22 15:25 UTC (permalink / raw)
  To: Colin Ian King, Tariq Toukan, David S . Miller, netdev,
	linux-rdma
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <297abbee-1f5c-6159-e559-44d83ddfb7ff@canonical.com>



On 22/05/2018 6:23 PM, Colin Ian King wrote:
> On 22/05/18 16:21, Tariq Toukan wrote:
>>
>>
>> On 22/05/2018 11:37 AM, Colin King wrote:
>>> From: Colin Ian King <colin.king@canonical.com>
>>>
>>> Trivial fix to spelling mistake in mlx4_dbg debug message
>>>
>>> Signed-off-by: Colin Ian King <colin.king@canonical.com>
>>> ---
>>>    drivers/net/ethernet/mellanox/mlx4/intf.c | 2 +-
>>>    1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c
>>> b/drivers/net/ethernet/mellanox/mlx4/intf.c
>>> index 2edcce98ab2d..6bd4103265d2 100644
>>> --- a/drivers/net/ethernet/mellanox/mlx4/intf.c
>>> +++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
>>> @@ -172,7 +172,7 @@ int mlx4_do_bond(struct mlx4_dev *dev, bool enable)
>>>            list_add_tail(&dev_ctx->list, &priv->ctx_list);
>>>            spin_unlock_irqrestore(&priv->ctx_lock, flags);
>>>    -        mlx4_dbg(dev, "Inrerface for protocol %d restarted with
>>> when bonded mode is %s\n",
>>> +        mlx4_dbg(dev, "Interface for protocol %d restarted with when
>>> bonded mode is %s\n",
>>
>> Thanks Colin.
>> I think there's one more thing to fix here.
>> It is redundant to say "with when", it was probably done by mistake.
>> Let's rephrase, maybe this way?
>>
>> restarted with bonded mode %s
> 
> Sounds like a good idea, do you want me to send V2 of the patch with
> this fix?
> 

Yes please.

>>
>>>                 dev_ctx->intf->protocol, enable ?
>>>                 "enabled" : "disabled");
>>>        }
>>>
>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe
>> kernel-janitors" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH net-next v11 2/5] netvsc: refactor notifier/event handling code to use the failover framework
From: Samudrala, Sridhar @ 2018-05-22 15:28 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh, aaron.f.brown, anjali.singhai
In-Reply-To: <20180522090853.GF2149@nanopsycho>


On 5/22/2018 2:08 AM, Jiri Pirko wrote:
> Tue, May 22, 2018 at 11:06:37AM CEST, jiri@resnulli.us wrote:
>> Tue, May 22, 2018 at 04:06:18AM CEST, sridhar.samudrala@intel.com wrote:
>>> Use the registration/notification framework supported by the generic
>>> failover infrastructure.
>>>
>>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> In previous patchset versions, the common code did
>> netdev_rx_handler_register() and netdev_upper_dev_link() etc
>> (netvsc_vf_join()). Now, this is still done in netvsc. Why?
>>
>> This should be part of the common "failover" code.

Based on Stephen's feedback on earlier patches, i tried to minimize the changes to
netvsc and only commonize the notifier and the main event handler routine.
Another complication is that netvsc does part of registration in a delayed workqueue.

It should be possible to move some of the code from net_failover.c to generic
failover.c in future if Stephen is ok with it.


>>
> Also note that in the current patchset you use IFF_FAILOVER flag for
> master, yet for the slave you use IFF_SLAVE. That is wrong.
> IFF_FAILOVER_SLAVE should be used.

Not sure which code you are referring to.  I only set IFF_FAILOVER_SLAVE
in patch 3.

^ permalink raw reply

* Re: [PATCH net-next v11 2/5] netvsc: refactor notifier/event handling code to use the failover framework
From: Michael S. Tsirkin @ 2018-05-22 15:32 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Sridhar Samudrala, stephen, davem, netdev, virtualization,
	virtio-dev, jesse.brandeburg, alexander.h.duyck, kubakici,
	jasowang, loseweigh, aaron.f.brown, anjali.singhai
In-Reply-To: <20180522151343.GJ2149@nanopsycho>

On Tue, May 22, 2018 at 05:13:43PM +0200, Jiri Pirko wrote:
> Tue, May 22, 2018 at 03:39:33PM CEST, mst@redhat.com wrote:
> >On Tue, May 22, 2018 at 03:26:26PM +0200, Jiri Pirko wrote:
> >> Tue, May 22, 2018 at 03:17:37PM CEST, mst@redhat.com wrote:
> >> >On Tue, May 22, 2018 at 03:14:22PM +0200, Jiri Pirko wrote:
> >> >> Tue, May 22, 2018 at 03:12:40PM CEST, mst@redhat.com wrote:
> >> >> >On Tue, May 22, 2018 at 11:08:53AM +0200, Jiri Pirko wrote:
> >> >> >> Tue, May 22, 2018 at 11:06:37AM CEST, jiri@resnulli.us wrote:
> >> >> >> >Tue, May 22, 2018 at 04:06:18AM CEST, sridhar.samudrala@intel.com wrote:
> >> >> >> >>Use the registration/notification framework supported by the generic
> >> >> >> >>failover infrastructure.
> >> >> >> >>
> >> >> >> >>Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> >> >> >> >
> >> >> >> >In previous patchset versions, the common code did
> >> >> >> >netdev_rx_handler_register() and netdev_upper_dev_link() etc
> >> >> >> >(netvsc_vf_join()). Now, this is still done in netvsc. Why?
> >> >> >> >
> >> >> >> >This should be part of the common "failover" code.
> >> >> >> >
> >> >> >> 
> >> >> >> Also note that in the current patchset you use IFF_FAILOVER flag for
> >> >> >> master, yet for the slave you use IFF_SLAVE. That is wrong.
> >> >> >> IFF_FAILOVER_SLAVE should be used.
> >> >> >
> >> >> >Or drop IFF_FAILOVER_SLAVE and set both IFF_FAILOVER and IFF_SLAVE?
> >> >> 
> >> >> No. IFF_SLAVE is for bonding.
> >> >
> >> >What breaks if we reuse it for failover?
> >> 
> >> This is exposed to userspace. IFF_SLAVE is expected for bonding slaves.
> >> And failover slave is not a bonding slave.
> >
> >That does not really answer the question.  I'd claim it's sufficiently
> >like a bond slave for IFF_SLAVE to make sense.
> >
> >In fact you will find that netvsc already sets IFF_SLAVE, and so
> 
> netvsc does the whole failover thing in a wrong way. This patchset is
> trying to fix it.

Maybe, but we don't need gratuitous changes either, especially if they
break userspace.

> >does e.g. the eql driver.
> >
> >The advantage of using IFF_SLAVE is that userspace knows to skip it.  If
> 
> The userspace should know how to skip other types of slaves - team,
> bridge, ovs, etc.
> The "master link" should be the one to look at.
> 

How should existing userspace know which ones to skip and which one is
the master?  Right now userspace seems to assume whatever does not have
IFF_SLAVE should be looked at. Are you saying that's not the right thing
to do and userspace should be fixed? What should userspace do in
your opinion that will be forward compatible with future kernels?

> 
> >we don't set IFF_SLAVE existing userspace tries to use the lowerdev.
> 
> Each master type has a IFF_ master flag and IFF_ slave flag.

Could you give some examples please?

> In private
> flag. I don't see no reason to break this pattern here.

Other masters are setup from userspace, this one is set up automatically
by kernel. So the bar is higher, we need an interface that existing
userspace knows about.  We can't just say "oh if userspace set this up
it should know to skip lowerdevs".

Otherwise multiple interfaces with same mac tend to confuse userspace.

-- 
MST

^ permalink raw reply

* Re: [PATCH v3] mlx4_core: allocate ICM memory in page size chunks
From: Tariq Toukan @ 2018-05-22 15:33 UTC (permalink / raw)
  To: Qing Huang, Eric Dumazet, tariqt, davem, haakon.bugge, yanjun.zhu
  Cc: netdev, linux-rdma, linux-kernel, gi-oh.kim
In-Reply-To: <19b7818e-16f6-2349-dc34-245c2f215f6f@oracle.com>



On 18/05/2018 12:45 AM, Qing Huang wrote:
> 
> 
> On 5/17/2018 2:14 PM, Eric Dumazet wrote:
>> On 05/17/2018 01:53 PM, Qing Huang wrote:
>>> When a system is under memory presure (high usage with fragments),
>>> the original 256KB ICM chunk allocations will likely trigger kernel
>>> memory management to enter slow path doing memory compact/migration
>>> ops in order to complete high order memory allocations.
>>>
>>> When that happens, user processes calling uverb APIs may get stuck
>>> for more than 120s easily even though there are a lot of free pages
>>> in smaller chunks available in the system.
>>>
>>> Syslog:
>>> ...
>>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>>> oracle_205573_e:205573 blocked for more than 120 seconds.
>>> ...
>>>
>> NACK on this patch.
>>
>> You have been asked repeatedly to use kvmalloc()
>>
>> This is not a minor suggestion.
>>
>> Take a look 
>> athttps://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d8c13f2271ec5178c52fbde072ec7b562651ed9d 
>>
> 
> Would you please take a look at how table->icm is being used in the mlx4 
> driver? It's a meta data used for individual pointer variable referencing,
> not as data frag or in/out buffer. It has no need for contiguous phy. 
> memory.
> 
> Thanks.
> 

NACK.

This would cause a degradation when iterating the entries of table->icm.
For example, in mlx4_table_get_range.

Thanks,
Tariq

>> And you'll understand some people care about this.
>>
>> Strongly.
>>
>> Thanks.
>>
> 

^ permalink raw reply

* [PATCH net-next v2 0/2] udp gso fixes
From: Willem de Bruijn @ 2018-05-22 15:34 UTC (permalink / raw)
  To: netdev; +Cc: davem, Willem de Bruijn

From: Willem de Bruijn <willemb@google.com>

A few small fixes:
- disallow segmentation with XFRM
- do not leak gso packets into the ingress path

Changes
  v1 -> v2
  - fix build failure in team.c
  - drop scatter-gather fix:
      this is now fixed by commit 113f99c33585 ("net: test tailroom
      before appending to linear skb"). After this patch gso skbs are
      built non-linear regardless of NETIF_F_SG and skb_segment builds
      linear segs.

Willem de Bruijn (4):
  udp: exclude gso from xfrm paths
  gso: limit udp gso to egress-only virtual devices

 drivers/net/bonding/bond_main.c | 5 +++--
 drivers/net/team/team.c         | 5 +++--
 include/linux/netdev_features.h | 1 -
 net/ipv4/udp.c                  | 3 ++-
 net/ipv6/udp.c                  | 3 ++-
 5 files changed, 10 insertions(+), 7 deletions(-)

-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply

* [PATCH net-next v2 1/2] udp: exclude gso from xfrm paths
From: Willem de Bruijn @ 2018-05-22 15:34 UTC (permalink / raw)
  To: netdev; +Cc: davem, Willem de Bruijn, Michal Kubecek
In-Reply-To: <20180522153440.204128-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

UDP GSO delays final datagram construction to the GSO layer. This
conflicts with protocol transformations.

Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT")
CC: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 net/ipv4/udp.c | 3 ++-
 net/ipv6/udp.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ff4d4ba67735..d71f1f3e1155 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -788,7 +788,8 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
 			return -EINVAL;
 		if (sk->sk_no_check_tx)
 			return -EINVAL;
-		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
+		    dst_xfrm(skb_dst(skb)))
 			return -EIO;
 
 		skb_shinfo(skb)->gso_size = cork->gso_size;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2839c1bd1e58..426c9d2b418d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1053,7 +1053,8 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
 			return -EINVAL;
 		if (udp_sk(sk)->no_check6_tx)
 			return -EINVAL;
-		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
+		    dst_xfrm(skb_dst(skb)))
 			return -EIO;
 
 		skb_shinfo(skb)->gso_size = cork->gso_size;
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related

* [PATCH net-next v2 2/2] gso: limit udp gso to egress-only virtual devices
From: Willem de Bruijn @ 2018-05-22 15:34 UTC (permalink / raw)
  To: netdev; +Cc: davem, Willem de Bruijn, Alexander Duyck
In-Reply-To: <20180522153440.204128-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

Until the udp receive stack supports large packets (UDP GRO), GSO
packets must not loop from the egress to the ingress path.

Revert the change that added NETIF_F_GSO_UDP_L4 to various virtual
devices through NETIF_F_GSO_ENCAP_ALL as this included devices that
may loop packets, such as veth and macvlan.

Instead add it to specific devices that forward to another device's
egress path, bonding and team.

Fixes: 83aa025f535f ("udp: add gso support to virtual devices")
CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 drivers/net/bonding/bond_main.c | 5 +++--
 drivers/net/team/team.c         | 5 +++--
 include/linux/netdev_features.h | 1 -
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 06efdf6a762b..fea17b92b1ae 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1107,7 +1107,8 @@ static void bond_compute_features(struct bonding *bond)
 
 done:
 	bond_dev->vlan_features = vlan_features;
-	bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL;
+	bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
+				    NETIF_F_GSO_UDP_L4;
 	bond_dev->gso_max_segs = gso_max_segs;
 	netif_set_gso_max_size(bond_dev, gso_max_size);
 
@@ -4268,7 +4269,7 @@ void bond_setup(struct net_device *bond_dev)
 				NETIF_F_HW_VLAN_CTAG_RX |
 				NETIF_F_HW_VLAN_CTAG_FILTER;
 
-	bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
+	bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL | NETIF_F_GSO_UDP_L4;
 	bond_dev->features |= bond_dev->hw_features;
 }
 
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 9dbd390ace34..d6ff881165d0 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1026,7 +1026,8 @@ static void __team_compute_features(struct team *team)
 	}
 
 	team->dev->vlan_features = vlan_features;
-	team->dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL;
+	team->dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
+				     NETIF_F_GSO_UDP_L4;
 	team->dev->hard_header_len = max_hard_header_len;
 
 	team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
@@ -2117,7 +2118,7 @@ static void team_setup(struct net_device *dev)
 			   NETIF_F_HW_VLAN_CTAG_RX |
 			   NETIF_F_HW_VLAN_CTAG_FILTER;
 
-	dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
+	dev->hw_features |= NETIF_F_GSO_ENCAP_ALL | NETIF_F_GSO_UDP_L4;
 	dev->features |= dev->hw_features;
 }
 
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index c87c3a3453c1..623bb8ced060 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -220,7 +220,6 @@ enum {
 				 NETIF_F_GSO_GRE_CSUM |			\
 				 NETIF_F_GSO_IPXIP4 |			\
 				 NETIF_F_GSO_IPXIP6 |			\
-				 NETIF_F_GSO_UDP_L4 |			\
 				 NETIF_F_GSO_UDP_TUNNEL |		\
 				 NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related

* [PATCH] net: vxge: fix spelling mistake in macro VXGE_HW_ERR_PRIVILAGED_OPEARATION
From: Colin King @ 2018-05-22 15:36 UTC (permalink / raw)
  To: Jon Mason, David S . Miller, netdev; +Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

Rename VXGE_HW_ERR_PRIVILAGED_OPEARATION to VXGE_HW_ERR_PRIVILAGED_OPERATION
to fix spelling mistake.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 drivers/net/ethernet/neterion/vxge/vxge-config.c  | 12 ++++++------
 drivers/net/ethernet/neterion/vxge/vxge-config.h  |  2 +-
 drivers/net/ethernet/neterion/vxge/vxge-ethtool.c |  2 +-
 drivers/net/ethernet/neterion/vxge/vxge-main.c    |  4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/neterion/vxge/vxge-config.c b/drivers/net/ethernet/neterion/vxge/vxge-config.c
index 6223930a8155..8656fcc9f2a0 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-config.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-config.c
@@ -693,7 +693,7 @@ __vxge_hw_device_is_privilaged(u32 host_type, u32 func_id)
 		VXGE_HW_DEVICE_ACCESS_RIGHT_MRPCIM)
 		return VXGE_HW_OK;
 	else
-		return VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+		return VXGE_HW_ERR_PRIVILAGED_OPERATION;
 }
 
 /*
@@ -1920,7 +1920,7 @@ enum vxge_hw_status vxge_hw_device_getpause_data(struct __vxge_hw_device *hldev,
 	}
 
 	if (!(hldev->access_rights & VXGE_HW_DEVICE_ACCESS_RIGHT_MRPCIM)) {
-		status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+		status = VXGE_HW_ERR_PRIVILAGED_OPERATION;
 		goto exit;
 	}
 
@@ -3153,7 +3153,7 @@ vxge_hw_mgmt_reg_read(struct __vxge_hw_device *hldev,
 	case vxge_hw_mgmt_reg_type_mrpcim:
 		if (!(hldev->access_rights &
 			VXGE_HW_DEVICE_ACCESS_RIGHT_MRPCIM)) {
-			status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+			status = VXGE_HW_ERR_PRIVILAGED_OPERATION;
 			break;
 		}
 		if (offset > sizeof(struct vxge_hw_mrpcim_reg) - 8) {
@@ -3165,7 +3165,7 @@ vxge_hw_mgmt_reg_read(struct __vxge_hw_device *hldev,
 	case vxge_hw_mgmt_reg_type_srpcim:
 		if (!(hldev->access_rights &
 			VXGE_HW_DEVICE_ACCESS_RIGHT_SRPCIM)) {
-			status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+			status = VXGE_HW_ERR_PRIVILAGED_OPERATION;
 			break;
 		}
 		if (index > VXGE_HW_TITAN_SRPCIM_REG_SPACES - 1) {
@@ -3279,7 +3279,7 @@ vxge_hw_mgmt_reg_write(struct __vxge_hw_device *hldev,
 	case vxge_hw_mgmt_reg_type_mrpcim:
 		if (!(hldev->access_rights &
 			VXGE_HW_DEVICE_ACCESS_RIGHT_MRPCIM)) {
-			status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+			status = VXGE_HW_ERR_PRIVILAGED_OPERATION;
 			break;
 		}
 		if (offset > sizeof(struct vxge_hw_mrpcim_reg) - 8) {
@@ -3291,7 +3291,7 @@ vxge_hw_mgmt_reg_write(struct __vxge_hw_device *hldev,
 	case vxge_hw_mgmt_reg_type_srpcim:
 		if (!(hldev->access_rights &
 			VXGE_HW_DEVICE_ACCESS_RIGHT_SRPCIM)) {
-			status = VXGE_HW_ERR_PRIVILAGED_OPEARATION;
+			status = VXGE_HW_ERR_PRIVILAGED_OPERATION;
 			break;
 		}
 		if (index > VXGE_HW_TITAN_SRPCIM_REG_SPACES - 1) {
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-config.h b/drivers/net/ethernet/neterion/vxge/vxge-config.h
index cfa970417f81..5ebdbfedc269 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-config.h
+++ b/drivers/net/ethernet/neterion/vxge/vxge-config.h
@@ -127,7 +127,7 @@ enum vxge_hw_status {
 	VXGE_HW_ERR_INVALID_TCODE 		  = VXGE_HW_BASE_ERR + 14,
 	VXGE_HW_ERR_INVALID_BLOCK_SIZE		  = VXGE_HW_BASE_ERR + 15,
 	VXGE_HW_ERR_INVALID_STATE		  = VXGE_HW_BASE_ERR + 16,
-	VXGE_HW_ERR_PRIVILAGED_OPEARATION	  = VXGE_HW_BASE_ERR + 17,
+	VXGE_HW_ERR_PRIVILAGED_OPERATION	  = VXGE_HW_BASE_ERR + 17,
 	VXGE_HW_ERR_INVALID_PORT 		  = VXGE_HW_BASE_ERR + 18,
 	VXGE_HW_ERR_FIFO		 	  = VXGE_HW_BASE_ERR + 19,
 	VXGE_HW_ERR_VPATH			  = VXGE_HW_BASE_ERR + 20,
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c b/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c
index 0452848d1316..2f1bde500420 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-ethtool.c
@@ -276,7 +276,7 @@ static void vxge_get_ethtool_stats(struct net_device *dev,
 	*ptr++ = 0;
 	status = vxge_hw_device_xmac_stats_get(hldev, xmac_stats);
 	if (status != VXGE_HW_OK) {
-		if (status != VXGE_HW_ERR_PRIVILAGED_OPEARATION) {
+		if (status != VXGE_HW_ERR_PRIVILAGED_OPERATION) {
 			vxge_debug_init(VXGE_ERR,
 				"%s : %d Failure in getting xmac stats",
 				__func__, __LINE__);
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index b2299f2b2155..64fa94f8b471 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -3484,11 +3484,11 @@ static int vxge_device_register(struct __vxge_hw_device *hldev,
 				0,
 				&stat);
 
-	if (status == VXGE_HW_ERR_PRIVILAGED_OPEARATION)
+	if (status == VXGE_HW_ERR_PRIVILAGED_OPERATION)
 		vxge_debug_init(
 			vxge_hw_device_trace_level_get(hldev),
 			"%s: device stats clear returns"
-			"VXGE_HW_ERR_PRIVILAGED_OPEARATION", ndev->name);
+			"VXGE_HW_ERR_PRIVILAGED_OPERATION", ndev->name);
 
 	vxge_debug_entryexit(vxge_hw_device_trace_level_get(hldev),
 		"%s: %s:%d  Exiting...",
-- 
2.17.0

^ permalink raw reply related

* Re: Shepherd request (P83): Multipath TCP: Present Use Cases and an Upstream Future
From: Jiri Pirko @ 2018-05-22 15:36 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh, aaron.f.brown, anjali.singhai
In-Reply-To: <39081bce-3913-5b07-3d07-0c476fca5e78@intel.com>

Tue, May 22, 2018 at 05:28:42PM CEST, sridhar.samudrala@intel.com wrote:
>
>On 5/22/2018 2:08 AM, Jiri Pirko wrote:
>> Tue, May 22, 2018 at 11:06:37AM CEST, jiri@resnulli.us wrote:
>> > Tue, May 22, 2018 at 04:06:18AM CEST, sridhar.samudrala@intel.com wrote:
>> > > Use the registration/notification framework supported by the generic
>> > > failover infrastructure.
>> > > 
>> > > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> > In previous patchset versions, the common code did
>> > netdev_rx_handler_register() and netdev_upper_dev_link() etc
>> > (netvsc_vf_join()). Now, this is still done in netvsc. Why?
>> > 
>> > This should be part of the common "failover" code.
>
>Based on Stephen's feedback on earlier patches, i tried to minimize the changes to
>netvsc and only commonize the notifier and the main event handler routine.
>Another complication is that netvsc does part of registration in a delayed workqueue.

:( This kind of degrades the whole efford of having single solution
in "failover" module. I think that common parts, as
netdev_rx_handler_register() and others certainly is should be inside
the common module. This is not a good time to minimize changes. Let's do
the thing properly and fix the netvsc mess now.


>
>It should be possible to move some of the code from net_failover.c to generic
>failover.c in future if Stephen is ok with it.
>
>
>> > 
>> Also note that in the current patchset you use IFF_FAILOVER flag for
>> master, yet for the slave you use IFF_SLAVE. That is wrong.
>> IFF_FAILOVER_SLAVE should be used.
>
>Not sure which code you are referring to.  I only set IFF_FAILOVER_SLAVE
>in patch 3.

The existing netvsc driver.

^ permalink raw reply

* Re: [net-next 1/6] net/dcb: Add dcbnl buffer attribute
From: Huy Nguyen @ 2018-05-22 15:36 UTC (permalink / raw)
  To: Jakub Kicinski, Saeed Mahameed
  Cc: David S. Miller, netdev, Jiri Pirko, Or Gerlitz, Parav Pandit
In-Reply-To: <20180521222026.4f54f479@cakuba>

On 5/22/2018 12:20 AM, Jakub Kicinski wrote:
> On Mon, 21 May 2018 14:04:57 -0700, Saeed Mahameed wrote:
>> From: Huy Nguyen <huyn@mellanox.com>
>>
>> In this patch, we add dcbnl buffer attribute to allow user
>> change the NIC's buffer configuration such as priority
>> to buffer mapping and buffer size of individual buffer.
>>
>> This attribute combined with pfc attribute allows advance user to
>> fine tune the qos setting for specific priority queue. For example,
>> user can give dedicated buffer for one or more prirorities or user
>> can give large buffer to certain priorities.
>>
>> We present an use case scenario where dcbnl buffer attribute configured
>> by advance user helps reduce the latency of messages of different sizes.
>>
>> Scenarios description:
>> On ConnectX-5, we run latency sensitive traffic with
>> small/medium message sizes ranging from 64B to 256KB and bandwidth sensitive
>> traffic with large messages sizes 512KB and 1MB. We group small, medium,
>> and large message sizes to their own pfc enables priorities as follow.
>>    Priorities 1 & 2 (64B, 256B and 1KB)
>>    Priorities 3 & 4 (4KB, 8KB, 16KB, 64KB, 128KB and 256KB)
>>    Priorities 5 & 6 (512KB and 1MB)
>>
>> By default, ConnectX-5 maps all pfc enabled priorities to a single
>> lossless fixed buffer size of 50% of total available buffer space. The
>> other 50% is assigned to lossy buffer. Using dcbnl buffer attribute,
>> we create three equal size lossless buffers. Each buffer has 25% of total
>> available buffer space. Thus, the lossy buffer size reduces to 25%. Priority
>> to lossless  buffer mappings are set as follow.
>>    Priorities 1 & 2 on lossless buffer #1
>>    Priorities 3 & 4 on lossless buffer #2
>>    Priorities 5 & 6 on lossless buffer #3
>>
>> We observe improvements in latency for small and medium message sizes
>> as follows. Please note that the large message sizes bandwidth performance is
>> reduced but the total bandwidth remains the same.
>>    256B message size (42 % latency reduction)
>>    4K message size (21% latency reduction)
>>    64K message size (16% latency reduction)
>>
>> Signed-off-by: Huy Nguyen <huyn@mellanox.com>
>> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
> On a cursory look this bares a lot of resemblance to devlink shared
> buffer configuration ABI.  Did you look into using that?
>
> Just to be clear devlink shared buffer ABIs don't require representors
> and "switchdev mode".
> .
[HQN] Dear Jakub, there are several reasons that devlink shared buffer 
ABI cannot be used:
1. The devlink shared buffer ABI is written based on the switch cli 
which you can find out more
from this link https://community.mellanox.com/docs/DOC-2558.
2. The dcbnl interfaces have been used for QoS settings. In NIC, the 
buffer configuration are tied to
priority (ETS PFC). The buffer configuration are not tied to port like 
switch.
3. Shared buffer, alpha, threshold are switch specific terms.

Please let me know if you have any further question.
Regards,
Huy Nguyen

^ permalink raw reply

* Re: [PATCH net-next 08/12] amd-xgbe: Add ethtool show/set channels support
From: Tom Lendacky @ 2018-05-22 15:37 UTC (permalink / raw)
  To: Edward Cree, Jakub Kicinski; +Cc: netdev, David Miller
In-Reply-To: <ddce7e2b-ab4d-8eb3-9b56-acb192ae29a6@solarflare.com>

On 5/22/2018 8:29 AM, Edward Cree wrote:
> On 22/05/18 14:24, Tom Lendacky wrote:
>> The amd-xgbe driver is not designed to allocate separate IRQs for Rx and
>> Tx.  In general, there is one IRQ for a channel of which Tx and Rx are
>> shared.  You can have more Tx channels than Rx channels or vice-versa, but
>> the min() of those numbers will be a combined Tx/Rx with the excess being
>> Tx or Rx only: e.g. combined 0 tx 8 rx 10 results in 8 combined channels
>> plus two Rx only channels.
> If you cannot allocate the channels requested by the user, surely the correct
>  thing is not to fudge it into something similar, but rather to return an
>  error from the ethtool set_channels() op.

Ok, another vote on changing the logic.  I'll update it and submit a v2.

Thanks,
Tom

> 
> -Ed
> 

^ permalink raw reply

* Re: [PATCH net-next] net: stmmac: Add PPS and Flexible PPS support
From: Jose Abreu @ 2018-05-22 15:38 UTC (permalink / raw)
  To: Andrew Lunn, Jose Abreu, Richard Cochran
  Cc: netdev, David S. Miller, Joao Pinto, Vitor Soares,
	Giuseppe Cavallaro, Alexandre Torgue
In-Reply-To: <20180522151418.GA4396@lunn.ch>

On 22-05-2018 16:14, Andrew Lunn wrote:
> On Tue, May 22, 2018 at 01:58:40PM +0100, Jose Abreu wrote:
>> This adds support for PPS output and Flexible PPS (which is equivalent
>> to per_out output of PTP subsystem).
> You forgot to Cc: the PTP maintainer, Richard Cochran <richardcochran@gmail.com>

Thanks Andrew!

Richard,

Can you take a look at this patch? (I can cc you in the original
thread if you're not a netdev subscriber).

Thanks and Best Regards,
Jose Miguel Abreu

>
>     Andrew

^ permalink raw reply

* Re: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
From: Willem de Bruijn @ 2018-05-22 15:41 UTC (permalink / raw)
  To: Jon Rosen (jrosen)
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL], open list
In-Reply-To: <64c91d04479348cabbcdf1df0ff7d40f@XCH-RTP-016.cisco.com>

>>> I think the bigger issues as you've pointed out are the cost of
>>> the additional spin lock and should the additional state be
>>> stored in-band (fewer cache lines) or out-of band (less risk of
>>> breaking due to unpredictable application behavior).
>>
>> We don't need the spinlock if clearing the shadow byte after
>> setting the status to user.
>>
>> Worst case, user will set it back to kernel while the shadow
>> byte is not cleared yet and the next producer will drop a packet.
>> But next producers will make progress, so there is no deadlock
>> or corruption.
>
> I thought so too for a while but after spending more time than I
> care to admit I relized the following sequence was occuring:
>
>    Core A                       Core B
>    ------                       ------
>    - Enter spin_lock
>    -   Get tp_status of head (X)
>        tp_status == 0
>    -   Check inuse
>        inuse == 0
>    -   Allocate entry X
>        advance head (X+1)
>        set inuse=1
>    - Exit spin_lock
>
>      <very long delay>
>
>                                 <allocate N-1 entries
>                                 where N = size of ring>
>
>                                 - Enter spin_lock
>                                 -   get tp_status of head (X+N)
>                                     tp_status == 0 (but slot
>                                     in use for X on core A)
>
>    - write tp_status of         <--- trouble!
>      X = TP_STATUS_USER         <--- trouble!
>    - write inuse=0              <--- trouble!
>
>                                 -   Check inuse
>                                     inuse == 0
>                                 -   Allocate entry X+N
>                                     advance head (X+N+1)
>                                     set inuse=1
>                                 - Exit spin_lock
>
>
> At this point Core A just passed slot X to userspace with a
> packet and Core B has just been assigned slot X+N (same slot as
> X) for it's new packet. Both cores A and B end up filling in that
> slot.  Tracking ths donw was one of the reasons it took me a
> while to produce these updated diffs.

Is this not just an ordering issue? Since inuse is set after tp_status,
it has to be tested first (and barriers are needed to avoid reordering).

^ permalink raw reply

* [PATCH][V2] net/mlx4: fix spelling mistake: "Inrerface" -> "Interface" and rephrase message
From: Colin King @ 2018-05-22 15:42 UTC (permalink / raw)
  To: Tariq Toukan, David S . Miller, netdev, linux-rdma
  Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

Trivial fix to spelling mistake in mlx4_dbg debug message and also
change the phrasing of the message so that is is more readable

Signed-off-by: Colin Ian King <colin.king@canonical.com>

---
V2: rephrase message, as helpfully suggested by Tariq Toukan
---
 drivers/net/ethernet/mellanox/mlx4/intf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
index 2edcce98ab2d..65482f004e50 100644
--- a/drivers/net/ethernet/mellanox/mlx4/intf.c
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -172,7 +172,7 @@ int mlx4_do_bond(struct mlx4_dev *dev, bool enable)
 		list_add_tail(&dev_ctx->list, &priv->ctx_list);
 		spin_unlock_irqrestore(&priv->ctx_lock, flags);
 
-		mlx4_dbg(dev, "Inrerface for protocol %d restarted with when bonded mode is %s\n",
+		mlx4_dbg(dev, "Interface for protocol %d restarted with bonded mode %s\n",
 			 dev_ctx->intf->protocol, enable ?
 			 "enabled" : "disabled");
 	}
-- 
2.17.0

^ permalink raw reply related

* Re: [PATCH] net: vxge: fix spelling mistake in macro VXGE_HW_ERR_PRIVILAGED_OPEARATION
From: Edward Cree @ 2018-05-22 15:44 UTC (permalink / raw)
  To: Colin King, Jon Mason, David S . Miller, netdev
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <20180522153616.13980-1-colin.king@canonical.com>

On 22/05/18 16:36, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
>
> Rename VXGE_HW_ERR_PRIVILAGED_OPEARATION to VXGE_HW_ERR_PRIVILAGED_OPERATION
> to fix spelling mistake.
>
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
"Privilaged" doesn't look right either, maybe fix both at once?
 -> VXGE_HW_PRIVILEGED_OPERATION

-Ed

^ permalink raw reply

* Re: [PATCH net-next v11 2/5] netvsc: refactor notifier/event handling code to use the failover framework
From: Jiri Pirko @ 2018-05-22 15:45 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: alexander.h.duyck, virtio-dev, kubakici, Sridhar Samudrala,
	virtualization, loseweigh, netdev, anjali.singhai, aaron.f.brown,
	davem
In-Reply-To: <20180522181804-mutt-send-email-mst@kernel.org>

Tue, May 22, 2018 at 05:32:30PM CEST, mst@redhat.com wrote:
>On Tue, May 22, 2018 at 05:13:43PM +0200, Jiri Pirko wrote:
>> Tue, May 22, 2018 at 03:39:33PM CEST, mst@redhat.com wrote:
>> >On Tue, May 22, 2018 at 03:26:26PM +0200, Jiri Pirko wrote:
>> >> Tue, May 22, 2018 at 03:17:37PM CEST, mst@redhat.com wrote:
>> >> >On Tue, May 22, 2018 at 03:14:22PM +0200, Jiri Pirko wrote:
>> >> >> Tue, May 22, 2018 at 03:12:40PM CEST, mst@redhat.com wrote:
>> >> >> >On Tue, May 22, 2018 at 11:08:53AM +0200, Jiri Pirko wrote:
>> >> >> >> Tue, May 22, 2018 at 11:06:37AM CEST, jiri@resnulli.us wrote:
>> >> >> >> >Tue, May 22, 2018 at 04:06:18AM CEST, sridhar.samudrala@intel.com wrote:
>> >> >> >> >>Use the registration/notification framework supported by the generic
>> >> >> >> >>failover infrastructure.
>> >> >> >> >>
>> >> >> >> >>Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> >> >> >> >
>> >> >> >> >In previous patchset versions, the common code did
>> >> >> >> >netdev_rx_handler_register() and netdev_upper_dev_link() etc
>> >> >> >> >(netvsc_vf_join()). Now, this is still done in netvsc. Why?
>> >> >> >> >
>> >> >> >> >This should be part of the common "failover" code.
>> >> >> >> >
>> >> >> >> 
>> >> >> >> Also note that in the current patchset you use IFF_FAILOVER flag for
>> >> >> >> master, yet for the slave you use IFF_SLAVE. That is wrong.
>> >> >> >> IFF_FAILOVER_SLAVE should be used.
>> >> >> >
>> >> >> >Or drop IFF_FAILOVER_SLAVE and set both IFF_FAILOVER and IFF_SLAVE?
>> >> >> 
>> >> >> No. IFF_SLAVE is for bonding.
>> >> >
>> >> >What breaks if we reuse it for failover?
>> >> 
>> >> This is exposed to userspace. IFF_SLAVE is expected for bonding slaves.
>> >> And failover slave is not a bonding slave.
>> >
>> >That does not really answer the question.  I'd claim it's sufficiently
>> >like a bond slave for IFF_SLAVE to make sense.
>> >
>> >In fact you will find that netvsc already sets IFF_SLAVE, and so
>> 
>> netvsc does the whole failover thing in a wrong way. This patchset is
>> trying to fix it.
>
>Maybe, but we don't need gratuitous changes either, especially if they
>break userspace.

What do you mean by the "break"? It was a mistake to reuse IFF_SLAVE at
the first place, lets fix it. If some userspace depends on that flag, it
is broken anyway.


>
>> >does e.g. the eql driver.
>> >
>> >The advantage of using IFF_SLAVE is that userspace knows to skip it.  If
>> 
>> The userspace should know how to skip other types of slaves - team,
>> bridge, ovs, etc.
>> The "master link" should be the one to look at.
>> 
>
>How should existing userspace know which ones to skip and which one is
>the master?  Right now userspace seems to assume whatever does not have
>IFF_SLAVE should be looked at. Are you saying that's not the right thing

Why do you say so? What do you mean by "looked at"? Certainly not.
IFLA_MASTER is the attribute that should be looked at, nothing else.


>to do and userspace should be fixed? What should userspace do in
>your opinion that will be forward compatible with future kernels?
>
>> 
>> >we don't set IFF_SLAVE existing userspace tries to use the lowerdev.
>> 
>> Each master type has a IFF_ master flag and IFF_ slave flag.
>
>Could you give some examples please?

enum netdev_priv_flags {
        IFF_EBRIDGE                     = 1<<1,
        IFF_BRIDGE_PORT                 = 1<<9,
        IFF_OPENVSWITCH                 = 1<<20,
        IFF_OVS_DATAPATH                = 1<<10,
	IFF_L3MDEV_MASTER               = 1<<18,
        IFF_L3MDEV_SLAVE                = 1<<21,
        IFF_TEAM                        = 1<<22,
        IFF_TEAM_PORT                   = 1<<13,
};


>
>> In private
>> flag. I don't see no reason to break this pattern here.
>
>Other masters are setup from userspace, this one is set up automatically
>by kernel. So the bar is higher, we need an interface that existing
>userspace knows about.  We can't just say "oh if userspace set this up
>it should know to skip lowerdevs".
>
>Otherwise multiple interfaces with same mac tend to confuse userspace.

No difference, really.
Regardless who does the setup, and independent userspace deamon should
react accordingly.

^ permalink raw reply

* Re: [PATCH net-next v4 1/3] ipv4: support sport, dport and ip_proto in RTM_GETROUTE
From: Roopa Prabhu @ 2018-05-22 15:45 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, netdev, Nikolay Aleksandrov, David Ahern,
	Ido Schimmel
In-Reply-To: <9e2a1bc7-e80a-1642-3b83-2341fddc6097@gmail.com>

On Tue, May 22, 2018 at 8:04 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
> On 05/22/2018 07:51 AM, Roopa Prabhu wrote:
>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>
>>
>
> ...
>
>> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
>> index 4d622112..cf5cfc5 100644
>> --- a/net/ipv4/fib_frontend.c
>> +++ b/net/ipv4/fib_frontend.c
>> @@ -649,6 +649,10 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
>>       [RTA_ENCAP]             = { .type = NLA_NESTED },
>>       [RTA_UID]               = { .type = NLA_U32 },
>>       [RTA_MARK]              = { .type = NLA_U32 },
>> +     [RTA_TABLE]             = { .type = NLA_U32 },
>> +     [RTA_IP_PROTO]          = { .type = NLA_U8 },
>> +     [RTA_SPORT]             = { .type = NLA_U16 },
>> +     [RTA_DPORT]             = { .type = NLA_U16 },
>>  };
>
> Hi Roopa
>
> RTA_TABLE addition looks like a bug fix for net tree ?
>
> This should be sent as an independent patch IMO.
>
> Thanks.

sure, I can do that. thanks.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox