Netdev List
 help / color / mirror / Atom feed
* [PATCH bpf-next v4 2/5] bpf: implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
From: Peter Oskolkov @ 2019-01-30 19:48 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Peter Oskolkov
In-Reply-To: <20190130194811.239760-1-posk@google.com>

This patch implements BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN
and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers
to packets (e.g. IP/GRE, GUE, IPIP).

This is useful when thousands of different short-lived flows should be
encapped, each with different and dynamically determined destination.
Although lwtunnels can be used in some of these scenarios, the ability
to dynamically generate encap headers adds more flexibility, e.g.
when routing depends on the state of the host (reflected in global bpf
maps).

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 include/net/lwtunnel.h |  3 +++
 net/core/filter.c      |  3 ++-
 net/core/lwt_bpf.c     | 59 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 33fd9ba7e0e5..f0973eca8036 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -126,6 +126,8 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
 int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int lwtunnel_input(struct sk_buff *skb);
 int lwtunnel_xmit(struct sk_buff *skb);
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
+			  bool ingress);
 
 static inline void lwtunnel_set_redirect(struct dst_entry *dst)
 {
@@ -138,6 +140,7 @@ static inline void lwtunnel_set_redirect(struct dst_entry *dst)
 		dst->input = lwtunnel_input;
 	}
 }
+
 #else
 
 static inline void lwtstate_free(struct lwtunnel_state *lws)
diff --git a/net/core/filter.c b/net/core/filter.c
index 27d3fbe4b77b..de6bd4b4e0a3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -73,6 +73,7 @@
 #include <linux/seg6_local.h>
 #include <net/seg6.h>
 #include <net/seg6_local.h>
+#include <net/lwtunnel.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -4804,7 +4805,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
 static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
 			     bool ingress)
 {
-	return -EINVAL;  /* Implemented in the next patch. */
+	return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
 }
 
 BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index a648568c5e8f..6a6e9acab73d 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -390,6 +390,65 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {
 	.owner		= THIS_MODULE,
 };
 
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
+{
+	struct iphdr *iph;
+	bool ipv4;
+	int err;
+
+	if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
+		return -EINVAL;
+
+	/* validate protocol and length */
+	iph = (struct iphdr *)hdr;
+	if (iph->version == 4) {
+		ipv4 = true;
+		if (unlikely(len < iph->ihl * 4))
+			return -EINVAL;
+	} else if (iph->version == 6) {
+		ipv4 = false;
+		if (unlikely(len < sizeof(struct ipv6hdr)))
+			return -EINVAL;
+	} else {
+		return -EINVAL;
+	}
+
+	if (ingress)
+		err = skb_cow_head(skb, len + skb->mac_len);
+	else
+		err = skb_cow_head(skb,
+				   len + LL_RESERVED_SPACE(skb_dst(skb)->dev));
+	if (unlikely(err))
+		return err;
+
+	/* push the encap headers and fix pointers */
+	skb_reset_inner_headers(skb);
+	skb->encapsulation = 1;
+	skb_push(skb, len);
+	if (ingress)
+		skb_postpush_rcsum(skb, iph, len);
+	skb_reset_network_header(skb);
+	memcpy(skb_network_header(skb), hdr, len);
+	bpf_compute_data_pointers(skb);
+
+	if (ipv4) {
+		skb->protocol = htons(ETH_P_IP);
+		iph = ip_hdr(skb);
+		if (iph->ihl * 4 < len)
+			skb_set_transport_header(skb, iph->ihl * 4);
+
+		if (!iph->check)
+			iph->check = ip_fast_csum((unsigned char *)iph,
+						  iph->ihl);
+	} else {
+		skb->protocol = htons(ETH_P_IPV6);
+		if (sizeof(struct ipv6hdr) < len)
+			skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+	}
+
+	return 0;
+}
+
 static int __init bpf_lwt_init(void)
 {
 	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
-- 
2.20.1.495.gaa96b0ce6b-goog


^ permalink raw reply related

* [PATCH bpf-next v4 3/5] bpf: add handling of BPF_LWT_REROUTE to lwt_bpf.c
From: Peter Oskolkov @ 2019-01-30 19:48 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Peter Oskolkov
In-Reply-To: <20190130194811.239760-1-posk@google.com>

This patch builds on top of the previous patch in the patchset,
which added BPF_LWT_ENCAP_IP mode to bpf_lwt_push_encap. As the
encapping can result in the skb needing to go via a different
interface/route/dst, bpf programs can indicate this by returning
BPF_LWT_REROUTE, which triggers a new route lookup for the skb.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 net/core/lwt_bpf.c | 116 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 6a6e9acab73d..21ffafd4eabb 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/bpf.h>
 #include <net/lwtunnel.h>
+#include <net/ip6_route.h>
 
 struct bpf_lwt_prog {
 	struct bpf_prog *prog;
@@ -55,6 +56,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 
 	switch (ret) {
 	case BPF_OK:
+	case BPF_LWT_REROUTE:
 		break;
 
 	case BPF_REDIRECT:
@@ -87,6 +89,32 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 	return ret;
 }
 
+static int bpf_lwt_input_reroute(struct sk_buff *skb)
+{
+	int err = -EINVAL;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		struct iphdr *iph = ip_hdr(skb);
+
+		err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					   iph->tos, skb_dst(skb)->dev);
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		ip6_route_input(skb);
+		err = skb_dst(skb)->error;
+	} else {
+		pr_warn_once("BPF_LWT_REROUTE input: unsupported proto %d\n",
+			     skb->protocol);
+	}
+
+	if (err)
+		goto err;
+	return dst_input(skb);
+
+err:
+	kfree_skb(skb);
+	return err;
+}
+
 static int bpf_input(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
@@ -98,6 +126,8 @@ static int bpf_input(struct sk_buff *skb)
 		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
 		if (ret < 0)
 			return ret;
+		if (ret == BPF_LWT_REROUTE)
+			return bpf_lwt_input_reroute(skb);
 	}
 
 	if (unlikely(!dst->lwtstate->orig_input)) {
@@ -147,6 +177,90 @@ static int xmit_check_hhlen(struct sk_buff *skb)
 	return 0;
 }
 
+static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
+{
+	struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
+	int oif = l3mdev ? l3mdev->ifindex : 0;
+	struct dst_entry *dst = NULL;
+	struct sock *sk;
+	struct net *net;
+	bool ipv4;
+	int err;
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		ipv4 = true;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		ipv4 = false;
+	} else {
+		pr_warn_once("BPF_LWT_REROUTE xmit: unsupported proto %d\n",
+			     skb->protocol);
+		return -EINVAL;
+	}
+
+	sk = sk_to_full_sk(skb->sk);
+	if (sk) {
+		if (sk->sk_bound_dev_if)
+			oif = sk->sk_bound_dev_if;
+		net = sock_net(sk);
+	} else {
+		net = dev_net(skb_dst(skb)->dev);
+	}
+
+	if (ipv4) {
+		struct iphdr *iph = ip_hdr(skb);
+		struct flowi4 fl4 = {0};
+		struct rtable *rt;
+
+		fl4.flowi4_oif = oif;
+		fl4.flowi4_mark = skb->mark;
+		fl4.flowi4_uid = sock_net_uid(net, sk);
+		fl4.flowi4_tos = RT_TOS(iph->tos);
+		fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+		fl4.flowi4_proto = iph->protocol;
+		fl4.daddr = iph->daddr;
+		fl4.saddr = iph->saddr;
+
+		rt = ip_route_output_key(net, &fl4);
+		if (IS_ERR(rt) || rt->dst.error)
+			return -EINVAL;
+		dst = &rt->dst;
+	} else {
+		struct ipv6hdr *iph6 = ipv6_hdr(skb);
+		struct flowi6 fl6 = {0};
+
+		fl6.flowi6_oif = oif;
+		fl6.flowi6_mark = skb->mark;
+		fl6.flowi6_uid = sock_net_uid(net, sk);
+		fl6.flowlabel = ip6_flowinfo(iph6);
+		fl6.flowi6_proto = iph6->nexthdr;
+		fl6.daddr = iph6->daddr;
+		fl6.saddr = iph6->saddr;
+
+		dst = ip6_route_output(net, skb->sk, &fl6);
+		if (IS_ERR(dst) || dst->error)
+			return -EINVAL;
+	}
+
+	/* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
+	 * was done for the previous dst, so we are doing it here again, in
+	 * case the new dst needs much more space. The call below is a noop
+	 * if there is enough header space in skb.
+	 */
+	err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+	if (unlikely(err))
+		return err;
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+
+	err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+	if (unlikely(err))
+		return err;
+
+	/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
+	return LWTUNNEL_XMIT_DONE;
+}
+
 static int bpf_xmit(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
@@ -169,6 +283,8 @@ static int bpf_xmit(struct sk_buff *skb)
 			return LWTUNNEL_XMIT_CONTINUE;
 		case BPF_REDIRECT:
 			return LWTUNNEL_XMIT_DONE;
+		case BPF_LWT_REROUTE:
+			return bpf_lwt_xmit_reroute(skb);
 		default:
 			return ret;
 		}
-- 
2.20.1.495.gaa96b0ce6b-goog


^ permalink raw reply related

* [PATCH bpf-next v4 4/5] bpf: sync <kdir>/<uapi>/bpf.h with tools/<uapi>/bpf.h
From: Peter Oskolkov @ 2019-01-30 19:48 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Peter Oskolkov
In-Reply-To: <20190130194811.239760-1-posk@google.com>

This patch copies changes in bpf.h done by a previous patch
in this patchset from the kernel uapi include dir into tools
uapi include dir.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 tools/include/uapi/linux/bpf.h | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 60b99b730a41..911c15585fab 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2015,6 +2015,16 @@ union bpf_attr {
  *			Only works if *skb* contains an IPv6 packet. Insert a
  *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
  *			the IPv6 header.
+ *		**BPF_LWT_ENCAP_IP**
+ *			IP encapsulation (GRE/GUE/IPIP/etc). The outer header
+ *			must be IPv4 or IPv6, followed by zero or more
+ *			additional headers, up to LWT_BPF_MAX_HEADROOM total
+ *			bytes in all prepended headers.
+ *
+ *		BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of
+ *		type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called
+ *		by bpf programs of types BPF_PROG_TYPE_LWT_IN and
+ *		BPF_PROG_TYPE_LWT_XMIT.
  *
  * 		A call to this helper is susceptible to change the underlaying
  * 		packet buffer. Therefore, at load time, all checks on pointers
@@ -2495,7 +2505,8 @@ enum bpf_hdr_start_off {
 /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
 enum bpf_lwt_encap_mode {
 	BPF_LWT_ENCAP_SEG6,
-	BPF_LWT_ENCAP_SEG6_INLINE
+	BPF_LWT_ENCAP_SEG6_INLINE,
+	BPF_LWT_ENCAP_IP,
 };
 
 #define __bpf_md_ptr(type, name)	\
@@ -2583,7 +2594,15 @@ enum bpf_ret_code {
 	BPF_DROP = 2,
 	/* 3-6 reserved */
 	BPF_REDIRECT = 7,
-	/* >127 are reserved for prog type specific return codes */
+	/* >127 are reserved for prog type specific return codes.
+	 *
+	 * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and
+	 *    BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been
+	 *    changed and should be routed based on its new L3 header.
+	 *    (This is an L3 redirect, as opposed to L2 redirect
+	 *    represented by BPF_REDIRECT above).
+	 */
+	BPF_LWT_REROUTE = 128,
 };
 
 struct bpf_sock {
-- 
2.20.1.495.gaa96b0ce6b-goog


^ permalink raw reply related

* [PATCH bpf-next v4 5/5] selftests: bpf: add test_lwt_ip_encap selftest
From: Peter Oskolkov @ 2019-01-30 19:48 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Peter Oskolkov
In-Reply-To: <20190130194811.239760-1-posk@google.com>

This patch adds a bpf self-test to cover BPF_LWT_ENCAP_IP mode
in bpf_lwt_push_encap.

Covered:
- encapping in LWT_IN and LWT_XMIT
- IPv4 and IPv6

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 tools/testing/selftests/bpf/Makefile          |   5 +-
 .../testing/selftests/bpf/test_lwt_ip_encap.c |  84 +++++
 .../selftests/bpf/test_lwt_ip_encap.sh        | 311 ++++++++++++++++++
 3 files changed, 398 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_lwt_ip_encap.c
 create mode 100755 tools/testing/selftests/bpf/test_lwt_ip_encap.sh

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 8993e9c8f410..28aa3b3e297e 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -35,7 +35,7 @@ BPF_OBJ_FILES = \
 	sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
 	get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
 	test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_xdp_vlan.o \
-	xdp_dummy.o test_map_in_map.o
+	xdp_dummy.o test_map_in_map.o test_lwt_ip_encap.o
 
 # Objects are built with default compilation flags and with sub-register
 # code-gen enabled.
@@ -73,7 +73,8 @@ TEST_PROGS := test_kmod.sh \
 	test_lirc_mode2.sh \
 	test_skb_cgroup_id.sh \
 	test_flow_dissector.sh \
-	test_xdp_vlan.sh
+	test_xdp_vlan.sh \
+	test_lwt_ip_encap.sh
 
 TEST_PROGS_EXTENDED := with_addr.sh \
 	with_tunnels.sh \
diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.c b/tools/testing/selftests/bpf/test_lwt_ip_encap.c
new file mode 100644
index 000000000000..2cd6bf9dd7e8
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+struct grehdr {
+	__be16 flags;
+	__be16 protocol;
+};
+
+SEC("encap_gre")
+int bpf_lwt_encap_gre(struct __sk_buff *skb)
+{
+	struct encap_hdr {
+		struct iphdr iph;
+		struct grehdr greh;
+	} hdr;
+	int err;
+
+	memset(&hdr, 0, sizeof(struct encap_hdr));
+
+	hdr.iph.ihl = 5;
+	hdr.iph.version = 4;
+	hdr.iph.ttl = 0x40;
+	hdr.iph.protocol = 47;  /* IPPROTO_GRE */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	hdr.iph.saddr = 0x640110ac;  /* 172.16.1.100 */
+	hdr.iph.daddr = 0x641010ac;  /* 172.16.16.100 */
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	hdr.iph.saddr = 0xac100164;  /* 172.16.1.100 */
+	hdr.iph.daddr = 0xac101064;  /* 172.16.16.100 */
+#else
+#error "Fix your compiler's __BYTE_ORDER__?!"
+#endif
+	hdr.iph.tot_len = bpf_htons(skb->len + sizeof(struct encap_hdr));
+
+	hdr.greh.protocol = skb->protocol;
+
+	err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr,
+				 sizeof(struct encap_hdr));
+	if (err)
+		return BPF_DROP;
+	return BPF_LWT_REROUTE;
+}
+
+SEC("encap_gre6")
+int bpf_lwt_encap_gre6(struct __sk_buff *skb)
+{
+	struct encap_hdr {
+		struct ipv6hdr ip6hdr;
+		struct grehdr greh;
+	} hdr;
+	int err;
+
+	memset(&hdr, 0, sizeof(struct encap_hdr));
+
+	hdr.ip6hdr.version = 6;
+	hdr.ip6hdr.payload_len = bpf_htons(skb->len + sizeof(struct grehdr));
+	hdr.ip6hdr.nexthdr = 47;  /* IPPROTO_GRE */
+	hdr.ip6hdr.hop_limit = 0x40;
+	/* fb01::1 */
+	hdr.ip6hdr.saddr.s6_addr[0] = 0xfb;
+	hdr.ip6hdr.saddr.s6_addr[1] = 1;
+	hdr.ip6hdr.saddr.s6_addr[15] = 1;
+	/* fb10::1 */
+	hdr.ip6hdr.daddr.s6_addr[0] = 0xfb;
+	hdr.ip6hdr.daddr.s6_addr[1] = 0x10;
+	hdr.ip6hdr.daddr.s6_addr[15] = 1;
+
+	hdr.greh.protocol = skb->protocol;
+
+	err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr,
+				 sizeof(struct encap_hdr));
+	if (err)
+		return BPF_DROP;
+
+	return BPF_LWT_REROUTE;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh
new file mode 100755
index 000000000000..4ca714e23ab0
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh
@@ -0,0 +1,311 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Setup/topology:
+#
+#    NS1             NS2             NS3
+#   veth1 <---> veth2   veth3 <---> veth4 (the top route)
+#   veth5 <---> veth6   veth7 <---> veth8 (the bottom route)
+#
+#   each vethN gets IPv[4|6]_N address
+#
+#   IPv*_SRC = IPv*_1
+#   IPv*_DST = IPv*_4
+#
+#   all tests test pings from IPv*_SRC to IPv*_DST
+#
+#   by default, routes are configured to allow packets to go
+#   IP*_1 <=> IP*_2 <=> IP*_3 <=> IP*_4 (the top route)
+#
+#   a GRE device is installed in NS3 with IPv*_GRE, and
+#   NS1/NS2 are configured to route packets to IPv*_GRE via IP*_8
+#   (the bottom route)
+#
+# Tests:
+#
+#   1. routes NS2->IPv*_DST are brought down, so the only way a ping
+#      from IP*_SRC to IP*_DST can work is via IPv*_GRE
+#
+#   2a. in an egress test, a bpf LWT_XMIT program is installed on veth1
+#       that encaps the packets with an IP/GRE header to route to IPv*_GRE
+#
+#       ping: SRC->[encap at veth1:egress]->GRE:decap->DST
+#       ping replies go DST->SRC directly
+#
+#   2b. in an ingress test, a bpf LWT_IN program is installed on veth2
+#       that encaps the packets with an IP/GRE header to route to IPv*_GRE
+#
+#       ping: SRC->[encap at veth2:ingress]->GRE:decap->DST
+#       ping replies go DST->SRC directly
+
+set -e  # exit on error
+
+if [[ $EUID -ne 0 ]]; then
+	echo "This script must be run as root"
+	echo "FAIL"
+	exit 1
+fi
+
+readonly NS1="ns1-$(mktemp -u XXXXXX)"
+readonly NS2="ns2-$(mktemp -u XXXXXX)"
+readonly NS3="ns3-$(mktemp -u XXXXXX)"
+
+readonly IPv4_1="172.16.1.100"
+readonly IPv4_2="172.16.2.100"
+readonly IPv4_3="172.16.3.100"
+readonly IPv4_4="172.16.4.100"
+readonly IPv4_5="172.16.5.100"
+readonly IPv4_6="172.16.6.100"
+readonly IPv4_7="172.16.7.100"
+readonly IPv4_8="172.16.8.100"
+readonly IPv4_GRE="172.16.16.100"
+
+readonly IPv4_SRC=$IPv4_1
+readonly IPv4_DST=$IPv4_4
+
+readonly IPv6_1="fb01::1"
+readonly IPv6_2="fb02::1"
+readonly IPv6_3="fb03::1"
+readonly IPv6_4="fb04::1"
+readonly IPv6_5="fb05::1"
+readonly IPv6_6="fb06::1"
+readonly IPv6_7="fb07::1"
+readonly IPv6_8="fb08::1"
+readonly IPv6_GRE="fb10::1"
+
+readonly IPv6_SRC=$IPv6_1
+readonly IPv6_DST=$IPv6_4
+
+setup() {
+set -e  # exit on error
+	# create devices and namespaces
+	ip netns add "${NS1}"
+	ip netns add "${NS2}"
+	ip netns add "${NS3}"
+
+	ip link add veth1 type veth peer name veth2
+	ip link add veth3 type veth peer name veth4
+	ip link add veth5 type veth peer name veth6
+	ip link add veth7 type veth peer name veth8
+
+	ip netns exec ${NS2} sysctl -wq net.ipv4.ip_forward=1
+	ip netns exec ${NS2} sysctl -wq net.ipv6.conf.all.forwarding=1
+
+	ip link set veth1 netns ${NS1}
+	ip link set veth2 netns ${NS2}
+	ip link set veth3 netns ${NS2}
+	ip link set veth4 netns ${NS3}
+	ip link set veth5 netns ${NS1}
+	ip link set veth6 netns ${NS2}
+	ip link set veth7 netns ${NS2}
+	ip link set veth8 netns ${NS3}
+
+	# configure addesses: the top route (1-2-3-4)
+	ip -netns ${NS1}    addr add ${IPv4_1}/24  dev veth1
+	ip -netns ${NS2}    addr add ${IPv4_2}/24  dev veth2
+	ip -netns ${NS2}    addr add ${IPv4_3}/24  dev veth3
+	ip -netns ${NS3}    addr add ${IPv4_4}/24  dev veth4
+	ip -netns ${NS1} -6 addr add ${IPv6_1}/128 nodad dev veth1
+	ip -netns ${NS2} -6 addr add ${IPv6_2}/128 nodad dev veth2
+	ip -netns ${NS2} -6 addr add ${IPv6_3}/128 nodad dev veth3
+	ip -netns ${NS3} -6 addr add ${IPv6_4}/128 nodad dev veth4
+
+	# configure addresses: the bottom route (5-6-7-8)
+	ip -netns ${NS1}    addr add ${IPv4_5}/24  dev veth5
+	ip -netns ${NS2}    addr add ${IPv4_6}/24  dev veth6
+	ip -netns ${NS2}    addr add ${IPv4_7}/24  dev veth7
+	ip -netns ${NS3}    addr add ${IPv4_8}/24  dev veth8
+	ip -netns ${NS1} -6 addr add ${IPv6_5}/128 nodad dev veth5
+	ip -netns ${NS2} -6 addr add ${IPv6_6}/128 nodad dev veth6
+	ip -netns ${NS2} -6 addr add ${IPv6_7}/128 nodad dev veth7
+	ip -netns ${NS3} -6 addr add ${IPv6_8}/128 nodad dev veth8
+
+
+	ip -netns ${NS1} link set dev veth1 up
+	ip -netns ${NS2} link set dev veth2 up
+	ip -netns ${NS2} link set dev veth3 up
+	ip -netns ${NS3} link set dev veth4 up
+	ip -netns ${NS1} link set dev veth5 up
+	ip -netns ${NS2} link set dev veth6 up
+	ip -netns ${NS2} link set dev veth7 up
+	ip -netns ${NS3} link set dev veth8 up
+
+	# configure routes: IP*_SRC -> veth1/IP*_2 (= top route) default;
+	# the bottom route to specific bottom addresses
+
+	# NS1
+	# top route
+	ip -netns ${NS1}    route add ${IPv4_2}/32  dev veth1
+	ip -netns ${NS1}    route add default dev veth1 via ${IPv4_2}  # go top by default
+	ip -netns ${NS1} -6 route add ${IPv6_2}/128 dev veth1
+	ip -netns ${NS1} -6 route add default dev veth1 via ${IPv6_2}  # go top by default
+	# bottom route
+	ip -netns ${NS1}    route add ${IPv4_6}/32  dev veth5
+	ip -netns ${NS1}    route add ${IPv4_7}/32  dev veth5 via ${IPv4_6}
+	ip -netns ${NS1}    route add ${IPv4_8}/32  dev veth5 via ${IPv4_6}
+	ip -netns ${NS1} -6 route add ${IPv6_6}/128 dev veth5
+	ip -netns ${NS1} -6 route add ${IPv6_7}/128 dev veth5 via ${IPv6_6}
+	ip -netns ${NS1} -6 route add ${IPv6_8}/128 dev veth5 via ${IPv6_6}
+
+	# NS2
+	# top route
+	ip -netns ${NS2}    route add ${IPv4_1}/32  dev veth2
+	ip -netns ${NS2}    route add ${IPv4_4}/32  dev veth3
+	ip -netns ${NS2} -6 route add ${IPv6_1}/128 dev veth2
+	ip -netns ${NS2} -6 route add ${IPv6_4}/128 dev veth3
+	# bottom route
+	ip -netns ${NS2}    route add ${IPv4_5}/32  dev veth6
+	ip -netns ${NS2}    route add ${IPv4_8}/32  dev veth7
+	ip -netns ${NS2} -6 route add ${IPv6_5}/128 dev veth6
+	ip -netns ${NS2} -6 route add ${IPv6_8}/128 dev veth7
+
+	# NS3
+	# top route
+	ip -netns ${NS3}    route add ${IPv4_3}/32  dev veth4
+	ip -netns ${NS3}    route add ${IPv4_1}/32  dev veth4 via ${IPv4_3}
+	ip -netns ${NS3}    route add ${IPv4_2}/32  dev veth4 via ${IPv4_3}
+	ip -netns ${NS3} -6 route add ${IPv6_3}/128 dev veth4
+	ip -netns ${NS3} -6 route add ${IPv6_1}/128 dev veth4 via ${IPv6_3}
+	ip -netns ${NS3} -6 route add ${IPv6_2}/128 dev veth4 via ${IPv6_3}
+	# bottom route
+	ip -netns ${NS3}    route add ${IPv4_7}/32  dev veth8
+	ip -netns ${NS3}    route add ${IPv4_5}/32  dev veth8 via ${IPv4_7}
+	ip -netns ${NS3}    route add ${IPv4_6}/32  dev veth8 via ${IPv4_7}
+	ip -netns ${NS3} -6 route add ${IPv6_7}/128 dev veth8
+	ip -netns ${NS3} -6 route add ${IPv6_5}/128 dev veth8 via ${IPv6_7}
+	ip -netns ${NS3} -6 route add ${IPv6_6}/128 dev veth8 via ${IPv6_7}
+
+	# configure IPv4 GRE device in NS3, and a route to it via the "bottom" route
+	ip -netns ${NS3} tunnel add gre_dev mode gre remote ${IPv4_1} local ${IPv4_GRE} ttl 255
+	ip -netns ${NS3} link set gre_dev up
+	ip -netns ${NS3} addr add ${IPv4_GRE} dev gre_dev
+	ip -netns ${NS1} route add ${IPv4_GRE}/32 dev veth5 via ${IPv4_6}
+	ip -netns ${NS2} route add ${IPv4_GRE}/32 dev veth7 via ${IPv4_8}
+
+
+	# configure IPv6 GRE device in NS3, and a route to it via the "bottom" route
+	ip -netns ${NS3} -6 tunnel add name gre6_dev mode ip6gre remote ${IPv6_1} local ${IPv6_GRE} ttl 255
+	ip -netns ${NS3} link set gre6_dev up
+	ip -netns ${NS3} -6 addr add ${IPv6_GRE} nodad dev gre6_dev
+	ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6}
+	ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8}
+
+	# rp_filter gets confused by what these tests are doing, so disable it
+	ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0
+	ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0
+	ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0
+}
+
+cleanup() {
+	ip netns del ${NS1} 2> /dev/null
+	ip netns del ${NS2} 2> /dev/null
+	ip netns del ${NS3} 2> /dev/null
+}
+
+trap cleanup EXIT
+
+test_ping() {
+	local readonly PROTO=$1
+	local readonly EXPECTED=$2
+	local RET=0
+
+	set +e
+	if [ "${PROTO}" == "IPv4" ] ; then
+		ip netns exec ${NS1} ping  -c 1 -W 1 -I ${IPv4_SRC} ${IPv4_DST} 2>&1 > /dev/null
+		RET=$?
+	elif [ "${PROTO}" == "IPv6" ] ; then
+		ip netns exec ${NS1} ping6 -c 1 -W 6 -I ${IPv6_SRC} ${IPv6_DST} 2>&1 > /dev/null
+		RET=$?
+	else
+		echo "test_ping: unknown PROTO: ${PROTO}"
+		exit 1
+	fi
+	set -e
+
+	if [ "0" != "${RET}" ]; then
+		RET=1
+	fi
+
+	if [ "${EXPECTED}" != "${RET}" ] ; then
+		echo "FAIL: test_ping: ${RET}"
+		exit 1
+	fi
+}
+
+test_egress() {
+	local readonly ENCAP=$1
+	echo "starting egress ${ENCAP} encap test"
+	setup
+
+	# need to wait a bit for IPv6 to autoconf, otherwise
+	# ping6 sometimes fails with "unable to bind to address"
+
+	# by default, pings work
+	test_ping IPv4 0
+	test_ping IPv6 0
+
+	# remove NS2->DST routes, ping fails
+	ip -netns ${NS2}    route del ${IPv4_DST}/32  dev veth3
+	ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3
+	test_ping IPv4 1
+	test_ping IPv6 1
+
+	# install replacement routes (LWT/eBPF), pings succeed
+	if [ "${ENCAP}" == "IPv4" ] ; then
+		ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1
+		ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1
+	elif [ "${ENCAP}" == "IPv6" ] ; then
+		ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1
+		ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1
+	else
+		echo "FAIL: unknown encap ${ENCAP}"
+	fi
+	test_ping IPv4 0
+	test_ping IPv6 0
+
+	cleanup
+	echo "PASS"
+}
+
+test_ingress() {
+	local readonly ENCAP=$1
+	echo "starting ingress ${ENCAP} encap test"
+	setup
+
+	# need to wait a bit for IPv6 to autoconf, otherwise
+	# ping6 sometimes fails with "unable to bind to address"
+
+	# by default, pings work
+	test_ping IPv4 0
+	test_ping IPv6 0
+
+	# remove NS2->DST routes, pings fail
+	ip -netns ${NS2}    route del ${IPv4_DST}/32  dev veth3
+	ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3
+	test_ping IPv4 1
+	test_ping IPv6 1
+
+	# install replacement routes (LWT/eBPF), pings succeed
+	if [ "${ENCAP}" == "IPv4" ] ; then
+		ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre dev veth2
+		ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre dev veth2
+	elif [ "${ENCAP}" == "IPv6" ] ; then
+		ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre6 dev veth2
+		ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre6 dev veth2
+	else
+		echo "FAIL: unknown encap ${ENCAP}"
+	fi
+	test_ping IPv4 0
+	test_ping IPv6 0
+
+	cleanup
+	echo "PASS"
+}
+
+test_egress IPv4
+test_egress IPv6
+
+test_ingress IPv4
+test_ingress IPv6
+
+echo "all tests passed"
-- 
2.20.1.495.gaa96b0ce6b-goog


^ permalink raw reply related

* Re: bpf memory model. Was: [PATCH v4 bpf-next 1/9] bpf: introduce bpf_spin_lock
From: Alexei Starovoitov @ 2019-01-30 19:50 UTC (permalink / raw)
  To: Will Deacon
  Cc: Peter Zijlstra, Alexei Starovoitov, davem, daniel, jakub.kicinski,
	netdev, kernel-team, mingo, Paul McKenney, jannh
In-Reply-To: <20190130181100.GA18558@fuggles.cambridge.arm.com>

On Wed, Jan 30, 2019 at 06:11:00PM +0000, Will Deacon wrote:
> Assuming that a desirable property of an eBPF program is portability between
> CPU architectures, then you're effectively forcing the programmer to "assume

that is fundamental misunderstanding that being thrown in this thread.
bpf is not fixated on portability.
All projects that tried to come up with universal byte code miserably failed.
bpf program compiled for big endian won't load on little.
bpf program designed to be used on x86 will work horribly slow on nfp.
It will work, but will be innefficient. Hence we have alu32 mode in llvm.
More so maps don't map one to one to all archs either.
per-cpu map doesn't exist on nfp. we're still figuring out
an equivalent for it for nfp.
So, no, programs are not portable across architectures.
The programmer cannot assume that.
They could be portable in some cases and we're trying to keep portability
as much as possible, but it's not a "desirable property" that we're going
to sacrifice performance and usability for it.
If it helps look at bpf as a safe kernel module.
Does given kernel module work on all archs? No. Sometimes users only need
to recompile it and sometimes do heavy changes. smp_mb and load acquire
are the list things to worry about when folks trying to make
such 'safe kernel module' work on different archs.


^ permalink raw reply

* Re: bpf memory model. Was: [PATCH v4 bpf-next 1/9] bpf: introduce bpf_spin_lock
From: Alexei Starovoitov @ 2019-01-30 19:51 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Will Deacon, Peter Zijlstra, Alexei Starovoitov, davem, daniel,
	jakub.kicinski, netdev, kernel-team, mingo, jannh
In-Reply-To: <20190130183618.GX4240@linux.ibm.com>

On Wed, Jan 30, 2019 at 10:36:18AM -0800, Paul E. McKenney wrote:
> On Wed, Jan 30, 2019 at 06:11:00PM +0000, Will Deacon wrote:
> > Hi Alexei,
> > 
> > On Mon, Jan 28, 2019 at 01:56:24PM -0800, Alexei Starovoitov wrote:
> > > On Mon, Jan 28, 2019 at 10:24:08AM +0100, Peter Zijlstra wrote:
> > > > On Fri, Jan 25, 2019 at 04:17:26PM -0800, Alexei Starovoitov wrote:
> > > > > What I want to avoid is to define the whole execution ordering model upfront.
> > > > > We cannot say that BPF ISA is weakly ordered like alpha.
> > > > > Most of the bpf progs are written and running on x86. We shouldn't
> > > > > twist bpf developer's arm by artificially relaxing memory model.
> > > > > BPF memory model is equal to memory model of underlying architecture.
> > > > > What we can do is to make it bpf progs a bit more portable with
> > > > > smp_rmb instructions, but we must not force weak execution on the developer.
> > > > 
> > > > Well, I agree with only introducing bits you actually need, and my
> > > > smp_rmb() example might have been poorly chosen, smp_load_acquire() /
> > > > smp_store_release() might have been a far more useful example.
> > > > 
> > > > But I disagree with the last part; we have to pick a model now;
> > > > otherwise you'll pain yourself into a corner.
> > > > 
> > > > Also; Alpha isn't very relevant these days; however ARM64 does seem to
> > > > be gaining a lot of attention and that is very much a weak architecture.
> > > > Adding strongly ordered assumptions to BPF now, will penalize them in
> > > > the long run.
> > > 
> > > arm64 is gaining attention just like riscV is gaining it too.
> > > BPF jit for arm64 is very solid, while BPF jit for riscV is being worked on.
> > > BPF is not picking sides in CPU HW and ISA battles.
> > 
> > It's not about picking a side, it's about providing an abstraction of the
> > various CPU architectures out there so that the programmer doesn't need to
> > worry about where their program may run. Hell, even if you just said "eBPF
> > follows x86 semantics" that would be better than saying nothing (and then we
> > could have a discussion about whether x86 semantics are really what you
> > want).
> 
> To reinforce this point, the Linux-kernel memory model (tools/memory-model)
> is that abstraction for the Linux kernel.  Why not just use that for BPF?

I already answered this earlier in the thread.
tldr: not going to sacrifice performance.


^ permalink raw reply

* Re: [PATCH bpf-next 1/4] bpf: fix lockdep false positive in percpu_freelist
From: Peter Zijlstra @ 2019-01-30 19:53 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, davem, daniel, edumazet, jannh, netdev,
	kernel-team
In-Reply-To: <20190130192752.rvpi2ul26543oax7@ast-mbp.dhcp.thefacebook.com>

On Wed, Jan 30, 2019 at 11:27:54AM -0800, Alexei Starovoitov wrote:
> On Wed, Jan 30, 2019 at 11:21:26AM +0100, Peter Zijlstra wrote:
> > On Tue, Jan 29, 2019 at 08:04:55PM -0800, Alexei Starovoitov wrote:
> > > 
> > > It has been explained that is a false positive here:
> > > https://lkml.org/lkml/2018/7/25/756
> > 
> > Please, no external references like that. The best option is to fully
> 
> I strongly disagree.
> We allowed all kinds of external links in bpf tree in the past and
> going to continue doing so in the future.
> I'm perfectly aware that some of them will go stale in a day or
> in a year.

What's the point of adding URLs if you know they'll not be useful later?

Anyway, your tree, so you get to make the rules, but personally I've
cursed about this exact issue a fair few times.

See for example the x86 tree policy of creating BZ entries to store
Intel documents to refer to them from commits because the Intel website
is notoriously flaky wrt persistence.

There's nothing worse than references to a document you can no longer
find while trying to make sense of this 3 year old code that suddenly
comes apart.

^ permalink raw reply

* Re: [PATCH bpf-next 2/4] bpf: fix lockdep false positive in stackmap
From: Waiman Long @ 2019-01-30 20:05 UTC (permalink / raw)
  To: Peter Zijlstra, Alexei Starovoitov
  Cc: Alexei Starovoitov, davem, daniel, edumazet, jannh, netdev,
	kernel-team
In-Reply-To: <20190130194407.GA3085@hirez.programming.kicks-ass.net>

On 01/30/2019 02:44 PM, Peter Zijlstra wrote:
> On Wed, Jan 30, 2019 at 11:30:41AM -0800, Alexei Starovoitov wrote:
>> On Wed, Jan 30, 2019 at 11:15:30AM +0100, Peter Zijlstra wrote:
>>> On Tue, Jan 29, 2019 at 08:04:56PM -0800, Alexei Starovoitov wrote:
>>>> Lockdep warns about false positive:
>>> This is not a false positive, and you probably also need to use
>>> down_read_non_owner() to match this up_read_non_owner().
>>>
>>> {up,down}_read() and {up,down}_read_non_owner() are not only different
>>> in the lockdep annotation; there is also optimistic spin stuff that
>>> relies on 'owner' tracking.
>> Can you point out in the code the spin bit?
> Hurmph, looks like you're right. I got lost in that stuff again. I hate
> that rwsem code :/

It is actually related to how the lockdep code track if a lock is
acquired or released. It is not actually related to how the rwsem code work.

>
> Rewriting that all is on the todo list somewhere, but it's far down :/
>

I am actually planning to rewrite it. Hopefully, I can send out the
patch soon.

Cheers,
Longman

^ permalink raw reply

* Re: [PATCH bpf-next 2/4] bpf: fix lockdep false positive in stackmap
From: Alexei Starovoitov @ 2019-01-30 20:10 UTC (permalink / raw)
  To: Waiman Long
  Cc: Peter Zijlstra, Alexei Starovoitov, davem, daniel, edumazet,
	jannh, netdev, kernel-team
In-Reply-To: <9246a76f-e8d4-66fc-d901-374169c3e709@redhat.com>

On Wed, Jan 30, 2019 at 02:42:23PM -0500, Waiman Long wrote:
> On 01/30/2019 02:30 PM, Alexei Starovoitov wrote:
> > On Wed, Jan 30, 2019 at 11:15:30AM +0100, Peter Zijlstra wrote:
> >> On Tue, Jan 29, 2019 at 08:04:56PM -0800, Alexei Starovoitov wrote:
> >>> Lockdep warns about false positive:
> >> This is not a false positive, and you probably also need to use
> >> down_read_non_owner() to match this up_read_non_owner().
> >>
> >> {up,down}_read() and {up,down}_read_non_owner() are not only different
> >> in the lockdep annotation; there is also optimistic spin stuff that
> >> relies on 'owner' tracking.
> > Can you point out in the code the spin bit?
> > As far as I can see sem->owner is debug only feature.
> > All owner checks are done under CONFIG_DEBUG_RWSEMS.
> 
> No, sem->owner is mainly for performing optimistic spinning which is a
> performance feature to make rwsem writer-lock performs similar to mutex.
> The debugging part is just an add-on. It is not the reason for the
> presence of sem->owner.

I see. Got it.

> > Also there is no down_read_trylock_non_owner() at the moment.
> > We can argue about it for -next, but I'd rather silence lockdep
> > with this patch today.
> >
> We can add down_read_trylock_non_owner() if there is a need for it. It
> should be easy to do.

Yes, but looking through the code it's not clear to me that it's safe
to mix non_owner() versions with regular.
bpf/stackmap.c does down_read_trylock + up_read.
If we add new down_read_trylock_non_owner that set the owner to
NULL | RWSEM_* bits is this safe with conccurent read/write
that do regular versions?
rwsem_can_spin_on_owner() does:
        if (owner) {
                ret = is_rwsem_owner_spinnable(owner) &&
                      owner_on_cpu(owner);
that looks correct.
For a second I thought there could be fault here due to non_owner.
But there could be other places where it's assumed that owner
is never null?

May be we should live with this lockdep warn in bpf tree
and fix it only in bpf-next?


^ permalink raw reply

* Re: [PATCH bpf-next 1/4] bpf: fix lockdep false positive in percpu_freelist
From: Alexei Starovoitov @ 2019-01-30 20:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Alexei Starovoitov, davem, daniel, edumazet, jannh, netdev,
	kernel-team
In-Reply-To: <20190130195313.GC3085@hirez.programming.kicks-ass.net>

On Wed, Jan 30, 2019 at 08:53:13PM +0100, Peter Zijlstra wrote:
> On Wed, Jan 30, 2019 at 11:27:54AM -0800, Alexei Starovoitov wrote:
> > On Wed, Jan 30, 2019 at 11:21:26AM +0100, Peter Zijlstra wrote:
> > > On Tue, Jan 29, 2019 at 08:04:55PM -0800, Alexei Starovoitov wrote:
> > > > 
> > > > It has been explained that is a false positive here:
> > > > https://lkml.org/lkml/2018/7/25/756
> > > 
> > > Please, no external references like that. The best option is to fully
> > 
> > I strongly disagree.
> > We allowed all kinds of external links in bpf tree in the past and
> > going to continue doing so in the future.
> > I'm perfectly aware that some of them will go stale in a day or
> > in a year.
> 
> What's the point of adding URLs if you know they'll not be useful later?
> 
> Anyway, your tree, so you get to make the rules, but personally I've
> cursed about this exact issue a fair few times.
> 
> See for example the x86 tree policy of creating BZ entries to store
> Intel documents to refer to them from commits because the Intel website
> is notoriously flaky wrt persistence.

well. yes. it's case by case.
Not every link would be acceptable.
URL to a news website wouldn't appropriate a git tree :)
but pointer to github.com or a corporate website likely going to be alive
for some time.

> There's nothing worse than references to a document you can no longer
> find while trying to make sense of this 3 year old code that suddenly
> comes apart.

true. I'm not saying that it's ok to put all info on the website
and keep commit minimal. Quite the opposite.
The commit log should be descriptive and contain all information
to explain the change. Extra url is an additional info.

^ permalink raw reply

* Re: [PATCH] Revert "ethtool: change to new sane powerpc64 kernel headers"
From: John W. Linville @ 2019-01-30 20:41 UTC (permalink / raw)
  To: Maciej Żenczykowski; +Cc: Maciej Żenczykowski, netdev
In-Reply-To: <20190130024849.24997-1-zenczykowski@gmail.com>

Cool, thanks -- queued for next release.

John

On Tue, Jan 29, 2019 at 06:48:49PM -0800, Maciej Żenczykowski wrote:
> From: Maciej Żenczykowski <maze@google.com>
> 
> This reverts commit 4df55c81996dfb1dbe98c93ee62d8067ed5073a9.
> 
> It turns out this is not needed due to:
>     commit c0a2c04b3cbf6d399a2551654401957ddb529a50
>     internal.h: change to new sane kernel headers on 64-bit archs
> which I apparently entirely forgot about while trying
> to synchronize internal and upstream git repositories.
> 
> Change-Id: I56d90a3c1e9b66c30526824fb7bc41aab01d85d1
> Signed-off-by: Maciej Żenczykowski <maze@google.com>
> ---
>  ethtool-copy.h | 6 ------
>  1 file changed, 6 deletions(-)
> 
> diff --git a/ethtool-copy.h b/ethtool-copy.h
> index 7772a4970987..6bfbb85f9402 100644
> --- a/ethtool-copy.h
> +++ b/ethtool-copy.h
> @@ -14,12 +14,6 @@
>  #ifndef _LINUX_ETHTOOL_H
>  #define _LINUX_ETHTOOL_H
>  
> -#ifdef __powerpc64__
> -/* Powerpc needs __SANE_USERSPACE_TYPES__ before <linux/types.h> to select
> - * 'int-ll64.h' and avoid compile warnings when printing __u64 with %llu.
> - */
> -#define __SANE_USERSPACE_TYPES__
> -#endif
>  #include <linux/kernel.h>
>  #include <linux/types.h>
>  #include <linux/if_ether.h>
> -- 
> 2.20.1.495.gaa96b0ce6b-goog
> 
> 

-- 
John W. Linville		Someday the world will need a hero, and you
linville@tuxdriver.com			might be all we have.  Be ready.

^ permalink raw reply

* [PATCH] ipmr: ip6mr: Create new sockopt to clear mfc cache only
From: Callum Sinclair @ 2019-01-30 20:52 UTC (permalink / raw)
  To: davem, kuznet, yoshfuji, nikolay, netdev, linux-kernel; +Cc: Callum Sinclair
In-Reply-To: <20190130205209.18183-1-callum.sinclair@alliedtelesis.co.nz>

Currently the only way to clear the mfc cache was to delete the entries
one by one using the MRT_DEL_MFC socket option or to destroy and
recreate the socket.

Create a new socket option which will clear the multicast forwarding
cache on the socket without destroying the socket.

Signed-off-by: Callum Sinclair <callum.sinclair@alliedtelesis.co.nz>
---
 include/uapi/linux/mroute.h  |  3 ++-
 include/uapi/linux/mroute6.h |  3 ++-
 net/ipv4/ipmr.c              | 40 +++++++++++++++++++++----------
 net/ipv6/ip6mr.c             | 46 +++++++++++++++++++++++-------------
 4 files changed, 61 insertions(+), 31 deletions(-)

diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h
index 5d37a9ccce63..8a0beb885cd9 100644
--- a/include/uapi/linux/mroute.h
+++ b/include/uapi/linux/mroute.h
@@ -28,7 +28,8 @@
 #define MRT_TABLE	(MRT_BASE+9)	/* Specify mroute table ID		*/
 #define MRT_ADD_MFC_PROXY	(MRT_BASE+10)	/* Add a (*,*|G) mfc entry	*/
 #define MRT_DEL_MFC_PROXY	(MRT_BASE+11)	/* Del a (*,*|G) mfc entry	*/
-#define MRT_MAX		(MRT_BASE+11)
+#define MRT_DEL_MFC_ALL		(MRT_BASE+12)	/* Del all multicast entries	*/
+#define MRT_MAX		(MRT_BASE+12)
 
 #define SIOCGETVIFCNT	SIOCPROTOPRIVATE	/* IP protocol privates */
 #define SIOCGETSGCNT	(SIOCPROTOPRIVATE+1)
diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h
index 9999cc006390..7def70cdf571 100644
--- a/include/uapi/linux/mroute6.h
+++ b/include/uapi/linux/mroute6.h
@@ -31,7 +31,8 @@
 #define MRT6_TABLE	(MRT6_BASE+9)	/* Specify mroute table ID		*/
 #define MRT6_ADD_MFC_PROXY	(MRT6_BASE+10)	/* Add a (*,*|G) mfc entry	*/
 #define MRT6_DEL_MFC_PROXY	(MRT6_BASE+11)	/* Del a (*,*|G) mfc entry	*/
-#define MRT6_MAX	(MRT6_BASE+11)
+#define MRT6_DEL_MFC_ALL	(MRT6_BASE+12)	/* Del all multicast entries	*/
+#define MRT6_MAX	(MRT6_BASE+12)
 
 #define SIOCGETMIFCNT_IN6	SIOCPROTOPRIVATE	/* IP protocol privates */
 #define SIOCGETSGCNT_IN6	(SIOCPROTOPRIVATE+1)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ddbf8c9a1abb..b996d0f70e5c 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1298,22 +1298,12 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 	return 0;
 }
 
-/* Close the multicast socket, and clear the vif tables etc */
-static void mroute_clean_tables(struct mr_table *mrt, bool all)
+/* Clear the vif tables */
+static void mroute_clean_cache(struct mr_table *mrt, bool all)
 {
-	struct net *net = read_pnet(&mrt->net);
 	struct mr_mfc *c, *tmp;
 	struct mfc_cache *cache;
-	LIST_HEAD(list);
-	int i;
-
-	/* Shut down all active vif entries */
-	for (i = 0; i < mrt->maxvif; i++) {
-		if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
-			continue;
-		vif_delete(mrt, i, 0, &list);
-	}
-	unregister_netdevice_many(&list);
+	struct net *net = read_pnet(&mrt->net);
 
 	/* Wipe the cache */
 	list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
@@ -1340,6 +1330,23 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 	}
 }
 
+/* Close the multicast socket, and clear the vif tables etc */
+static void mroute_clean_tables(struct mr_table *mrt, bool all)
+{
+	LIST_HEAD(list);
+	int i;
+
+	/* Shut down all active vif entries */
+	for (i = 0; i < mrt->maxvif; i++) {
+		if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
+			continue;
+		vif_delete(mrt, i, 0, &list);
+	}
+	unregister_netdevice_many(&list);
+
+	mroute_clean_cache(mrt, all);
+}
+
 /* called from ip_ra_control(), before an RCU grace period,
  * we dont need to call synchronize_rcu() here
  */
@@ -1482,6 +1489,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
 					   sk == rtnl_dereference(mrt->mroute_sk),
 					   parent);
 		break;
+	case MRT_DEL_MFC_ALL:
+		rtnl_lock();
+		ipmr_for_each_table(mrt, net) {
+			mroute_clean_cache(mrt, true);
+		}
+		rtnl_unlock();
+		break;
 	/* Control PIM assert. */
 	case MRT_ASSERT:
 		if (optlen != sizeof(val)) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 30337b38274b..0168420d217b 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1492,25 +1492,11 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
 	return 0;
 }
 
-/*
- *	Close the multicast socket, and clear the vif tables etc
- */
-
-static void mroute_clean_tables(struct mr_table *mrt, bool all)
+/* Clear the vif tables */
+static void mroute_clean_cache(struct mr_table *mrt, bool all)
 {
 	struct mr_mfc *c, *tmp;
-	LIST_HEAD(list);
-	int i;
-
-	/* Shut down all active vif entries */
-	for (i = 0; i < mrt->maxvif; i++) {
-		if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
-			continue;
-		mif6_delete(mrt, i, 0, &list);
-	}
-	unregister_netdevice_many(&list);
 
-	/* Wipe the cache */
 	list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
 		if (!all && (c->mfc_flags & MFC_STATIC))
 			continue;
@@ -1536,6 +1522,27 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 	}
 }
 
+/*
+ *	Close the multicast socket, and clear the vif tables etc
+ */
+
+static void mroute_clean_tables(struct mr_table *mrt, bool all)
+{
+	LIST_HEAD(list);
+	int i;
+
+	/* Shut down all active vif entries */
+	for (i = 0; i < mrt->maxvif; i++) {
+		if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
+			continue;
+		mif6_delete(mrt, i, 0, &list);
+	}
+	unregister_netdevice_many(&list);
+
+	/* Wipe the cache */
+	mroute_clean_cache(mrt, all);
+}
+
 static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
 {
 	int err = 0;
@@ -1703,6 +1710,13 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 					    parent);
 		rtnl_unlock();
 		return ret;
+	case MRT6_DEL_MFC_ALL:
+		rtnl_lock();
+		ip6mr_for_each_table(mrt, net) {
+			mroute_clean_cache(mrt, true);
+		}
+		rtnl_unlock();
+		return 0;
 
 	/*
 	 *	Control PIM assert (to activate pim will activate assert)
-- 
2.20.1


^ permalink raw reply related

* [PATCH] ipmr: ip6mr: Create new sockopt to clear mfc cache only
From: Callum Sinclair @ 2019-01-30 20:52 UTC (permalink / raw)
  To: davem, kuznet, yoshfuji, nikolay, netdev, linux-kernel; +Cc: Callum Sinclair

Created a way to clear the multicast forwarding cache on a socket
without having to either remove the entries manually using the delete
entry socket option or destroy and recreate the multicast socket.

Patch Set 2:
  - Fix Compile Errors

Callum Sinclair (1):
  ipmr: ip6mr: Create new sockopt to clear mfc cache only

 include/uapi/linux/mroute.h  |  3 ++-
 include/uapi/linux/mroute6.h |  3 ++-
 net/ipv4/ipmr.c              | 40 +++++++++++++++++++++----------
 net/ipv6/ip6mr.c             | 46 +++++++++++++++++++++++-------------
 4 files changed, 61 insertions(+), 31 deletions(-)

-- 
2.20.1


^ permalink raw reply

* [PATCH net-next 12/12] net: hns3: keep flow director state unchanged when reset
From: Huazhong Tan @ 2019-01-30 20:55 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, huangdaode, yisen.zhuang, salil.mehta,
	linuxarm, Jian Shen, Peng Li, Huazhong Tan
In-Reply-To: <20190130205552.8512-1-tanhuazhong@huawei.com>

From: Jian Shen <shenjian15@huawei.com>

In orginal codes, driver always enables flow director when
intializing. When user disable flow director with command
ethtool -K, the flow director will be enabled again after
resetting.

This patch fixes it by only enabling it when first initialzing.

Fixes: 6871af29b3ab ("net: hns3: Add reset handle for flow director")
Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c    | 10 ++++++----
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h    |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index b531eac12fea..2ffbf07ff829 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1007,6 +1007,9 @@ static int hclge_configure(struct hclge_dev *hdev)
 	hdev->tm_info.hw_pfc_map = 0;
 	hdev->wanted_umv_size = cfg.umv_space;
 
+	if (hnae3_dev_fd_supported(hdev))
+		hdev->fd_en = true;
+
 	ret = hclge_parse_speed(cfg.default_speed, &hdev->hw.mac.speed);
 	if (ret) {
 		dev_err(&hdev->pdev->dev, "Get wrong speed ret=%d.\n", ret);
@@ -3973,7 +3976,6 @@ static int hclge_init_fd_config(struct hclge_dev *hdev)
 		return -EOPNOTSUPP;
 	}
 
-	hdev->fd_cfg.fd_en = true;
 	hdev->fd_cfg.proto_support =
 		TCP_V4_FLOW | UDP_V4_FLOW | SCTP_V4_FLOW | TCP_V6_FLOW |
 		UDP_V6_FLOW | SCTP_V6_FLOW | IPV4_USER_FLOW | IPV6_USER_FLOW;
@@ -4731,7 +4733,7 @@ static int hclge_add_fd_entry(struct hnae3_handle *handle,
 	if (!hnae3_dev_fd_supported(hdev))
 		return -EOPNOTSUPP;
 
-	if (!hdev->fd_cfg.fd_en) {
+	if (!hdev->fd_en) {
 		dev_warn(&hdev->pdev->dev,
 			 "Please enable flow director first\n");
 		return -EOPNOTSUPP;
@@ -4884,7 +4886,7 @@ static int hclge_restore_fd_entries(struct hnae3_handle *handle)
 		return 0;
 
 	/* if fd is disabled, should not restore it when reset */
-	if (!hdev->fd_cfg.fd_en)
+	if (!hdev->fd_en)
 		return 0;
 
 	hlist_for_each_entry_safe(rule, node, &hdev->fd_rule_list, rule_node) {
@@ -5170,7 +5172,7 @@ static void hclge_enable_fd(struct hnae3_handle *handle, bool enable)
 	struct hclge_vport *vport = hclge_get_vport(handle);
 	struct hclge_dev *hdev = vport->back;
 
-	hdev->fd_cfg.fd_en = enable;
+	hdev->fd_en = enable;
 	if (!enable)
 		hclge_del_all_fd_entries(handle, false);
 	else
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 2c413c63c6c9..c939f4a7f5f0 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -583,7 +583,6 @@ struct hclge_fd_key_cfg {
 
 struct hclge_fd_cfg {
 	u8 fd_mode;
-	u8 fd_en;
 	u16 max_key_length;
 	u32 proto_support;
 	u32 rule_num[2]; /* rule entry number */
@@ -758,6 +757,7 @@ struct hclge_dev {
 	struct hclge_fd_cfg fd_cfg;
 	struct hlist_head fd_rule_list;
 	u16 hclge_fd_rule_num;
+	u8 fd_en;
 
 	u16 wanted_umv_size;
 	/* max available unicast mac vlan space */
-- 
2.20.1



^ permalink raw reply related

* [PATCH net-next 06/12] net: hns3: Fix NULL deref when unloading driver
From: Huazhong Tan @ 2019-01-30 20:55 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, huangdaode, yisen.zhuang, salil.mehta,
	linuxarm, Huazhong Tan, Peng Li
In-Reply-To: <20190130205552.8512-1-tanhuazhong@huawei.com>

When the driver is unloading, if there is a calling of ndo_open occurs
between phy_disconnect() and unregister_netdev(), it will end up
causing the kernel to eventually hit a NULL deref:

[14942.417828] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000048
[14942.529878] Mem abort info:
[14942.551166]   ESR = 0x96000006
[14942.567070]   Exception class = DABT (current EL), IL = 32 bits
[14942.623081]   SET = 0, FnV = 0
[14942.639112]   EA = 0, S1PTW = 0
[14942.643628] Data abort info:
[14942.659227]   ISV = 0, ISS = 0x00000006
[14942.674870]   CM = 0, WnR = 0
[14942.679449] user pgtable: 4k pages, 48-bit VAs, pgdp = 00000000224ad6ad
[14942.695595] [0000000000000048] pgd=00000021e6673003, pud=00000021dbf01003, pmd=0000000000000000
[14942.723163] Internal error: Oops: 96000006 [#1] PREEMPT SMP
[14942.729358] Modules linked in: hns3(O) hclge(O) pv680_mii(O) hnae3(O) [last unloaded: hclge]
[14942.738907] CPU: 1 PID: 26629 Comm: kworker/u4:13 Tainted: G           O      4.18.0-rc1-12928-ga960791-dirty #145
[14942.749491] Hardware name: Huawei Technologies Co., Ltd. D05/D05, BIOS Hi1620 FPGA TB BOOT BIOS B763 08/17/2018
[14942.760392] Workqueue: events_power_efficient phy_state_machine
[14942.766644] pstate: 80c00009 (Nzcv daif +PAN +UAO)
[14942.771918] pc : test_and_set_bit+0x18/0x38
[14942.776589] lr : netif_carrier_off+0x24/0x70
[14942.781033] sp : ffff0000121abd20
[14942.784518] x29: ffff0000121abd20 x28: 0000000000000000
[14942.790208] x27: ffff0000164d3cd8 x26: ffff8021da68b7b8
[14942.795832] x25: 0000000000000000 x24: ffff8021eb407800
[14942.801445] x23: 0000000000000000 x22: 0000000000000000
[14942.807046] x21: 0000000000000001 x20: 0000000000000000
[14942.812672] x19: 0000000000000000 x18: ffff000009781708
[14942.818284] x17: 00000000004970e8 x16: ffff00000816ad48
[14942.823900] x15: 0000000000000000 x14: 0000000000000008
[14942.829528] x13: 0000000000000000 x12: 0000000000000f65
[14942.835149] x11: 0000000000000001 x10: 00000000000009d0
[14942.840753] x9 : ffff0000121abaa0 x8 : 0000000000000000
[14942.846360] x7 : ffff000009781708 x6 : 0000000000000003
[14942.851970] x5 : 0000000000000020 x4 : 0000000000000004
[14942.857575] x3 : 0000000000000002 x2 : 0000000000000001
[14942.863180] x1 : 0000000000000048 x0 : 0000000000000000
[14942.868875] Process kworker/u4:13 (pid: 26629, stack limit = 0x00000000c909dbf3)
[14942.876464] Call trace:
[14942.879200]  test_and_set_bit+0x18/0x38
[14942.883376]  phy_link_change+0x38/0x78
[14942.887378]  phy_state_machine+0x3dc/0x4f8
[14942.891968]  process_one_work+0x158/0x470
[14942.896223]  worker_thread+0x50/0x470
[14942.900219]  kthread+0x104/0x130
[14942.903905]  ret_from_fork+0x10/0x1c
[14942.907755] Code: d2800022 8b400c21 f9800031 9ac32044 (c85f7c22)
[14942.914185] ---[ end trace 968c9e12eb740b23 ]---

So this patch fixes it by modifying the timing to do phy_connect_direct()
and phy_disconnect().

Fixes: 256727da7395 ("net: hns3: Add MDIO support to HNS3 Ethernet driver for hip08 SoC")
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h   |  2 ++
 .../net/ethernet/hisilicon/hns3/hns3_enet.c   | 28 +++++++++++++++++++
 .../hisilicon/hns3/hns3pf/hclge_main.c        | 20 ++-----------
 .../hisilicon/hns3/hns3pf/hclge_mdio.c        |  8 ++++--
 .../hisilicon/hns3/hns3pf/hclge_mdio.h        |  4 +--
 5 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index c29f82aa5ba1..e05b4926feb2 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -464,6 +464,8 @@ struct hnae3_ae_ops {
 	int (*set_gro_en)(struct hnae3_handle *handle, int enable);
 	u16 (*get_global_queue_id)(struct hnae3_handle *handle, u16 queue_id);
 	void (*set_timer_task)(struct hnae3_handle *handle, bool enable);
+	int (*mac_connect_phy)(struct hnae3_handle *handle);
+	void (*mac_disconnect_phy)(struct hnae3_handle *handle);
 };
 
 struct hnae3_dcb_ops {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 6520e1c1a837..ac9b0aa258ec 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -3531,6 +3531,25 @@ static int hns3_init_mac_addr(struct net_device *netdev, bool init)
 	return ret;
 }
 
+static int hns3_init_phy(struct net_device *netdev)
+{
+	struct hnae3_handle *h = hns3_get_handle(netdev);
+	int ret = 0;
+
+	if (h->ae_algo->ops->mac_connect_phy)
+		ret = h->ae_algo->ops->mac_connect_phy(h);
+
+	return ret;
+}
+
+static void hns3_uninit_phy(struct net_device *netdev)
+{
+	struct hnae3_handle *h = hns3_get_handle(netdev);
+
+	if (h->ae_algo->ops->mac_disconnect_phy)
+		h->ae_algo->ops->mac_disconnect_phy(h);
+}
+
 static int hns3_restore_fd_rules(struct net_device *netdev)
 {
 	struct hnae3_handle *h = hns3_get_handle(netdev);
@@ -3640,6 +3659,10 @@ static int hns3_client_init(struct hnae3_handle *handle)
 		goto out_init_ring_data;
 	}
 
+	ret = hns3_init_phy(netdev);
+	if (ret)
+		goto out_init_phy;
+
 	ret = register_netdev(netdev);
 	if (ret) {
 		dev_err(priv->dev, "probe register netdev fail!\n");
@@ -3664,6 +3687,9 @@ static int hns3_client_init(struct hnae3_handle *handle)
 	return ret;
 
 out_reg_netdev_fail:
+	hns3_uninit_phy(netdev);
+out_init_phy:
+	hns3_uninit_all_ring(priv);
 out_init_ring_data:
 	(void)hns3_nic_uninit_vector_data(priv);
 out_init_vector_data:
@@ -3698,6 +3724,8 @@ static void hns3_client_uninit(struct hnae3_handle *handle, bool reset)
 
 	hns3_force_clear_all_rx_ring(handle);
 
+	hns3_uninit_phy(netdev);
+
 	ret = hns3_nic_uninit_vector_data(priv);
 	if (ret)
 		netdev_err(netdev, "uninit vector error\n");
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index ae8336c18264..795ebedde284 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -7050,16 +7050,6 @@ static void hclge_get_mdix_mode(struct hnae3_handle *handle,
 		*tp_mdix = ETH_TP_MDI;
 }
 
-static int hclge_init_instance_hw(struct hclge_dev *hdev)
-{
-	return hclge_mac_connect_phy(hdev);
-}
-
-static void hclge_uninit_instance_hw(struct hclge_dev *hdev)
-{
-	hclge_mac_disconnect_phy(hdev);
-}
-
 static int hclge_init_client_instance(struct hnae3_client *client,
 				      struct hnae3_ae_dev *ae_dev)
 {
@@ -7079,13 +7069,6 @@ static int hclge_init_client_instance(struct hnae3_client *client,
 			if (ret)
 				goto clear_nic;
 
-			ret = hclge_init_instance_hw(hdev);
-			if (ret) {
-			        client->ops->uninit_instance(&vport->nic,
-			                                     0);
-				goto clear_nic;
-			}
-
 			hnae3_set_client_init_flag(client, ae_dev, 1);
 
 			if (hdev->roce_client &&
@@ -7170,7 +7153,6 @@ static void hclge_uninit_client_instance(struct hnae3_client *client,
 		if (client->type == HNAE3_CLIENT_ROCE)
 			return;
 		if (hdev->nic_client && client->ops->uninit_instance) {
-			hclge_uninit_instance_hw(hdev);
 			client->ops->uninit_instance(&vport->nic, 0);
 			hdev->nic_client = NULL;
 			vport->nic.client = NULL;
@@ -8076,6 +8058,8 @@ static const struct hnae3_ae_ops hclge_ops = {
 	.set_gro_en = hclge_gro_en,
 	.get_global_queue_id = hclge_covert_handle_qid_global,
 	.set_timer_task = hclge_set_timer_task,
+	.mac_connect_phy = hclge_mac_connect_phy,
+	.mac_disconnect_phy = hclge_mac_disconnect_phy,
 };
 
 static struct hnae3_ae_algo ae_algo = {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
index dabb8437f8dc..84f28785ba28 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
@@ -195,8 +195,10 @@ static void hclge_mac_adjust_link(struct net_device *netdev)
 		netdev_err(netdev, "failed to configure flow control.\n");
 }
 
-int hclge_mac_connect_phy(struct hclge_dev *hdev)
+int hclge_mac_connect_phy(struct hnae3_handle *handle)
 {
+	struct hclge_vport *vport = hclge_get_vport(handle);
+	struct hclge_dev *hdev = vport->back;
 	struct net_device *netdev = hdev->vport[0].nic.netdev;
 	struct phy_device *phydev = hdev->hw.mac.phydev;
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
@@ -229,8 +231,10 @@ int hclge_mac_connect_phy(struct hclge_dev *hdev)
 	return 0;
 }
 
-void hclge_mac_disconnect_phy(struct hclge_dev *hdev)
+void hclge_mac_disconnect_phy(struct hnae3_handle *handle)
 {
+	struct hclge_vport *vport = hclge_get_vport(handle);
+	struct hclge_dev *hdev = vport->back;
 	struct phy_device *phydev = hdev->hw.mac.phydev;
 
 	if (!phydev)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.h
index 5fbf7dddb5d9..ef095d9c566f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.h
@@ -5,8 +5,8 @@
 #define __HCLGE_MDIO_H
 
 int hclge_mac_mdio_config(struct hclge_dev *hdev);
-int hclge_mac_connect_phy(struct hclge_dev *hdev);
-void hclge_mac_disconnect_phy(struct hclge_dev *hdev);
+int hclge_mac_connect_phy(struct hnae3_handle *handle);
+void hclge_mac_disconnect_phy(struct hnae3_handle *handle);
 void hclge_mac_start_phy(struct hclge_dev *hdev);
 void hclge_mac_stop_phy(struct hclge_dev *hdev);
 
-- 
2.20.1



^ permalink raw reply related

* [PATCH net-next 05/12] net: hns3: only support tc 0 for VF
From: Huazhong Tan @ 2019-01-30 20:55 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, huangdaode, yisen.zhuang, salil.mehta,
	linuxarm, Yunsheng Lin, Peng Li, Huazhong Tan
In-Reply-To: <20190130205552.8512-1-tanhuazhong@huawei.com>

From: Yunsheng Lin <linyunsheng@huawei.com>

When the VF shares the same TC config as PF, the business
running on PF and VF must have samiliar module.

For simplicity, we are not considering VF sharing the same tc
configuration as PF use case, so this patch removes the support
of TC configuration from VF and forcing VF to just use single
TC.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 .../hisilicon/hns3/hns3pf/hclge_dcb.c         | 12 +++++-----
 .../hisilicon/hns3/hns3pf/hclge_mbx.c         | 10 ++++++---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 22 ++++++++++++++-----
 3 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c
index 961aedb0e20f..1161361a973b 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c
@@ -93,13 +93,11 @@ static int hclge_dcb_common_validate(struct hclge_dev *hdev, u8 num_tc,
 		}
 	}
 
-	for (i = 0; i < hdev->num_alloc_vport; i++) {
-		if (num_tc > hdev->vport[i].alloc_tqps) {
-			dev_err(&hdev->pdev->dev,
-				"allocated tqp(%u) checking failed, %u > tqp(%u)\n",
-				i, num_tc, hdev->vport[i].alloc_tqps);
-			return -EINVAL;
-		}
+	if (num_tc > hdev->vport[0].alloc_tqps) {
+		dev_err(&hdev->pdev->dev,
+			"allocated tqp checking failed, %u > tqp(%u)\n",
+			num_tc, hdev->vport[0].alloc_tqps);
+		return -EINVAL;
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
index 3603034aa45c..6afb0a4b73f7 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
@@ -319,10 +319,14 @@ static int hclge_get_vf_tcinfo(struct hclge_vport *vport,
 			       struct hclge_mbx_vf_to_pf_cmd *mbx_req,
 			       bool gen_resp)
 {
-	struct hclge_dev *hdev = vport->back;
-	int ret;
+	struct hnae3_knic_private_info *kinfo = &vport->nic.kinfo;
+	u8 vf_tc_map = 0;
+	int i, ret;
+
+	for (i = 0; i < kinfo->num_tc; i++)
+		vf_tc_map |= BIT(i);
 
-	ret = hclge_gen_resp_to_vf(vport, mbx_req, 0, &hdev->hw_tc_map,
+	ret = hclge_gen_resp_to_vf(vport, mbx_req, 0, &vf_tc_map,
 				   sizeof(u8));
 
 	return ret;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
index 9f4069fb786b..aafc69f4bfdd 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
@@ -520,8 +520,14 @@ static void hclge_tm_vport_tc_info_update(struct hclge_vport *vport)
 	u16 max_rss_size;
 	u8 i;
 
-	vport->bw_limit = hdev->tm_info.pg_info[0].bw_limit;
-	kinfo->num_tc = min_t(u16, vport->alloc_tqps, hdev->tm_info.num_tc);
+	/* TC configuration is shared by PF/VF in one port, only allow
+	 * one tc for VF for simplicity. VF's vport_id is non zero.
+	 */
+	kinfo->num_tc = vport->vport_id ? 1 :
+			min_t(u16, vport->alloc_tqps, hdev->tm_info.num_tc);
+	vport->qs_offset = (vport->vport_id ? hdev->tm_info.num_tc : 0) +
+				(vport->vport_id ? (vport->vport_id - 1) : 0);
+
 	max_rss_size = min_t(u16, hdev->rss_size_max,
 			     vport->alloc_tqps / kinfo->num_tc);
 
@@ -538,12 +544,12 @@ static void hclge_tm_vport_tc_info_update(struct hclge_vport *vport)
 	}
 
 	kinfo->num_tqps = kinfo->num_tc * kinfo->rss_size;
-	vport->qs_offset = hdev->tm_info.num_tc * vport->vport_id;
 	vport->dwrr = 100;  /* 100 percent as init */
 	vport->alloc_rss_size = kinfo->rss_size;
+	vport->bw_limit = hdev->tm_info.pg_info[0].bw_limit;
 
 	for (i = 0; i < HNAE3_MAX_TC; i++) {
-		if (hdev->hw_tc_map & BIT(i)) {
+		if (hdev->hw_tc_map & BIT(i) && i < kinfo->num_tc) {
 			kinfo->tc_info[i].enable = true;
 			kinfo->tc_info[i].tqp_offset = i * kinfo->rss_size;
 			kinfo->tc_info[i].tqp_count = kinfo->rss_size;
@@ -766,13 +772,17 @@ static int hclge_tm_pri_q_qs_cfg(struct hclge_dev *hdev)
 
 	if (hdev->tx_sch_mode == HCLGE_FLAG_TC_BASE_SCH_MODE) {
 		/* Cfg qs -> pri mapping, one by one mapping */
-		for (k = 0; k < hdev->num_alloc_vport; k++)
-			for (i = 0; i < hdev->tm_info.num_tc; i++) {
+		for (k = 0; k < hdev->num_alloc_vport; k++) {
+			struct hnae3_knic_private_info *kinfo =
+				&vport[k].nic.kinfo;
+
+			for (i = 0; i < kinfo->num_tc; i++) {
 				ret = hclge_tm_qs_to_pri_map_cfg(
 					hdev, vport[k].qs_offset + i, i);
 				if (ret)
 					return ret;
 			}
+		}
 	} else if (hdev->tx_sch_mode == HCLGE_FLAG_VNET_BASE_SCH_MODE) {
 		/* Cfg qs -> pri mapping,  qs = tc, pri = vf, 8 qs -> 1 pri */
 		for (k = 0; k < hdev->num_alloc_vport; k++)
-- 
2.20.1



^ permalink raw reply related

* [PATCH net-next 10/12] net: hns3: fix an issue for hclgevf_ae_get_hdev
From: Huazhong Tan @ 2019-01-30 20:55 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, huangdaode, yisen.zhuang, salil.mehta,
	linuxarm, Peng Li, Huazhong Tan
In-Reply-To: <20190130205552.8512-1-tanhuazhong@huawei.com>

From: Peng Li <lipeng321@huawei.com>

HNS3 VF driver support NIC and Roce, hdev stores NIC
handle and Roce handle, should use correct parameter for
container_of.

Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index 93f306f42cf4..95f926971e3b 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -86,7 +86,12 @@ static const u32 tqp_intr_reg_addr_list[] = {HCLGEVF_TQP_INTR_CTRL_REG,
 static inline struct hclgevf_dev *hclgevf_ae_get_hdev(
 	struct hnae3_handle *handle)
 {
-	return container_of(handle, struct hclgevf_dev, nic);
+	if (!handle->client)
+		return container_of(handle, struct hclgevf_dev, nic);
+	else if (handle->client->type == HNAE3_CLIENT_ROCE)
+		return container_of(handle, struct hclgevf_dev, roce);
+	else
+		return container_of(handle, struct hclgevf_dev, nic);
 }
 
 static int hclgevf_tqps_update_stats(struct hnae3_handle *handle)
-- 
2.20.1



^ permalink raw reply related

* [PATCH net-next 09/12] net: hns3: fix improper error handling in the hclge_init_ae_dev()
From: Huazhong Tan @ 2019-01-30 20:55 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, huangdaode, yisen.zhuang, salil.mehta,
	linuxarm, Huazhong Tan, Peng Li
In-Reply-To: <20190130205552.8512-1-tanhuazhong@huawei.com>

While hclge_init_umv_space() failed in the hclge_init_ae_dev(),
we should undo all the operation which has been done successfully,
the last success operation maybe hclge_mac_mdio_config(), so if
hclge_init_umv_space() failed, we also need to undo it.

Fixes: 288475b2ad01 ("{topost} net: hns3: refine umv space allocation")
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 42f0f8824b47..b531eac12fea 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -7383,7 +7383,7 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 	ret = hclge_init_umv_space(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "umv space init error, ret=%d.\n", ret);
-		goto err_msi_irq_uninit;
+		goto err_mdiobus_unreg;
 	}
 
 	ret = hclge_mac_init(hdev);
-- 
2.20.1



^ permalink raw reply related

* [PATCH net-next 07/12] net: hns3: fix netif_napi_del() not do problem when unloading
From: Huazhong Tan @ 2019-01-30 20:55 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, huangdaode, yisen.zhuang, salil.mehta,
	linuxarm, Huazhong Tan, Peng Li
In-Reply-To: <20190130205552.8512-1-tanhuazhong@huawei.com>

When the driver is unloading, if a global reset occurs,
unmap_ring_from_vector() in the hns3_nic_uninit_vector_data() will
fail, and hns3_nic_uninit_vector_data() just return. There may be
some netif_napi_del() not be done.

Since hardware will unmap all ring while resetting, so
hns3_nic_uninit_vector_data() should ignore this error, and do the
rest uninitialization.

Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC")
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3_enet.c   | 27 +++++--------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index ac9b0aa258ec..c546b874d659 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -3183,12 +3183,12 @@ static void hns3_clear_ring_group(struct hns3_enet_ring_group *group)
 	group->count = 0;
 }
 
-static int hns3_nic_uninit_vector_data(struct hns3_nic_priv *priv)
+static void hns3_nic_uninit_vector_data(struct hns3_nic_priv *priv)
 {
 	struct hnae3_ring_chain_node vector_ring_chain;
 	struct hnae3_handle *h = priv->ae_handle;
 	struct hns3_enet_tqp_vector *tqp_vector;
-	int i, ret;
+	int i;
 
 	for (i = 0; i < priv->vector_num; i++) {
 		tqp_vector = &priv->tqp_vector[i];
@@ -3196,15 +3196,10 @@ static int hns3_nic_uninit_vector_data(struct hns3_nic_priv *priv)
 		if (!tqp_vector->rx_group.ring && !tqp_vector->tx_group.ring)
 			continue;
 
-		ret = hns3_get_vector_ring_chain(tqp_vector,
-						 &vector_ring_chain);
-		if (ret)
-			return ret;
+		hns3_get_vector_ring_chain(tqp_vector, &vector_ring_chain);
 
-		ret = h->ae_algo->ops->unmap_ring_from_vector(h,
+		h->ae_algo->ops->unmap_ring_from_vector(h,
 			tqp_vector->vector_irq, &vector_ring_chain);
-		if (ret)
-			return ret;
 
 		hns3_free_vector_ring_chain(tqp_vector, &vector_ring_chain);
 
@@ -3220,8 +3215,6 @@ static int hns3_nic_uninit_vector_data(struct hns3_nic_priv *priv)
 		hns3_clear_ring_group(&tqp_vector->tx_group);
 		netif_napi_del(&priv->tqp_vector[i].napi);
 	}
-
-	return 0;
 }
 
 static int hns3_nic_dealloc_vector_data(struct hns3_nic_priv *priv)
@@ -3691,7 +3684,7 @@ static int hns3_client_init(struct hnae3_handle *handle)
 out_init_phy:
 	hns3_uninit_all_ring(priv);
 out_init_ring_data:
-	(void)hns3_nic_uninit_vector_data(priv);
+	hns3_nic_uninit_vector_data(priv);
 out_init_vector_data:
 	hns3_nic_dealloc_vector_data(priv);
 out_alloc_vector_data:
@@ -3726,9 +3719,7 @@ static void hns3_client_uninit(struct hnae3_handle *handle, bool reset)
 
 	hns3_uninit_phy(netdev);
 
-	ret = hns3_nic_uninit_vector_data(priv);
-	if (ret)
-		netdev_err(netdev, "uninit vector error\n");
+	hns3_nic_uninit_vector_data(priv);
 
 	ret = hns3_nic_dealloc_vector_data(priv);
 	if (ret)
@@ -4121,11 +4112,7 @@ static int hns3_reset_notify_uninit_enet(struct hnae3_handle *handle)
 
 	hns3_force_clear_all_rx_ring(handle);
 
-	ret = hns3_nic_uninit_vector_data(priv);
-	if (ret) {
-		netdev_err(netdev, "uninit vector error\n");
-		return ret;
-	}
+	hns3_nic_uninit_vector_data(priv);
 
 	hns3_store_coal(priv);
 
-- 
2.20.1



^ permalink raw reply related

* [PATCH net-next 08/12] net: hns3: fix for rss result nonuniform
From: Huazhong Tan @ 2019-01-30 20:55 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, huangdaode, yisen.zhuang, salil.mehta,
	linuxarm, Jian Shen, Peng Li, Huazhong Tan
In-Reply-To: <20190130205552.8512-1-tanhuazhong@huawei.com>

From: Jian Shen <shenjian15@huawei.com>

The rss result is more uniform when use recommended hash key from
microsoft, instead of the one generated by netdev_rss_key_fill().
Also using hash algorithm "xor" is better than "toeplitz".

This patch modifies the default hash key and hash algorithm.

Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 .../hisilicon/hns3/hns3pf/hclge_main.c         | 18 +++++++++++++++---
 .../hisilicon/hns3/hns3vf/hclgevf_main.c       | 14 +++++++++++---
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 795ebedde284..42f0f8824b47 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -295,6 +295,14 @@ static const struct hclge_mac_mgr_tbl_entry_cmd hclge_mgr_table[] = {
 	},
 };
 
+static const u8 hclge_hash_key[] = {
+	0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
+	0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
+	0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
+	0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
+	0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA
+};
+
 static int hclge_mac_update_stats_defective(struct hclge_dev *hdev)
 {
 #define HCLGE_MAC_CMD_NUM 21
@@ -3652,8 +3660,11 @@ void hclge_rss_indir_init_cfg(struct hclge_dev *hdev)
 
 static void hclge_rss_init_cfg(struct hclge_dev *hdev)
 {
+	int i, rss_algo = HCLGE_RSS_HASH_ALGO_TOEPLITZ;
 	struct hclge_vport *vport = hdev->vport;
-	int i;
+
+	if (hdev->pdev->revision >= 0x21)
+		rss_algo = HCLGE_RSS_HASH_ALGO_SIMPLE;
 
 	for (i = 0; i < hdev->num_vmdq_vport + 1; i++) {
 		vport[i].rss_tuple_sets.ipv4_tcp_en =
@@ -3673,9 +3684,10 @@ static void hclge_rss_init_cfg(struct hclge_dev *hdev)
 		vport[i].rss_tuple_sets.ipv6_fragment_en =
 			HCLGE_RSS_INPUT_TUPLE_OTHER;
 
-		vport[i].rss_algo = HCLGE_RSS_HASH_ALGO_TOEPLITZ;
+		vport[i].rss_algo = rss_algo;
 
-		netdev_rss_key_fill(vport[i].rss_hash_key, HCLGE_RSS_KEY_SIZE);
+		memcpy(vport[i].rss_hash_key, hclge_hash_key,
+		       HCLGE_RSS_KEY_SIZE);
 	}
 
 	hclge_rss_indir_init_cfg(hdev);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index fc99a0c70164..93f306f42cf4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -21,6 +21,14 @@ static const struct pci_device_id ae_algovf_pci_tbl[] = {
 	{0, }
 };
 
+static const u8 hclgevf_hash_key[] = {
+	0x6D, 0x5A, 0x56, 0xDA, 0x25, 0x5B, 0x0E, 0xC2,
+	0x41, 0x67, 0x25, 0x3D, 0x43, 0xA3, 0x8F, 0xB0,
+	0xD0, 0xCA, 0x2B, 0xCB, 0xAE, 0x7B, 0x30, 0xB4,
+	0x77, 0xCB, 0x2D, 0xA3, 0x80, 0x30, 0xF2, 0x0C,
+	0x6A, 0x42, 0xB7, 0x3B, 0xBE, 0xAC, 0x01, 0xFA
+};
+
 MODULE_DEVICE_TABLE(pci, ae_algovf_pci_tbl);
 
 static const u32 cmdq_reg_addr_list[] = {HCLGEVF_CMDQ_TX_ADDR_L_REG,
@@ -1789,9 +1797,9 @@ static int hclgevf_rss_init_hw(struct hclgevf_dev *hdev)
 	rss_cfg->rss_size = hdev->rss_size_max;
 
 	if (hdev->pdev->revision >= 0x21) {
-		rss_cfg->hash_algo = HCLGEVF_RSS_HASH_ALGO_TOEPLITZ;
-		netdev_rss_key_fill(rss_cfg->rss_hash_key,
-				    HCLGEVF_RSS_KEY_SIZE);
+		rss_cfg->hash_algo = HCLGEVF_RSS_HASH_ALGO_SIMPLE;
+		memcpy(rss_cfg->rss_hash_key, hclgevf_hash_key,
+		       HCLGEVF_RSS_KEY_SIZE);
 
 		ret = hclgevf_set_rss_algo_key(hdev, rss_cfg->hash_algo,
 					       rss_cfg->rss_hash_key);
-- 
2.20.1



^ permalink raw reply related

* [PATCH net-next 11/12] net: hns3: stop sending keep alive msg to PF when VF is resetting
From: Huazhong Tan @ 2019-01-30 20:55 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, huangdaode, yisen.zhuang, salil.mehta,
	linuxarm, Jian Shen, Peng Li, Huazhong Tan
In-Reply-To: <20190130205552.8512-1-tanhuazhong@huawei.com>

From: Jian Shen <shenjian15@huawei.com>

When VF is resetting, it can't communicate to PF with mailbox msg.
This patch adds reset state checking before sending keep alive msg
to PF.

Fixes: a6d818e31d08 ("net: hns3: Add vport alive state checking support")
Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index 95f926971e3b..b9cdbd5dd6cb 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -1624,6 +1624,10 @@ static void hclgevf_keep_alive_task(struct work_struct *work)
 	int ret;
 
 	hdev = container_of(work, struct hclgevf_dev, keep_alive_task);
+
+	if (test_bit(HCLGEVF_STATE_RST_HANDLING, &hdev->state))
+		return;
+
 	ret = hclgevf_send_mbx_msg(hdev, HCLGE_MBX_KEEP_ALIVE, 0, NULL,
 				   0, false, &respmsg, sizeof(u8));
 	if (ret)
-- 
2.20.1



^ permalink raw reply related

* [PATCH net-next 01/12] net: hns3: reuse the definition of l3 and l4 header info union
From: Huazhong Tan @ 2019-01-30 20:55 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, huangdaode, yisen.zhuang, salil.mehta,
	linuxarm, liyongxin, Peng Li, Huazhong Tan
In-Reply-To: <20190130205552.8512-1-tanhuazhong@huawei.com>

From: liyongxin <liyongxin1@huawei.com>

Union l3_hdr_info and l4_hdr_info have already been defined in
the hns3_enet.h, so it is unnecessary to define them elsewhere.

This patch removes the redundant definition, and reuses the one
defined in the hns3_enet.h.

Signed-off-by: liyongxin <liyongxin1@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 .../net/ethernet/hisilicon/hns3/hns3_enet.c   | 32 +++----------------
 .../net/ethernet/hisilicon/hns3/hns3_enet.h   |  1 +
 2 files changed, 6 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 4b38c37eef8f..048b5fbdf8fa 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -655,11 +655,7 @@ static int hns3_set_tso(struct sk_buff *skb, u32 *paylen,
 static int hns3_get_l4_protocol(struct sk_buff *skb, u8 *ol4_proto,
 				u8 *il4_proto)
 {
-	union {
-		struct iphdr *v4;
-		struct ipv6hdr *v6;
-		unsigned char *hdr;
-	} l3;
+	union l3_hdr_info l3;
 	unsigned char *l4_hdr;
 	unsigned char *exthdr;
 	u8 l4_proto_tmp;
@@ -712,17 +708,8 @@ static void hns3_set_l2l3l4_len(struct sk_buff *skb, u8 ol4_proto,
 				u8 il4_proto, u32 *type_cs_vlan_tso,
 				u32 *ol_type_vlan_len_msec)
 {
-	union {
-		struct iphdr *v4;
-		struct ipv6hdr *v6;
-		unsigned char *hdr;
-	} l3;
-	union {
-		struct tcphdr *tcp;
-		struct udphdr *udp;
-		struct gre_base_hdr *gre;
-		unsigned char *hdr;
-	} l4;
+	union l3_hdr_info l3;
+	union l4_hdr_info l4;
 	unsigned char *l2_hdr;
 	u8 l4_proto = ol4_proto;
 	u32 ol2_len;
@@ -821,12 +808,7 @@ static void hns3_set_l2l3l4_len(struct sk_buff *skb, u8 ol4_proto,
 static bool hns3_tunnel_csum_bug(struct sk_buff *skb)
 {
 #define IANA_VXLAN_PORT	4789
-	union {
-		struct tcphdr *tcp;
-		struct udphdr *udp;
-		struct gre_base_hdr *gre;
-		unsigned char *hdr;
-	} l4;
+	union l4_hdr_info l4;
 
 	l4.hdr = skb_transport_header(skb);
 
@@ -842,11 +824,7 @@ static int hns3_set_l3l4_type_csum(struct sk_buff *skb, u8 ol4_proto,
 				   u8 il4_proto, u32 *type_cs_vlan_tso,
 				   u32 *ol_type_vlan_len_msec)
 {
-	union {
-		struct iphdr *v4;
-		struct ipv6hdr *v6;
-		unsigned char *hdr;
-	} l3;
+	union l3_hdr_info l3;
 	u32 l4_proto = ol4_proto;
 
 	l3.hdr = skb_network_header(skb);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index f3d248626ab3..71ff8f4d6c18 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -574,6 +574,7 @@ union l3_hdr_info {
 union l4_hdr_info {
 	struct tcphdr *tcp;
 	struct udphdr *udp;
+	struct gre_base_hdr *gre;
 	unsigned char *hdr;
 };
 
-- 
2.20.1



^ permalink raw reply related

* [PATCH net-next 03/12] net: hns3: use the correct interface to stop|open port
From: Huazhong Tan @ 2019-01-30 20:55 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, huangdaode, yisen.zhuang, salil.mehta,
	linuxarm, Peng Li, Huazhong Tan
In-Reply-To: <20190130205552.8512-1-tanhuazhong@huawei.com>

From: Peng Li <lipeng321@huawei.com>

dev_close() stop the netdev and the service base on the netdev
will stop. But ndev->netdev_ops->ndo_stop() may only stop HW
and stack queue, the service base on the netdev can still work.

Fixes: 5668abda0931 ("net: hns3: add support for set_ringparam")
Signed-off-by: Peng Li <lipeng321@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 76ef06a7c261..63f5f56bda94 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -805,7 +805,7 @@ static int hns3_set_ringparam(struct net_device *ndev,
 		    old_desc_num, new_desc_num);
 
 	if (if_running)
-		dev_close(ndev);
+		ndev->netdev_ops->ndo_stop(ndev);
 
 	ret = hns3_uninit_all_ring(priv);
 	if (ret)
@@ -822,7 +822,7 @@ static int hns3_set_ringparam(struct net_device *ndev,
 	}
 
 	if (if_running)
-		ret = dev_open(ndev, NULL);
+		ret = ndev->netdev_ops->ndo_open(ndev);
 
 	return ret;
 }
-- 
2.20.1



^ permalink raw reply related

* [PATCH net-next 04/12] net: hns3: change hnae3_register_ae_dev() to int
From: Huazhong Tan @ 2019-01-30 20:55 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, huangdaode, yisen.zhuang, salil.mehta,
	linuxarm, Huazhong Tan, Peng Li
In-Reply-To: <20190130205552.8512-1-tanhuazhong@huawei.com>

hnae3_register_ae_dev() may fail, and it should return a error code
to its caller, so change hnae3_register_ae_dev() return type to int.

Also, when hnae3_register_ae_dev() return error, hns3_probe() should
do some error handling and return the error code.

Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.c     | 10 +++++++++-
 drivers/net/ethernet/hisilicon/hns3/hnae3.h     |  2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c |  8 ++++++--
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
index 781e5dee3c70..50011aafbae4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(hnae3_unregister_ae_algo);
  * @ae_dev: the AE device
  * NOTE: the duplicated name will not be checked
  */
-void hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev)
+int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev)
 {
 	const struct pci_device_id *id;
 	struct hnae3_ae_algo *ae_algo;
@@ -259,6 +259,7 @@ void hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev)
 
 		if (!ae_dev->ops) {
 			dev_err(&ae_dev->pdev->dev, "ae_dev ops are null\n");
+			ret = -EOPNOTSUPP;
 			goto out_err;
 		}
 
@@ -285,8 +286,15 @@ void hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev)
 				ret);
 	}
 
+	mutex_unlock(&hnae3_common_lock);
+
+	return 0;
+
 out_err:
+	list_del(&ae_dev->node);
 	mutex_unlock(&hnae3_common_lock);
+
+	return ret;
 }
 EXPORT_SYMBOL(hnae3_register_ae_dev);
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 585800e634e6..c29f82aa5ba1 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -587,7 +587,7 @@ struct hnae3_handle {
 #define hnae3_get_bit(origin, shift) \
 	hnae3_get_field((origin), (0x1 << (shift)), (shift))
 
-void hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev);
+int hnae3_register_ae_dev(struct hnae3_ae_dev *ae_dev);
 void hnae3_unregister_ae_dev(struct hnae3_ae_dev *ae_dev);
 
 void hnae3_unregister_ae_algo(struct hnae3_ae_algo *ae_algo);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 048b5fbdf8fa..6520e1c1a837 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1752,9 +1752,13 @@ static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	hns3_get_dev_capability(pdev, ae_dev);
 	pci_set_drvdata(pdev, ae_dev);
 
-	hnae3_register_ae_dev(ae_dev);
+	ret = hnae3_register_ae_dev(ae_dev);
+	if (ret) {
+		devm_kfree(&pdev->dev, ae_dev);
+		pci_set_drvdata(pdev, NULL);
+	}
 
-	return 0;
+	return ret;
 }
 
 /* hns3_remove - Device removal routine
-- 
2.20.1



^ permalink raw reply related

* [PATCH net-next 00/12] code optimizations & bugfixes for HNS3 driver
From: Huazhong Tan @ 2019-01-30 20:55 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-kernel, huangdaode, yisen.zhuang, salil.mehta,
	linuxarm, Huazhong Tan

This patchset includes bugfixes and code optimizations for the HNS3
ethernet controller driver

Huazhong Tan (4):
  net: hns3: change hnae3_register_ae_dev() to int
  net: hns3: Fix NULL deref when unloading driver
  net: hns3: fix netif_napi_del() not do problem when unloading
  net: hns3: fix improper error handling in the hclge_init_ae_dev()

Jian Shen (4):
  net: hns3: fix VF dump register issue
  net: hns3: fix for rss result nonuniform
  net: hns3: stop sending keep alive msg to PF when VF is resetting
  net: hns3: keep flow director state unchanged when reset

Peng Li (2):
  net: hns3: use the correct interface to stop|open port
  net: hns3: fix an issue for hclgevf_ae_get_hdev

Yunsheng Lin (1):
  net: hns3: only support tc 0 for VF

liyongxin (1):
  net: hns3: reuse the definition of l3 and l4 header info union

 drivers/net/ethernet/hisilicon/hns3/hnae3.c   | 10 +-
 drivers/net/ethernet/hisilicon/hns3/hnae3.h   |  4 +-
 .../net/ethernet/hisilicon/hns3/hns3_enet.c   | 95 +++++++++----------
 .../net/ethernet/hisilicon/hns3/hns3_enet.h   |  1 +
 .../ethernet/hisilicon/hns3/hns3_ethtool.c    |  6 +-
 .../hisilicon/hns3/hns3pf/hclge_dcb.c         | 12 +--
 .../hisilicon/hns3/hns3pf/hclge_main.c        | 50 +++++-----
 .../hisilicon/hns3/hns3pf/hclge_main.h        |  2 +-
 .../hisilicon/hns3/hns3pf/hclge_mbx.c         | 10 +-
 .../hisilicon/hns3/hns3pf/hclge_mdio.c        |  8 +-
 .../hisilicon/hns3/hns3pf/hclge_mdio.h        |  4 +-
 .../ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 22 +++--
 .../hisilicon/hns3/hns3vf/hclgevf_main.c      | 25 ++++-
 13 files changed, 145 insertions(+), 104 deletions(-)

-- 
2.20.1



^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox