Netdev List
 help / color / mirror / Atom feed
* [PATCH bpf-next v3 5/7] selftests/bpf: support BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG
From: Stanislav Fomichev @ 2019-07-25 22:52 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, ast, daniel, Stanislav Fomichev, Petar Penkov,
	Willem de Bruijn, Song Liu
In-Reply-To: <20190725225231.195090-1-sdf@google.com>

bpf_flow.c: exit early unless BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG is
passed in flags. Also, set ip_proto earlier, this makes sure we have
correct value with fragmented packets.

Add selftest cases to test ipv4/ipv6 fragments and skip eth_get_headlen
tests that don't have BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG flag.

eth_get_headlen calls flow dissector with
BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG flag so we can't run tests that
have different set of input flags against it.

v2:
 * sefltests -> selftests (Willem de Bruijn)
 * Reword a comment about eth_get_headlen flags (Song Liu)

Acked-by: Petar Penkov <ppenkov@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 .../selftests/bpf/prog_tests/flow_dissector.c | 133 +++++++++++++++++-
 tools/testing/selftests/bpf/progs/bpf_flow.c  |  29 +++-
 2 files changed, 155 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
index c938283ac232..8e8c18aced9b 100644
--- a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -5,6 +5,10 @@
 #include <linux/if_tun.h>
 #include <sys/uio.h>
 
+#ifndef IP_MF
+#define IP_MF 0x2000
+#endif
+
 #define CHECK_FLOW_KEYS(desc, got, expected)				\
 	CHECK_ATTR(memcmp(&got, &expected, sizeof(got)) != 0,		\
 	      desc,							\
@@ -49,6 +53,18 @@ struct ipv6_pkt {
 	struct tcphdr tcp;
 } __packed;
 
+struct ipv6_frag_pkt {
+	struct ethhdr eth;
+	struct ipv6hdr iph;
+	struct frag_hdr {
+		__u8 nexthdr;
+		__u8 reserved;
+		__be16 frag_off;
+		__be32 identification;
+	} ipf;
+	struct tcphdr tcp;
+} __packed;
+
 struct dvlan_ipv6_pkt {
 	struct ethhdr eth;
 	__u16 vlan_tci;
@@ -65,9 +81,11 @@ struct test {
 		struct ipv4_pkt ipv4;
 		struct svlan_ipv4_pkt svlan_ipv4;
 		struct ipv6_pkt ipv6;
+		struct ipv6_frag_pkt ipv6_frag;
 		struct dvlan_ipv6_pkt dvlan_ipv6;
 	} pkt;
 	struct bpf_flow_keys keys;
+	__u32 flags;
 };
 
 #define VLAN_HLEN	4
@@ -143,6 +161,102 @@ struct test tests[] = {
 			.n_proto = __bpf_constant_htons(ETH_P_IPV6),
 		},
 	},
+	{
+		.name = "ipv4-frag",
+		.pkt.ipv4 = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+			.iph.ihl = 5,
+			.iph.protocol = IPPROTO_TCP,
+			.iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+			.iph.frag_off = __bpf_constant_htons(IP_MF),
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct iphdr),
+			.addr_proto = ETH_P_IP,
+			.ip_proto = IPPROTO_TCP,
+			.n_proto = __bpf_constant_htons(ETH_P_IP),
+			.is_frag = true,
+			.is_first_frag = true,
+			.sport = 80,
+			.dport = 8080,
+		},
+		.flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
+	},
+	{
+		.name = "ipv4-no-frag",
+		.pkt.ipv4 = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+			.iph.ihl = 5,
+			.iph.protocol = IPPROTO_TCP,
+			.iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+			.iph.frag_off = __bpf_constant_htons(IP_MF),
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct iphdr),
+			.addr_proto = ETH_P_IP,
+			.ip_proto = IPPROTO_TCP,
+			.n_proto = __bpf_constant_htons(ETH_P_IP),
+			.is_frag = true,
+			.is_first_frag = true,
+		},
+	},
+	{
+		.name = "ipv6-frag",
+		.pkt.ipv6_frag = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.iph.nexthdr = IPPROTO_FRAGMENT,
+			.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+			.ipf.nexthdr = IPPROTO_TCP,
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct ipv6hdr) +
+				sizeof(struct frag_hdr),
+			.addr_proto = ETH_P_IPV6,
+			.ip_proto = IPPROTO_TCP,
+			.n_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.is_frag = true,
+			.is_first_frag = true,
+			.sport = 80,
+			.dport = 8080,
+		},
+		.flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
+	},
+	{
+		.name = "ipv6-no-frag",
+		.pkt.ipv6_frag = {
+			.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.iph.nexthdr = IPPROTO_FRAGMENT,
+			.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+			.ipf.nexthdr = IPPROTO_TCP,
+			.tcp.doff = 5,
+			.tcp.source = 80,
+			.tcp.dest = 8080,
+		},
+		.keys = {
+			.nhoff = ETH_HLEN,
+			.thoff = ETH_HLEN + sizeof(struct ipv6hdr) +
+				sizeof(struct frag_hdr),
+			.addr_proto = ETH_P_IPV6,
+			.ip_proto = IPPROTO_TCP,
+			.n_proto = __bpf_constant_htons(ETH_P_IPV6),
+			.is_frag = true,
+			.is_first_frag = true,
+		},
+	},
 };
 
 static int create_tap(const char *ifname)
@@ -225,6 +339,13 @@ void test_flow_dissector(void)
 			.data_size_in = sizeof(tests[i].pkt),
 			.data_out = &flow_keys,
 		};
+		static struct bpf_flow_keys ctx = {};
+
+		if (tests[i].flags) {
+			tattr.ctx_in = &ctx;
+			tattr.ctx_size_in = sizeof(ctx);
+			ctx.flags = tests[i].flags;
+		}
 
 		err = bpf_prog_test_run_xattr(&tattr);
 		CHECK_ATTR(tattr.data_size_out != sizeof(flow_keys) ||
@@ -251,10 +372,20 @@ void test_flow_dissector(void)
 	CHECK(err, "ifup", "err %d errno %d\n", err, errno);
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
-		struct bpf_flow_keys flow_keys = {};
+		/* Keep in sync with 'flags' from eth_get_headlen. */
+		__u32 eth_get_headlen_flags =
+			BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
 		struct bpf_prog_test_run_attr tattr = {};
+		struct bpf_flow_keys flow_keys = {};
 		__u32 key = 0;
 
+		/* For skb-less case we can't pass input flags; run
+		 * only the tests that have a matching set of flags.
+		 */
+
+		if (tests[i].flags != eth_get_headlen_flags)
+			continue;
+
 		err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt));
 		CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno);
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c
index 5ae485a6af3f..f931cd3db9d4 100644
--- a/tools/testing/selftests/bpf/progs/bpf_flow.c
+++ b/tools/testing/selftests/bpf/progs/bpf_flow.c
@@ -153,7 +153,6 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
 	struct tcphdr *tcp, _tcp;
 	struct udphdr *udp, _udp;
 
-	keys->ip_proto = proto;
 	switch (proto) {
 	case IPPROTO_ICMP:
 		icmp = bpf_flow_dissect_get_header(skb, sizeof(*icmp), &_icmp);
@@ -231,7 +230,6 @@ static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr)
 {
 	struct bpf_flow_keys *keys = skb->flow_keys;
 
-	keys->ip_proto = nexthdr;
 	switch (nexthdr) {
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_DSTOPTS:
@@ -266,6 +264,7 @@ PROG(IP)(struct __sk_buff *skb)
 	keys->addr_proto = ETH_P_IP;
 	keys->ipv4_src = iph->saddr;
 	keys->ipv4_dst = iph->daddr;
+	keys->ip_proto = iph->protocol;
 
 	keys->thoff += iph->ihl << 2;
 	if (data + keys->thoff > data_end)
@@ -273,13 +272,20 @@ PROG(IP)(struct __sk_buff *skb)
 
 	if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) {
 		keys->is_frag = true;
-		if (iph->frag_off & bpf_htons(IP_OFFSET))
+		if (iph->frag_off & bpf_htons(IP_OFFSET)) {
 			/* From second fragment on, packets do not have headers
 			 * we can parse.
 			 */
 			done = true;
-		else
+		} else {
 			keys->is_first_frag = true;
+			/* No need to parse fragmented packet unless
+			 * explicitly asked for.
+			 */
+			if (!(keys->flags &
+			      BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
+				done = true;
+		}
 	}
 
 	if (done)
@@ -301,6 +307,7 @@ PROG(IPV6)(struct __sk_buff *skb)
 	memcpy(&keys->ipv6_src, &ip6h->saddr, 2*sizeof(ip6h->saddr));
 
 	keys->thoff += sizeof(struct ipv6hdr);
+	keys->ip_proto = ip6h->nexthdr;
 
 	return parse_ipv6_proto(skb, ip6h->nexthdr);
 }
@@ -317,7 +324,8 @@ PROG(IPV6OP)(struct __sk_buff *skb)
 	/* hlen is in 8-octets and does not include the first 8 bytes
 	 * of the header
 	 */
-	skb->flow_keys->thoff += (1 + ip6h->hdrlen) << 3;
+	keys->thoff += (1 + ip6h->hdrlen) << 3;
+	keys->ip_proto = ip6h->nexthdr;
 
 	return parse_ipv6_proto(skb, ip6h->nexthdr);
 }
@@ -333,9 +341,18 @@ PROG(IPV6FR)(struct __sk_buff *skb)
 
 	keys->thoff += sizeof(*fragh);
 	keys->is_frag = true;
-	if (!(fragh->frag_off & bpf_htons(IP6_OFFSET)))
+	keys->ip_proto = fragh->nexthdr;
+
+	if (!(fragh->frag_off & bpf_htons(IP6_OFFSET))) {
 		keys->is_first_frag = true;
 
+		/* No need to parse fragmented packet unless
+		 * explicitly asked for.
+		 */
+		if (!(keys->flags & BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
+			return export_flow_keys(keys, BPF_OK);
+	}
+
 	return parse_ipv6_proto(skb, fragh->nexthdr);
 }
 
-- 
2.22.0.709.g102302147b-goog


^ permalink raw reply related

* [PATCH bpf-next v3 4/7] tools/bpf: sync bpf_flow_keys flags
From: Stanislav Fomichev @ 2019-07-25 22:52 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, ast, daniel, Stanislav Fomichev, Petar Penkov,
	Willem de Bruijn, Song Liu
In-Reply-To: <20190725225231.195090-1-sdf@google.com>

Export bpf_flow_keys flags to tools/libbpf/selftests.

Acked-by: Petar Penkov <ppenkov@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 tools/include/uapi/linux/bpf.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4e455018da65..2e4b0848d795 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3504,6 +3504,10 @@ enum bpf_task_fd_type {
 	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
 };
 
+#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG		(1U << 0)
+#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL		(1U << 1)
+#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP		(1U << 2)
+
 struct bpf_flow_keys {
 	__u16	nhoff;
 	__u16	thoff;
@@ -3525,6 +3529,7 @@ struct bpf_flow_keys {
 			__u32	ipv6_dst[4];	/* in6_addr; network order */
 		};
 	};
+	__u32	flags;
 };
 
 struct bpf_func_info {
-- 
2.22.0.709.g102302147b-goog


^ permalink raw reply related

* [PATCH bpf-next v3 3/7] bpf/flow_dissector: support flags in BPF_PROG_TEST_RUN
From: Stanislav Fomichev @ 2019-07-25 22:52 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, ast, daniel, Stanislav Fomichev, Petar Penkov,
	Willem de Bruijn, Song Liu
In-Reply-To: <20190725225231.195090-1-sdf@google.com>

This will allow us to write tests for those flags.

v2:
* Swap kfree(data) and kfree(user_ctx) (Song Liu)

Acked-by: Petar Penkov <ppenkov@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 net/bpf/test_run.c | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 4e41d15a1098..1153bbcdff72 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -377,6 +377,22 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 	return ret;
 }
 
+static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx)
+{
+	/* make sure the fields we don't use are zeroed */
+	if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags)))
+		return -EINVAL;
+
+	/* flags is allowed */
+
+	if (!range_is_zero(ctx, offsetof(struct bpf_flow_keys, flags) +
+			   FIELD_SIZEOF(struct bpf_flow_keys, flags),
+			   sizeof(struct bpf_flow_keys)))
+		return -EINVAL;
+
+	return 0;
+}
+
 int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 				     const union bpf_attr *kattr,
 				     union bpf_attr __user *uattr)
@@ -384,9 +400,11 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	u32 size = kattr->test.data_size_in;
 	struct bpf_flow_dissector ctx = {};
 	u32 repeat = kattr->test.repeat;
+	struct bpf_flow_keys *user_ctx;
 	struct bpf_flow_keys flow_keys;
 	u64 time_start, time_spent = 0;
 	const struct ethhdr *eth;
+	unsigned int flags = 0;
 	u32 retval, duration;
 	void *data;
 	int ret;
@@ -395,9 +413,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR)
 		return -EINVAL;
 
-	if (kattr->test.ctx_in || kattr->test.ctx_out)
-		return -EINVAL;
-
 	if (size < ETH_HLEN)
 		return -EINVAL;
 
@@ -410,6 +425,18 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	if (!repeat)
 		repeat = 1;
 
+	user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys));
+	if (IS_ERR(user_ctx)) {
+		kfree(data);
+		return PTR_ERR(user_ctx);
+	}
+	if (user_ctx) {
+		ret = verify_user_bpf_flow_keys(user_ctx);
+		if (ret)
+			goto out;
+		flags = user_ctx->flags;
+	}
+
 	ctx.flow_keys = &flow_keys;
 	ctx.data = data;
 	ctx.data_end = (__u8 *)data + size;
@@ -419,7 +446,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	time_start = ktime_get_ns();
 	for (i = 0; i < repeat; i++) {
 		retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN,
-					  size, 0);
+					  size, flags);
 
 		if (signal_pending(current)) {
 			preempt_enable();
@@ -450,8 +477,12 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 
 	ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys),
 			      retval, duration);
+	if (!ret)
+		ret = bpf_ctx_finish(kattr, uattr, user_ctx,
+				     sizeof(struct bpf_flow_keys));
 
 out:
+	kfree(user_ctx);
 	kfree(data);
 	return ret;
 }
-- 
2.22.0.709.g102302147b-goog


^ permalink raw reply related

* [PATCH bpf-next v3 2/7] bpf/flow_dissector: document flags
From: Stanislav Fomichev @ 2019-07-25 22:52 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, ast, daniel, Stanislav Fomichev, Petar Penkov,
	Willem de Bruijn, Song Liu
In-Reply-To: <20190725225231.195090-1-sdf@google.com>

Describe what each input flag does and who uses it.

Acked-by: Petar Penkov <ppenkov@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 Documentation/bpf/prog_flow_dissector.rst | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Documentation/bpf/prog_flow_dissector.rst b/Documentation/bpf/prog_flow_dissector.rst
index ed343abe541e..a78bf036cadd 100644
--- a/Documentation/bpf/prog_flow_dissector.rst
+++ b/Documentation/bpf/prog_flow_dissector.rst
@@ -26,6 +26,7 @@ and output arguments.
   * ``nhoff`` - initial offset of the networking header
   * ``thoff`` - initial offset of the transport header, initialized to nhoff
   * ``n_proto`` - L3 protocol type, parsed out of L2 header
+  * ``flags`` - optional flags
 
 Flow dissector BPF program should fill out the rest of the ``struct
 bpf_flow_keys`` fields. Input arguments ``nhoff/thoff/n_proto`` should be
@@ -101,6 +102,23 @@ can be called for both cases and would have to be written carefully to
 handle both cases.
 
 
+Flags
+=====
+
+``flow_keys->flags`` might contain optional input flags that work as follows:
+
+* ``BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG`` - tells BPF flow dissector to
+  continue parsing first fragment; the default expected behavior is that
+  flow dissector returns as soon as it finds out that the packet is fragmented;
+  used by ``eth_get_headlen`` to estimate length of all headers for GRO.
+* ``BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL`` - tells BPF flow dissector to
+  stop parsing as soon as it reaches IPv6 flow label; used by
+  ``___skb_get_hash`` and ``__skb_get_hash_symmetric`` to get flow hash.
+* ``BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP`` - tells BPF flow dissector to stop
+  parsing as soon as it reaches encapsulated headers; used by routing
+  infrastructure.
+
+
 Reference Implementation
 ========================
 
-- 
2.22.0.709.g102302147b-goog


^ permalink raw reply related

* [PATCH bpf-next v3 1/7] bpf/flow_dissector: pass input flags to BPF flow dissector program
From: Stanislav Fomichev @ 2019-07-25 22:52 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, ast, daniel, Stanislav Fomichev, Petar Penkov,
	Willem de Bruijn, Song Liu
In-Reply-To: <20190725225231.195090-1-sdf@google.com>

C flow dissector supports input flags that tell it to customize parsing
by either stopping early or trying to parse as deep as possible. Pass
those flags to the BPF flow dissector so it can make the same
decisions. In the next commits I'll add support for those flags to
our reference bpf_flow.c

v3:
* Export copy of flow dissector flags instead of moving (Alexei Starovoitov)

Acked-by: Petar Penkov <ppenkov@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 include/linux/skbuff.h    |  2 +-
 include/uapi/linux/bpf.h  |  5 +++++
 net/bpf/test_run.c        |  2 +-
 net/core/flow_dissector.c | 12 ++++++++++--
 4 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 718742b1c505..9b7a8038beec 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1271,7 +1271,7 @@ static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
 
 struct bpf_flow_dissector;
 bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
-		      __be16 proto, int nhoff, int hlen);
+		      __be16 proto, int nhoff, int hlen, unsigned int flags);
 
 bool __skb_flow_dissect(const struct net *net,
 			const struct sk_buff *skb,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fa1c753dcdbc..88b9d743036f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3507,6 +3507,10 @@ enum bpf_task_fd_type {
 	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
 };
 
+#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG		(1U << 0)
+#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL		(1U << 1)
+#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP		(1U << 2)
+
 struct bpf_flow_keys {
 	__u16	nhoff;
 	__u16	thoff;
@@ -3528,6 +3532,7 @@ struct bpf_flow_keys {
 			__u32	ipv6_dst[4];	/* in6_addr; network order */
 		};
 	};
+	__u32	flags;
 };
 
 struct bpf_func_info {
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 80e6f3a6864d..4e41d15a1098 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -419,7 +419,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	time_start = ktime_get_ns();
 	for (i = 0; i < repeat; i++) {
 		retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN,
-					  size);
+					  size, 0);
 
 		if (signal_pending(current)) {
 			preempt_enable();
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 3e6fedb57bc1..50ed1a688709 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -784,7 +784,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
 }
 
 bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
-		      __be16 proto, int nhoff, int hlen)
+		      __be16 proto, int nhoff, int hlen, unsigned int flags)
 {
 	struct bpf_flow_keys *flow_keys = ctx->flow_keys;
 	u32 result;
@@ -795,6 +795,14 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
 	flow_keys->nhoff = nhoff;
 	flow_keys->thoff = flow_keys->nhoff;
 
+	BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG !=
+		     (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG);
+	BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL !=
+		     (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+	BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP !=
+		     (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
+	flow_keys->flags = flags;
+
 	preempt_disable();
 	result = BPF_PROG_RUN(prog, ctx);
 	preempt_enable();
@@ -914,7 +922,7 @@ bool __skb_flow_dissect(const struct net *net,
 			}
 
 			ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff,
-					       hlen);
+					       hlen, flags);
 			__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
 						 target_container);
 			rcu_read_unlock();
-- 
2.22.0.709.g102302147b-goog


^ permalink raw reply related

* [PATCH bpf-next v3 0/7] bpf/flow_dissector: support input flags
From: Stanislav Fomichev @ 2019-07-25 22:52 UTC (permalink / raw)
  To: netdev, bpf
  Cc: davem, ast, daniel, Stanislav Fomichev, Song Liu,
	Willem de Bruijn, Petar Penkov

C flow dissector supports input flags that tell it to customize parsing
by either stopping early or trying to parse as deep as possible.
BPF flow dissector always parses as deep as possible which is sub-optimal.
Pass input flags to the BPF flow dissector as well so it can make the same
decisions.

Series outline:
* remove unused FLOW_DISSECTOR_F_STOP_AT_L3 flag
* export FLOW_DISSECTOR_F_XXX flags as uapi and pass them to BPF
  flow dissector
* add documentation for the export flags
* support input flags in BPF_PROG_TEST_RUN via ctx_{in,out}
* sync uapi to tools
* support FLOW_DISSECTOR_F_PARSE_1ST_FRAG in selftest
* support FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL in kernel and selftest
* support FLOW_DISSECTOR_F_STOP_AT_ENCAP in selftest

Pros:
* makes BPF flow dissector faster by avoiding burning extra cycles
* existing BPF progs continue to work by ignoring the flags and always
  parsing as deep as possible

Cons:
* new UAPI which we need to support (OTOH, if we need to deprecate some
  flags, we can just stop setting them upon calling BPF programs)

Some numbers (with .repeat = 4000000 in test_flow_dissector):
        test_flow_dissector:PASS:ipv4-frag 35 nsec
        test_flow_dissector:PASS:ipv4-frag 35 nsec
        test_flow_dissector:PASS:ipv4-no-frag 32 nsec
        test_flow_dissector:PASS:ipv4-no-frag 32 nsec

        test_flow_dissector:PASS:ipv6-frag 39 nsec
        test_flow_dissector:PASS:ipv6-frag 39 nsec
        test_flow_dissector:PASS:ipv6-no-frag 36 nsec
        test_flow_dissector:PASS:ipv6-no-frag 36 nsec

        test_flow_dissector:PASS:ipv6-flow-label 36 nsec
        test_flow_dissector:PASS:ipv6-flow-label 36 nsec
        test_flow_dissector:PASS:ipv6-no-flow-label 33 nsec
        test_flow_dissector:PASS:ipv6-no-flow-label 33 nsec

        test_flow_dissector:PASS:ipip-encap 38 nsec
        test_flow_dissector:PASS:ipip-encap 38 nsec
        test_flow_dissector:PASS:ipip-no-encap 32 nsec
        test_flow_dissector:PASS:ipip-no-encap 32 nsec

The improvement is around 10%, but it's in a tight cache-hot
BPF_PROG_TEST_RUN loop.

Cc: Song Liu <songliubraving@fb.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Petar Penkov <ppenkov@google.com>

Stanislav Fomichev (7):
  bpf/flow_dissector: pass input flags to BPF flow dissector program
  bpf/flow_dissector: document flags
  bpf/flow_dissector: support flags in BPF_PROG_TEST_RUN
  tools/bpf: sync bpf_flow_keys flags
  selftests/bpf: support BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG
  bpf/flow_dissector: support ipv6 flow_label and
    BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL
  selftests/bpf: support BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP

 Documentation/bpf/prog_flow_dissector.rst     |  18 ++
 include/linux/skbuff.h                        |   2 +-
 include/uapi/linux/bpf.h                      |   6 +
 net/bpf/test_run.c                            |  39 ++-
 net/core/flow_dissector.c                     |  21 +-
 tools/include/uapi/linux/bpf.h                |   6 +
 .../selftests/bpf/prog_tests/flow_dissector.c | 243 +++++++++++++++++-
 tools/testing/selftests/bpf/progs/bpf_flow.c  |  47 +++-
 8 files changed, 368 insertions(+), 14 deletions(-)

-- 
2.22.0.709.g102302147b-goog

^ permalink raw reply

* Re: [PATCH net-next 07/11] net: hns3: adds debug messages to identify eth down cause
From: Saeed Mahameed @ 2019-07-25 21:59 UTC (permalink / raw)
  To: tanhuazhong@huawei.com, davem@davemloft.net,
	liuyonglong@huawei.com
  Cc: lipeng321@huawei.com, yisen.zhuang@huawei.com,
	salil.mehta@huawei.com, linuxarm@huawei.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <30483e38-5e4a-0111-f431-4742ceb1aa62@huawei.com>

On Thu, 2019-07-25 at 20:28 +0800, liuyonglong wrote:
> 
> On 2019/7/25 3:12, Saeed Mahameed wrote:
> > On Wed, 2019-07-24 at 11:18 +0800, Huazhong Tan wrote:
> > > From: Yonglong Liu <liuyonglong@huawei.com>
> > > 
> > > Some times just see the eth interface have been down/up via
> > > dmesg, but can not know why the eth down. So adds some debug
> > > messages to identify the cause for this.
> > > 
> > 
> > I really don't like this. your default msg lvl has NETIF_MSG_IFDOWN
> > turned on .. dumping every single operation that happens on your
> > device
> > by default to kernel log is too much ! 
> > 
> > We should really consider using trace buffers with well defined
> > structures for vendor specific events. so we can use bpf filters
> > and
> > state of the art tools for netdev debugging.
> > 
> 
> We do this because we can just see a link down message in dmesg, and
> had
> take a long time to found the cause of link down, just because
> another
> user changed the settings.
> 
> We can change the net_open/net_stop/dcbnl_ops to msg_drv (not default
> turned on),  and want to keep the others default print to kernel log,
> is it acceptable?
> 

acceptable as long as debug information are kept off by default and
your driver doens't spam the kernel log.

you should use dynamic debug [1] and/or "off by default" msg lvls for
debugging information..

I couldn't find any rules regarding what to put in kernel log, Maybe
someone can share ?. but i vaguely remember that the recommendation
for device drivers is to put nothing, only error/warning messages.

[1] 
https://www.kernel.org/doc/html/v4.15/admin-guide/dynamic-debug-howto.html

> > > @@ -1593,6 +1603,11 @@ static int hns3_ndo_set_vf_vlan(struct
> > > net_device *netdev, int vf, u16 vlan,
> > >  	struct hnae3_handle *h = hns3_get_handle(netdev);
> > >  	int ret = -EIO;
> > >  
> > > +	if (netif_msg_ifdown(h))
> > 
> > why msg_ifdown ? looks like netif_msg_drv is more appropriate, for
> > many
> > of the cases in this patch.
> > 
> 
> This operation may cause link down, so we use msg_ifdown.
> 

ifdown isn't link down.. 

to be honest, I couldn't find any documentation explaining how/when to
use msg lvls, (i didn't look too deep though), by looking at other
drivers, my interpretations is:

ifdup (open/boot up flow)
ifdwon (close/teardown flow)
drv (driver based or dynamic flows) 
etc .. 

-Saeed.

^ permalink raw reply

* Re: general protection fault in tls_trim_both_msgs
From: Jakub Kicinski @ 2019-07-25 21:39 UTC (permalink / raw)
  To: syzbot
  Cc: ast, aviadye, borisp, bpf, daniel, davejwatson, davem,
	john.fastabend, kafai, linux-kernel, netdev, songliubraving,
	syzkaller-bugs, yhs
In-Reply-To: <0000000000002b4896058e7abf78@google.com>

On Wed, 24 Jul 2019 22:32:07 -0700, syzbot wrote:
> Hello,
> 
> syzbot found the following crash on:
> 
> HEAD commit:    9e6dfe80 Add linux-next specific files for 20190724
> git tree:       linux-next
> console output: https://syzkaller.appspot.com/x/log.txt?x=1046971fa00000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=6cbb8fc2cf2842d7
> dashboard link: https://syzkaller.appspot.com/bug?extid=0e0fedcad708d12d3032
> compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
> 
> Unfortunately, I don't have any reproducer for this crash yet.

Looks very like the issue we mentioned in the cover letter for unhash
fixes. TX is waiting for mem, the connection dies, we free ctx, TX
wakes up with a now stale ctx pointer. I'm testing a fix for this,
Netronome team was actually able to trigger a NULL-deref on the RX
side, because there ctx is reloaded but not NULL-checked.

> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+0e0fedcad708d12d3032@syzkaller.appspotmail.com
> 
> kasan: CONFIG_KASAN_INLINE enabled
> kasan: GPF could be caused by NULL-ptr deref or user memory access
> general protection fault: 0000 [#1] PREEMPT SMP KASAN
> CPU: 1 PID: 15517 Comm: syz-executor.4 Not tainted 5.3.0-rc1-next-20190724  
> #50
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
> Google 01/01/2011
> RIP: 0010:tls_trim_both_msgs+0x54/0x130 net/tls/tls_sw.c:268
> Code: 48 c1 ea 03 80 3c 02 00 0f 85 e3 00 00 00 4d 8b b5 b0 06 00 00 48 b8  
> 00 00 00 00 00 fc ff df 49 8d 7e 28 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f  
> 85 b3 00 00 00 48 b8 00 00 00 00 00 fc ff df 49 8b
> RSP: 0018:ffff8880612cfac0 EFLAGS: 00010206
> RAX: dffffc0000000000 RBX: ffff8880a8794340 RCX: ffffc9000e7b9000
> RDX: 0000000000000005 RSI: ffffffff86298656 RDI: 0000000000000028
> RBP: ffff8880612cfae0 R08: ffff88805ae4c580 R09: fffffbfff14a8155
> R10: fffffbfff14a8154 R11: ffffffff8a540aa7 R12: 0000000000000000
> R13: ffff888061d82e00 R14: 0000000000000000 R15: 00000000ffffffe0
> FS:  00007f7d33516700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000001b2fa2f000 CR3: 000000009fcf1000 CR4: 00000000001406e0
> Call Trace:
>   tls_sw_sendmsg+0xe38/0x17b0 net/tls/tls_sw.c:1057
>   inet6_sendmsg+0x9e/0xe0 net/ipv6/af_inet6.c:576
>   sock_sendmsg_nosec net/socket.c:637 [inline]
>   sock_sendmsg+0xd7/0x130 net/socket.c:657
>   __sys_sendto+0x262/0x380 net/socket.c:1952
>   __do_sys_sendto net/socket.c:1964 [inline]
>   __se_sys_sendto net/socket.c:1960 [inline]
>   __x64_sys_sendto+0xe1/0x1a0 net/socket.c:1960
>   do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x459829
> Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7  
> 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
> ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
> RSP: 002b:00007f7d33515c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
> RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 0000000000459829
> RDX: ffffffffffffffc1 RSI: 00000000200005c0 RDI: 0000000000000003
> RBP: 000000000075bf20 R08: 0000000000000000 R09: 1201000000003618
> R10: 0000000000000000 R11: 0000000000000246 R12: 00007f7d335166d4
> R13: 00000000004c7669 R14: 00000000004dcc70 R15: 00000000ffffffff
> Modules linked in:
> ---[ end trace 2dd728cceb39a185 ]---
> RIP: 0010:tls_trim_both_msgs+0x54/0x130 net/tls/tls_sw.c:268
> Code: 48 c1 ea 03 80 3c 02 00 0f 85 e3 00 00 00 4d 8b b5 b0 06 00 00 48 b8  
> 00 00 00 00 00 fc ff df 49 8d 7e 28 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f  
> 85 b3 00 00 00 48 b8 00 00 00 00 00 fc ff df 49 8b
> RSP: 0018:ffff8880612cfac0 EFLAGS: 00010206
> RAX: dffffc0000000000 RBX: ffff8880a8794340 RCX: ffffc9000e7b9000
> RDX: 0000000000000005 RSI: ffffffff86298656 RDI: 0000000000000028
> RBP: ffff8880612cfae0 R08: ffff88805ae4c580 R09: fffffbfff14a8155
> R10: fffffbfff14a8154 R11: ffffffff8a540aa7 R12: 0000000000000000
> R13: ffff888061d82e00 R14: 0000000000000000 R15: 00000000ffffffe0
> FS:  00007f7d33516700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00000000019dbe80 CR3: 000000009fcf1000 CR4: 00000000001406e0
> 
> 
> ---
> This bug is generated by a bot. It may contain errors.
> See https://goo.gl/tpsmEJ for more information about syzbot.
> syzbot engineers can be reached at syzkaller@googlegroups.com.
> 
> syzbot will keep track of this bug report. See:
> https://goo.gl/tpsmEJ#status for how to communicate with syzbot.


^ permalink raw reply

* Re: [PATCH net-next 1/2] net sched: update skbedit action for batched events operations
From: Roman Mashak @ 2019-07-25 21:39 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, kernel, jhs, xiyou.wangcong, jiri
In-Reply-To: <20190713.192344.1454771658469437265.davem@davemloft.net>

David Miller <davem@davemloft.net> writes:

> From: Roman Mashak <mrv@mojatatu.com>
> Date: Fri, 12 Jul 2019 18:21:59 -0400
>
>> Add get_fill_size() routine used to calculate the action size
>> when building a batch of events.
>> 
>> Signed-off-by: Roman Mashak <mrv@mojatatu.com>
>
> Please add an appropriate Fixes: tag, and also when you respin this provide the
> required "[PATCH 0/N] " header posting explaining what the series is doing at
> a high level, how it is doing it, and why it is doing it that way.

Thanks for feedback, I've sent v2 of this patchset.

^ permalink raw reply

* Re: [PATCH net-next 06/11] net: hns3: modify firmware version display format
From: Saeed Mahameed @ 2019-07-25 21:32 UTC (permalink / raw)
  To: tanhuazhong@huawei.com, davem@davemloft.net
  Cc: lipeng321@huawei.com, yisen.zhuang@huawei.com,
	salil.mehta@huawei.com, linuxarm@huawei.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	moyufeng@huawei.com
In-Reply-To: <95783289-9b3b-f085-876b-49815b07d595@huawei.com>

On Thu, 2019-07-25 at 10:34 +0800, tanhuazhong wrote:
> 
> On 2019/7/25 2:34, Saeed Mahameed wrote:
> > On Wed, 2019-07-24 at 11:18 +0800, Huazhong Tan wrote:
> > > From: Yufeng Mo <moyufeng@huawei.com>
> > > 
> > > This patch modifies firmware version display format in
> > > hclge(vf)_cmd_init() and hns3_get_drvinfo(). Also, adds
> > > some optimizations for firmware version display format.
> > > 
> > > Signed-off-by: Yufeng Mo <moyufeng@huawei.com>
> > > Signed-off-by: Peng Li <lipeng321@huawei.com>
> > > Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
> > > ---
> > >   drivers/net/ethernet/hisilicon/hns3/hnae3.h              |  9
> > > +++++++++
> > >   drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c       | 15
> > > +++++++++++++--
> > >   drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c   | 10
> > > +++++++++-
> > >   drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c | 11
> > > +++++++++--
> > >   4 files changed, 40 insertions(+), 5 deletions(-)
> > > 
> > > 

[...]

> > > --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
> > > +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
> > > @@ -419,7 +419,15 @@ int hclge_cmd_init(struct hclge_dev *hdev)
> > >   	}
> > >   	hdev->fw_version = version;
> > >   
> > > -	dev_info(&hdev->pdev->dev, "The firmware version is %08x\n",
> > > version);
> > > +	pr_info_once("The firmware version is %lu.%lu.%lu.%lu\n",
> > > +		     hnae3_get_field(version,
> > > HNAE3_FW_VERSION_BYTE3_MASK,
> > > +				     HNAE3_FW_VERSION_BYTE3_SHIFT),
> > > +		     hnae3_get_field(version,
> > > HNAE3_FW_VERSION_BYTE2_MASK,
> > > +				     HNAE3_FW_VERSION_BYTE2_SHIFT),
> > > +		     hnae3_get_field(version,
> > > HNAE3_FW_VERSION_BYTE1_MASK,
> > > +				     HNAE3_FW_VERSION_BYTE1_SHIFT),
> > > +		     hnae3_get_field(version,
> > > HNAE3_FW_VERSION_BYTE0_MASK,
> > > +				     HNAE3_FW_VERSION_BYTE0_SHIFT));
> > >   
> > 
> > Device name/string will not be printed now, what happens if i have
> > multiple devices ? at least print the device name as it was before
> > 
> Since on each board we only have one firmware, the firmware
> version is same per device, and will not change when running.
> So pr_info_once() looks good for this case.
> 

boards change too often to have such static assumption.

> BTW, maybe we should change below print in the end of
> hclge_init_ae_dev(), use dev_info() instead of pr_info(),
> then we can know that which device has already initialized.
> I will send other patch to do that, is it acceptable for you?
> 
> "pr_info("%s driver initialization finished.\n", HCLGE_DRIVER_NAME);"
> 

I would avoid using pr_info when i can ! if you have the option to
print with dev information as it was before that is preferable.

Thanks,
Saeed.


^ permalink raw reply

* Re: [PATCH net-next 2/2] mlx4/en_netdev: call notifiers when hw_enc_features change
From: Saeed Mahameed @ 2019-07-25 21:27 UTC (permalink / raw)
  To: dcaratti@redhat.com, davem@davemloft.net, Tariq Toukan,
	netdev@vger.kernel.org
  Cc: Eran Ben Elisha
In-Reply-To: <73cd7a2a29db5a32d669273d367566cdf6652f4e.camel@redhat.com>

On Thu, 2019-07-25 at 14:25 +0200, Davide Caratti wrote:
> On Wed, 2019-07-24 at 20:47 +0000, Saeed Mahameed wrote:
> > On Wed, 2019-07-24 at 16:02 +0200, Davide Caratti wrote:
> > > ensure to call netdev_features_change() when the driver flips its
> > > hw_enc_features bits.
> > > 
> > > Signed-off-by: Davide Caratti <dcaratti@redhat.com>
> > 
> > The patch is correct, 
> 
> hello Saeed, and thanks for looking at this!
> 
> > but can you explain how did you come to this ? 
> > did you encounter any issue with the current code ?
> > 
> > I am asking just because i think the whole dynamic changing of dev-
> > > hw_enc_features is redundant since mlx4 has the featutres_check
> > callback.
> 
> we need it to ensure that vlan_transfer_features() updates
> the (new) value of hw_enc_features in the overlying vlan: otherwise,
> segmentation will happen anyway when skb passes from vxlan to vlan,
> if the
> vxlan is added after the vlan device has been created (see:
> 7dad9937e064
> ("net: vlan: add support for tunnel offload") ).
> 

but in previous patch you made sure that the vlan always sees the
correct hw_enc_features on driver load, we don't need to have this
dynamic update mechanism, features_check ndo should take care of
protocols we don't support.

> thanks!

^ permalink raw reply

* Re: [PATCH] net/mlx5e: Fix zero table prio set by user.
From: Saeed Mahameed @ 2019-07-25 21:22 UTC (permalink / raw)
  To: wenxu@ucloud.cn, Roi Dayan, Or Gerlitz, Mark Bloch, Paul Blakey
  Cc: netdev@vger.kernel.org
In-Reply-To: <1564053847-28756-1-git-send-email-wenxu@ucloud.cn>

On Thu, 2019-07-25 at 19:24 +0800, wenxu@ucloud.cn wrote:
> From: wenxu <wenxu@ucloud.cn>
> 
> The flow_cls_common_offload prio is zero
> 
> It leads the invalid table prio in hw.
> 
> Error: Could not process rule: Invalid argument
> 
> kernel log:
> mlx5_core 0000:81:00.0: E-Switch: Failed to create FDB Table err -22
> (table prio: 65535, level: 0, size: 4194304)
> 
> table_prio = (chain * FDB_MAX_PRIO) + prio - 1;
> should check (chain * FDB_MAX_PRIO) + prio is not 0
> 
> Signed-off-by: wenxu <wenxu@ucloud.cn>
> ---
>  drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git
> a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> index 089ae4d..64ca90f 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> @@ -970,7 +970,9 @@ static int esw_add_fdb_miss_rule(struct 

this piece of code isn't in this function, weird how it got to the
diff, patch applies correctly though !

> mlx5_eswitch *esw)
>  		flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT |
>  			  MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
>  
> -	table_prio = (chain * FDB_MAX_PRIO) + prio - 1;
> +	table_prio = (chain * FDB_MAX_PRIO) + prio;
> +	if (table_prio)
> +		table_prio = table_prio - 1;
>  

This is black magic, even before this fix.
this -1 seems to be needed in order to call
create_next_size_table(table_prio) with the previous "table prio" ?
(table_prio - 1)  ?

The whole thing looks wrong to me since when prio is 0 and chain is 0,
there is not such thing table_prio - 1.

mlnx eswitch guys in the cc, please advise.

Thanks,
Saeed.

>  	/* create earlier levels for correct fs_core lookup when
>  	 * connecting tables

^ permalink raw reply

* Re: [PATCH net-next 2/2] qed: Add API for flashing the nvm attributes.
From: Saeed Mahameed @ 2019-07-25 20:56 UTC (permalink / raw)
  To: skalluru@marvell.com, davem@davemloft.net
  Cc: aelior@marvell.com, mkalderon@marvell.com, netdev@vger.kernel.org
In-Reply-To: <MN2PR18MB25283C800F4F96BB511118A0D3C10@MN2PR18MB2528.namprd18.prod.outlook.com>

On Thu, 2019-07-25 at 00:48 +0000, Sudarsana Reddy Kalluru wrote:
> > > 
> > -----Original Message-----
> > From: Saeed Mahameed <saeedm@mellanox.com>
> > Sent: Thursday, July 25, 2019 1:13 AM
> > To: Sudarsana Reddy Kalluru <skalluru@marvell.com>;
> > davem@davemloft.net
> > Cc: Ariel Elior <aelior@marvell.com>; Michal Kalderon
> > <mkalderon@marvell.com>; netdev@vger.kernel.org
> > Subject: [EXT] Re: [PATCH net-next 2/2] qed: Add API for flashing
> > the nvm
> > attributes.
> > 
> > External Email
> > 
> > -----------------------------------------------------------------
> > -----
> > On Tue, 2019-07-23 at 21:51 -0700, Sudarsana Reddy Kalluru wrote:
> > > The patch adds driver interface for reading the NVM config
> > > request and
> > > update the attributes on nvm config flash partition.
> > > 
> > 
> > You didn't not use the get_cfg API you added in previous patch.
> Thanks for your review. Will move this API to the next patch series
> which will plan to send shortly.
> 
> > Also can you please clarify how the user reads/write from/to NVM
> > config
> > ? i mean what UAPIs and tools are being used ?
> NVM config/partition will be updated using ethtool flash update
> command (i.e., ethtool -f) just like the update of 
> other flash partitions of qed device. Example code path,
>   ethool-flash_device --> qede_flash_device() --> qed_nvm_flash() --> 
> qed_nvm_flash_cfg_write()
> 

I see, thanks for the clarification.

^ permalink raw reply

* Re: [net 9/9] Documentation: TLS: fix stat counters description
From: Jakub Kicinski @ 2019-07-25 20:53 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: David S. Miller, netdev@vger.kernel.org, Tariq Toukan
In-Reply-To: <20190725203618.11011-10-saeedm@mellanox.com>

On Thu, 25 Jul 2019 20:36:52 +0000, Saeed Mahameed wrote:
> From: Tariq Toukan <tariqt@mellanox.com>
> 
> Add missing description of counters.
> Split tx_tls_encrypted counter into two, to give packets
> and bytes indications.
> 
> Fixes: f42c104f2ec9 ("Documentation: add TLS offload documentation")
> Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>

Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>

Thanks!

^ permalink raw reply

* Re: [net 8/9] nfp: tls: rename tls packet counters
From: Jakub Kicinski @ 2019-07-25 20:50 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: David S. Miller, netdev@vger.kernel.org, Tariq Toukan
In-Reply-To: <20190725203618.11011-9-saeedm@mellanox.com>

On Thu, 25 Jul 2019 20:36:50 +0000, Saeed Mahameed wrote:
> From: Tariq Toukan <tariqt@mellanox.com>
> 
> Align to the naming convention in TLS documentation.
> 
> Fixes: 51a5e563298d ("nfp: tls: add basic statistics")
> Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>

Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>

^ permalink raw reply

* Re: [net 7/9] net/mlx5e: kTLS, Call WARN_ONCE on netdev mismatch
From: Jakub Kicinski @ 2019-07-25 20:49 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: David S. Miller, netdev@vger.kernel.org, Tariq Toukan
In-Reply-To: <20190725203618.11011-8-saeedm@mellanox.com>

On Thu, 25 Jul 2019 20:36:48 +0000, Saeed Mahameed wrote:
> From: Tariq Toukan <tariqt@mellanox.com>
> 
> A netdev mismatch in the processed TLS SKB should not occur,
> and indicates a kernel bug.
> Add WARN_ONCE to spot such cases.
> 
> Fixes: d2ead1f360e8 ("net/mlx5e: Add kTLS TX HW offload support")
> Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
> ---
>  drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
> index ea032f54197e..3766545ce259 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
> @@ -412,7 +412,7 @@ struct sk_buff *mlx5e_ktls_handle_tx_skb(struct net_device *netdev,
>  		goto out;
>  
>  	tls_ctx = tls_get_ctx(skb->sk);
> -	if (unlikely(tls_ctx->netdev != netdev))
> +	if (unlikely(WARN_ON_ONCE(tls_ctx->netdev != netdev)))

Ah, nit: the unlikely is probably unnecessary but that's no big deal.

#define WARN_ON_ONCE(condition) ({			\
	static int __warned;				\
	int __ret_warn_once = !!(condition);		\
							\
	if (unlikely(__ret_warn_once && !__warned)) {	\
		__warned = true;			\
		WARN_ON(1);				\
	}						\
	unlikely(__ret_warn_once);			\
})

>  		goto err_out;
>  
>  	priv_tx = mlx5e_get_ktls_tx_priv_ctx(tls_ctx);


^ permalink raw reply

* Re: [net 7/9] net/mlx5e: kTLS, Call WARN_ONCE on netdev mismatch
From: Jakub Kicinski @ 2019-07-25 20:48 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: David S. Miller, netdev@vger.kernel.org, Tariq Toukan
In-Reply-To: <20190725203618.11011-8-saeedm@mellanox.com>

On Thu, 25 Jul 2019 20:36:48 +0000, Saeed Mahameed wrote:
> From: Tariq Toukan <tariqt@mellanox.com>
> 
> A netdev mismatch in the processed TLS SKB should not occur,
> and indicates a kernel bug.
> Add WARN_ONCE to spot such cases.
> 
> Fixes: d2ead1f360e8 ("net/mlx5e: Add kTLS TX HW offload support")
> Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>

Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>

^ permalink raw reply

* [PATCH 0/2] Make ipmr queue length configurable
From: Brodie Greenfield @ 2019-07-25 20:42 UTC (permalink / raw)
  To: davem, stephen, kuznet, yoshfuji, netdev
  Cc: linux-kernel, chris.packham, luuk.paulussen, Brodie Greenfield

We want to have some more space in our queue for processing incoming
multicast packets, so we can process more of them without dropping
them prematurely. It is useful to be able to increase this limit on
higher-spec platforms that can handle more items.

For the particular use case here at Allied Telesis, we have linux
running on our switches and routers, with support for the number of
multicast groups being increased. Basically, this queue length affects
the time taken to fully learn all of the multicast streams. 

Changes in v3:
 - Corrected a v4 to v6 typo.

Changes in v2:
 - Tidy up a few unnecessary bits of code.
 - Put the sysctl inside ip multicast ifdef.
 - Included the IPv6 version.

Brodie Greenfield (2):
  ipmr: Make cache queue length configurable
  ip6mr: Make cache queue length configurable

 Documentation/networking/ip-sysctl.txt | 16 ++++++++++++++++
 include/net/netns/ipv4.h               |  1 +
 include/net/netns/ipv6.h               |  1 +
 net/ipv4/af_inet.c                     |  1 +
 net/ipv4/ipmr.c                        |  4 +++-
 net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
 net/ipv6/af_inet6.c                    |  1 +
 net/ipv6/ip6mr.c                       |  4 +++-
 net/ipv6/sysctl_net_ipv6.c             |  7 +++++++
 9 files changed, 40 insertions(+), 2 deletions(-)

-- 
2.21.0


^ permalink raw reply

* [PATCH 2/2] ip6mr: Make cache queue length configurable
From: Brodie Greenfield @ 2019-07-25 20:42 UTC (permalink / raw)
  To: davem, stephen, kuznet, yoshfuji, netdev
  Cc: linux-kernel, chris.packham, luuk.paulussen, Brodie Greenfield
In-Reply-To: <20190725204230.12229-1-brodie.greenfield@alliedtelesis.co.nz>

We want to be able to keep more spaces available in our queue for
processing incoming IPv6 multicast traffic (adding (S,G) entries) - this
lets us learn more groups faster, rather than dropping them at this stage.

Signed-off-by: Brodie Greenfield <brodie.greenfield@alliedtelesis.co.nz>
---
 Documentation/networking/ip-sysctl.txt | 8 ++++++++
 include/net/netns/ipv6.h               | 1 +
 net/ipv6/af_inet6.c                    | 1 +
 net/ipv6/ip6mr.c                       | 4 +++-
 net/ipv6/sysctl_net_ipv6.c             | 7 +++++++
 5 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 02f77e932adf..68eada3ca915 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1481,6 +1481,14 @@ skip_notify_on_dev_down - BOOLEAN
 	on userspace caches to track link events and evict routes.
 	Default: false (generate message)
 
+ip6_mr_cache_queue_length - INTEGER
+	Limit the number of multicast packets we can have in the queue to be
+	resolved.
+	Bear in mind that when an unresolved multicast packet is received,
+	there is an O(n) traversal of the queue. This should be considered
+	if increasing.
+	Default: 10
+
 IPv6 Fragmentation:
 
 ip6frag_high_thresh - INTEGER
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index ef1ed529f33c..84b58424c799 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -46,6 +46,7 @@ struct netns_sysctl_ipv6 {
 	int max_hbh_opts_len;
 	int seg6_flowlabel;
 	bool skip_notify_on_dev_down;
+	unsigned int ip6_mr_cache_queue_length;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d99753b5e39b..6551bb63e5a2 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -856,6 +856,7 @@ static int __net_init inet6_net_init(struct net *net)
 	net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT;
 	net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN;
 	net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN;
+	net->ipv6.sysctl.ip6_mr_cache_queue_length = 10;
 	atomic_set(&net->ipv6.fib6_sernum, 1);
 
 	err = ipv6_init_mibs(net);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index cc01aa3f2b5e..bb445871437e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1135,6 +1135,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
 static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
 				  struct sk_buff *skb, struct net_device *dev)
 {
+	struct net *net = dev_net(dev);
 	struct mfc6_cache *c;
 	bool found = false;
 	int err;
@@ -1153,7 +1154,8 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
 		 *	Create a new entry if allowable
 		 */
 
-		if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
+		if (atomic_read(&mrt->cache_resolve_queue_len) >=
+		    net->ipv6.sysctl.ip6_mr_cache_queue_length ||
 		    (c = ip6mr_cache_alloc_unres()) == NULL) {
 			spin_unlock_bh(&mfc_unres_lock);
 
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index e15cd37024fd..a27299d4cc34 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -159,6 +159,13 @@ static struct ctl_table ipv6_table_template[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "ip6_mr_cache_queue_length",
+		.data		= &init_net.ipv6.sysctl.ip6_mr_cache_queue_length,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
-- 
2.21.0


^ permalink raw reply related

* [PATCH 1/2] ipmr: Make cache queue length configurable
From: Brodie Greenfield @ 2019-07-25 20:42 UTC (permalink / raw)
  To: davem, stephen, kuznet, yoshfuji, netdev
  Cc: linux-kernel, chris.packham, luuk.paulussen, Brodie Greenfield
In-Reply-To: <20190725204230.12229-1-brodie.greenfield@alliedtelesis.co.nz>

We want to be able to keep more spaces available in our queue for
processing incoming multicast traffic (adding (S,G) entries) - this lets
us learn more groups faster, rather than dropping them at this stage.

Signed-off-by: Brodie Greenfield <brodie.greenfield@alliedtelesis.co.nz>
---
 Documentation/networking/ip-sysctl.txt | 8 ++++++++
 include/net/netns/ipv4.h               | 1 +
 net/ipv4/af_inet.c                     | 1 +
 net/ipv4/ipmr.c                        | 4 +++-
 net/ipv4/sysctl_net_ipv4.c             | 7 +++++++
 5 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index acdfb5d2bcaa..02f77e932adf 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -887,6 +887,14 @@ ip_local_reserved_ports - list of comma separated ranges
 
 	Default: Empty
 
+ip_mr_cache_queue_length - INTEGER
+	Limit the number of multicast packets we can have in the queue to be
+	resolved.
+	Bear in mind that when an unresolved multicast packet is received,
+	there is an O(n) traversal of the queue. This should be considered
+	if increasing.
+	Default: 10
+
 ip_unprivileged_port_start - INTEGER
 	This is a per-namespace sysctl.  It defines the first
 	unprivileged port in the network namespace.  Privileged ports
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 104a6669e344..3411d3f18d51 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -187,6 +187,7 @@ struct netns_ipv4 {
 	int sysctl_igmp_max_msf;
 	int sysctl_igmp_llm_reports;
 	int sysctl_igmp_qrv;
+	unsigned int sysctl_ip_mr_cache_queue_length;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0dfb72c46671..8e25538bdb1e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1827,6 +1827,7 @@ static __net_init int inet_init_net(struct net *net)
 	net->ipv4.sysctl_igmp_llm_reports = 1;
 	net->ipv4.sysctl_igmp_qrv = 2;
 
+	net->ipv4.sysctl_ip_mr_cache_queue_length = 10;
 	return 0;
 }
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ddbf8c9a1abb..c6a6c3e453a9 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1127,6 +1127,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 				 struct sk_buff *skb, struct net_device *dev)
 {
 	const struct iphdr *iph = ip_hdr(skb);
+	struct net *net = dev_net(dev);
 	struct mfc_cache *c;
 	bool found = false;
 	int err;
@@ -1142,7 +1143,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 
 	if (!found) {
 		/* Create a new entry if allowable */
-		if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
+		if (atomic_read(&mrt->cache_resolve_queue_len) >=
+		    net->ipv4.sysctl_ip_mr_cache_queue_length ||
 		    (c = ipmr_cache_alloc_unres()) == NULL) {
 			spin_unlock_bh(&mfc_unres_lock);
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ba0fc4b18465..78ae86e8c6cb 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -784,6 +784,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 #ifdef CONFIG_IP_MULTICAST
+	{
+		.procname	= "ip_mr_cache_queue_length",
+		.data		= &init_net.ipv4.sysctl_ip_mr_cache_queue_length,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{
 		.procname	= "igmp_qrv",
 		.data		= &init_net.ipv4.sysctl_igmp_qrv,
-- 
2.21.0


^ permalink raw reply related

* [net 9/9] Documentation: TLS: fix stat counters description
From: Saeed Mahameed @ 2019-07-25 20:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Jakub Kicinski, Tariq Toukan,
	Saeed Mahameed
In-Reply-To: <20190725203618.11011-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Add missing description of counters.
Split tx_tls_encrypted counter into two, to give packets
and bytes indications.

Fixes: f42c104f2ec9 ("Documentation: add TLS offload documentation")
Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 Documentation/networking/tls-offload.rst | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst
index 048e5ca44824..b70b70dc4524 100644
--- a/Documentation/networking/tls-offload.rst
+++ b/Documentation/networking/tls-offload.rst
@@ -424,13 +424,24 @@ Statistics
 Following minimum set of TLS-related statistics should be reported
 by the driver:
 
- * ``rx_tls_decrypted`` - number of successfully decrypted TLS segments
- * ``tx_tls_encrypted`` - number of in-order TLS segments passed to device
-   for encryption
+ * ``rx_tls_decrypted_packets`` - number of successfully decrypted RX packets
+   which were part of a TLS stream.
+ * ``rx_tls_decrypted_bytes`` - number of TLS payload bytes in RX packets
+   which were successfully decrypted.
+ * ``tx_tls_encrypted_packets`` - number of TX packets passed to the device
+   for encryption of their TLS payload.
+ * ``tx_tls_encrypted_bytes`` - number of TLS payload bytes in TX packets
+   passed to the device for encryption.
+ * ``tx_tls_ctx`` - number of TLS TX HW offload contexts added to device for
+   encryption.
  * ``tx_tls_ooo`` - number of TX packets which were part of a TLS stream
-   but did not arrive in the expected order
- * ``tx_tls_drop_no_sync_data`` - number of TX packets dropped because
-   they arrived out of order and associated record could not be found
+   but did not arrive in the expected order.
+ * ``tx_tls_drop_no_sync_data`` - number of TX packets which were part of
+   a TLS stream dropped, because they arrived out of order and associated
+   record could not be found.
+ * ``tx_tls_drop_bypass_req`` - number of TX packets which were part of a TLS
+   stream dropped, because they contain both data that has been encrypted by
+   software and data that expects hardware crypto offload.
 
 Notable corner cases, exceptions and additional requirements
 ============================================================
-- 
2.21.0


^ permalink raw reply related

* [net 8/9] nfp: tls: rename tls packet counters
From: Saeed Mahameed @ 2019-07-25 20:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Jakub Kicinski, Tariq Toukan,
	Saeed Mahameed
In-Reply-To: <20190725203618.11011-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Align to the naming convention in TLS documentation.

Fixes: 51a5e563298d ("nfp: tls: add basic statistics")
Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index d9cbe84ac6ad..1b840ee47339 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -444,12 +444,12 @@ static u8 *nfp_vnic_get_sw_stats_strings(struct net_device *netdev, u8 *data)
 	data = nfp_pr_et(data, "hw_rx_csum_complete");
 	data = nfp_pr_et(data, "hw_rx_csum_err");
 	data = nfp_pr_et(data, "rx_replace_buf_alloc_fail");
-	data = nfp_pr_et(data, "rx_tls_decrypted");
+	data = nfp_pr_et(data, "rx_tls_decrypted_packets");
 	data = nfp_pr_et(data, "hw_tx_csum");
 	data = nfp_pr_et(data, "hw_tx_inner_csum");
 	data = nfp_pr_et(data, "tx_gather");
 	data = nfp_pr_et(data, "tx_lso");
-	data = nfp_pr_et(data, "tx_tls_encrypted");
+	data = nfp_pr_et(data, "tx_tls_encrypted_packets");
 	data = nfp_pr_et(data, "tx_tls_ooo");
 	data = nfp_pr_et(data, "tx_tls_drop_no_sync_data");
 
-- 
2.21.0


^ permalink raw reply related

* [net 7/9] net/mlx5e: kTLS, Call WARN_ONCE on netdev mismatch
From: Saeed Mahameed @ 2019-07-25 20:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Jakub Kicinski, Tariq Toukan,
	Saeed Mahameed
In-Reply-To: <20190725203618.11011-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

A netdev mismatch in the processed TLS SKB should not occur,
and indicates a kernel bug.
Add WARN_ONCE to spot such cases.

Fixes: d2ead1f360e8 ("net/mlx5e: Add kTLS TX HW offload support")
Suggested-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
index ea032f54197e..3766545ce259 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
@@ -412,7 +412,7 @@ struct sk_buff *mlx5e_ktls_handle_tx_skb(struct net_device *netdev,
 		goto out;
 
 	tls_ctx = tls_get_ctx(skb->sk);
-	if (unlikely(tls_ctx->netdev != netdev))
+	if (unlikely(WARN_ON_ONCE(tls_ctx->netdev != netdev)))
 		goto err_out;
 
 	priv_tx = mlx5e_get_ktls_tx_priv_ctx(tls_ctx);
-- 
2.21.0


^ permalink raw reply related

* [net 6/9] net/mlx5e: Prevent encap flow counter update async to user query
From: Saeed Mahameed @ 2019-07-25 20:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Jakub Kicinski, Ariel Levkovich,
	Roi Dayan, Saeed Mahameed
In-Reply-To: <20190725203618.11011-1-saeedm@mellanox.com>

From: Ariel Levkovich <lariel@mellanox.com>

This patch prevents a race between user invoked cached counters
query and a neighbor last usage updater.

The cached flow counter stats can be queried by calling
"mlx5_fc_query_cached" which provides the number of bytes and
packets that passed via this flow since the last time this counter
was queried.
It does so by reducting the last saved stats from the current, cached
stats and then updating the last saved stats with the cached stats.
It also provide the lastuse value for that flow.

Since "mlx5e_tc_update_neigh_used_value" needs to retrieve the
last usage time of encapsulation flows, it calls the flow counter
query method periodically and async to user queries of the flow counter
using cls_flower.
This call is causing the driver to update the last reported bytes and
packets from the cache and therefore, future user queries of the flow
stats will return lower than expected number for bytes and packets
since the last saved stats in the driver was updated async to the last
saved stats in cls_flower.

This causes wrong stats presentation of encapsulation flows to user.

Since the neighbor usage updater only needs the lastuse stats from the
cached counter, the fix is to use a dedicated lastuse query call that
returns the lastuse value without synching between the cached stats and
the last saved stats.

Fixes: f6dfb4c3f216 ("net/mlx5e: Update neighbour 'used' state using HW flow rules counters")
Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c       | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c | 5 +++++
 include/linux/mlx5/fs.h                               | 1 +
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index cc096f6011d9..7ecfc53cf5f6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1230,13 +1230,13 @@ static struct mlx5_fc *mlx5e_tc_get_counter(struct mlx5e_tc_flow *flow)
 void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
 {
 	struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
-	u64 bytes, packets, lastuse = 0;
 	struct mlx5e_tc_flow *flow;
 	struct mlx5e_encap_entry *e;
 	struct mlx5_fc *counter;
 	struct neigh_table *tbl;
 	bool neigh_used = false;
 	struct neighbour *n;
+	u64 lastuse;
 
 	if (m_neigh->family == AF_INET)
 		tbl = &arp_tbl;
@@ -1256,7 +1256,7 @@ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
 					    encaps[efi->index]);
 			if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
 				counter = mlx5e_tc_get_counter(flow);
-				mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
+				lastuse = mlx5_fc_query_lastuse(counter);
 				if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
 					neigh_used = true;
 					break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index b3762123a69c..1834d9f3aa1c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -369,6 +369,11 @@ int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
 }
 EXPORT_SYMBOL(mlx5_fc_query);
 
+u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter)
+{
+	return counter->cache.lastuse;
+}
+
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
 			  u64 *bytes, u64 *packets, u64 *lastuse)
 {
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 04a569568eac..f049af3f3cd8 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -220,6 +220,7 @@ int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler,
 
 struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
+u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter);
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
 			  u64 *bytes, u64 *packets, u64 *lastuse);
 int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
-- 
2.21.0


^ permalink raw reply related

* [net 5/9] net/mlx5e: Fix matching of speed to PRM link modes
From: Saeed Mahameed @ 2019-07-25 20:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Jakub Kicinski, Aya Levin, Saeed Mahameed
In-Reply-To: <20190725203618.11011-1-saeedm@mellanox.com>

From: Aya Levin <ayal@mellanox.com>

Speed translation is performed based on legacy or extended PTYS
register. Translate speed with respect to:
1) Capability bit of extended PTYS table.
2) User request:
 a) When auto-negotiation is turned on, inspect advertisement whether it
 contains extended link modes.
 b) When auto-negotiation is turned off, speed > 100Gbps (maximal
 speed supported in legacy mode).
With both conditions fulfilled translation is done with extended PTYS
table otherwise use legacy PTYS table.
Without this patch 25/50/100 Gbps speed cannot be set, since try to
configure in extended mode but read from legacy mode.

Fixes: dd1b9e09c12b ("net/mlx5: ethtool, Allow legacy link-modes configuration via non-extended ptys")
Signed-off-by: Aya Levin <ayal@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en/port.c | 27 +++++---
 .../net/ethernet/mellanox/mlx5/core/en/port.h |  6 +-
 .../ethernet/mellanox/mlx5/core/en_ethtool.c  | 67 +++++++++++++------
 3 files changed, 68 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
index d5e5afbdca6d..f777994f3005 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
@@ -78,9 +78,10 @@ static const u32 mlx5e_ext_link_speed[MLX5E_EXT_LINK_MODES_NUMBER] = {
 };
 
 static void mlx5e_port_get_speed_arr(struct mlx5_core_dev *mdev,
-				     const u32 **arr, u32 *size)
+				     const u32 **arr, u32 *size,
+				     bool force_legacy)
 {
-	bool ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet);
+	bool ext = force_legacy ? false : MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet);
 
 	*size = ext ? ARRAY_SIZE(mlx5e_ext_link_speed) :
 		      ARRAY_SIZE(mlx5e_link_speed);
@@ -152,7 +153,8 @@ int mlx5_port_set_eth_ptys(struct mlx5_core_dev *dev, bool an_disable,
 			    sizeof(out), MLX5_REG_PTYS, 0, 1);
 }
 
-u32 mlx5e_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper)
+u32 mlx5e_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper,
+			  bool force_legacy)
 {
 	unsigned long temp = eth_proto_oper;
 	const u32 *table;
@@ -160,7 +162,7 @@ u32 mlx5e_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper)
 	u32 max_size;
 	int i;
 
-	mlx5e_port_get_speed_arr(mdev, &table, &max_size);
+	mlx5e_port_get_speed_arr(mdev, &table, &max_size, force_legacy);
 	i = find_first_bit(&temp, max_size);
 	if (i < max_size)
 		speed = table[i];
@@ -170,6 +172,7 @@ u32 mlx5e_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper)
 int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
 {
 	struct mlx5e_port_eth_proto eproto;
+	bool force_legacy = false;
 	bool ext;
 	int err;
 
@@ -177,8 +180,13 @@ int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
 	err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto);
 	if (err)
 		goto out;
-
-	*speed = mlx5e_port_ptys2speed(mdev, eproto.oper);
+	if (ext && !eproto.admin) {
+		force_legacy = true;
+		err = mlx5_port_query_eth_proto(mdev, 1, false, &eproto);
+		if (err)
+			goto out;
+	}
+	*speed = mlx5e_port_ptys2speed(mdev, eproto.oper, force_legacy);
 	if (!(*speed))
 		err = -EINVAL;
 
@@ -201,7 +209,7 @@ int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
 	if (err)
 		return err;
 
-	mlx5e_port_get_speed_arr(mdev, &table, &max_size);
+	mlx5e_port_get_speed_arr(mdev, &table, &max_size, false);
 	for (i = 0; i < max_size; ++i)
 		if (eproto.cap & MLX5E_PROT_MASK(i))
 			max_speed = max(max_speed, table[i]);
@@ -210,14 +218,15 @@ int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
 	return 0;
 }
 
-u32 mlx5e_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed)
+u32 mlx5e_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed,
+			       bool force_legacy)
 {
 	u32 link_modes = 0;
 	const u32 *table;
 	u32 max_size;
 	int i;
 
-	mlx5e_port_get_speed_arr(mdev, &table, &max_size);
+	mlx5e_port_get_speed_arr(mdev, &table, &max_size, force_legacy);
 	for (i = 0; i < max_size; ++i) {
 		if (table[i] == speed)
 			link_modes |= MLX5E_PROT_MASK(i);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
index 70f536ec51c4..4a7f4497692b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
@@ -48,10 +48,12 @@ void mlx5_port_query_eth_autoneg(struct mlx5_core_dev *dev, u8 *an_status,
 				 u8 *an_disable_cap, u8 *an_disable_admin);
 int mlx5_port_set_eth_ptys(struct mlx5_core_dev *dev, bool an_disable,
 			   u32 proto_admin, bool ext);
-u32 mlx5e_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper);
+u32 mlx5e_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper,
+			  bool force_legacy);
 int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
 int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
-u32 mlx5e_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed);
+u32 mlx5e_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed,
+			       bool force_legacy);
 
 int mlx5e_port_query_pbmc(struct mlx5_core_dev *mdev, void *out);
 int mlx5e_port_set_pbmc(struct mlx5_core_dev *mdev, void *in);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index ed25757ac5bd..03bed714bac3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -785,7 +785,7 @@ static void ptys2ethtool_supported_advertised_port(struct ethtool_link_ksettings
 }
 
 static void get_speed_duplex(struct net_device *netdev,
-			     u32 eth_proto_oper,
+			     u32 eth_proto_oper, bool force_legacy,
 			     struct ethtool_link_ksettings *link_ksettings)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
@@ -795,7 +795,7 @@ static void get_speed_duplex(struct net_device *netdev,
 	if (!netif_carrier_ok(netdev))
 		goto out;
 
-	speed = mlx5e_port_ptys2speed(priv->mdev, eth_proto_oper);
+	speed = mlx5e_port_ptys2speed(priv->mdev, eth_proto_oper, force_legacy);
 	if (!speed) {
 		speed = SPEED_UNKNOWN;
 		goto out;
@@ -914,8 +914,8 @@ int mlx5e_ethtool_get_link_ksettings(struct mlx5e_priv *priv,
 	/* Fields: eth_proto_admin and ext_eth_proto_admin  are
 	 * mutually exclusive. Hence try reading legacy advertising
 	 * when extended advertising is zero.
-	 * admin_ext indicates how eth_proto_admin should be
-	 * interpreted
+	 * admin_ext indicates which proto_admin (ext vs. legacy)
+	 * should be read and interpreted
 	 */
 	admin_ext = ext;
 	if (ext && !eth_proto_admin) {
@@ -924,7 +924,7 @@ int mlx5e_ethtool_get_link_ksettings(struct mlx5e_priv *priv,
 		admin_ext = false;
 	}
 
-	eth_proto_oper   = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
+	eth_proto_oper   = MLX5_GET_ETH_PROTO(ptys_reg, out, admin_ext,
 					      eth_proto_oper);
 	eth_proto_lp	    = MLX5_GET(ptys_reg, out, eth_proto_lp_advertise);
 	an_disable_admin    = MLX5_GET(ptys_reg, out, an_disable_admin);
@@ -939,7 +939,8 @@ int mlx5e_ethtool_get_link_ksettings(struct mlx5e_priv *priv,
 	get_supported(mdev, eth_proto_cap, link_ksettings);
 	get_advertising(eth_proto_admin, tx_pause, rx_pause, link_ksettings,
 			admin_ext);
-	get_speed_duplex(priv->netdev, eth_proto_oper, link_ksettings);
+	get_speed_duplex(priv->netdev, eth_proto_oper, !admin_ext,
+			 link_ksettings);
 
 	eth_proto_oper = eth_proto_oper ? eth_proto_oper : eth_proto_cap;
 
@@ -1016,45 +1017,69 @@ static u32 mlx5e_ethtool2ptys_ext_adver_link(const unsigned long *link_modes)
 	return ptys_modes;
 }
 
+static bool ext_link_mode_requested(const unsigned long *adver)
+{
+#define MLX5E_MIN_PTYS_EXT_LINK_MODE_BIT ETHTOOL_LINK_MODE_50000baseKR_Full_BIT
+	int size = __ETHTOOL_LINK_MODE_MASK_NBITS - MLX5E_MIN_PTYS_EXT_LINK_MODE_BIT;
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(modes);
+
+	bitmap_set(modes, MLX5E_MIN_PTYS_EXT_LINK_MODE_BIT, size);
+	return bitmap_intersects(modes, adver, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static bool ext_speed_requested(u32 speed)
+{
+#define MLX5E_MAX_PTYS_LEGACY_SPEED 100000
+	return !!(speed > MLX5E_MAX_PTYS_LEGACY_SPEED);
+}
+
+static bool ext_requested(u8 autoneg, const unsigned long *adver, u32 speed)
+{
+	bool ext_link_mode = ext_link_mode_requested(adver);
+	bool ext_speed = ext_speed_requested(speed);
+
+	return  autoneg == AUTONEG_ENABLE ? ext_link_mode : ext_speed;
+}
+
 int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv,
 				     const struct ethtool_link_ksettings *link_ksettings)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_port_eth_proto eproto;
+	const unsigned long *adver;
 	bool an_changes = false;
 	u8 an_disable_admin;
 	bool ext_supported;
-	bool ext_requested;
 	u8 an_disable_cap;
 	bool an_disable;
 	u32 link_modes;
 	u8 an_status;
+	u8 autoneg;
 	u32 speed;
+	bool ext;
 	int err;
 
 	u32 (*ethtool2ptys_adver_func)(const unsigned long *adver);
 
-#define MLX5E_PTYS_EXT ((1ULL << ETHTOOL_LINK_MODE_50000baseKR_Full_BIT) - 1)
+	adver = link_ksettings->link_modes.advertising;
+	autoneg = link_ksettings->base.autoneg;
+	speed = link_ksettings->base.speed;
 
-	ext_requested = !!(link_ksettings->link_modes.advertising[0] >
-			MLX5E_PTYS_EXT ||
-			link_ksettings->link_modes.advertising[1]);
+	ext = ext_requested(autoneg, adver, speed),
 	ext_supported = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet);
-	ext_requested &= ext_supported;
+	if (!ext_supported && ext)
+		return -EOPNOTSUPP;
 
-	speed = link_ksettings->base.speed;
-	ethtool2ptys_adver_func = ext_requested ?
-				  mlx5e_ethtool2ptys_ext_adver_link :
+	ethtool2ptys_adver_func = ext ? mlx5e_ethtool2ptys_ext_adver_link :
 				  mlx5e_ethtool2ptys_adver_link;
-	err = mlx5_port_query_eth_proto(mdev, 1, ext_requested, &eproto);
+	err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto);
 	if (err) {
 		netdev_err(priv->netdev, "%s: query port eth proto failed: %d\n",
 			   __func__, err);
 		goto out;
 	}
-	link_modes = link_ksettings->base.autoneg == AUTONEG_ENABLE ?
-		ethtool2ptys_adver_func(link_ksettings->link_modes.advertising) :
-		mlx5e_port_speed2linkmodes(mdev, speed);
+	link_modes = autoneg == AUTONEG_ENABLE ? ethtool2ptys_adver_func(adver) :
+		mlx5e_port_speed2linkmodes(mdev, speed, !ext);
 
 	link_modes = link_modes & eproto.cap;
 	if (!link_modes) {
@@ -1067,14 +1092,14 @@ int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv,
 	mlx5_port_query_eth_autoneg(mdev, &an_status, &an_disable_cap,
 				    &an_disable_admin);
 
-	an_disable = link_ksettings->base.autoneg == AUTONEG_DISABLE;
+	an_disable = autoneg == AUTONEG_DISABLE;
 	an_changes = ((!an_disable && an_disable_admin) ||
 		      (an_disable && !an_disable_admin));
 
 	if (!an_changes && link_modes == eproto.admin)
 		goto out;
 
-	mlx5_port_set_eth_ptys(mdev, an_disable, link_modes, ext_requested);
+	mlx5_port_set_eth_ptys(mdev, an_disable, link_modes, ext);
 	mlx5_toggle_port_link(mdev);
 
 out:
-- 
2.21.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox