Netdev List
 help / color / mirror / Atom feed
* [RFCv2 bpf-next 11/12] libbpf: Add support for inet_lookup program type
From: Jakub Sitnicki @ 2019-08-28  7:22 UTC (permalink / raw)
  To: bpf, netdev; +Cc: kernel-team, Lorenz Bauer, Marek Majkowski
In-Reply-To: <20190828072250.29828-1-jakub@cloudflare.com>

Make libbpf aware of the newly added program type. Reserve a section name
for it.

Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
---
 tools/lib/bpf/libbpf.c        | 4 ++++
 tools/lib/bpf/libbpf.h        | 2 ++
 tools/lib/bpf/libbpf.map      | 2 ++
 tools/lib/bpf/libbpf_probes.c | 1 +
 4 files changed, 9 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 2233f919dd88..addb9762e965 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -3580,6 +3580,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_PERF_EVENT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+	case BPF_PROG_TYPE_INET_LOOKUP:
 		return false;
 	case BPF_PROG_TYPE_KPROBE:
 	default:
@@ -4447,6 +4448,7 @@ BPF_PROG_TYPE_FNS(tracepoint, BPF_PROG_TYPE_TRACEPOINT);
 BPF_PROG_TYPE_FNS(raw_tracepoint, BPF_PROG_TYPE_RAW_TRACEPOINT);
 BPF_PROG_TYPE_FNS(xdp, BPF_PROG_TYPE_XDP);
 BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT);
+BPF_PROG_TYPE_FNS(inet_lookup, BPF_PROG_TYPE_INET_LOOKUP);
 
 void bpf_program__set_expected_attach_type(struct bpf_program *prog,
 					   enum bpf_attach_type type)
@@ -4542,6 +4544,8 @@ static const struct {
 						BPF_CGROUP_GETSOCKOPT),
 	BPF_EAPROG_SEC("cgroup/setsockopt",	BPF_PROG_TYPE_CGROUP_SOCKOPT,
 						BPF_CGROUP_SETSOCKOPT),
+	BPF_EAPROG_SEC("inet_lookup",		BPF_PROG_TYPE_INET_LOOKUP,
+						BPF_INET_LOOKUP),
 };
 
 #undef BPF_PROG_SEC_IMPL
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index e8f70977d137..937d6da9430a 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -262,6 +262,7 @@ LIBBPF_API int bpf_program__set_sched_cls(struct bpf_program *prog);
 LIBBPF_API int bpf_program__set_sched_act(struct bpf_program *prog);
 LIBBPF_API int bpf_program__set_xdp(struct bpf_program *prog);
 LIBBPF_API int bpf_program__set_perf_event(struct bpf_program *prog);
+LIBBPF_API int bpf_program__set_inet_lookup(struct bpf_program *prog);
 LIBBPF_API void bpf_program__set_type(struct bpf_program *prog,
 				      enum bpf_prog_type type);
 LIBBPF_API void
@@ -276,6 +277,7 @@ LIBBPF_API bool bpf_program__is_sched_cls(const struct bpf_program *prog);
 LIBBPF_API bool bpf_program__is_sched_act(const struct bpf_program *prog);
 LIBBPF_API bool bpf_program__is_xdp(const struct bpf_program *prog);
 LIBBPF_API bool bpf_program__is_perf_event(const struct bpf_program *prog);
+LIBBPF_API bool bpf_program__is_inet_lookup(const struct bpf_program *prog);
 
 /*
  * No need for __attribute__((packed)), all members of 'bpf_map_def'
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 664ce8e7a60e..57564ad458ba 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -67,6 +67,7 @@ LIBBPF_0.0.1 {
 		bpf_prog_test_run;
 		bpf_prog_test_run_xattr;
 		bpf_program__fd;
+		bpf_program__is_inet_lookup;
 		bpf_program__is_kprobe;
 		bpf_program__is_perf_event;
 		bpf_program__is_raw_tracepoint;
@@ -84,6 +85,7 @@ LIBBPF_0.0.1 {
 		bpf_program__priv;
 		bpf_program__set_expected_attach_type;
 		bpf_program__set_ifindex;
+		bpf_program__set_inet_lookup;
 		bpf_program__set_kprobe;
 		bpf_program__set_perf_event;
 		bpf_program__set_prep;
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 4b0b0364f5fc..c365223a2d1e 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -102,6 +102,7 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+	case BPF_PROG_TYPE_INET_LOOKUP:
 	default:
 		break;
 	}
-- 
2.20.1


^ permalink raw reply related

* [RFCv2 bpf-next 07/12] inet6: Run inet_lookup bpf program on socket lookup
From: Jakub Sitnicki @ 2019-08-28  7:22 UTC (permalink / raw)
  To: bpf, netdev; +Cc: kernel-team, Lorenz Bauer, Marek Majkowski
In-Reply-To: <20190828072250.29828-1-jakub@cloudflare.com>

Following the ipv4 changes, run a BPF program attached to netns in context
of which we're doing the socket lookup so that it can redirect the skb to a
socket of its choice. The program runs before the listening socket lookup.

Suggested-by: Marek Majkowski <marek@cloudflare.com>
Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
---
 include/net/inet6_hashtables.h | 19 +++++++++++++++++++
 net/ipv6/inet6_hashtables.c    |  5 +++++
 2 files changed, 24 insertions(+)

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index fe96bf247aac..c2393d148d8d 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -104,6 +104,25 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
 			  const int dif);
 
 int inet6_hash(struct sock *sk);
+
+static inline struct sock *inet6_lookup_run_bpf(struct net *net, u8 proto,
+						const struct in6_addr *saddr,
+						__be16 sport,
+						const struct in6_addr *daddr,
+						unsigned short hnum)
+{
+	struct bpf_inet_lookup_kern ctx = {
+		.family		= AF_INET6,
+		.protocol	= proto,
+		.saddr6		= *saddr,
+		.sport		= sport,
+		.daddr6		= *daddr,
+		.hnum		= hnum,
+	};
+
+	return __inet_lookup_run_bpf(net, &ctx);
+}
+
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
 #define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif, __sdif) \
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index cf60fae9533b..40dd0a3d80ed 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -157,6 +157,11 @@ struct sock *inet6_lookup_listener(struct net *net,
 	struct sock *result = NULL;
 	unsigned int hash2;
 
+	result = inet6_lookup_run_bpf(net, hashinfo->protocol,
+				      saddr, sport, daddr, hnum);
+	if (result)
+		goto done;
+
 	hash2 = ipv6_portaddr_hash(net, daddr, hnum);
 	ilb2 = inet_lhash2_bucket(hashinfo, hash2);
 
-- 
2.20.1


^ permalink raw reply related

* [RFCv2 bpf-next 08/12] udp: Run inet_lookup bpf program on socket lookup
From: Jakub Sitnicki @ 2019-08-28  7:22 UTC (permalink / raw)
  To: bpf, netdev; +Cc: kernel-team, Lorenz Bauer, Marek Majkowski
In-Reply-To: <20190828072250.29828-1-jakub@cloudflare.com>

Following the TCP socket lookup changes, allow selecting the receiving
socket from BPF before searching for bound socket by destination address
and port.

As connected and bound but non-connected socket lookup currently happens in
one step, we split the lookup in two phases to run BPF only after a lookup
for a connected socket was a miss. Hence making sure connected UDP sockets
continue to work as expected in presence of a BPF inet_lookup program.

Suggested-by: Marek Majkowski <marek@cloudflare.com>
Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
---
 net/ipv4/udp.c | 44 ++++++++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9fffe9e9eec6..3a4b98f89249 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -353,7 +353,7 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
 static int compute_score(struct sock *sk, struct net *net,
 			 __be32 saddr, __be16 sport,
 			 __be32 daddr, unsigned short hnum,
-			 int dif, int sdif)
+			 int dif, int sdif, unsigned char state)
 {
 	int score;
 	struct inet_sock *inet;
@@ -364,6 +364,9 @@ static int compute_score(struct sock *sk, struct net *net,
 	    ipv6_only_sock(sk))
 		return -1;
 
+	if (state && sk->sk_state != state)
+		return -1;
+
 	if (sk->sk_rcv_saddr != daddr)
 		return -1;
 
@@ -411,7 +414,8 @@ static struct sock *udp4_lib_lookup2(struct net *net,
 				     __be32 daddr, unsigned int hnum,
 				     int dif, int sdif,
 				     struct udp_hslot *hslot2,
-				     struct sk_buff *skb)
+				     struct sk_buff *skb,
+				     unsigned char state)
 {
 	struct sock *sk, *result;
 	int score, badness;
@@ -421,7 +425,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
 	badness = 0;
 	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		score = compute_score(sk, net, saddr, sport,
-				      daddr, hnum, dif, sdif);
+				      daddr, hnum, dif, sdif, state);
 		if (score > badness) {
 			if (sk->sk_reuseport) {
 				hash = udp_ehashfn(net, daddr, hnum,
@@ -454,18 +458,34 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 	slot2 = hash2 & udptable->mask;
 	hslot2 = &udptable->hash2[slot2];
 
+	/* Lookup connected sockets */
 	result = udp4_lib_lookup2(net, saddr, sport,
 				  daddr, hnum, dif, sdif,
-				  hslot2, skb);
-	if (!result) {
-		hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
-		slot2 = hash2 & udptable->mask;
-		hslot2 = &udptable->hash2[slot2];
+				  hslot2, skb, TCP_ESTABLISHED);
+	if (result)
+		goto done;
 
-		result = udp4_lib_lookup2(net, saddr, sport,
-					  htonl(INADDR_ANY), hnum, dif, sdif,
-					  hslot2, skb);
-	}
+	/* Lookup redirect from BPF */
+	result = inet_lookup_run_bpf(net, udptable->protocol,
+				     saddr, sport, daddr, hnum);
+	if (result)
+		goto done;
+
+	/* Lookup bound sockets */
+	result = udp4_lib_lookup2(net, saddr, sport,
+				  daddr, hnum, dif, sdif,
+				  hslot2, skb, 0);
+	if (result)
+		goto done;
+
+	hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+	slot2 = hash2 & udptable->mask;
+	hslot2 = &udptable->hash2[slot2];
+
+	result = udp4_lib_lookup2(net, saddr, sport,
+				  htonl(INADDR_ANY), hnum, dif, sdif,
+				  hslot2, skb, 0);
+done:
 	if (IS_ERR(result))
 		return NULL;
 	return result;
-- 
2.20.1


^ permalink raw reply related

* [RFCv2 bpf-next 12/12] bpf: Test redirecting listening/receiving socket lookup
From: Jakub Sitnicki @ 2019-08-28  7:22 UTC (permalink / raw)
  To: bpf, netdev; +Cc: kernel-team, Lorenz Bauer, Marek Majkowski
In-Reply-To: <20190828072250.29828-1-jakub@cloudflare.com>

Check that steering the packets targeted at a local (address, port) that is
different than the server's bind() address with a BPF inet_lookup program
works as expected for TCP or UDP over either IPv4 or IPv6. Make sure that
it is possible to redirect IPv4 packets to IPv6 sockets that are not
V6-only.

Reviewed-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
---
 tools/testing/selftests/bpf/.gitignore        |   1 +
 tools/testing/selftests/bpf/Makefile          |   5 +-
 tools/testing/selftests/bpf/bpf_helpers.h     |   3 +
 .../selftests/bpf/progs/inet_lookup_progs.c   |  78 +++
 .../testing/selftests/bpf/test_inet_lookup.c  | 522 ++++++++++++++++++
 .../testing/selftests/bpf/test_inet_lookup.sh |  35 ++
 6 files changed, 642 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/inet_lookup_progs.c
 create mode 100644 tools/testing/selftests/bpf/test_inet_lookup.c
 create mode 100755 tools/testing/selftests/bpf/test_inet_lookup.sh

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 60c9338cd9b4..7442bd9166c7 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -44,3 +44,4 @@ test_sockopt_sk
 test_sockopt_multi
 test_sockopt_inherit
 test_tcp_rtt
+test_inet_lookup
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 7a23d94fe6a9..89dbbc032c8f 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -65,7 +65,8 @@ TEST_PROGS := test_kmod.sh \
 	test_tcp_check_syncookie.sh \
 	test_tc_tunnel.sh \
 	test_tc_edt.sh \
-	test_xdping.sh
+	test_xdping.sh \
+	test_inet_lookup.sh
 
 TEST_PROGS_EXTENDED := with_addr.sh \
 	with_tunnels.sh \
@@ -75,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
 # Compile but not part of 'make run_tests'
 TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \
 	flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
-	test_lirc_mode2_user
+	test_lirc_mode2_user test_inet_lookup
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 6c4930bc6e2e..dda00609098a 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -231,6 +231,9 @@ static int (*bpf_send_signal)(unsigned sig) = (void *)BPF_FUNC_send_signal;
 static long long (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *ip,
 					  int ip_len, void *tcp, int tcp_len) =
 	(void *) BPF_FUNC_tcp_gen_syncookie;
+static int (*bpf_redirect_lookup)(void *ctx, void *map, void *key,
+				  __u64 flags) =
+	(void *) BPF_FUNC_redirect_lookup;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/tools/testing/selftests/bpf/progs/inet_lookup_progs.c b/tools/testing/selftests/bpf/progs/inet_lookup_progs.c
new file mode 100644
index 000000000000..16b1b2e241e4
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/inet_lookup_progs.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <sys/socket.h>
+
+#include "bpf_endian.h"
+#include "bpf_helpers.h"
+
+#define IP4(a, b, c, d)	((__u32)(		\
+	((__u32)((a) & (__u32)0xffUL) << 24) |	\
+	((__u32)((b) & (__u32)0xffUL) << 16) |	\
+	((__u32)((c) & (__u32)0xffUL) <<  8) |	\
+	((__u32)((d) & (__u32)0xffUL) <<  0)))
+
+#define REUSEPORT_ARRAY_SIZE 32
+
+struct {
+	__uint(type, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY);
+	__uint(max_entries, REUSEPORT_ARRAY_SIZE);
+	__type(key, __u32);
+	__type(value, __u64);
+} redir_map SEC(".maps");
+
+static const __u32 DST_PORT = 7007;
+static const __u32 DST_IP4 = IP4(127, 0, 0, 1);
+static const __u32 DST_IP6[] = { 0xfd000000, 0x0, 0x0, 0x00000001 };
+
+/* Redirect packets destined for port DST_PORT to socket at redir_map[0]. */
+SEC("inet_lookup/redir_port")
+int redir_port(struct bpf_inet_lookup *ctx)
+{
+	__u32 index = 0;
+	__u64 flags = 0;
+
+	if (ctx->local_port != DST_PORT)
+		return BPF_OK;
+
+	return bpf_redirect_lookup(ctx, &redir_map, &index, flags);
+}
+
+/* Redirect packets destined for DST_IP4 address to socket at redir_map[0]. */
+SEC("inet_lookup/redir_ip4")
+int redir_ip4(struct bpf_inet_lookup *ctx)
+{
+	__u32 index = 0;
+	__u64 flags = 0;
+
+	if (ctx->family != AF_INET)
+		return BPF_OK;
+	if (ctx->local_port != DST_PORT)
+		return BPF_OK;
+	if (ctx->local_ip4 != bpf_htonl(DST_IP4))
+		return BPF_OK;
+
+	return bpf_redirect_lookup(ctx, &redir_map, &index, flags);
+}
+
+/* Redirect packets destined for DST_IP6 address to socket at redir_map[0]. */
+SEC("inet_lookup/redir_ip6")
+int redir_ip6(struct bpf_inet_lookup *ctx)
+{
+	__u32 index = 0;
+	__u64 flags = 0;
+
+	if (ctx->family != AF_INET6)
+		return BPF_OK;
+	if (ctx->local_port != DST_PORT)
+		return BPF_OK;
+	if (ctx->local_ip6[0] != bpf_htonl(DST_IP6[0]) ||
+	    ctx->local_ip6[1] != bpf_htonl(DST_IP6[1]) ||
+	    ctx->local_ip6[2] != bpf_htonl(DST_IP6[2]) ||
+	    ctx->local_ip6[3] != bpf_htonl(DST_IP6[3]))
+		return BPF_OK;
+
+	return bpf_redirect_lookup(ctx, &redir_map, &index, flags);
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
diff --git a/tools/testing/selftests/bpf/test_inet_lookup.c b/tools/testing/selftests/bpf/test_inet_lookup.c
new file mode 100644
index 000000000000..7e222488514c
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_inet_lookup.c
@@ -0,0 +1,522 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * L7 echo tests with the server listening/receiving at a different
+ * (address, port) than the client sends packets to.
+ *
+ * Traffic is steered to the server socket by redirecting the socket
+ * lookup with an eBPF inet_lookup program. The inet_lookup program,
+ * selected a target listening/bound socket from SOCKARRAY map based
+ * on the packet's 4-tuple.
+ */
+
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+
+#define BPF_FILE	"./inet_lookup_progs.o"
+#define MAX_ERROR_LEN	256
+
+/* External (address, port) pairs the client sends packets to. */
+#define EXT_IP4		"127.0.0.1"
+#define EXT_IP6		"fd00::1"
+#define EXT_PORT	7007
+
+/* Internal (address, port) pairs the server listens/receives at. */
+#define INT_IP4		"127.0.0.2"
+#define INT_IP4_V6	"::ffff:127.0.0.2"
+#define INT_IP6		"fd00::2"
+#define INT_PORT	8008
+
+#define REUSEPORT_ARRAY_SIZE 32
+
+struct inet_addr {
+	const char *ip;
+	unsigned short port;
+};
+
+struct test {
+	const char *desc;
+	const char *bpf_prog;
+
+	int socket_type;
+
+	struct inet_addr send_to;
+	struct inet_addr recv_at;
+};
+
+static const struct test tests[] = {
+	{
+		.desc		= "TCP IPv4 redir port",
+		.bpf_prog	= "inet_lookup/redir_port",
+		.socket_type	= SOCK_STREAM,
+		.send_to	= { EXT_IP4, EXT_PORT },
+		.recv_at	= { EXT_IP4, INT_PORT },
+	},
+	{
+		.desc		= "TCP IPv4 redir addr",
+		.bpf_prog	= "inet_lookup/redir_ip4",
+		.socket_type	= SOCK_STREAM,
+		.send_to	= { EXT_IP4, EXT_PORT },
+		.recv_at	= { INT_IP4, EXT_PORT },
+	},
+	{
+		.desc		= "TCP IPv6 redir port",
+		.bpf_prog	= "inet_lookup/redir_port",
+		.socket_type	= SOCK_STREAM,
+		.send_to	= { EXT_IP6, EXT_PORT },
+		.recv_at	= { EXT_IP6, INT_PORT },
+	},
+	{
+		.desc		= "TCP IPv6 redir addr",
+		.bpf_prog	= "inet_lookup/redir_ip6",
+		.socket_type	= SOCK_STREAM,
+		.send_to	= { EXT_IP6, EXT_PORT },
+		.recv_at	= { INT_IP6, EXT_PORT },
+	},
+	{
+		.desc		= "TCP IPv4->IPv6 redir port",
+		.bpf_prog	= "inet_lookup/redir_port",
+		.socket_type	= SOCK_STREAM,
+		.recv_at	= { INT_IP4_V6, INT_PORT },
+		.send_to	= { EXT_IP4, EXT_PORT },
+	},
+	{
+		.desc		= "UDP IPv4 redir port",
+		.bpf_prog	= "inet_lookup/redir_port",
+		.socket_type	= SOCK_DGRAM,
+		.send_to	= { EXT_IP4, EXT_PORT },
+		.recv_at	= { EXT_IP4, INT_PORT },
+	},
+	{
+		.desc		= "UDP IPv4 redir addr",
+		.bpf_prog	= "inet_lookup/redir_ip4",
+		.socket_type	= SOCK_DGRAM,
+		.send_to	= { EXT_IP4, EXT_PORT },
+		.recv_at	= { INT_IP4, EXT_PORT },
+	},
+	{
+		.desc		= "UDP IPv6 redir port",
+		.bpf_prog	= "inet_lookup/redir_port",
+		.socket_type	= SOCK_DGRAM,
+		.send_to	= { EXT_IP6, EXT_PORT },
+		.recv_at	= { EXT_IP6, INT_PORT },
+	},
+	{
+		.desc		= "UDP IPv6 redir addr",
+		.bpf_prog	= "inet_lookup/redir_ip6",
+		.socket_type	= SOCK_DGRAM,
+		.send_to	= { EXT_IP6, EXT_PORT },
+		.recv_at	= { INT_IP6, EXT_PORT },
+	},
+	{
+		.desc		= "UDP IPv4->IPv6 redir port",
+		.bpf_prog	= "inet_lookup/redir_port",
+		.socket_type	= SOCK_DGRAM,
+		.recv_at	= { INT_IP4_V6, INT_PORT },
+		.send_to	= { EXT_IP4, EXT_PORT },
+	},
+};
+
+static bool is_ipv6_addr(const char *ip)
+{
+	return !!strchr(ip, ':');
+}
+
+static void make_addr(int family, const char *ip, int port,
+		      struct sockaddr_storage *ss, int *sz)
+{
+	struct sockaddr_in *addr4;
+	struct sockaddr_in6 *addr6;
+
+	switch (family) {
+	case AF_INET:
+		addr4 = (struct sockaddr_in *)ss;
+		addr4->sin_family = AF_INET;
+		addr4->sin_port = htons(port);
+		if (!inet_pton(AF_INET, ip, &addr4->sin_addr))
+			error(1, errno, "inet_pton failed: %s", ip);
+		*sz = sizeof(*addr4);
+		break;
+	case AF_INET6:
+		addr6 = (struct sockaddr_in6 *)ss;
+		addr6->sin6_family = AF_INET6;
+		addr6->sin6_port = htons(port);
+		if (!inet_pton(AF_INET6, ip, &addr6->sin6_addr))
+			error(1, errno, "inet_pton failed: %s", ip);
+		*sz = sizeof(*addr6);
+		break;
+	default:
+		error(1, 0, "unsupported family %d", family);
+	}
+}
+
+static int make_server(int type, const char *ip, int port)
+{
+	struct sockaddr_storage ss = {0};
+	int fd, opt, sz;
+	int family;
+
+	family = is_ipv6_addr(ip) ? AF_INET6 : AF_INET;
+	make_addr(family, ip, port, &ss, &sz);
+
+	fd = socket(family, type, 0);
+	if (fd < 0)
+		error(1, errno, "failed to create listen socket");
+
+	opt = 1;
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)))
+		error(1, errno, "failed to set SO_REUSEPORT");
+	if (type == SOCK_DGRAM) {
+		if (setsockopt(fd, SOL_IP, IP_RECVORIGDSTADDR,
+			       &opt, sizeof(opt)))
+			error(1, errno, "failed to set IP_RECVORIGDSTADDR");
+	}
+	if (family == AF_INET6 && type == SOCK_DGRAM) {
+		if (setsockopt(fd, SOL_IPV6, IPV6_RECVORIGDSTADDR,
+			       &opt, sizeof(opt)))
+			error(1, errno, "failed to set IPV6_RECVORIGDSTADDR");
+	}
+
+	if (bind(fd, (struct sockaddr *)&ss, sz))
+		error(1, errno, "failed to bind listen socket");
+
+	if (type == SOCK_STREAM && listen(fd, 1))
+		error(1, errno, "failed to listen on port %d", port);
+
+	return fd;
+}
+
+static int make_client(int type, const char *ip, int port)
+{
+	struct sockaddr_storage ss = {0};
+	struct sockaddr *sa;
+	int family;
+	int fd, sz;
+
+	family = is_ipv6_addr(ip) ? AF_INET6 : AF_INET;
+	make_addr(family, ip, port, &ss, &sz);
+	sa = (struct sockaddr *)&ss;
+
+	fd = socket(family, type, 0);
+	if (fd < 0)
+		error(1, errno, "failed to create socket");
+
+	if (connect(fd, sa, sz))
+		error(1, errno, "failed to connect socket");
+
+	return fd;
+}
+
+static void send_byte(int fd)
+{
+	if (send(fd, "a", 1, 0) < 1)
+		error(1, errno, "failed to send message");
+}
+
+static void recv_byte(int fd)
+{
+	char buf[1];
+
+	if (recv(fd, buf, sizeof(buf), 0) < 1)
+		error(1, errno, "failed to receive message");
+}
+
+static void tcp_recv_send(int server_fd)
+{
+	char buf[1];
+	size_t len;
+	ssize_t n;
+	int fd;
+
+	fd = accept(server_fd, NULL, NULL);
+	if (fd < 0)
+		error(1, errno, "failed to accept");
+
+	len = sizeof(buf);
+	n = recv(fd, buf, len, 0);
+	if (n < 0)
+		error(1, errno, "failed to receive");
+	if (n < len)
+		error(1, 0, "partial receive");
+
+	n = send(fd, buf, len, 0);
+	if (n < 0)
+		error(1, errno, "failed to send");
+	if (n < len)
+		error(1, 0, "partial send");
+
+	close(fd);
+}
+
+static void udp_recv_send(int server_fd)
+{
+	char cmsg_buf[CMSG_SPACE(sizeof(struct sockaddr_storage))];
+	struct sockaddr_storage _src_addr = { 0 };
+	struct sockaddr_storage _dst_addr = { 0 };
+	struct sockaddr_storage *src_addr = &_src_addr;
+	struct sockaddr_storage *dst_addr = NULL;
+	struct msghdr msg = { 0 };
+	struct iovec iov = { 0 };
+	struct cmsghdr *cm;
+	char buf[1];
+	ssize_t n;
+	int fd;
+
+	iov.iov_base = buf;
+	iov.iov_len = sizeof(buf);
+
+	msg.msg_name = src_addr;
+	msg.msg_namelen = sizeof(*src_addr);
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = cmsg_buf;
+	msg.msg_controllen = sizeof(cmsg_buf);
+
+	n = recvmsg(server_fd, &msg, 0);
+	if (n < 0)
+		error(1, errno, "failed to receive");
+	if (n < sizeof(buf))
+		error(1, 0, "partial receive");
+	if (msg.msg_flags & MSG_CTRUNC)
+		error(1, errno, "truncated cmsg");
+
+	for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
+		if ((cm->cmsg_level == SOL_IP &&
+		     cm->cmsg_type == IP_ORIGDSTADDR) ||
+		    (cm->cmsg_level == SOL_IPV6 &&
+		     cm->cmsg_type == IPV6_ORIGDSTADDR)) {
+			dst_addr = (struct sockaddr_storage *)CMSG_DATA(cm);
+			break;
+		}
+		error(0, 0, "ignored cmsg at level %d type %d",
+		      cm->cmsg_level, cm->cmsg_type);
+	}
+	if (!dst_addr)
+		error(1, 0, "failed to get destination address");
+
+	/* Server bound to IPv4-mapped IPv6 address */
+	if (src_addr->ss_family != dst_addr->ss_family) {
+		assert(dst_addr->ss_family == AF_INET);
+
+		struct sockaddr_in *dst4 = (void *)dst_addr;
+		struct sockaddr_in6 *dst6 = (void *)&_dst_addr;
+
+		dst6->sin6_family = AF_INET6;
+		dst6->sin6_port = dst4->sin_port;
+
+		dst6->sin6_addr.s6_addr[10] = 0xff;
+		dst6->sin6_addr.s6_addr[11] = 0xff;
+		memcpy(&dst6->sin6_addr.s6_addr[12],
+		       &dst4->sin_addr.s_addr, sizeof(dst4->sin_addr.s_addr));
+
+		dst_addr = (void *)dst6;
+	}
+
+	/* Reply from original destination address. */
+	fd = socket(dst_addr->ss_family, SOCK_DGRAM, 0);
+	if (fd < 0)
+		error(1, errno, "failed to create socket");
+
+	if (bind(fd, (struct sockaddr *)dst_addr, sizeof(*dst_addr)))
+		error(1, errno, "failed to bind socket");
+
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	n = sendmsg(fd, &msg, 0);
+	if (n < 0)
+		error(1, errno, "failed to send");
+	if (n < sizeof(buf))
+		error(1, 0, "partial send");
+
+	close(fd);
+}
+
+static void tcp_echo(int client_fd, int server_fd)
+{
+	send_byte(client_fd);
+	tcp_recv_send(server_fd);
+	recv_byte(client_fd);
+}
+
+static void udp_echo(int client_fd, int server_fd)
+{
+	send_byte(client_fd);
+	udp_recv_send(server_fd);
+	recv_byte(client_fd);
+}
+
+static struct bpf_object *load_prog(void)
+{
+	char buf[MAX_ERROR_LEN];
+	struct bpf_object *obj;
+	int prog_fd;
+	int err;
+
+	err = bpf_prog_load(BPF_FILE, BPF_PROG_TYPE_UNSPEC, &obj, &prog_fd);
+	if (err) {
+		libbpf_strerror(err, buf, ARRAY_SIZE(buf));
+		error(1, 0, "failed to open bpf file '%s': %s", BPF_FILE, buf);
+	}
+
+	return obj;
+}
+
+static void attach_prog(struct bpf_object *obj, const char *sec)
+{
+	enum bpf_attach_type attach_type;
+	struct bpf_program *prog;
+	char buf[MAX_ERROR_LEN];
+	int target_fd = -1;
+	int prog_fd;
+	int err;
+
+	prog = bpf_object__find_program_by_title(obj, sec);
+	err = libbpf_get_error(prog);
+	if (err) {
+		libbpf_strerror(err, buf, ARRAY_SIZE(buf));
+		error(1, 0, "failed to find section \"%s\": %s", sec, buf);
+	}
+
+	err = libbpf_attach_type_by_name(sec, &attach_type);
+	if (err) {
+		libbpf_strerror(err, buf, ARRAY_SIZE(buf));
+		error(1, 0, "failed to identify attach type: %s", buf);
+	}
+
+	prog_fd = bpf_program__fd(prog);
+	if (prog_fd < 0)
+		error(1, errno, "failed to get prog fd");
+
+	err = bpf_prog_attach(prog_fd, target_fd, attach_type, 0);
+	if (err)
+		error(1, -err, "failed to attach prog");
+}
+
+static void detach_prog(const char *sec)
+{
+	enum bpf_attach_type attach_type;
+	char buf[MAX_ERROR_LEN];
+	int target_fd = -1;
+	int err;
+
+	err = libbpf_attach_type_by_name(sec, &attach_type);
+	if (err) {
+		libbpf_strerror(err, buf, ARRAY_SIZE(buf));
+		error(1, 0, "failed to identify attach type: %s", buf);
+	}
+
+	err = bpf_prog_detach(target_fd, attach_type);
+	if (err && err != -EPERM)
+		error(1, -err, "failed to detach prog");
+}
+
+static void update_redir_map(int map_fd, int index, int sock_fd)
+{
+	uint64_t value;
+	int err;
+
+	value = (uint64_t)sock_fd;
+	err = bpf_map_update_elem(map_fd, &index, &value, BPF_NOEXIST);
+	if (err)
+		error(1, errno, "failed to update redir_map @ %d", index);
+}
+
+static void test_prog_query(void)
+{
+	__u32 attach_flags = 0;
+	__u32 prog_ids[1] = { 0 };
+	__u32 prog_cnt = 1;
+	int fd, err;
+
+	fd = open("/proc/self/ns/net", O_RDONLY);
+	if (fd < 0)
+		error(1, errno, "failed to open /proc/self/ns/net");
+
+	err = bpf_prog_query(fd, BPF_INET_LOOKUP, 0,
+			     &attach_flags, prog_ids, &prog_cnt);
+	if (err)
+		error(1, errno, "failed to query BPF_INET_LOOKUP prog");
+
+	assert(attach_flags == 0);
+	assert(prog_cnt == 1);
+	assert(prog_ids[0] != 0);
+
+	close(fd);
+}
+
+static void run_test(const struct test *t, struct bpf_object *obj,
+		     int redir_map)
+{
+	int client_fd, server_fd;
+
+	fprintf(stderr, "test %s\n", t->desc);
+
+	/* Clean up after any previous failed test runs */
+	detach_prog(t->bpf_prog);
+
+	attach_prog(obj, t->bpf_prog);
+	test_prog_query();
+
+	server_fd = make_server(t->socket_type,
+				t->recv_at.ip, t->recv_at.port);
+	update_redir_map(redir_map, 0, server_fd);
+
+	client_fd = make_client(t->socket_type,
+				t->send_to.ip, t->send_to.port);
+
+	if (t->socket_type == SOCK_STREAM)
+		tcp_echo(client_fd, server_fd);
+	else
+		udp_echo(client_fd, server_fd);
+
+	close(client_fd);
+	close(server_fd);
+
+	detach_prog(t->bpf_prog);
+}
+
+static int find_redir_map(struct bpf_object *obj)
+{
+	struct bpf_map *map;
+	int fd;
+
+	map = bpf_object__find_map_by_name(obj, "redir_map");
+	if (!map)
+		error(1, 0, "failed to find 'redir_map'");
+	fd = bpf_map__fd(map);
+	if (fd < 0)
+		error(1, 0, "failed to get 'redir_map' fd");
+
+	return fd;
+}
+
+int main(void)
+{
+	struct bpf_object *obj;
+	const struct test *t;
+	int redir_map;
+
+	obj = load_prog();
+	redir_map = find_redir_map(obj);
+
+	for (t = tests; t < tests + ARRAY_SIZE(tests); t++)
+		run_test(t, obj, redir_map);
+
+	close(redir_map);
+	bpf_object__unload(obj);
+
+	fprintf(stderr, "PASS\n");
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_inet_lookup.sh b/tools/testing/selftests/bpf/test_inet_lookup.sh
new file mode 100755
index 000000000000..5efb42fbdf59
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_inet_lookup.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+if [[ $EUID -ne 0 ]]; then
+        echo "This script must be run as root"
+        echo "FAIL"
+        exit 1
+fi
+
+# Run the script in a dedicated network namespace.
+if [[ -z $(ip netns identify $$) ]]; then
+        ../net/in_netns.sh "$0" "$@"
+        exit $?
+fi
+
+readonly IP6_1="fd00::1"
+readonly IP6_2="fd00::2"
+
+setup()
+{
+        ip -6 addr add ${IP6_1}/128 dev lo
+        ip -6 addr add ${IP6_2}/128 dev lo
+}
+
+cleanup()
+{
+        ip -6 addr del ${IP6_1}/128 dev lo
+        ip -6 addr del ${IP6_2}/128 dev lo
+}
+
+trap cleanup EXIT
+setup
+
+./test_inet_lookup
+exit $?
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH 1/2] PTP: introduce new versions of IOCTLs
From: Felipe Balbi @ 2019-08-28  8:23 UTC (permalink / raw)
  To: Joe Perches, Richard Cochran; +Cc: Christopher S Hall, netdev, linux-kernel
In-Reply-To: <0f1487356ae2e9ff185ede2359381630007538c7.camel@perches.com>

[-- Attachment #1: Type: text/plain, Size: 1047 bytes --]


Hi,

Joe Perches <joe@perches.com> writes:

> On Mon, 2019-08-19 at 08:43 -0700, Richard Cochran wrote:
>> On Sun, Aug 18, 2019 at 03:07:18PM -0700, Joe Perches wrote:
>> > Also the original patch deletes 2 case entries for
>> > PTP_PIN_GETFUNC and PTP_PIN_SETFUNC and converts them to
>> > PTP_PIN_GETFUNC2 and PTP_PIN_SETFUNC2 but still uses tests
>> > for the deleted case label entries making part of the case
>> > code block unreachable.
>> > 
>> > That's at least a defect:
>> > 
>> > -	case PTP_PIN_GETFUNC:
>> > +	case PTP_PIN_GETFUNC2:
>> > 
>> > and
>> >  
>> > -	case PTP_PIN_SETFUNC:
>> > +	case PTP_PIN_SETFUNC2:
>> 
>> Good catch.  Felipe, please fix that!
>> 
>> (Regarding Joe's memset suggestion, I'll leave that to your discretion.)
>
> Not just how declarations are done or memset.
>
> Minimizing unnecessary stack consumption is generally good.

Originally I had memset only on the three cases where they were
needed. Richard, which do you prefer? I don't mind changing it back.

-- 
balbi

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* RE: [PATCH net-next v2 3/3] dpaa2-eth: Add pause frame support
From: Ioana Ciocoi Radulescu @ 2019-08-28  8:40 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: netdev@vger.kernel.org, davem@davemloft.net, Ioana Ciornei
In-Reply-To: <20190827232132.GD26248@lunn.ch>

> -----Original Message-----
> From: Andrew Lunn <andrew@lunn.ch>
> Sent: Wednesday, August 28, 2019 2:22 AM
> To: Ioana Ciocoi Radulescu <ruxandra.radulescu@nxp.com>
> Cc: netdev@vger.kernel.org; davem@davemloft.net; Ioana Ciornei
> <ioana.ciornei@nxp.com>
> Subject: Re: [PATCH net-next v2 3/3] dpaa2-eth: Add pause frame support
> 
> On Tue, Aug 27, 2019 at 05:15:51PM +0300, Ioana Radulescu wrote:
> > Starting with firmware version MC10.18.0, we have support for
> > L2 flow control. Asymmetrical configuration (Rx or Tx only) is
> > supported, but not pause frame autonegotioation.
> 
> > +static int set_pause(struct dpaa2_eth_priv *priv)
> > +{
> > +	struct device *dev = priv->net_dev->dev.parent;
> > +	struct dpni_link_cfg link_cfg = {0};
> > +	int err;
> > +
> > +	/* Get the default link options so we don't override other flags */
> > +	err = dpni_get_link_cfg(priv->mc_io, 0, priv->mc_token, &link_cfg);
> > +	if (err) {
> > +		dev_err(dev, "dpni_get_link_cfg() failed\n");
> > +		return err;
> > +	}
> > +
> > +	link_cfg.options |= DPNI_LINK_OPT_PAUSE;
> > +	link_cfg.options &= ~DPNI_LINK_OPT_ASYM_PAUSE;
> > +	err = dpni_set_link_cfg(priv->mc_io, 0, priv->mc_token, &link_cfg);
> > +	if (err) {
> > +		dev_err(dev, "dpni_set_link_cfg() failed\n");
> > +		return err;
> > +	}
> > +
> > +	priv->link_state.options = link_cfg.options;
> > +
> > +	return 0;
> > +}
> > +
> >  /* Configure the DPNI object this interface is associated with */
> >  static int setup_dpni(struct fsl_mc_device *ls_dev)
> >  {
> > @@ -2500,6 +2562,13 @@ static int setup_dpni(struct fsl_mc_device
> *ls_dev)
> >
> >  	set_enqueue_mode(priv);
> >
> > +	/* Enable pause frame support */
> > +	if (dpaa2_eth_has_pause_support(priv)) {
> > +		err = set_pause(priv);
> > +		if (err)
> > +			goto close;
> 
> Hi Ioana
> 
> So by default you have the MAC do pause, not asym pause?  Generally,
> any MAC that can do asym pause does asym pause.

Clearing the ASYM_PAUSE flag only means we tell the firmware we want
both Rx and Tx pause to be enabled in the beginning. User can still set
an asymmetric config (i.e. only Rx pause or only Tx pause to be enabled)
if needed.

The truth table is like this:

PAUSE | ASYM_PAUSE | Rx pause | Tx pause
----------------------------------------
  0   |     0      | disabled | disabled
  0   |     1      | disabled | enabled
  1   |     0      | enabled  | enabled
  1   |     1      | enabled  | disabled

Thanks,
Ioana

^ permalink raw reply

* Re: [PATCH] arcnet: capmode: remove redundant assignment to pointer pkt
From: Sergei Shtylyov @ 2019-08-28  9:14 UTC (permalink / raw)
  To: Colin King, Michael Grzeschik, David S . Miller, netdev
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <20190827112954.26677-1-colin.king@canonical.com>

Hello!

On 27.08.2019 14:29, Colin King wrote:

> From: Colin Ian King <colin.king@canonical.com>
> 
> Pointer pkt is being initialized with a value that is never read
> and pkg is being re-assigned a little later on. The assignment is
       ^^^ pkt

> redundant and hence can be removed.
> 
> Addresses-Coverity: ("Ununsed value")
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
[...]

MBR, Sergei

^ permalink raw reply

* Re: [PATCH] bridge:fragmented packets dropped by bridge
From: Rundong Ge @ 2019-08-28  9:21 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Florian Westphal, davem, kuznet, yoshfuji, netdev,
	Pablo Neira Ayuso, kadlec, Roopa Prabhu, netfilter-devel,
	coreteam, bridge, Nikolay Aleksandrov, linux-kernel
In-Reply-To: <nycvar.YFH.7.76.1908260955400.22383@n3.vanv.qr>

Jan Engelhardt <jengelh@inai.de> 于2019年8月26日周一 下午3:59写道:
>
>
> On Tuesday 2019-07-30 14:35, Florian Westphal wrote:
> >Rundong Ge <rdong.ge@gmail.com> wrote:
> >> Given following setup:
> >> -modprobe br_netfilter
> >> -echo '1' > /proc/sys/net/bridge/bridge-nf-call-iptables
> >> -brctl addbr br0
> >> -brctl addif br0 enp2s0
> >> -brctl addif br0 enp3s0
> >> -brctl addif br0 enp6s0
> >> -ifconfig enp2s0 mtu 1300
> >> -ifconfig enp3s0 mtu 1500
> >> -ifconfig enp6s0 mtu 1500
> >> -ifconfig br0 up
> >>
> >>                  multi-port
> >> mtu1500 - mtu1500|bridge|1500 - mtu1500
> >>   A                  |            B
> >>                    mtu1300
> >
> >How can a bridge forward a frame from A/B to mtu1300?
>
> There might be a misunderstanding here judging from the shortness of this
> thread.
>
> I understood it such that the bridge ports (eth0,eth1) have MTU 1500, yet br0
> (in essence the third bridge port if you so wish) itself has MTU 1300.
>
> Therefore, frame forwarding from eth0 to eth1 should succeed, since the
> 1300-byte MTU is only relevant if the bridge decides the packet needs to be
> locally delivered.

Under this setup when I do "ping B -l 2000" from A, the fragmented
packets will be dropped by bridge.
When the "/proc/sys/net/bridge/bridge-nf-call-iptables" is on, bridge
will do defragment at PREROUTING and re-fragment at POSTROUTING. At
the re-fragment bridge will check if the max frag size is larger than
the bridge's MTU in  br_nf_ip_fragment(), if it is true packets will
be dropped.
And this patch use the outdev's MTU instead of the bridge's MTU to do
the br_nf_ip_fragment.

^ permalink raw reply

* Re: [PATCH net-next v4 3/3] dt-bindings: net: ethernet: Update mt7622 docs and dts to reflect the new phylink API
From: Matthias Brugger @ 2019-08-28  9:29 UTC (permalink / raw)
  To: René van Dorst, John Crispin, Sean Wang, Nelson Chang,
	David S . Miller
  Cc: netdev, linux-arm-kernel, linux-mediatek, linux-mips,
	Russell King, Frank Wunderlich, Stefan Roese
In-Reply-To: <20190825174341.20750-4-opensource@vdorst.com>

Hi David,

On 25/08/2019 19:43, René van Dorst wrote:
> This patch the removes the recently added mediatek,physpeed property.
> Use the fixed-link property speed = <2500> to set the phy in 2.5Gbit.
> See mt7622-bananapi-bpi-r64.dts for a working example.
> 
> Signed-off-by: René van Dorst <opensource@vdorst.com>
> --
> v3->v4:
> * no change
> v2->v3:
> * no change
> v1->v2:
> * SGMII port only support BASE-X at 2.5Gbit.
> ---
>  .../arm/mediatek/mediatek,sgmiisys.txt        |  2 --
>  .../dts/mediatek/mt7622-bananapi-bpi-r64.dts  | 28 +++++++++++++------
>  arch/arm64/boot/dts/mediatek/mt7622.dtsi      |  1 -
>  3 files changed, 19 insertions(+), 12 deletions(-)

Thanks for taking this patch. For the next time, please make sure that dts[i]
patches are independent from the binding description, as dts[i] should go
through my tree. No problem for this round, just saying for the future.

Regards,
Matthias

> 
> diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt
> index f5518f26a914..30cb645c0e54 100644
> --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt
> +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt
> @@ -9,8 +9,6 @@ Required Properties:
>  	- "mediatek,mt7622-sgmiisys", "syscon"
>  	- "mediatek,mt7629-sgmiisys", "syscon"
>  - #clock-cells: Must be 1
> -- mediatek,physpeed: Should be one of "auto", "1000" or "2500" to match up
> -		     the capability of the target PHY.
>  
>  The SGMIISYS controller uses the common clk binding from
>  Documentation/devicetree/bindings/clock/clock-bindings.txt
> diff --git a/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts b/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts
> index 710c5c3d87d3..83e10591e0e5 100644
> --- a/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts
> +++ b/arch/arm64/boot/dts/mediatek/mt7622-bananapi-bpi-r64.dts
> @@ -115,24 +115,34 @@
>  };
>  
>  &eth {
> -	pinctrl-names = "default";
> -	pinctrl-0 = <&eth_pins>;
>  	status = "okay";
> +	gmac0: mac@0 {
> +		compatible = "mediatek,eth-mac";
> +		reg = <0>;
> +		phy-mode = "2500base-x";
> +
> +		fixed-link {
> +			speed = <2500>;
> +			full-duplex;
> +			pause;
> +		};
> +	};
>  
>  	gmac1: mac@1 {
>  		compatible = "mediatek,eth-mac";
>  		reg = <1>;
> -		phy-handle = <&phy5>;
> +		phy-mode = "rgmii";
> +
> +		fixed-link {
> +			speed = <1000>;
> +			full-duplex;
> +			pause;
> +		};
>  	};
>  
> -	mdio-bus {
> +	mdio: mdio-bus {
>  		#address-cells = <1>;
>  		#size-cells = <0>;
> -
> -		phy5: ethernet-phy@5 {
> -			reg = <5>;
> -			phy-mode = "sgmii";
> -		};
>  	};
>  };
>  
> diff --git a/arch/arm64/boot/dts/mediatek/mt7622.dtsi b/arch/arm64/boot/dts/mediatek/mt7622.dtsi
> index d1e13d340e26..dac51e98204c 100644
> --- a/arch/arm64/boot/dts/mediatek/mt7622.dtsi
> +++ b/arch/arm64/boot/dts/mediatek/mt7622.dtsi
> @@ -931,6 +931,5 @@
>  			     "syscon";
>  		reg = <0 0x1b128000 0 0x3000>;
>  		#clock-cells = <1>;
> -		mediatek,physpeed = "2500";
>  	};
>  };
> 

^ permalink raw reply

* Re: [PATCH net] netdevsim: Restore per-network namespace accounting for fib entries
From: Jiri Pirko @ 2019-08-28 10:37 UTC (permalink / raw)
  To: David Ahern; +Cc: davem, netdev, David Ahern
In-Reply-To: <20190806191517.8713-1-dsahern@kernel.org>

Tue, Aug 06, 2019 at 09:15:17PM CEST, dsahern@kernel.org wrote:
>From: David Ahern <dsahern@gmail.com>
>
>Prior to the commit in the fixes tag, the resource controller in netdevsim
>tracked fib entries and rules per network namespace. Restore that behavior.

David, please help me understand. If the counters are per-device, not
per-netns, they are both the same. If we have device (devlink instance)
is in a netns and take only things happening in this netns into account,
it should count exactly the same amount of fib entries, doesn't it?

I re-thinked the devlink netns patchset and currently I'm going in
slightly different direction. I'm having netns as an attribute of
devlink reload. So all the port netdevices and everything gets
re-instantiated into new netns. Works fine with mlxsw. There we also
re-register the fib notifier.

I think that this can work for your usecase in netdevsim too:
1) devlink instance is registering a fib notifier to track all fib
   entries in a namespace it belongs to. The counters are per-device -
   counting fib entries in a namespace the device is in.
2) another devlink instance can do the same tracking in the same
   namespace. No problem, it's a separate counter, but the numbers are
   the same. One can set different limits to different devlink
   instances, but you can have only one. That is the bahaviour you have
   now.
3) on devlink reload, netdevsim re-instantiates ports and re-registers
   fib notifier
4) on devlink reload with netns change, all should be fine as the
   re-registered fib nofitier replays the entries. The ports are
   re-instatiated in new netns.

This way, we would get consistent behaviour between netdevsim and real
devices (mlxsw), correct devlink-netns implementation (you also
suggested to move ports to the namespace). Everyone should be happy.

What do you think?

^ permalink raw reply

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
From: kbuild test robot @ 2019-08-28 10:38 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: kbuild-all, luto, davem, daniel, netdev, bpf, kernel-team,
	linux-api
In-Reply-To: <20190827205213.456318-1-ast@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 1553 bytes --]

Hi Alexei,

I love your patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Alexei-Starovoitov/bpf-capabilities-introduce-CAP_BPF/20190828-142441
base:   https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: ia64-defconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 7.4.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.4.0 make.cross ARCH=ia64 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   kernel/bpf/core.c: In function 'cap_bpf_tracing':
>> kernel/bpf/core.c:2110:31: error: implicit declaration of function 'perf_paranoid_tracepoint_raw' [-Werror=implicit-function-declaration]
            (capable(CAP_BPF) && !perf_paranoid_tracepoint_raw());
                                  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors

vim +/perf_paranoid_tracepoint_raw +2110 kernel/bpf/core.c

  2106	
  2107	bool cap_bpf_tracing(void)
  2108	{
  2109		return capable(CAP_SYS_ADMIN) ||
> 2110		       (capable(CAP_BPF) && !perf_paranoid_tracepoint_raw());
  2111	}
  2112	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 19214 bytes --]

^ permalink raw reply

* RE: Query on possible bug in the can_create_echo_skb() API
From: Srinivas Neeli @ 2019-08-28 11:02 UTC (permalink / raw)
  To: Marc Kleine-Budde, wg@grandegger.com
  Cc: Srinivas Goud, Naga Sureshkumar Relli,
	Appana Durga Kedareswara Rao, linux-can@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <6bd3a657-dd8a-03a5-1e7c-bac532008f6e@pengutronix.de>

Hi,

Case 1:
can_put_echo_skb(); -> skb = can_create_echo_skb(skb); -> return skb;

In can_create_echo_skb() not using the shared_skb, so we are returning the old skb.
Storing the return value in "skb". But it's a pointer, for storing that need double pointer.
Instead of double-pointer using a single pointer. In this scenario it's ok , we are returning the same SKB.

Case 2:
can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max); -> skb = can_create_echo_skb(skb); -> can_skb_set_owner(nskb, skb->sk); - Returning nskb;

shared_skb scenario:
In share-skb case “can_create_echo_skb(skb);”  returning "new skb". For storing new skb need a double pointer.

Providing an example for overcoming above issue.
Example:
can_put_echo_skb(struct sk_buff **skb,struct net_device *dev,unsigned int idx);

If you ok with this change, I will send a patch.


Thanks
Srinivas Neeli

> -----Original Message-----
> From: Marc Kleine-Budde <mkl@pengutronix.de>
> Sent: Wednesday, August 28, 2019 1:03 AM
> To: Srinivas Neeli <sneeli@xilinx.com>; wg@grandegger.com
> Cc: Srinivas Goud <sgoud@xilinx.com>; Naga Sureshkumar Relli
> <nagasure@xilinx.com>; Appana Durga Kedareswara Rao
> <appanad@xilinx.com>; linux-can@vger.kernel.org
> Subject: Re: Query on possible bug in the can_create_echo_skb() API
> 
> Hello Srinivas Neeli,
> 
> please don't send HTML messages to the kernel mailinglists.
> 
> On 8/21/19 12:51 PM, Srinivas Neeli wrote:
> > While walking through the CAN core layer dev.c file in the
> > can_put_echo_skb() API [1], Seems to be there is a race condition in
> > the
> > can_create_echo_skb() API, more details below
> >
> > If the skb is a shared skb, we are overwriting the skb pointer [2] in
> > the can_create_echo_skb() API and returning the new skb back.
> 
> Where and how is the skb pointer overwritten? Can you explain a bit more.
> 
> > If the core layer/drivers use this skb it is not valid any more (it
> > may lead to crash/oops).
> >
> >
> >
> > A possible solution for this issue would make the function input
> > argument should be double-pointer.
> >
> > Please correct me if my analyzation is wrong.
> 
> Can you provide a patch of your proposed changes?
> 
> regards,
> Marc
> 
> --
> Pengutronix e.K.                  | Marc Kleine-Budde           |
> Industrial Linux Solutions        | Phone: +49-231-2826-924     |
> Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
> Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


^ permalink raw reply

* multipath tcp MIB counter placement - share with tcp or extra?
From: Florian Westphal @ 2019-08-28 11:43 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev

Hi Eric,

The out-of-tree multipath TCP stack adds a few MIB counters to track
(and debug) MTPCP behaviour.  Examples:

        SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE),
        SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE),
[..]
        SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS),
        SNMP_MIB_ITEM("MPFailRX", MPTCP_MIB_MPFAILRX),
        SNMP_MIB_ITEM("MPCsumFail", MPTCP_MIB_CSUMFAIL),

and so on.

I think that such MIB counters would be good to have in the 'upstreaming'
attempt as well.

The out-of-tree code keeps them separate from the tcp mib counters and also
exposes them in a different /proc file (/proc/net/mptcp_net/snmp).

Would you be ok with mptcp-upstreaming adding its MIB counters to the
existing TCP MIB instead?

This would make 'nstat' and other tools pick them up automatically.
It would also help TCP highlevel debugging to see if MPTCP is involved
in any way.

Let me know -- I can go with a separate MIB, its no problem, I just want
to avoid going down the wrong path.

Thanks,
Florian

^ permalink raw reply

* Re: [PATCH net-next v2 3/3] dpaa2-eth: Add pause frame support
From: Andrew Lunn @ 2019-08-28 11:52 UTC (permalink / raw)
  To: Ioana Ciocoi Radulescu
  Cc: netdev@vger.kernel.org, davem@davemloft.net, Ioana Ciornei
In-Reply-To: <AM0PR04MB499496AC09FD7BE58AE7B9C394A30@AM0PR04MB4994.eurprd04.prod.outlook.com>

> Clearing the ASYM_PAUSE flag only means we tell the firmware we want
> both Rx and Tx pause to be enabled in the beginning. User can still set
> an asymmetric config (i.e. only Rx pause or only Tx pause to be enabled)
> if needed.
> 
> The truth table is like this:
> 
> PAUSE | ASYM_PAUSE | Rx pause | Tx pause
> ----------------------------------------
>   0   |     0      | disabled | disabled
>   0   |     1      | disabled | enabled
>   1   |     0      | enabled  | enabled
>   1   |     1      | enabled  | disabled

Hi Ioana

Ah, that is not intuitive. Please add a comment, and maybe this table
to the commit message.

Thanks
	Andrew

^ permalink raw reply

* Re: [PATCH net-next v3 05/10] net: sched: add API for registering unlocked offload block callbacks
From: tanhuazhong @ 2019-08-28 11:53 UTC (permalink / raw)
  To: Vlad Buslov, netdev
  Cc: jhs, xiyou.wangcong, jiri, davem, jakub.kicinski, pablo,
	Jiri Pirko
In-Reply-To: <20190826134506.9705-6-vladbu@mellanox.com>



On 2019/8/26 21:45, Vlad Buslov wrote:
> Extend struct flow_block_offload with "unlocked_driver_cb" flag to allow
> registering and unregistering block hardware offload callbacks that do not
> require caller to hold rtnl lock. Extend tcf_block with additional
> lockeddevcnt counter that is incremented for each non-unlocked driver
> callback attached to device. This counter is necessary to conditionally
> obtain rtnl lock before calling hardware callbacks in following patches.
> 
> Register mlx5 tc block offload callbacks as "unlocked".
> 
> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
> Acked-by: Jiri Pirko <jiri@mellanox.com>
> ---
>   drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 ++
>   drivers/net/ethernet/mellanox/mlx5/core/en_rep.c  | 3 +++
>   include/net/flow_offload.h                        | 1 +
>   include/net/sch_generic.h                         | 1 +
>   net/sched/cls_api.c                               | 6 ++++++
>   5 files changed, 13 insertions(+)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index fa4bf2d4bcd4..8592b98d0e70 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -3470,10 +3470,12 @@ static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
>   			  void *type_data)
>   {
>   	struct mlx5e_priv *priv = netdev_priv(dev);
> +	struct flow_block_offload *f = type_data;
>   
>   	switch (type) {
>   #ifdef CONFIG_MLX5_ESWITCH
>   	case TC_SETUP_BLOCK:
> +		f->unlocked_driver_cb = true;
>   		return flow_block_cb_setup_simple(type_data,
>   						  &mlx5e_block_cb_list,
>   						  mlx5e_setup_tc_block_cb,
Hi,

I have got below warning when compiling the latest net-next:
drivers/net/ethernet/mellanox//mlx5/core/en_main.c:3473:29: warning: 
unused variable ‘f’ [-Wunused-variable]
   struct flow_block_offload *f = type_data;

Could this variable be defined within "#ifdef CONFIG_MLX5_ESWITCH"?
BTW, it seems varible f has not been used in any place in addition to 
assigning true to its member unlocked_driver_cb in "case 
TC_SETUP_BLOCK:". Maybe I have miss something about it:).

Huazhong.
Thanks.

> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
> index 3c0d36b2b91c..e7ac6233037d 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
> @@ -763,6 +763,7 @@ mlx5e_rep_indr_setup_tc_block(struct net_device *netdev,
>   	if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
>   		return -EOPNOTSUPP;
>   
> +	f->unlocked_driver_cb = true;
>   	f->driver_block_list = &mlx5e_block_cb_list;
>   
>   	switch (f->command) {
> @@ -1245,9 +1246,11 @@ static int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
>   			      void *type_data)
>   {
>   	struct mlx5e_priv *priv = netdev_priv(dev);
> +	struct flow_block_offload *f = type_data;
>   
>   	switch (type) {
>   	case TC_SETUP_BLOCK:
> +		f->unlocked_driver_cb = true;
>   		return flow_block_cb_setup_simple(type_data,
>   						  &mlx5e_rep_block_cb_list,
>   						  mlx5e_rep_setup_tc_cb,
> diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
> index 757fa84de654..fc881875f856 100644
> --- a/include/net/flow_offload.h
> +++ b/include/net/flow_offload.h
> @@ -284,6 +284,7 @@ struct flow_block_offload {
>   	enum flow_block_command command;
>   	enum flow_block_binder_type binder_type;
>   	bool block_shared;
> +	bool unlocked_driver_cb;
>   	struct net *net;
>   	struct flow_block *block;
>   	struct list_head cb_list;
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index c4fbbaff30a2..43f5b7ed02bd 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -408,6 +408,7 @@ struct tcf_block {
>   	bool keep_dst;
>   	atomic_t offloadcnt; /* Number of oddloaded filters */
>   	unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */
> +	unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */
>   	struct {
>   		struct tcf_chain *chain;
>   		struct list_head filter_chain_list;
> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> index 8b807e75fae2..1a39779bdbad 100644
> --- a/net/sched/cls_api.c
> +++ b/net/sched/cls_api.c
> @@ -1418,6 +1418,8 @@ static int tcf_block_bind(struct tcf_block *block,
>   						  bo->extack);
>   		if (err)
>   			goto err_unroll;
> +		if (!bo->unlocked_driver_cb)
> +			block->lockeddevcnt++;
>   
>   		i++;
>   	}
> @@ -1433,6 +1435,8 @@ static int tcf_block_bind(struct tcf_block *block,
>   						    block_cb->cb_priv, false,
>   						    tcf_block_offload_in_use(block),
>   						    NULL);
> +			if (!bo->unlocked_driver_cb)
> +				block->lockeddevcnt--;
>   		}
>   		flow_block_cb_free(block_cb);
>   	}
> @@ -1454,6 +1458,8 @@ static void tcf_block_unbind(struct tcf_block *block,
>   					    NULL);
>   		list_del(&block_cb->list);
>   		flow_block_cb_free(block_cb);
> +		if (!bo->unlocked_driver_cb)
> +			block->lockeddevcnt--;
>   	}
>   }
>   
> 


^ permalink raw reply

* Re: Query on possible bug in the can_create_echo_skb() API
From: Marc Kleine-Budde @ 2019-08-28 12:15 UTC (permalink / raw)
  To: Srinivas Neeli, wg@grandegger.com
  Cc: Srinivas Goud, Naga Sureshkumar Relli,
	Appana Durga Kedareswara Rao, linux-can@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <BYAPR02MB546403642B2233DDD5C456C4AFA30@BYAPR02MB5464.namprd02.prod.outlook.com>


[-- Attachment #1.1: Type: text/plain, Size: 1442 bytes --]

On 8/28/19 1:02 PM, Srinivas Neeli wrote:
> Case 1:
> can_put_echo_skb(); -> skb = can_create_echo_skb(skb); -> return skb;
> 
> In can_create_echo_skb() not using the shared_skb, so we are returning the old skb.
> Storing the return value in "skb". But it's a pointer, for storing that need double pointer.
> Instead of double-pointer using a single pointer. In this scenario it's ok , we are returning the same SKB.
> 
> Case 2:
> can_put_echo_skb(skb, ndev, priv->tx_head % priv->tx_max); -> skb = can_create_echo_skb(skb); -> can_skb_set_owner(nskb, skb->sk); - Returning nskb;
> 
> shared_skb scenario:
> In share-skb case “can_create_echo_skb(skb);”  returning "new skb". For storing new skb need a double pointer.
> 
> Providing an example for overcoming above issue.
> Example:
> can_put_echo_skb(struct sk_buff **skb,struct net_device *dev,unsigned int idx);

Now I get what you mean.

> If you ok with this change, I will send a patch.

I think the can_put_echo_skb() API needs clarification. The driver is
not allowed to touch the skb any more after can_put_echo_skb(). We
should review the driver for this.

thanks,
Marc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: multipath tcp MIB counter placement - share with tcp or extra?
From: Eric Dumazet @ 2019-08-28 12:27 UTC (permalink / raw)
  To: Florian Westphal, Eric Dumazet; +Cc: netdev
In-Reply-To: <20190828114321.GG20113@breakpoint.cc>



On 8/28/19 1:43 PM, Florian Westphal wrote:
> Hi Eric,
> 
> The out-of-tree multipath TCP stack adds a few MIB counters to track
> (and debug) MTPCP behaviour.  Examples:
> 
>         SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE),
>         SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE),
> [..]
>         SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS),
>         SNMP_MIB_ITEM("MPFailRX", MPTCP_MIB_MPFAILRX),
>         SNMP_MIB_ITEM("MPCsumFail", MPTCP_MIB_CSUMFAIL),
> 
> and so on.
> 
> I think that such MIB counters would be good to have in the 'upstreaming'
> attempt as well.
> 
> The out-of-tree code keeps them separate from the tcp mib counters and also
> exposes them in a different /proc file (/proc/net/mptcp_net/snmp).
> 
> Would you be ok with mptcp-upstreaming adding its MIB counters to the
> existing TCP MIB instead?
> 
> This would make 'nstat' and other tools pick them up automatically.
> It would also help TCP highlevel debugging to see if MPTCP is involved
> in any way.
> 
> Let me know -- I can go with a separate MIB, its no problem, I just want
> to avoid going down the wrong path.

There are about 40 counters.

Space for that will be per netns : num_possible_cpus * 40 * 8  bytes

The cost of folding all the values will make nstat slower even if MPTCP is not used.

Maybe find a way to not having to fold the MPTCP percpu counters if MPTCP is not loaded ?

^ permalink raw reply

* Re: multipath tcp MIB counter placement - share with tcp or extra?
From: Florian Westphal @ 2019-08-28 12:32 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Florian Westphal, Eric Dumazet, netdev
In-Reply-To: <deb00e41-0188-0ca9-ccb3-b74b34a4cc5d@gmail.com>

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > Let me know -- I can go with a separate MIB, its no problem, I just want
> > to avoid going down the wrong path.
> 
> There are about 40 counters.
> 
> Space for that will be per netns : num_possible_cpus * 40 * 8  bytes
> 
> The cost of folding all the values will make nstat slower even if MPTCP is not used.

Ok, so 'same proc file' would be fine but 'increase pcpu mem cost
unconditionally' isn't.

> Maybe find a way to not having to fold the MPTCP percpu counters if MPTCP is not loaded ?

MPTCP is builtin (bool).

However, we might be able to delay allocation until first mptcp socket
is requested, I will see if this can be done somehow.

Thanks Eric!

^ permalink raw reply

* [PATCH net v4 1/2] Revert "r8152: napi hangup fix after disconnect"
From: Hayes Wang @ 2019-08-28 12:56 UTC (permalink / raw)
  To: netdev; +Cc: nic_swsd, linux-kernel, Hayes Wang
In-Reply-To: <1394712342-15778-323-Taiwan-albertk@realtek.com>

This reverts commit 0ee1f4734967af8321ecebaf9c74221ace34f2d5.

The commit 0ee1f4734967 ("r8152: napi hangup fix after
disconnect") adds a check about RTL8152_UNPLUG to determine
if calling napi_disable() is invalid in rtl8152_close(),
when rtl8152_disconnect() is called. This avoids to use
napi_disable() after calling netif_napi_del().

However, commit ffa9fec30ca0 ("r8152: set RTL8152_UNPLUG
only for real disconnection") causes that RTL8152_UNPLUG
is not always set when calling rtl8152_disconnect().
Therefore, I have to revert commit 0ee1f4734967 ("r8152:
napi hangup fix after disconnect"), first. And submit
another patch to fix it.

Fixes: ffa9fec30ca0 ("r8152: set RTL8152_UNPLUG only for real disconnection")
Signed-off-by: Hayes Wang <hayeswang@realtek.com>
---
 drivers/net/usb/r8152.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index eee0f5007ee3..ad3abe26b51b 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -4021,8 +4021,7 @@ static int rtl8152_close(struct net_device *netdev)
 #ifdef CONFIG_PM_SLEEP
 	unregister_pm_notifier(&tp->pm_notifier);
 #endif
-	if (!test_bit(RTL8152_UNPLUG, &tp->flags))
-		napi_disable(&tp->napi);
+	napi_disable(&tp->napi);
 	clear_bit(WORK_ENABLE, &tp->flags);
 	usb_kill_urb(tp->intr_urb);
 	cancel_delayed_work_sync(&tp->schedule);
-- 
2.21.0


^ permalink raw reply related

* [PATCH net v4 2/2] r8152: remove calling netif_napi_del
From: Hayes Wang @ 2019-08-28 12:56 UTC (permalink / raw)
  To: netdev; +Cc: nic_swsd, linux-kernel, Hayes Wang
In-Reply-To: <1394712342-15778-323-Taiwan-albertk@realtek.com>

Remove unnecessary use of netif_napi_del. This also avoids to call
napi_disable() after netif_napi_del().

Fixes: ffa9fec30ca0 ("r8152: set RTL8152_UNPLUG only for real disconnection")
Signed-off-by: Hayes Wang <hayeswang@realtek.com>
---
 drivers/net/usb/r8152.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index ad3abe26b51b..04137ac373b0 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -5352,7 +5352,6 @@ static int rtl8152_probe(struct usb_interface *intf,
 	return 0;
 
 out1:
-	netif_napi_del(&tp->napi);
 	usb_set_intfdata(intf, NULL);
 out:
 	free_netdev(netdev);
@@ -5367,7 +5366,6 @@ static void rtl8152_disconnect(struct usb_interface *intf)
 	if (tp) {
 		rtl_set_unplug(tp);
 
-		netif_napi_del(&tp->napi);
 		unregister_netdev(tp->netdev);
 		cancel_delayed_work_sync(&tp->hw_phy_work);
 		tp->rtl_ops.unload(tp);
-- 
2.21.0


^ permalink raw reply related

* [PATCH net v4 0/2] r8152: fix side effect
From: Hayes Wang @ 2019-08-28 12:56 UTC (permalink / raw)
  To: netdev; +Cc: nic_swsd, linux-kernel, Hayes Wang
In-Reply-To: <1394712342-15778-314-Taiwan-albertk@realtek.com>

v4:
Add Fixes tag for both patch #1 and #2.

v3:
Update the commit message for patch #1.

v2:
Replace patch #2 with "r8152: remove calling netif_napi_del".

v1:
The commit 0ee1f4734967 ("r8152: napi hangup fix after disconnect")
add a check to avoid using napi_disable after netif_napi_del. However,
the commit ffa9fec30ca0 ("r8152: set RTL8152_UNPLUG only for real
disconnection") let the check useless.

Therefore, I revert commit 0ee1f4734967 ("r8152: napi hangup fix
after disconnect") first, and add another patch to fix it.

Hayes Wang (2):
  Revert "r8152: napi hangup fix after disconnect"
  r8152: remove calling netif_napi_del

 drivers/net/usb/r8152.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

-- 
2.21.0


^ permalink raw reply

* Re: [PATCH 1/2] PTP: introduce new versions of IOCTLs
From: Richard Cochran @ 2019-08-28 12:57 UTC (permalink / raw)
  To: Felipe Balbi; +Cc: Joe Perches, Christopher S Hall, netdev, linux-kernel
In-Reply-To: <87k1axwvei.fsf@gmail.com>

On Wed, Aug 28, 2019 at 11:23:33AM +0300, Felipe Balbi wrote:
> Originally I had memset only on the three cases where they were
> needed. Richard, which do you prefer? I don't mind changing it back.

Go ahead and change it back.

Thanks,
Richard

^ permalink raw reply

* Re: [PATCH REPOST 1/2] can: flexcan: fix deadlock when using self wakeup
From: Sean Nyekjaer @ 2019-08-28 13:24 UTC (permalink / raw)
  To: Joakim Zhang, mkl@pengutronix.de, linux-can@vger.kernel.org
  Cc: wg@grandegger.com, netdev@vger.kernel.org, dl-linux-imx,
	Martin Hundebøll
In-Reply-To: <35190c5b-f8be-8784-5b4f-32a691a6cffe@geanix.com>



On 20/08/2019 13.55, Sean Nyekjaer wrote:
> 
> I have added some more debug, same test setup:
> https://gist.github.com/sknsean/81208714de23aa3639d3e31dccb2f3e0
> 
> root@iwg26:~# systemctl suspend
> 
> ...
> https://gist.github.com/sknsean/2a786f1543305056d4de03d387872403
> 
> /Sean

Any luck reproducing this?

/Sean

^ permalink raw reply

* RE: [PATCH net-next v2 3/3] dpaa2-eth: Add pause frame support
From: Ioana Ciocoi Radulescu @ 2019-08-28 13:31 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: netdev@vger.kernel.org, davem@davemloft.net, Ioana Ciornei
In-Reply-To: <20190828115250.GA32178@lunn.ch>

> -----Original Message-----
> From: Andrew Lunn <andrew@lunn.ch>
> Sent: Wednesday, August 28, 2019 2:53 PM
> To: Ioana Ciocoi Radulescu <ruxandra.radulescu@nxp.com>
> Cc: netdev@vger.kernel.org; davem@davemloft.net; Ioana Ciornei
> <ioana.ciornei@nxp.com>
> Subject: Re: [PATCH net-next v2 3/3] dpaa2-eth: Add pause frame support
> 
> > Clearing the ASYM_PAUSE flag only means we tell the firmware we want
> > both Rx and Tx pause to be enabled in the beginning. User can still set
> > an asymmetric config (i.e. only Rx pause or only Tx pause to be enabled)
> > if needed.
> >
> > The truth table is like this:
> >
> > PAUSE | ASYM_PAUSE | Rx pause | Tx pause
> > ----------------------------------------
> >   0   |     0      | disabled | disabled
> >   0   |     1      | disabled | enabled
> >   1   |     0      | enabled  | enabled
> >   1   |     1      | enabled  | disabled
> 
> Hi Ioana
> 
> Ah, that is not intuitive. Please add a comment, and maybe this table
> to the commit message.

I think firmware tried to mirror the ASM_DIR bit (see
http://www.ieee802.org/3/z/public/presentations/nov1996/asym.pdf),
but I agree it's not really user friendly. Will add comment in v3.

Thanks,
Ioana




^ permalink raw reply

* [PATCH net-next 11/15] net: sgi: ioc3-eth: use dma-direct for dma allocations
From: Thomas Bogendoerfer @ 2019-08-28 14:03 UTC (permalink / raw)
  To: Ralf Baechle, Paul Burton, James Hogan, David S. Miller,
	linux-mips, linux-kernel, netdev
In-Reply-To: <20190828140315.17048-1-tbogendoerfer@suse.de>

Replace the homegrown DMA memory allocation, which only works on
SGI-IP27 machines, with the generic dma allocations.

Signed-off-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/sgi/ioc3-eth.c | 146 ++++++++++++++++++++++++++++--------
 1 file changed, 114 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
index ecd3bc135cc0..aea3fecfac24 100644
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -36,7 +36,6 @@
 #include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
-#include <linux/dma-mapping.h>
 #include <linux/gfp.h>
 
 #ifdef CONFIG_SERIAL_8250
@@ -49,6 +48,8 @@
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
 #include <linux/skbuff.h>
+#include <linux/dma-direct.h>
+
 #include <net/ip.h>
 
 #include <asm/byteorder.h>
@@ -64,10 +65,12 @@
 #define RX_BUFFS		64
 #define RX_RING_ENTRIES		512		/* fixed in hardware */
 #define RX_RING_MASK		(RX_RING_ENTRIES - 1)
+#define RX_RING_SIZE		(RX_RING_ENTRIES * sizeof(u64))
 
 /* 128 TX buffers (not tunable) */
 #define TX_RING_ENTRIES		128
 #define TX_RING_MASK		(TX_RING_ENTRIES - 1)
+#define TX_RING_SIZE		(TX_RING_ENTRIES * sizeof(struct ioc3_etxd))
 
 /* IOC3 does dma transfers in 128 byte blocks */
 #define IOC3_DMA_XFER_LEN	128UL
@@ -83,9 +86,12 @@
 struct ioc3_private {
 	struct ioc3_ethregs *regs;
 	struct ioc3 *all_regs;
+	struct device *dma_dev;
 	u32 *ssram;
 	unsigned long *rxr;		/* pointer to receiver ring */
 	struct ioc3_etxd *txr;
+	dma_addr_t rxr_dma;
+	dma_addr_t txr_dma;
 	struct sk_buff *rx_skbs[RX_RING_ENTRIES];
 	struct sk_buff *tx_skbs[TX_RING_ENTRIES];
 	int rx_ci;			/* RX consumer index */
@@ -125,9 +131,11 @@ static inline unsigned long aligned_rx_skb_addr(unsigned long addr)
 	return (~addr + 1) & (IOC3_DMA_XFER_LEN - 1UL);
 }
 
-static inline int ioc3_alloc_skb(struct sk_buff **skb, struct ioc3_erxbuf **rxb)
+static inline int ioc3_alloc_skb(struct ioc3_private *ip, struct sk_buff **skb,
+				 struct ioc3_erxbuf **rxb, dma_addr_t *rxb_dma)
 {
 	struct sk_buff *new_skb;
+	dma_addr_t d;
 	int offset;
 
 	new_skb = alloc_skb(RX_BUF_SIZE + IOC3_DMA_XFER_LEN - 1, GFP_ATOMIC);
@@ -139,6 +147,14 @@ static inline int ioc3_alloc_skb(struct sk_buff **skb, struct ioc3_erxbuf **rxb)
 	if (offset)
 		skb_reserve(new_skb, offset);
 
+	d = dma_map_single(ip->dma_dev, new_skb->data,
+			   RX_BUF_SIZE, DMA_FROM_DEVICE);
+
+	if (dma_mapping_error(ip->dma_dev, d)) {
+		dev_kfree_skb_any(new_skb);
+		return -ENOMEM;
+	}
+	*rxb_dma = d;
 	*rxb = (struct ioc3_erxbuf *)new_skb->data;
 	skb_reserve(new_skb, RX_OFFSET);
 	*skb = new_skb;
@@ -146,17 +162,22 @@ static inline int ioc3_alloc_skb(struct sk_buff **skb, struct ioc3_erxbuf **rxb)
 	return 0;
 }
 
-static inline unsigned long ioc3_map(void *ptr, unsigned long vdev)
+#ifdef CONFIG_PCI_XTALK_BRIDGE
+static inline unsigned long ioc3_map(dma_addr_t addr, unsigned long attr)
 {
-#ifdef CONFIG_SGI_IP27
-	vdev <<= 57;   /* Shift to PCI64_ATTR_VIRTUAL */
+	return (addr & ~PCI64_ATTR_BAR) | attr;
+}
 
-	return vdev | (0xaUL << PCI64_ATTR_TARG_SHFT) | PCI64_ATTR_PREF |
-	       ((unsigned long)ptr & TO_PHYS_MASK);
+#define ERBAR_VAL	(ERBAR_BARRIER_BIT << ERBAR_RXBARR_SHIFT)
 #else
-	return virt_to_bus(ptr);
-#endif
+static inline unsigned long ioc3_map(dma_addr_t addr, unsigned long attr)
+{
+	return addr;
 }
+
+#define ERBAR_VAL	0
+#endif
+
 #define IOC3_SIZE 0x100000
 
 static inline u32 mcr_pack(u32 pulse, u32 sample)
@@ -523,6 +544,7 @@ static inline void ioc3_rx(struct net_device *dev)
 	int rx_entry, n_entry, len;
 	struct ioc3_erxbuf *rxb;
 	unsigned long *rxr;
+	dma_addr_t d;
 	u32 w0, err;
 
 	rxr = ip->rxr;		/* Ring base */
@@ -540,12 +562,13 @@ static inline void ioc3_rx(struct net_device *dev)
 			skb_put(skb, len);
 			skb->protocol = eth_type_trans(skb, dev);
 
-			if (ioc3_alloc_skb(&new_skb, &rxb)) {
+			if (ioc3_alloc_skb(ip, &new_skb, &rxb, &d)) {
 				/* Ouch, drop packet and just recycle packet
 				 * to keep the ring filled.
 				 */
 				dev->stats.rx_dropped++;
 				new_skb = skb;
+				d = rxr[rx_entry];
 				goto next;
 			}
 
@@ -554,6 +577,9 @@ static inline void ioc3_rx(struct net_device *dev)
 						     w0 & ERXBUF_IPCKSUM_MASK,
 						     len);
 
+			dma_unmap_single(ip->dma_dev, rxr[rx_entry],
+					 RX_BUF_SIZE, DMA_FROM_DEVICE);
+
 			netif_rx(skb);
 
 			ip->rx_skbs[rx_entry] = NULL;	/* Poison  */
@@ -566,15 +592,17 @@ static inline void ioc3_rx(struct net_device *dev)
 			 * recycle it.
 			 */
 			new_skb = skb;
+			d = rxr[rx_entry];
 			dev->stats.rx_errors++;
 		}
 		if (err & ERXBUF_CRCERR)	/* Statistics */
 			dev->stats.rx_crc_errors++;
 		if (err & ERXBUF_FRAMERR)
 			dev->stats.rx_frame_errors++;
+
 next:
 		ip->rx_skbs[n_entry] = new_skb;
-		rxr[n_entry] = cpu_to_be64(ioc3_map(rxb, 1));
+		rxr[n_entry] = cpu_to_be64(ioc3_map(d, PCI64_ATTR_BAR));
 		rxb->w0 = 0;				/* Clear valid flag */
 		n_entry = (n_entry + 1) & RX_RING_MASK;	/* Update erpir */
 
@@ -767,6 +795,26 @@ static void ioc3_mii_start(struct ioc3_private *ip)
 	add_timer(&ip->ioc3_timer);
 }
 
+static inline void ioc3_tx_unmap(struct ioc3_private *ip, int entry)
+{
+	struct ioc3_etxd *desc;
+	u32 cmd, bufcnt, len;
+
+	desc = &ip->txr[entry];
+	cmd = be32_to_cpu(desc->cmd);
+	bufcnt = be32_to_cpu(desc->bufcnt);
+	if (cmd & ETXD_B1V) {
+		len = (bufcnt & ETXD_B1CNT_MASK) >> ETXD_B1CNT_SHIFT;
+		dma_unmap_single(ip->dma_dev, be64_to_cpu(desc->p1),
+				 len, DMA_TO_DEVICE);
+	}
+	if (cmd & ETXD_B2V) {
+		len = (bufcnt & ETXD_B2CNT_MASK) >> ETXD_B2CNT_SHIFT;
+		dma_unmap_single(ip->dma_dev, be64_to_cpu(desc->p2),
+				 len, DMA_TO_DEVICE);
+	}
+}
+
 static inline void ioc3_clean_tx_ring(struct ioc3_private *ip)
 {
 	struct sk_buff *skb;
@@ -775,6 +823,7 @@ static inline void ioc3_clean_tx_ring(struct ioc3_private *ip)
 	for (i = 0; i < TX_RING_ENTRIES; i++) {
 		skb = ip->tx_skbs[i];
 		if (skb) {
+			ioc3_tx_unmap(ip, i);
 			ip->tx_skbs[i] = NULL;
 			dev_kfree_skb_any(skb);
 		}
@@ -794,8 +843,12 @@ static void ioc3_free_rx_bufs(struct ioc3_private *ip)
 
 	while (n_entry != rx_entry) {
 		skb = ip->rx_skbs[n_entry];
-		if (skb)
+		if (skb) {
+			dma_unmap_single(ip->dma_dev,
+					 be64_to_cpu(ip->rxr[n_entry]),
+					 RX_BUF_SIZE, DMA_FROM_DEVICE);
 			dev_kfree_skb_any(skb);
+		}
 
 		n_entry = (n_entry + 1) & RX_RING_MASK;
 	}
@@ -805,6 +858,7 @@ static int ioc3_alloc_rx_bufs(struct net_device *dev)
 {
 	struct ioc3_private *ip = netdev_priv(dev);
 	struct ioc3_erxbuf *rxb;
+	dma_addr_t d;
 	int i;
 
 	/* Now the rx buffers.  The RX ring may be larger but
@@ -812,10 +866,10 @@ static int ioc3_alloc_rx_bufs(struct net_device *dev)
 	 * this for performance and memory later.
 	 */
 	for (i = 0; i < RX_BUFFS; i++) {
-		if (ioc3_alloc_skb(&ip->rx_skbs[i], &rxb))
+		if (ioc3_alloc_skb(ip, &ip->rx_skbs[i], &rxb, &d))
 			return -ENOMEM;
 
-		ip->rxr[i] = cpu_to_be64(ioc3_map(rxb, 1));
+		ip->rxr[i] = cpu_to_be64(ioc3_map(d, PCI64_ATTR_BAR));
 	}
 	ip->rx_ci = 0;
 	ip->rx_pi = RX_BUFFS;
@@ -861,13 +915,7 @@ static void ioc3_init(struct net_device *dev)
 	readl(&regs->emcr);
 
 	/* Misc registers  */
-#ifdef CONFIG_SGI_IP27
-	/* Barrier on last store */
-	writel(PCI64_ATTR_BAR >> 32, &regs->erbar);
-#else
-	/* Let PCI API get it right */
-	writel(0, &regs->erbar);
-#endif
+	writel(ERBAR_VAL, &regs->erbar);
 	readl(&regs->etcdc);			/* Clear on read */
 	writel(15, &regs->ercsr);		/* RX low watermark  */
 	writel(0, &regs->ertr);			/* Interrupt immediately */
@@ -883,13 +931,13 @@ static void ioc3_start(struct ioc3_private *ip)
 	unsigned long ring;
 
 	/* Now the rx ring base, consume & produce registers.  */
-	ring = ioc3_map(ip->rxr, 0);
+	ring = ioc3_map(ip->rxr_dma, PCI64_ATTR_PREC);
 	writel(ring >> 32, &regs->erbr_h);
 	writel(ring & 0xffffffff, &regs->erbr_l);
 	writel(ip->rx_ci << 3, &regs->ercir);
 	writel((ip->rx_pi << 3) | ERPIR_ARM, &regs->erpir);
 
-	ring = ioc3_map(ip->txr, 0);
+	ring = ioc3_map(ip->txr_dma, PCI64_ATTR_PREC);
 
 	ip->txqlen = 0;					/* nothing queued  */
 
@@ -1163,6 +1211,7 @@ static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	ip = netdev_priv(dev);
 	ip->dev = dev;
+	ip->dma_dev = &pdev->dev;
 
 	dev->irq = pdev->irq;
 
@@ -1189,7 +1238,8 @@ static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	ioc3_stop(ip);
 
 	/* Allocate and rx ring.  4kb = 512 entries  */
-	ip->rxr = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
+	ip->rxr = dma_direct_alloc_pages(ip->dma_dev, RX_RING_SIZE,
+					 &ip->rxr_dma, GFP_ATOMIC, 0);
 	if (!ip->rxr) {
 		pr_err("ioc3-eth: rx ring allocation failed\n");
 		err = -ENOMEM;
@@ -1197,7 +1247,9 @@ static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 
 	/* Allocate tx rings.  16kb = 128 bufs.  */
-	ip->txr = (struct ioc3_etxd *)__get_free_pages(GFP_KERNEL, 2);
+	ip->txr = dma_direct_alloc_pages(ip->dma_dev, TX_RING_SIZE,
+					 &ip->txr_dma,
+					 GFP_KERNEL | __GFP_ZERO, 0);
 	if (!ip->txr) {
 		pr_err("ioc3-eth: tx ring allocation failed\n");
 		err = -ENOMEM;
@@ -1256,8 +1308,12 @@ static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 out_stop:
 	del_timer_sync(&ip->ioc3_timer);
-	kfree(ip->rxr);
-	kfree(ip->txr);
+	if (ip->rxr)
+		dma_direct_free_pages(ip->dma_dev, RX_RING_SIZE, ip->rxr,
+				      ip->rxr_dma, 0);
+	if (ip->txr)
+		dma_direct_free_pages(ip->dma_dev, TX_RING_SIZE, ip->txr,
+				      ip->txr_dma, 0);
 out_res:
 	pci_release_regions(pdev);
 out_free:
@@ -1275,8 +1331,12 @@ static void ioc3_remove_one(struct pci_dev *pdev)
 	struct net_device *dev = pci_get_drvdata(pdev);
 	struct ioc3_private *ip = netdev_priv(dev);
 
-	kfree(ip->rxr);
-	kfree(ip->txr);
+	if (ip->rxr)
+		dma_direct_free_pages(ip->dma_dev, RX_RING_SIZE, ip->rxr,
+				      ip->rxr_dma, 0);
+	if (ip->txr)
+		dma_direct_free_pages(ip->dma_dev, TX_RING_SIZE, ip->txr,
+				      ip->txr_dma, 0);
 
 	unregister_netdev(dev);
 	del_timer_sync(&ip->ioc3_timer);
@@ -1382,18 +1442,32 @@ static netdev_tx_t ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		unsigned long b2 = (data | 0x3fffUL) + 1UL;
 		unsigned long s1 = b2 - data;
 		unsigned long s2 = data + len - b2;
+		dma_addr_t d1, d2;
 
 		desc->cmd    = cpu_to_be32(len | ETXD_INTWHENDONE |
 					   ETXD_B1V | ETXD_B2V | w0);
 		desc->bufcnt = cpu_to_be32((s1 << ETXD_B1CNT_SHIFT) |
 					   (s2 << ETXD_B2CNT_SHIFT));
-		desc->p1     = cpu_to_be64(ioc3_map(skb->data, 1));
-		desc->p2     = cpu_to_be64(ioc3_map((void *)b2, 1));
+		d1 = dma_map_single(ip->dma_dev, skb->data, s1, DMA_TO_DEVICE);
+		if (dma_mapping_error(ip->dma_dev, d1))
+			goto drop_packet;
+		d2 = dma_map_single(ip->dma_dev, (void *)b2, s1, DMA_TO_DEVICE);
+		if (dma_mapping_error(ip->dma_dev, d2)) {
+			dma_unmap_single(ip->dma_dev, d1, len, DMA_TO_DEVICE);
+			goto drop_packet;
+		}
+		desc->p1     = cpu_to_be64(ioc3_map(d1, PCI64_ATTR_PREF));
+		desc->p2     = cpu_to_be64(ioc3_map(d2, PCI64_ATTR_PREF));
 	} else {
+		dma_addr_t d;
+
 		/* Normal sized packet that doesn't cross a page boundary. */
 		desc->cmd = cpu_to_be32(len | ETXD_INTWHENDONE | ETXD_B1V | w0);
 		desc->bufcnt = cpu_to_be32(len << ETXD_B1CNT_SHIFT);
-		desc->p1     = cpu_to_be64(ioc3_map(skb->data, 1));
+		d = dma_map_single(ip->dma_dev, skb->data, len, DMA_TO_DEVICE);
+		if (dma_mapping_error(ip->dma_dev, d))
+			goto drop_packet;
+		desc->p1     = cpu_to_be64(ioc3_map(d, PCI64_ATTR_PREF));
 	}
 
 	mb(); /* make sure all descriptor changes are visible */
@@ -1411,6 +1485,14 @@ static netdev_tx_t ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	spin_unlock_irq(&ip->ioc3_lock);
 
 	return NETDEV_TX_OK;
+
+drop_packet:
+	dev_kfree_skb_any(skb);
+	dev->stats.tx_dropped++;
+
+	spin_unlock_irq(&ip->ioc3_lock);
+
+	return NETDEV_TX_OK;
 }
 
 static void ioc3_timeout(struct net_device *dev)
-- 
2.13.7


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox