Netdev List
 help / color / mirror / Atom feed
* [PATCH perf/core 3/5] perf tools: Move libbpf init in libbpf_init function
From: Jiri Olsa @ 2022-04-22 10:00 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko
  Cc: linux-perf-users, netdev, bpf, Ingo Molnar, Namhyung Kim,
	Alexander Shishkin, Peter Zijlstra, Martin KaFai Lau, Song Liu,
	Yonghong Song, John Fastabend, Ian Rogers
In-Reply-To: <20220422100025.1469207-1-jolsa@kernel.org>

Moving the libbpf init code into single function,
so we have single place doing that.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/perf/util/bpf-loader.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index b72cef1ae959..f8ad581ea247 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -99,16 +99,26 @@ static int bpf_perf_object__add(struct bpf_object *obj)
 	return perf_obj ? 0 : -ENOMEM;
 }
 
+static int libbpf_init(void)
+{
+	if (libbpf_initialized)
+		return 0;
+
+	libbpf_set_print(libbpf_perf_print);
+	libbpf_initialized = true;
+	return 0;
+}
+
 struct bpf_object *
 bpf__prepare_load_buffer(void *obj_buf, size_t obj_buf_sz, const char *name)
 {
 	LIBBPF_OPTS(bpf_object_open_opts, opts, .object_name = name);
 	struct bpf_object *obj;
+	int err;
 
-	if (!libbpf_initialized) {
-		libbpf_set_print(libbpf_perf_print);
-		libbpf_initialized = true;
-	}
+	err = libbpf_init();
+	if (err)
+		return ERR_PTR(err);
 
 	obj = bpf_object__open_mem(obj_buf, obj_buf_sz, &opts);
 	if (IS_ERR_OR_NULL(obj)) {
@@ -135,14 +145,13 @@ struct bpf_object *bpf__prepare_load(const char *filename, bool source)
 {
 	LIBBPF_OPTS(bpf_object_open_opts, opts, .object_name = filename);
 	struct bpf_object *obj;
+	int err;
 
-	if (!libbpf_initialized) {
-		libbpf_set_print(libbpf_perf_print);
-		libbpf_initialized = true;
-	}
+	err = libbpf_init();
+	if (err)
+		return ERR_PTR(err);
 
 	if (source) {
-		int err;
 		void *obj_buf;
 		size_t obj_buf_sz;
 
-- 
2.35.1


^ permalink raw reply related

* [PATCH perf/core 4/5] perf tools: Register perfkprobe libbpf section handler
From: Jiri Olsa @ 2022-04-22 10:00 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko
  Cc: linux-perf-users, netdev, bpf, Ingo Molnar, Namhyung Kim,
	Alexander Shishkin, Peter Zijlstra, Martin KaFai Lau, Song Liu,
	Yonghong Song, John Fastabend, Ian Rogers
In-Reply-To: <20220422100025.1469207-1-jolsa@kernel.org>

Perf is using section name to declare special kprobe arguments,
which no longer works with current libbpf, that either requires
certain form of the section name or allows to register custom
handler.

Adding support for 'perfkprobe/' section name handler to take
care of perf kprobe programs.

The handler servers two purposes:
  - allows perf programs to have special arguments in section name
  - allows perf to use pre-load callback where we can attach init
    code (zeroing all argument registers) to each perf program

The second is essential part of new prologue generation code,
that's coming in following patch.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/perf/util/bpf-loader.c | 50 ++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index f8ad581ea247..92dd8cc18edb 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -86,6 +86,7 @@ bpf_perf_object__next(struct bpf_perf_object *prev)
 	     (perf_obj) = (tmp), (tmp) = bpf_perf_object__next(tmp))
 
 static bool libbpf_initialized;
+static int libbpf_sec_handler;
 
 static int bpf_perf_object__add(struct bpf_object *obj)
 {
@@ -99,12 +100,61 @@ static int bpf_perf_object__add(struct bpf_object *obj)
 	return perf_obj ? 0 : -ENOMEM;
 }
 
+static struct bpf_insn prologue_init_insn[] = {
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_MOV64_IMM(BPF_REG_1, 0),
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_MOV64_IMM(BPF_REG_4, 0),
+	BPF_MOV64_IMM(BPF_REG_5, 0),
+};
+
+#define LIBBPF_SEC_PREFIX "perfkprobe/"
+
+static int libbpf_prog_prepare_load_fn(struct bpf_program *prog,
+				       struct bpf_prog_load_opts *opts __maybe_unused,
+				       long cookie __maybe_unused)
+{
+	size_t init_size_cnt = ARRAY_SIZE(prologue_init_insn);
+	size_t orig_insn_cnt, insn_cnt, init_size, orig_size;
+	const struct bpf_insn *orig_insn;
+	struct bpf_insn *insn;
+
+	/* prepend initialization code to program instructions */
+	orig_insn = bpf_program__insns(prog);
+	orig_insn_cnt = bpf_program__insn_cnt(prog);
+	init_size = init_size_cnt * sizeof(*insn);
+	orig_size = orig_insn_cnt * sizeof(*insn);
+
+	insn_cnt = orig_insn_cnt + init_size_cnt;
+	insn = malloc(insn_cnt * sizeof(*insn));
+	if (!insn)
+		return -ENOMEM;
+
+	memcpy(insn, prologue_init_insn, init_size);
+	memcpy((char *) insn + init_size, orig_insn, orig_size);
+	bpf_program__set_insns(prog, insn, insn_cnt);
+	return 0;
+}
+
 static int libbpf_init(void)
 {
+	LIBBPF_OPTS(libbpf_prog_handler_opts, handler_opts,
+		.prog_prepare_load_fn = libbpf_prog_prepare_load_fn,
+	);
+
 	if (libbpf_initialized)
 		return 0;
 
 	libbpf_set_print(libbpf_perf_print);
+	libbpf_sec_handler = libbpf_register_prog_handler(LIBBPF_SEC_PREFIX,
+							  BPF_PROG_TYPE_KPROBE,
+							  0, &handler_opts);
+	if (libbpf_sec_handler < 0) {
+		pr_debug("bpf: failed to register libbpf section handler: %d\n",
+			 libbpf_sec_handler);
+		return -BPF_LOADER_ERRNO__INTERNAL;
+	}
 	libbpf_initialized = true;
 	return 0;
 }
-- 
2.35.1


^ permalink raw reply related

* [PATCH perf/core 5/5] perf tools: Rework prologue generation code
From: Jiri Olsa @ 2022-04-22 10:00 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko
  Cc: linux-perf-users, netdev, bpf, Ingo Molnar, Namhyung Kim,
	Alexander Shishkin, Peter Zijlstra, Martin KaFai Lau, Song Liu,
	Yonghong Song, John Fastabend, Ian Rogers
In-Reply-To: <20220422100025.1469207-1-jolsa@kernel.org>

Some functions we use for bpf prologue generation are going to be
deprecated. This change reworks current code not to use them.

We need to replace following functions/struct:
   bpf_program__set_prep
   bpf_program__nth_fd
   struct bpf_prog_prep_result

We use bpf_program__set_prep to hook perf callback before program
is loaded and provide new instructions with the prologue.

We workaround this by taking instructions for specific program,
attaching prologue to them and load such new ebpf programs with
prologue using separate bpf_prog_load calls (outside libbpf load
machinery).

Before we can take program instructions, we need libbpf to actually
load it. This way we get the final shape of its instructions (with
all relocations and verifier adjustments).

There's one glitch though.. perf kprobe program already assumes
generated prologue code with proper values in argument registers,
so loading such program directly will fail in the verifier.

That's where 'perfkprobe/' pre-load handler fits in and prepends
the initialization code to the program. Once such program is loaded
we take its instructions, cut off the initialization code and prepend
the prologue.

I know.. sorry ;-)

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 tools/perf/include/bpf/bpf.h                |   2 +-
 tools/perf/tests/bpf-script-example.c       |   2 +-
 tools/perf/tests/bpf-script-test-prologue.c |   2 +-
 tools/perf/util/bpf-loader.c                | 136 +++++++++++++++++---
 4 files changed, 120 insertions(+), 22 deletions(-)

diff --git a/tools/perf/include/bpf/bpf.h b/tools/perf/include/bpf/bpf.h
index b422aeef5339..91869f6fb672 100644
--- a/tools/perf/include/bpf/bpf.h
+++ b/tools/perf/include/bpf/bpf.h
@@ -50,7 +50,7 @@ static void (*bpf_tail_call)(void *ctx, void *map, int index) = (void *)BPF_FUNC
 #define SEC(NAME) __attribute__((section(NAME),  used))
 
 #define probe(function, vars) \
-	SEC(#function "=" #function " " #vars) function
+	SEC("perfkprobe/" #function "=" #function " " #vars) function
 
 #define syscall_enter(name) \
 	SEC("syscalls:sys_enter_" #name) syscall_enter_ ## name
diff --git a/tools/perf/tests/bpf-script-example.c b/tools/perf/tests/bpf-script-example.c
index ab4b98b3165d..56673fa1f30d 100644
--- a/tools/perf/tests/bpf-script-example.c
+++ b/tools/perf/tests/bpf-script-example.c
@@ -32,7 +32,7 @@ struct bpf_map_def SEC("maps") flip_table = {
 	.max_entries = 1,
 };
 
-SEC("func=do_epoll_wait")
+SEC("perfkprobe/func=do_epoll_wait")
 int bpf_func__SyS_epoll_pwait(void *ctx)
 {
 	int ind =0;
diff --git a/tools/perf/tests/bpf-script-test-prologue.c b/tools/perf/tests/bpf-script-test-prologue.c
index bd83d364cf30..00dac5a23938 100644
--- a/tools/perf/tests/bpf-script-test-prologue.c
+++ b/tools/perf/tests/bpf-script-test-prologue.c
@@ -26,7 +26,7 @@
 static void (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) =
 	(void *) 6;
 
-SEC("func=null_lseek file->f_mode offset orig")
+SEC("perfkprobe/func=null_lseek file->f_mode offset orig")
 int bpf_func__null_lseek(void *ctx, int err, unsigned long _f_mode,
 			 unsigned long offset, unsigned long orig)
 {
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 92dd8cc18edb..10151da862c8 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -9,6 +9,7 @@
 #include <linux/bpf.h>
 #include <bpf/libbpf.h>
 #include <bpf/bpf.h>
+#include <linux/filter.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
@@ -49,6 +50,7 @@ struct bpf_prog_priv {
 	struct bpf_insn *insns_buf;
 	int nr_types;
 	int *type_mapping;
+	int *proglogue_fds;
 };
 
 struct bpf_perf_object {
@@ -56,6 +58,11 @@ struct bpf_perf_object {
 	struct bpf_object *obj;
 };
 
+struct bpf_preproc_result {
+	struct bpf_insn *new_insn_ptr;
+	int new_insn_cnt;
+};
+
 static LIST_HEAD(bpf_objects_list);
 static struct hashmap *bpf_program_hash;
 static struct hashmap *bpf_map_hash;
@@ -238,14 +245,31 @@ struct bpf_object *bpf__prepare_load(const char *filename, bool source)
 	return obj;
 }
 
+static void close_prologue_programs(struct bpf_prog_priv *priv)
+{
+	struct perf_probe_event *pev;
+	int i, fd;
+
+	if (!priv->need_prologue)
+		return;
+	pev = &priv->pev;
+	for (i = 0; i < pev->ntevs; i++) {
+		fd = priv->proglogue_fds[i];
+		if (fd != -1)
+			close(fd);
+	}
+}
+
 static void
 clear_prog_priv(const struct bpf_program *prog __maybe_unused,
 		void *_priv)
 {
 	struct bpf_prog_priv *priv = _priv;
 
+	close_prologue_programs(priv);
 	cleanup_perf_probe_events(&priv->pev, 1);
 	zfree(&priv->insns_buf);
+	zfree(&priv->proglogue_fds);
 	zfree(&priv->type_mapping);
 	zfree(&priv->sys_name);
 	zfree(&priv->evt_name);
@@ -480,9 +504,15 @@ static int
 parse_prog_config(const char *config_str, const char **p_main_str,
 		  bool *is_tp, struct perf_probe_event *pev)
 {
+	const char *main_str, *parse_str;
 	int err;
-	const char *main_str = parse_prog_config_kvpair(config_str, pev);
 
+	/* Make sure it's our section with 'perfkprobe/' prefix check. */
+	if (!strstarts(config_str, LIBBPF_SEC_PREFIX))
+		return -EINVAL;
+
+	parse_str = config_str + sizeof(LIBBPF_SEC_PREFIX) - 1;
+	main_str  = parse_prog_config_kvpair(parse_str, pev);
 	if (IS_ERR(main_str))
 		return PTR_ERR(main_str);
 
@@ -608,8 +638,8 @@ static int bpf__prepare_probe(void)
 
 static int
 preproc_gen_prologue(struct bpf_program *prog, int n,
-		     struct bpf_insn *orig_insns, int orig_insns_cnt,
-		     struct bpf_prog_prep_result *res)
+		     const struct bpf_insn *orig_insns, int orig_insns_cnt,
+		     struct bpf_preproc_result *res)
 {
 	struct bpf_prog_priv *priv = program_priv(prog);
 	struct probe_trace_event *tev;
@@ -657,7 +687,6 @@ preproc_gen_prologue(struct bpf_program *prog, int n,
 
 	res->new_insn_ptr = buf;
 	res->new_insn_cnt = prologue_cnt + orig_insns_cnt;
-	res->pfd = NULL;
 	return 0;
 
 errout:
@@ -765,7 +794,7 @@ static int hook_load_preprocessor(struct bpf_program *prog)
 	struct bpf_prog_priv *priv = program_priv(prog);
 	struct perf_probe_event *pev;
 	bool need_prologue = false;
-	int err, i;
+	int i;
 
 	if (IS_ERR_OR_NULL(priv)) {
 		pr_debug("Internal error when hook preprocessor\n");
@@ -803,6 +832,13 @@ static int hook_load_preprocessor(struct bpf_program *prog)
 		return -ENOMEM;
 	}
 
+	priv->proglogue_fds = malloc(sizeof(int) * pev->ntevs);
+	if (!priv->proglogue_fds) {
+		pr_debug("Not enough memory: alloc prologue fds failed\n");
+		return -ENOMEM;
+	}
+	memset(priv->proglogue_fds, -1, sizeof(int) * pev->ntevs);
+
 	priv->type_mapping = malloc(sizeof(int) * pev->ntevs);
 	if (!priv->type_mapping) {
 		pr_debug("Not enough memory: alloc type_mapping failed\n");
@@ -811,13 +847,7 @@ static int hook_load_preprocessor(struct bpf_program *prog)
 	memset(priv->type_mapping, -1,
 	       sizeof(int) * pev->ntevs);
 
-	err = map_prologue(pev, priv->type_mapping, &priv->nr_types);
-	if (err)
-		return err;
-
-	err = bpf_program__set_prep(prog, priv->nr_types,
-				    preproc_gen_prologue);
-	return err;
+	return map_prologue(pev, priv->type_mapping, &priv->nr_types);
 }
 
 int bpf__probe(struct bpf_object *obj)
@@ -924,6 +954,77 @@ int bpf__unprobe(struct bpf_object *obj)
 	return ret;
 }
 
+static int bpf_object__load_prologue(struct bpf_object *obj)
+{
+	int init_cnt = ARRAY_SIZE(prologue_init_insn);
+	const struct bpf_insn *orig_insns;
+	struct bpf_preproc_result res;
+	struct perf_probe_event *pev;
+	struct bpf_program *prog;
+	int orig_insns_cnt;
+
+	bpf_object__for_each_program(prog, obj) {
+		struct bpf_prog_priv *priv = program_priv(prog);
+		int err, i, fd;
+
+		if (IS_ERR_OR_NULL(priv)) {
+			pr_debug("bpf: failed to get private field\n");
+			return -BPF_LOADER_ERRNO__INTERNAL;
+		}
+
+		if (!priv->need_prologue)
+			continue;
+
+		/*
+		 * For each program that needs prologue we do following:
+		 *
+		 * - take its current instructions and use them
+		 *   to generate the new code with prologue
+		 * - load new instructions with bpf_prog_load
+		 *   and keep the fd in proglogue_fds
+		 * - new fd will be used in bpf__foreach_event
+		 *   to connect this program with perf evsel
+		 */
+		orig_insns = bpf_program__insns(prog);
+		orig_insns_cnt = bpf_program__insn_cnt(prog);
+
+		pev = &priv->pev;
+		for (i = 0; i < pev->ntevs; i++) {
+			/*
+			 * Skipping artificall prologue_init_insn instructions
+			 * (init_cnt), so the prologue can be generated instead
+			 * of them.
+			 */
+			err = preproc_gen_prologue(prog, i,
+						   orig_insns + init_cnt,
+						   orig_insns_cnt - init_cnt,
+						   &res);
+			if (err)
+				return err;
+
+			fd = bpf_prog_load(bpf_program__get_type(prog),
+					   bpf_program__name(prog), "GPL",
+					   res.new_insn_ptr,
+					   res.new_insn_cnt, NULL);
+			if (fd < 0) {
+				char bf[128];
+
+				libbpf_strerror(-errno, bf, sizeof(bf));
+				pr_debug("bpf: load objects with prologue failed: err=%d: (%s)\n",
+					 -errno, bf);
+				return -errno;
+			}
+			priv->proglogue_fds[i] = fd;
+		}
+		/*
+		 * We no longer need the original program,
+		 * we can unload it.
+		 */
+		bpf_program__unload(prog);
+	}
+	return 0;
+}
+
 int bpf__load(struct bpf_object *obj)
 {
 	int err;
@@ -935,7 +1036,7 @@ int bpf__load(struct bpf_object *obj)
 		pr_debug("bpf: load objects failed: err=%d: (%s)\n", err, bf);
 		return err;
 	}
-	return 0;
+	return bpf_object__load_prologue(obj);
 }
 
 int bpf__foreach_event(struct bpf_object *obj,
@@ -970,13 +1071,10 @@ int bpf__foreach_event(struct bpf_object *obj,
 		for (i = 0; i < pev->ntevs; i++) {
 			tev = &pev->tevs[i];
 
-			if (priv->need_prologue) {
-				int type = priv->type_mapping[i];
-
-				fd = bpf_program__nth_fd(prog, type);
-			} else {
+			if (priv->need_prologue)
+				fd = priv->proglogue_fds[i];
+			else
 				fd = bpf_program__fd(prog);
-			}
 
 			if (fd < 0) {
 				pr_debug("bpf: failed to get file descriptor\n");
-- 
2.35.1


^ permalink raw reply related

* Re: Accessing XDP packet memory from the end
From: Alexander Lobakin @ 2022-04-22 10:06 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Alexander Lobakin, Larysa Zaremba, bpf, brouer, netdev,
	Andrii Nakryiko, Alexei Starovoitov, Daniel Borkmann,
	Toke Hoiland-Jorgensen, Magnus Karlsson, Maciej Fijalkowski,
	xdp-hints@xdp-project.net
In-Reply-To: <f1417959-4b0d-7c33-4b2b-08989bb86b23@redhat.com>

From: Jesper Dangaard Brouer <jbrouer@redhat.com>
Date: Thu, 21 Apr 2022 18:27:47 +0200

> On 21/04/2022 17.56, Larysa Zaremba wrote:
> > Dear all,
> > Our team has encountered a need of accessing data_meta in a following way:
> > 
> > int xdp_meta_prog(struct xdp_md *ctx)
> > {
> > 	void *data_meta_ptr = (void *)(long)ctx->data_meta;
> > 	void *data_end = (void *)(long)ctx->data_end;
> > 	void *data = (void *)(long)ctx->data;
> > 	u64 data_size = sizeof(u32);
> > 	u32 magic_meta;
> > 	u8 offset;
> > 
> > 	offset = (u8)((s64)data - (s64)data_meta_ptr);
> 
> I'm not sure the verifier can handle this 'offset' calc. As it cannot
> statically know the sized based on this statement. Maybe this is not the
> issue.
> 
> > 	if (offset < data_size) {
> > 		bpf_printk("invalid offset: %ld\n", offset);
> > 		return XDP_DROP;
> > 	}
> > 
> > 	data_meta_ptr += offset;
> > 	data_meta_ptr -= data_size;
> > 
> > 	if (data_meta_ptr + data_size > data) {
> > 		return XDP_DROP;
> > 	}
> > 		
> > 	magic_meta = *((u32 *)data);
> > 	bpf_printk("Magic: %d\n", magic_meta);
> > 	return XDP_PASS;
> > }
> > 
> > Unfortunately, verifier claims this code attempts to access packet with
> > an offset of -2 (a constant part) and negative offset is generally forbidden.
> 
> Are you forgetting to mention:
>   - Have you modified the NIC driver to adjust data_meta pointer and 
> provide info in this area?

Exactly. Previously, @data_meta == @data prior to running BPF
program in 100% cases. Now, the driver can provide arbitrary
metadata and set @data_meta to be @data - 32, data - 48 or so.

> 
> p.s. this is exactly what I'm also working towards[1], so I'll be happy
> to collaborate.  I'm missing the driver code, as link[1] is focused on
> decoding BTF data_meta area in userspace for AF_XDP.

Yeah, we're almost about to post a first RFC to LKML. This issue is
the last one, the rest just needs to be rebased to fix some minors
and polish the code.
It will contain the kernel core part and the driver part (only ice
for now). Then we could e.g. fuse it with your changes (we weren't
touching AF_XDP part) etc.
But for now, until an RFC is posted, you could take a look at the
code in my GH[0] if you're wish :) The second half of the ice code
is not committed yet tho.

> 
> [1] 
> https://github.com/xdp-project/bpf-examples/tree/master/AF_XDP-interaction
> 
> > For now we have 2 solutions, one is using bpf_xdp_adjust_meta(),
> > which is pretty good, but not ideal for the hot path.
> > The second one is the patch at the end.
> > 
> 
> Are you saying, verifier cannot handle that driver changed data_meta 
> pointer and provided info there (without calling bpf_xdp_adjust_meta)?

Correct. I suspect the verifier just assumes that @data_meta always
equals @data when executing BPF prog.
Let's assume:

	offset = data - data_meta; // 64 bytes
	data_meta += offset; // equals to data now
	/* Let's say xdp_meta_generic is 48 bytes long, then */
	data_meta -= sizeof(struct xdp_meta_generic);
	/* data_meta is now 16 bytes past the original data_meta,
	 * or data - 48.
	 */
	bpf_printk("magic: 0x%04x\n",
		   ((struct xdp_meta_generic)data_meta)->magic);

So in fact, this code is absolutely correct, it doesn't go past the
bounds in either direction, but the verifier claims it goes out of
bounds to the left by 48 bytes (not counting the offsetof).
OTOH,

	data_meta = (void *)ctx->data_meta;
	bpf_printk("magic: 0x%04x\n",
		   ((struct xdp_meta_generic)data_meta)->magic);

works with no issues. The verifier still thinks @data_meta == @data,
but this code effectively accesses the metadata, not the frame
itself.

> 
> 
> > Do you see any other way of accessing memory from the end of data_meta/data?
> > What do you think about both suggested solutions?
> > 
> > Best regards,
> > Larysa Zaremba
> > 
> > ---
> > 
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -3576,8 +3576,11 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
> >   	}
> >   
> >   	err = reg->range < 0 ? -EINVAL :
> > -	      __check_mem_access(env, regno, off, size, reg->range,
> > -				 zero_size_allowed);
> > +	      __check_mem_access(env, regno, off + reg->smin_value, size,
> > +				 reg->range + reg->smin_value, zero_size_allowed);
> > +	err = err ? :
> > +	      __check_mem_access(env, regno, off + reg->umax_value, size,
> > +				 reg->range + reg->umax_value, zero_size_allowed);
> >   	if (err) {
> >   		verbose(env, "R%d offset is outside of the packet\n", regno);
> >   		return err;

[0] https://github.com/alobakin/linux/commits/xdp_hints

Thanks,
Al

^ permalink raw reply

* Re: [PATCH net-next] nfp: support 802.1ad VLAN assingment to VF
From: patchwork-bot+netdevbpf @ 2022-04-22 10:10 UTC (permalink / raw)
  To: Simon Horman; +Cc: davem, kuba, netdev, oss-drivers
In-Reply-To: <20220419124443.271047-1-simon.horman@corigine.com>

Hello:

This patch was applied to netdev/net-next.git (master)
by David S. Miller <davem@davemloft.net>:

On Tue, 19 Apr 2022 14:44:43 +0200 you wrote:
> From: Baowen Zheng <baowen.zheng@corigine.com>
> 
> The NFP driver already supports assignment of 802.1Q VLANs to VFs
> 
> e.g.
>  # ip link set $DEV vf $VF_NUM vlan $VLAN_ID [proto 802.1Q]
> 
> [...]

Here is the summary with links:
  - [net-next] nfp: support 802.1ad VLAN assingment to VF
    https://git.kernel.org/netdev/net-next/c/59359597b010

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH net-next] Revert "rtnetlink: return EINVAL when request cannot succeed"
From: patchwork-bot+netdevbpf @ 2022-04-22 10:10 UTC (permalink / raw)
  To: Florent Fourcot; +Cc: netdev, stephen, brian.baboch
In-Reply-To: <20220419125151.15589-1-florent.fourcot@wifirst.fr>

Hello:

This patch was applied to netdev/net-next.git (master)
by David S. Miller <davem@davemloft.net>:

On Tue, 19 Apr 2022 14:51:51 +0200 you wrote:
> This reverts commit b6177d3240a4
> 
> ip-link command is testing kernel capability by sending a RTM_NEWLINK
> request, without any argument. It accepts everything in reply, except
> EOPNOTSUPP and EINVAL (functions iplink_have_newlink / accept_msg)
> 
> So we must keep compatiblity here, invalid empty message should not
> return EINVAL
> 
> [...]

Here is the summary with links:
  - [net-next] Revert "rtnetlink: return EINVAL when request cannot succeed"
    https://git.kernel.org/netdev/net-next/c/6f37c9f9dfbf

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* [PATCH net-next 0/8] DSA selftests
From: Vladimir Oltean @ 2022-04-22 10:14 UTC (permalink / raw)
  To: netdev
  Cc: Jakub Kicinski, David S. Miller, Paolo Abeni, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Vladimir Oltean, Claudiu Manoil,
	Alexandre Belloni, UNGLinuxDriver, Tobias Waldekranz,
	Mattias Forsblad, Roopa Prabhu, Nikolay Aleksandrov, Jiri Pirko,
	Ido Schimmel, Joachim Wiberg, Marek Behún, Ansuel Smith,
	DENG Qingfang, Kurt Kanzenbach

When working on complex new features or reworks it becomes increasingly
difficult to ensure there aren't regressions being introduced, and
therefore it would be nice if we could go over the functionality we
already have and write some tests for it.

Verbally I know from Tobias Waldekranz that he has been working on some
selftests for DSA, yet I have never seen them, so here I am adding some
tests I have written which have been useful for me. The list is by no
means complete (it only covers elementary functionality), but it's still
good to have as a starting point. I also borrowed some refactoring
changes from Joachim Wiberg that he submitted for his "net: bridge:
forwarding of unknown IPv4/IPv6/MAC BUM traffic" series, but not the
entirety of his selftests. I now think that his selftests have some
overlap with bridge_vlan_unaware.sh and bridge_vlan_aware.sh and they
should be more tightly integrated with each other - yet I didn't do that
either :). Another issue I had with his selftests was that they jumped
straight ahead to configure brport flags on br0 (a radical new idea
still at RFC status) while we have bigger problems, and we don't have
nearly enough coverage for the *existing* functionality.

One idea introduced here which I haven't seen before is the symlinking
of relevant forwarding selftests to the selftests/drivers/net/<my-driver>/
folder, plus a forwarding.config file. I think there's some value in
having things structured this way, since the forwarding dir has so many
selftests that aren't relevant to DSA that it is a bit difficult to find
the ones that are.

While searching for applications that I could use for multicast testing
(not my domain of interest/knowledge really), I found Joachim Wiberg's
mtools, mcjoin and omping, and I tried them all with various degrees of
success. In particular, I was going to use mcjoin, but I faced some
issues getting IPv6 multicast traffic to work in a VRF, and I bothered
David Ahern about it here:
https://lore.kernel.org/netdev/97eaffb8-2125-834e-641f-c99c097b6ee2@gmail.com/t/
It seems that the problem is that this application should use
SO_BINDTODEVICE, yet it doesn't.

So I ended up patching the bare-bones mtools (msend, mreceive) forked by
Joachim from the University of Virginia's Multimedia Networks Group to
include IPv6 support, and to use SO_BINDTODEVICE. This is what I'm using
now for IPv6.

Note that mausezahn doesn't appear to do a particularly good job of
supporting IPv6 really, and I needed a program to emit the actual
IP_ADD_MEMBERSHIP calls, for dev_mc_add(), so I could test RX filtering.
Crafting the IGMP/MLD reports by hand doesn't really do the trick.
While extremely bare-bones, the mreceive application now seems to do
what I need it to.

Feedback appreciated, it is very likely that I could have done things in
a better way.

Joachim Wiberg (2):
  selftests: forwarding: add TCPDUMP_EXTRA_FLAGS to lib.sh
  selftests: forwarding: multiple instances in tcpdump helper

Vladimir Oltean (6):
  selftests: forwarding: add option to run tests with stable MAC
    addresses
  selftests: forwarding: add helpers for IP multicast group joins/leaves
  selftests: forwarding: add helper for retrieving IPv6 link-local
    address of interface
  selftests: forwarding: add a no_forwarding.sh test
  selftests: forwarding: add a test for local_termination.sh
  selftests: drivers: dsa: add a subset of forwarding selftests

 .../drivers/net/dsa/bridge_locked_port.sh     |   1 +
 .../selftests/drivers/net/dsa/bridge_mdb.sh   |   1 +
 .../selftests/drivers/net/dsa/bridge_mld.sh   |   1 +
 .../drivers/net/dsa/bridge_vlan_aware.sh      |   1 +
 .../drivers/net/dsa/bridge_vlan_mcast.sh      |   1 +
 .../drivers/net/dsa/bridge_vlan_unaware.sh    |   1 +
 .../drivers/net/dsa/forwarding.config         |   2 +
 .../testing/selftests/drivers/net/dsa/lib.sh  |   1 +
 .../drivers/net/dsa/local_termination.sh      |   1 +
 .../drivers/net/dsa/no_forwarding.sh          |   1 +
 .../drivers/net/ocelot/tc_flower_chains.sh    |  24 +-
 tools/testing/selftests/net/forwarding/lib.sh | 112 ++++++-
 .../net/forwarding/local_termination.sh       | 299 ++++++++++++++++++
 .../selftests/net/forwarding/no_forwarding.sh | 261 +++++++++++++++
 14 files changed, 687 insertions(+), 20 deletions(-)
 create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_locked_port.sh
 create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_mdb.sh
 create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_mld.sh
 create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_vlan_aware.sh
 create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_vlan_mcast.sh
 create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_vlan_unaware.sh
 create mode 100644 tools/testing/selftests/drivers/net/dsa/forwarding.config
 create mode 120000 tools/testing/selftests/drivers/net/dsa/lib.sh
 create mode 120000 tools/testing/selftests/drivers/net/dsa/local_termination.sh
 create mode 120000 tools/testing/selftests/drivers/net/dsa/no_forwarding.sh
 mode change 100644 => 100755 tools/testing/selftests/net/forwarding/lib.sh
 create mode 100755 tools/testing/selftests/net/forwarding/local_termination.sh
 create mode 100755 tools/testing/selftests/net/forwarding/no_forwarding.sh

-- 
2.25.1


^ permalink raw reply

* [PATCH net-next 1/8] selftests: forwarding: add option to run tests with stable MAC addresses
From: Vladimir Oltean @ 2022-04-22 10:14 UTC (permalink / raw)
  To: netdev
  Cc: Jakub Kicinski, David S. Miller, Paolo Abeni, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Vladimir Oltean, Claudiu Manoil,
	Alexandre Belloni, UNGLinuxDriver, Tobias Waldekranz,
	Mattias Forsblad, Roopa Prabhu, Nikolay Aleksandrov, Jiri Pirko,
	Ido Schimmel, Joachim Wiberg, Marek Behún, Ansuel Smith,
	DENG Qingfang, Kurt Kanzenbach
In-Reply-To: <20220422101504.3729309-1-vladimir.oltean@nxp.com>

By default, DSA switch ports inherit their MAC address from the DSA
master.

This works well for practical situations, but some selftests like
bridge_vlan_unaware.sh loop back 2 standalone DSA ports with 2 bridged
DSA ports, and require the bridge to forward packets between the
standalone ports.

Due to the bridge seeing that the MAC DA it needs to forward is present
as a local FDB entry (it coincides with the MAC address of the bridge
ports), the test packets are not forwarded, but terminated locally on
br0. In turn, this makes the ping and ping6 tests fail.

Address this by introducing an option to have stable MAC addresses.
When mac_addr_prepare is called, the current addresses of the netifs are
saved and replaced with 00:01:02:03:04:${netif number}. Then when
mac_addr_restore is called at the end of the test, the original MAC
addresses are restored. This ensures that the MAC addresses are unique,
which makes the test pass even for DSA ports.

The usage model is for the behavior to be opt-in via STABLE_MAC_ADDRS,
which DSA should set to true, all others behave as before. By hooking
the calls to mac_addr_prepare and mac_addr_restore within the forwarding
lib itself, we do not need to patch each individual selftest, the only
requirement is that pre_cleanup is called.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 tools/testing/selftests/net/forwarding/lib.sh | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 664b9ecaf228..e3b3cdef3170 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -27,6 +27,7 @@ INTERFACE_TIMEOUT=${INTERFACE_TIMEOUT:=600}
 LOW_AGEING_TIME=${LOW_AGEING_TIME:=1000}
 REQUIRE_JQ=${REQUIRE_JQ:=yes}
 REQUIRE_MZ=${REQUIRE_MZ:=yes}
+STABLE_MAC_ADDRS=${STABLE_MAC_ADDRS:=no}
 
 relative_path="${BASH_SOURCE%/*}"
 if [[ "$relative_path" == "${BASH_SOURCE}" ]]; then
@@ -214,10 +215,41 @@ create_netif()
 	esac
 }
 
+declare -A MAC_ADDR_ORIG
+mac_addr_prepare()
+{
+	local new_addr=
+	local dev=
+
+	for ((i = 1; i <= NUM_NETIFS; ++i)); do
+		dev=${NETIFS[p$i]}
+		new_addr=$(printf "00:01:02:03:04:%02x" $i)
+
+		MAC_ADDR_ORIG["$dev"]=$(ip -j link show dev $dev | jq -e '.[].address')
+		# Strip quotes
+		MAC_ADDR_ORIG["$dev"]=${MAC_ADDR_ORIG["$dev"]//\"/}
+		ip link set dev $dev address $new_addr
+	done
+}
+
+mac_addr_restore()
+{
+	local dev=
+
+	for ((i = 1; i <= NUM_NETIFS; ++i)); do
+		dev=${NETIFS[p$i]}
+		ip link set dev $dev address ${MAC_ADDR_ORIG["$dev"]}
+	done
+}
+
 if [[ "$NETIF_CREATE" = "yes" ]]; then
 	create_netif
 fi
 
+if [[ "$STABLE_MAC_ADDRS" = "yes" ]]; then
+	mac_addr_prepare
+fi
+
 for ((i = 1; i <= NUM_NETIFS; ++i)); do
 	ip link show dev ${NETIFS[p$i]} &> /dev/null
 	if [[ $? -ne 0 ]]; then
@@ -503,6 +535,10 @@ pre_cleanup()
 		echo "Pausing before cleanup, hit any key to continue"
 		read
 	fi
+
+	if [[ "$STABLE_MAC_ADDRS" = "yes" ]]; then
+		mac_addr_restore
+	fi
 }
 
 vrf_prepare()
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next 2/8] selftests: forwarding: add TCPDUMP_EXTRA_FLAGS to lib.sh
From: Vladimir Oltean @ 2022-04-22 10:14 UTC (permalink / raw)
  To: netdev
  Cc: Jakub Kicinski, David S. Miller, Paolo Abeni, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Vladimir Oltean, Claudiu Manoil,
	Alexandre Belloni, UNGLinuxDriver, Tobias Waldekranz,
	Mattias Forsblad, Roopa Prabhu, Nikolay Aleksandrov, Jiri Pirko,
	Ido Schimmel, Joachim Wiberg, Marek Behún, Ansuel Smith,
	DENG Qingfang, Kurt Kanzenbach
In-Reply-To: <20220422101504.3729309-1-vladimir.oltean@nxp.com>

From: Joachim Wiberg <troglobit@gmail.com>

For some use-cases we may want to change the tcpdump flags used in
tcpdump_start().  For instance, observing interfaces without the PROMISC
flag, e.g. to see what's really being forwarded to the bridge interface.

Signed-off-by: Joachim Wiberg <troglobit@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 tools/testing/selftests/net/forwarding/lib.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 tools/testing/selftests/net/forwarding/lib.sh

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
old mode 100644
new mode 100755
index e3b3cdef3170..de10451d7671
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -28,6 +28,7 @@ LOW_AGEING_TIME=${LOW_AGEING_TIME:=1000}
 REQUIRE_JQ=${REQUIRE_JQ:=yes}
 REQUIRE_MZ=${REQUIRE_MZ:=yes}
 STABLE_MAC_ADDRS=${STABLE_MAC_ADDRS:=no}
+TCPDUMP_EXTRA_FLAGS=${TCPDUMP_EXTRA_FLAGS:=}
 
 relative_path="${BASH_SOURCE%/*}"
 if [[ "$relative_path" == "${BASH_SOURCE}" ]]; then
@@ -1405,7 +1406,7 @@ tcpdump_start()
 		capuser="-Z $SUDO_USER"
 	fi
 
-	$ns_cmd tcpdump -e -n -Q in -i $if_name \
+	$ns_cmd tcpdump $TCPDUMP_EXTRA_FLAGS -e -n -Q in -i $if_name \
 		-s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 &
 	cappid=$!
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next 3/8] selftests: forwarding: multiple instances in tcpdump helper
From: Vladimir Oltean @ 2022-04-22 10:14 UTC (permalink / raw)
  To: netdev
  Cc: Jakub Kicinski, David S. Miller, Paolo Abeni, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Vladimir Oltean, Claudiu Manoil,
	Alexandre Belloni, UNGLinuxDriver, Tobias Waldekranz,
	Mattias Forsblad, Roopa Prabhu, Nikolay Aleksandrov, Jiri Pirko,
	Ido Schimmel, Joachim Wiberg, Marek Behún, Ansuel Smith,
	DENG Qingfang, Kurt Kanzenbach
In-Reply-To: <20220422101504.3729309-1-vladimir.oltean@nxp.com>

From: Joachim Wiberg <troglobit@gmail.com>

Extend tcpdump_start() & C:o to handle multiple instances.  Useful when
observing bridge operation, e.g., unicast learning/flooding, and any
case of multicast distribution (to these ports but not that one ...).

This means the interface argument is now a mandatory argument to all
tcpdump_*() functions, hence the changes to the ocelot flower test.

Signed-off-by: Joachim Wiberg <troglobit@gmail.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 .../drivers/net/ocelot/tc_flower_chains.sh    | 24 ++++++++---------
 tools/testing/selftests/net/forwarding/lib.sh | 26 ++++++++++++++-----
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh b/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh
index eaf8a04a7ca5..7e684e27a682 100755
--- a/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh
+++ b/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh
@@ -215,15 +215,15 @@ test_vlan_pop()
 
 	sleep 1
 
-	tcpdump_stop
+	tcpdump_stop $eth2
 
-	if tcpdump_show | grep -q "$eth3_mac > $eth2_mac, ethertype IPv4"; then
+	if tcpdump_show $eth2 | grep -q "$eth3_mac > $eth2_mac, ethertype IPv4"; then
 		echo "OK"
 	else
 		echo "FAIL"
 	fi
 
-	tcpdump_cleanup
+	tcpdump_cleanup $eth2
 }
 
 test_vlan_push()
@@ -236,15 +236,15 @@ test_vlan_push()
 
 	sleep 1
 
-	tcpdump_stop
+	tcpdump_stop $eth3.100
 
-	if tcpdump_show | grep -q "$eth2_mac > $eth3_mac"; then
+	if tcpdump_show $eth3.100 | grep -q "$eth2_mac > $eth3_mac"; then
 		echo "OK"
 	else
 		echo "FAIL"
 	fi
 
-	tcpdump_cleanup
+	tcpdump_cleanup $eth3.100
 }
 
 test_vlan_ingress_modify()
@@ -267,15 +267,15 @@ test_vlan_ingress_modify()
 
 	sleep 1
 
-	tcpdump_stop
+	tcpdump_stop $eth2
 
-	if tcpdump_show | grep -q "$eth3_mac > $eth2_mac, .* vlan 300"; then
+	if tcpdump_show $eth2 | grep -q "$eth3_mac > $eth2_mac, .* vlan 300"; then
 		echo "OK"
 	else
 		echo "FAIL"
 	fi
 
-	tcpdump_cleanup
+	tcpdump_cleanup $eth2
 
 	tc filter del dev $eth0 ingress chain $(IS1 2) pref 3
 
@@ -305,15 +305,15 @@ test_vlan_egress_modify()
 
 	sleep 1
 
-	tcpdump_stop
+	tcpdump_stop $eth2
 
-	if tcpdump_show | grep -q "$eth3_mac > $eth2_mac, .* vlan 300"; then
+	if tcpdump_show $eth2 | grep -q "$eth3_mac > $eth2_mac, .* vlan 300"; then
 		echo "OK"
 	else
 		echo "FAIL"
 	fi
 
-	tcpdump_cleanup
+	tcpdump_cleanup $eth2
 
 	tc filter del dev $eth1 egress chain $(ES0) pref 3
 	tc qdisc del dev $eth1 clsact
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index de10451d7671..7eff5ecf7565 100755
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -1386,13 +1386,17 @@ stop_traffic()
 	{ kill %% && wait %%; } 2>/dev/null
 }
 
+declare -A cappid
+declare -A capfile
+declare -A capout
+
 tcpdump_start()
 {
 	local if_name=$1; shift
 	local ns=$1; shift
 
-	capfile=$(mktemp)
-	capout=$(mktemp)
+	capfile[$if_name]=$(mktemp)
+	capout[$if_name]=$(mktemp)
 
 	if [ -z $ns ]; then
 		ns_cmd=""
@@ -1407,26 +1411,34 @@ tcpdump_start()
 	fi
 
 	$ns_cmd tcpdump $TCPDUMP_EXTRA_FLAGS -e -n -Q in -i $if_name \
-		-s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 &
-	cappid=$!
+		-s 65535 -B 32768 $capuser -w ${capfile[$if_name]} \
+		> "${capout[$if_name]}" 2>&1 &
+	cappid[$if_name]=$!
 
 	sleep 1
 }
 
 tcpdump_stop()
 {
-	$ns_cmd kill $cappid
+	local if_name=$1
+	local pid=${cappid[$if_name]}
+
+	$ns_cmd kill "$pid" && wait "$pid"
 	sleep 1
 }
 
 tcpdump_cleanup()
 {
-	rm $capfile $capout
+	local if_name=$1
+
+	rm ${capfile[$if_name]} ${capout[$if_name]}
 }
 
 tcpdump_show()
 {
-	tcpdump -e -n -r $capfile 2>&1
+	local if_name=$1
+
+	tcpdump -e -n -r ${capfile[$if_name]} 2>&1
 }
 
 # return 0 if the packet wasn't seen on host2_if or 1 if it was
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next 4/8] selftests: forwarding: add helpers for IP multicast group joins/leaves
From: Vladimir Oltean @ 2022-04-22 10:15 UTC (permalink / raw)
  To: netdev
  Cc: Jakub Kicinski, David S. Miller, Paolo Abeni, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Vladimir Oltean, Claudiu Manoil,
	Alexandre Belloni, UNGLinuxDriver, Tobias Waldekranz,
	Mattias Forsblad, Roopa Prabhu, Nikolay Aleksandrov, Jiri Pirko,
	Ido Schimmel, Joachim Wiberg, Marek Behún, Ansuel Smith,
	DENG Qingfang, Kurt Kanzenbach
In-Reply-To: <20220422101504.3729309-1-vladimir.oltean@nxp.com>

Extend the forwarding library with calls to some small C programs which
join an IP multicast group and send some packets to it. Both IPv4 and
IPv6 groups are supported. Use cases range from testing IGMP/MLD
snooping, to RX filtering, to multicast routing.

Testing multicast traffic using msend/mreceive is intended to be done
using tcpdump.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 tools/testing/selftests/net/forwarding/lib.sh | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 7eff5ecf7565..15fb46b39fe8 100755
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -27,6 +27,7 @@ INTERFACE_TIMEOUT=${INTERFACE_TIMEOUT:=600}
 LOW_AGEING_TIME=${LOW_AGEING_TIME:=1000}
 REQUIRE_JQ=${REQUIRE_JQ:=yes}
 REQUIRE_MZ=${REQUIRE_MZ:=yes}
+REQUIRE_MTOOLS=${REQUIRE_MTOOLS:=no}
 STABLE_MAC_ADDRS=${STABLE_MAC_ADDRS:=no}
 TCPDUMP_EXTRA_FLAGS=${TCPDUMP_EXTRA_FLAGS:=}
 
@@ -161,6 +162,12 @@ fi
 if [[ "$REQUIRE_MZ" = "yes" ]]; then
 	require_command $MZ
 fi
+if [[ "$REQUIRE_MTOOLS" = "yes" ]]; then
+	# https://github.com/vladimiroltean/mtools/
+	# patched for IPv6 support
+	require_command msend
+	require_command mreceive
+fi
 
 if [[ ! -v NUM_NETIFS ]]; then
 	echo "SKIP: importer does not define \"NUM_NETIFS\""
@@ -1548,6 +1555,37 @@ brmcast_check_sg_state()
 	done
 }
 
+mc_join()
+{
+	local if_name=$1
+	local group=$2
+	local vrf_name=$(master_name_get $if_name)
+
+	# We don't care about actual reception, just about joining the
+	# IP multicast group and adding the L2 address to the device's
+	# MAC filtering table
+	ip vrf exec $vrf_name \
+		mreceive -g $group -I $if_name > /dev/null 2>&1 &
+	mreceive_pid=$!
+
+	sleep 1
+}
+
+mc_leave()
+{
+	kill "$mreceive_pid" && wait "$mreceive_pid"
+}
+
+mc_send()
+{
+	local if_name=$1
+	local groups=$2
+	local vrf_name=$(master_name_get $if_name)
+
+	ip vrf exec $vrf_name \
+		msend -g $groups -I $if_name -c 1 > /dev/null 2>&1
+}
+
 start_ip_monitor()
 {
 	local mtype=$1; shift
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next 6/8] selftests: forwarding: add a no_forwarding.sh test
From: Vladimir Oltean @ 2022-04-22 10:15 UTC (permalink / raw)
  To: netdev
  Cc: Jakub Kicinski, David S. Miller, Paolo Abeni, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Vladimir Oltean, Claudiu Manoil,
	Alexandre Belloni, UNGLinuxDriver, Tobias Waldekranz,
	Mattias Forsblad, Roopa Prabhu, Nikolay Aleksandrov, Jiri Pirko,
	Ido Schimmel, Joachim Wiberg, Marek Behún, Ansuel Smith,
	DENG Qingfang, Kurt Kanzenbach
In-Reply-To: <20220422101504.3729309-1-vladimir.oltean@nxp.com>

Bombard a standalone switch port with various kinds of traffic to ensure
it is really standalone and doesn't leak packets to other switch ports.
Also check for switch ports in different bridges, and switch ports in a
VLAN-aware bridge but having different pvids. No forwarding should take
place in either case.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 .../selftests/net/forwarding/no_forwarding.sh | 261 ++++++++++++++++++
 1 file changed, 261 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/no_forwarding.sh

diff --git a/tools/testing/selftests/net/forwarding/no_forwarding.sh b/tools/testing/selftests/net/forwarding/no_forwarding.sh
new file mode 100755
index 000000000000..af3b398d13f0
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/no_forwarding.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="standalone two_bridges one_bridge_two_pvids"
+NUM_NETIFS=4
+
+source lib.sh
+
+h1=${NETIFS[p1]}
+h2=${NETIFS[p3]}
+swp1=${NETIFS[p2]}
+swp2=${NETIFS[p4]}
+
+H1_IPV4="192.0.2.1"
+H2_IPV4="192.0.2.2"
+H1_IPV6="2001:db8:1::1"
+H2_IPV6="2001:db8:1::2"
+
+IPV4_ALLNODES="224.0.0.1"
+IPV6_ALLNODES="ff02::1"
+MACV4_ALLNODES="01:00:5e:00:00:01"
+MACV6_ALLNODES="33:33:00:00:00:01"
+NON_IP_MC="01:02:03:04:05:06"
+NON_IP_PKT="00:04 48:45:4c:4f"
+BC="ff:ff:ff:ff:ff:ff"
+
+# The full 4K VLAN space is too much to check, so strategically pick some
+# values which should provide reasonable coverage
+vids=(0 1 2 5 10 20 50 100 200 500 1000 1000 2000 4000 4094)
+
+send_non_ip()
+{
+	local if_name=$1
+	local smac=$2
+	local dmac=$3
+
+	$MZ -q $if_name "$dmac $smac $NON_IP_PKT"
+}
+
+send_uc_ipv4()
+{
+	local if_name=$1
+	local dmac=$2
+
+	ip neigh add $H2_IPV4 lladdr $dmac dev $if_name
+	ping_do $if_name $H2_IPV4
+	ip neigh del $H2_IPV4 dev $if_name
+}
+
+send_mc_ipv4()
+{
+	local if_name=$1
+
+	ping_do $if_name $IPV4_ALLNODES "-I $if_name"
+}
+
+send_uc_ipv6()
+{
+	local if_name=$1
+	local dmac=$2
+
+	ip -6 neigh add $H2_IPV6 lladdr $dmac dev $if_name
+	ping6_do $if_name $H2_IPV6
+	ip -6 neigh del $H2_IPV6 dev $if_name
+}
+
+send_mc_ipv6()
+{
+	local if_name=$1
+
+	ping6_do $if_name $IPV6_ALLNODES%$if_name
+}
+
+check_rcv()
+{
+	local if_name=$1
+	local type=$2
+	local pattern=$3
+	local should_fail=1
+
+	RET=0
+
+	tcpdump_show $if_name | grep -q "$pattern"
+
+	check_err_fail "$should_fail" "$?" "reception"
+
+	log_test "$type"
+}
+
+run_test()
+{
+	local test_name="$1"
+	local smac=$(mac_get $h1)
+	local dmac=$(mac_get $h2)
+	local h1_ipv6_lladdr=$(ipv6_lladdr_get $h1)
+	local vid=
+
+	echo "$test_name: Sending packets"
+
+	tcpdump_start $h2
+
+	send_non_ip $h1 $smac $dmac
+	send_non_ip $h1 $smac $NON_IP_MC
+	send_non_ip $h1 $smac $BC
+	send_uc_ipv4 $h1 $dmac
+	send_mc_ipv4 $h1
+	send_uc_ipv6 $h1 $dmac
+	send_mc_ipv6 $h1
+
+	for vid in "${vids[@]}"; do
+		vlan_create $h1 $vid
+		simple_if_init $h1.$vid $H1_IPV4/24 $H1_IPV6/64
+
+		send_non_ip $h1.$vid $smac $dmac
+		send_non_ip $h1.$vid $smac $NON_IP_MC
+		send_non_ip $h1.$vid $smac $BC
+		send_uc_ipv4 $h1.$vid $dmac
+		send_mc_ipv4 $h1.$vid
+		send_uc_ipv6 $h1.$vid $dmac
+		send_mc_ipv6 $h1.$vid
+
+		simple_if_fini $h1.$vid $H1_IPV4/24 $H1_IPV6/64
+		vlan_destroy $h1 $vid
+	done
+
+	sleep 1
+
+	echo "$test_name: Checking which packets were received"
+
+	tcpdump_stop $h2
+
+	check_rcv $h2 "$test_name: Unicast non-IP untagged" \
+		"$smac > $dmac, 802.3, length 4:"
+
+	check_rcv $h2 "$test_name: Multicast non-IP untagged" \
+		"$smac > $NON_IP_MC, 802.3, length 4:"
+
+	check_rcv $h2 "$test_name: Broadcast non-IP untagged" \
+		"$smac > $BC, 802.3, length 4:"
+
+	check_rcv $h2 "$test_name: Unicast IPv4 untagged" \
+		"$smac > $dmac, ethertype IPv4 (0x0800)"
+
+	check_rcv $h2 "$test_name: Multicast IPv4 untagged" \
+		"$smac > $MACV4_ALLNODES, ethertype IPv4 (0x0800).*: $H1_IPV4 > $IPV4_ALLNODES"
+
+	check_rcv $h2 "$test_name: Unicast IPv6 untagged" \
+		"$smac > $dmac, ethertype IPv6 (0x86dd).*8: $H1_IPV6 > $H2_IPV6"
+
+	check_rcv $h2 "$test_name: Multicast IPv6 untagged" \
+		"$smac > $MACV6_ALLNODES, ethertype IPv6 (0x86dd).*: $h1_ipv6_lladdr > $IPV6_ALLNODES"
+
+	for vid in "${vids[@]}"; do
+		check_rcv $h2 "$test_name: Unicast non-IP VID $vid" \
+			"$smac > $dmac, ethertype 802.1Q (0x8100).*vlan $vid,.*length 4"
+
+		check_rcv $h2 "$test_name: Multicast non-IP VID $vid" \
+			"$smac > $NON_IP_MC, ethertype 802.1Q (0x8100).*vlan $vid,.*length 4"
+
+		check_rcv $h2 "$test_name: Broadcast non-IP VID $vid" \
+			"$smac > $BC, ethertype 802.1Q (0x8100).*vlan $vid,.*length 4"
+
+		check_rcv $h2 "$test_name: Unicast IPv4 VID $vid" \
+			"$smac > $dmac, ethertype 802.1Q (0x8100).*vlan $vid,.*ethertype IPv4 (0x0800), $H1_IPV4 > $H2_IPV4"
+
+		check_rcv $h2 "$test_name: Multicast IPv4 VID $vid" \
+			"$smac > $MACV4_ALLNODES, ethertype 802.1Q (0x8100).*vlan $vid,.*ethertype IPv4 (0x0800), $H1_IPV4 > $IPV4_ALLNODES"
+
+		check_rcv $h2 "$test_name: Unicast IPv6 VID $vid" \
+			"$smac > $dmac, ethertype 802.1Q (0x8100).*vlan $vid,.*ethertype IPv6 (0x86dd), $H1_IPV6 > $H2_IPV6"
+
+		check_rcv $h2 "$test_name: Multicast IPv6 VID $vid" \
+			"$smac > $MACV6_ALLNODES, ethertype 802.1Q (0x8100).*vlan $vid,.*ethertype IPv6 (0x86dd), $h1_ipv6_lladdr > $IPV6_ALLNODES"
+	done
+
+	tcpdump_cleanup $h2
+}
+
+standalone()
+{
+	run_test "Standalone switch ports"
+}
+
+two_bridges()
+{
+	ip link add br0 type bridge && ip link set br0 up
+	ip link add br1 type bridge && ip link set br1 up
+	ip link set $swp1 master br0
+	ip link set $swp2 master br1
+
+	run_test "Switch ports in different bridges"
+
+	ip link del br1
+	ip link del br0
+}
+
+one_bridge_two_pvids()
+{
+	ip link add br0 type bridge vlan_filtering 1 vlan_default_pvid 0
+	ip link set br0 up
+	ip link set $swp1 master br0
+	ip link set $swp2 master br0
+
+	bridge vlan add dev $swp1 vid 1 pvid untagged
+	bridge vlan add dev $swp1 vid 2 pvid untagged
+
+	run_test "Switch ports in VLAN-aware bridge with different PVIDs"
+
+	ip link del br0
+}
+
+h1_create()
+{
+	simple_if_init $h1 $H1_IPV4/24 $H1_IPV6/64
+}
+
+h1_destroy()
+{
+	simple_if_fini $h1 $H1_IPV4/24 $H1_IPV6/64
+}
+
+h2_create()
+{
+	simple_if_init $h2 $H2_IPV4/24 $H2_IPV6/64
+}
+
+h2_destroy()
+{
+	simple_if_fini $h2 $H2_IPV4/24 $H2_IPV6/64
+}
+
+cleanup()
+{
+	pre_cleanup
+
+	h2_destroy
+	h1_destroy
+
+	vrf_cleanup
+}
+
+setup_prepare()
+{
+	vrf_prepare
+
+	h1_create
+	h2_create
+	# we call simple_if_init from the test itself, but setup_wait expects
+	# that we call it from here, and waits until the interfaces are up
+	ip link set dev $swp1 up
+	ip link set dev $swp2 up
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next 5/8] selftests: forwarding: add helper for retrieving IPv6 link-local address of interface
From: Vladimir Oltean @ 2022-04-22 10:15 UTC (permalink / raw)
  To: netdev
  Cc: Jakub Kicinski, David S. Miller, Paolo Abeni, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Vladimir Oltean, Claudiu Manoil,
	Alexandre Belloni, UNGLinuxDriver, Tobias Waldekranz,
	Mattias Forsblad, Roopa Prabhu, Nikolay Aleksandrov, Jiri Pirko,
	Ido Schimmel, Joachim Wiberg, Marek Behún, Ansuel Smith,
	DENG Qingfang, Kurt Kanzenbach
In-Reply-To: <20220422101504.3729309-1-vladimir.oltean@nxp.com>

Pinging an IPv6 link-local multicast address selects the link-local
unicast address of the interface as source, and we'd like to monitor for
that in tcpdump.

Add a helper to the forwarding library which retrieves the link-local
IPv6 address of an interface, to make that task easier.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 tools/testing/selftests/net/forwarding/lib.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 15fb46b39fe8..5386c826e46a 100755
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -868,6 +868,15 @@ mac_get()
 	ip -j link show dev $if_name | jq -r '.[]["address"]'
 }
 
+ipv6_lladdr_get()
+{
+	local if_name=$1
+
+	ip -j addr show dev $if_name | \
+		jq -r '.[]["addr_info"][] | select(.scope == "link").local' | \
+		head -1
+}
+
 bridge_ageing_time_get()
 {
 	local bridge=$1
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next 7/8] selftests: forwarding: add a test for local_termination.sh
From: Vladimir Oltean @ 2022-04-22 10:15 UTC (permalink / raw)
  To: netdev
  Cc: Jakub Kicinski, David S. Miller, Paolo Abeni, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Vladimir Oltean, Claudiu Manoil,
	Alexandre Belloni, UNGLinuxDriver, Tobias Waldekranz,
	Mattias Forsblad, Roopa Prabhu, Nikolay Aleksandrov, Jiri Pirko,
	Ido Schimmel, Joachim Wiberg, Marek Behún, Ansuel Smith,
	DENG Qingfang, Kurt Kanzenbach
In-Reply-To: <20220422101504.3729309-1-vladimir.oltean@nxp.com>

This tests the capability of switch ports to filter out undesired
traffic. Different drivers are expected to have different capabilities
here (so some may fail and some may pass), yet the test still has some
value, for example to check for regressions.

There are 2 kinds of failures, one is when a packet which should have
been accepted isn't (and that should be fixed), and the other "failure"
(as reported by the test) is when a packet could have been filtered out
(for being unnecessary) yet it was received.

The bridge driver fares particularly badly at this test:

TEST: br0: Unicast IPv4 to primary MAC address                      [ OK ]
TEST: br0: Unicast IPv4 to macvlan MAC address                      [ OK ]
TEST: br0: Unicast IPv4 to unknown MAC address                      [FAIL]
        reception succeeded, but should have failed
TEST: br0: Unicast IPv4 to unknown MAC address, promisc             [ OK ]
TEST: br0: Unicast IPv4 to unknown MAC address, allmulti            [FAIL]
        reception succeeded, but should have failed
TEST: br0: Multicast IPv4 to joined group                           [ OK ]
TEST: br0: Multicast IPv4 to unknown group                          [FAIL]
        reception succeeded, but should have failed
TEST: br0: Multicast IPv4 to unknown group, promisc                 [ OK ]
TEST: br0: Multicast IPv4 to unknown group, allmulti                [ OK ]
TEST: br0: Multicast IPv6 to joined group                           [ OK ]
TEST: br0: Multicast IPv6 to unknown group                          [FAIL]
        reception succeeded, but should have failed
TEST: br0: Multicast IPv6 to unknown group, promisc                 [ OK ]
TEST: br0: Multicast IPv6 to unknown group, allmulti                [ OK ]

mainly because it does not implement IFF_UNICAST_FLT. Yet I still think
having the test (with the failures) is useful in case somebody wants to
tackle that problem in the future, to make an easy before-and-after
comparison.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 .../net/forwarding/local_termination.sh       | 299 ++++++++++++++++++
 1 file changed, 299 insertions(+)
 create mode 100755 tools/testing/selftests/net/forwarding/local_termination.sh

diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh
new file mode 100755
index 000000000000..c5b0cbc85b3e
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/local_termination.sh
@@ -0,0 +1,299 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="standalone bridge"
+NUM_NETIFS=2
+PING_COUNT=1
+REQUIRE_MTOOLS=yes
+REQUIRE_MZ=no
+
+source lib.sh
+
+H1_IPV4="192.0.2.1"
+H2_IPV4="192.0.2.2"
+H1_IPV6="2001:db8:1::1"
+H2_IPV6="2001:db8:1::2"
+
+BRIDGE_ADDR="00:00:de:ad:be:ee"
+MACVLAN_ADDR="00:00:de:ad:be:ef"
+UNKNOWN_UC_ADDR1="de:ad:be:ef:ee:03"
+UNKNOWN_UC_ADDR2="de:ad:be:ef:ee:04"
+UNKNOWN_UC_ADDR3="de:ad:be:ef:ee:05"
+JOINED_IPV4_MC_ADDR="225.1.2.3"
+UNKNOWN_IPV4_MC_ADDR1="225.1.2.4"
+UNKNOWN_IPV4_MC_ADDR2="225.1.2.5"
+UNKNOWN_IPV4_MC_ADDR3="225.1.2.6"
+JOINED_IPV6_MC_ADDR="ff2e::0102:0304"
+UNKNOWN_IPV6_MC_ADDR1="ff2e::0102:0305"
+UNKNOWN_IPV6_MC_ADDR2="ff2e::0102:0306"
+UNKNOWN_IPV6_MC_ADDR3="ff2e::0102:0307"
+
+JOINED_MACV4_MC_ADDR="01:00:5e:01:02:03"
+UNKNOWN_MACV4_MC_ADDR1="01:00:5e:01:02:04"
+UNKNOWN_MACV4_MC_ADDR2="01:00:5e:01:02:05"
+UNKNOWN_MACV4_MC_ADDR3="01:00:5e:01:02:06"
+JOINED_MACV6_MC_ADDR="33:33:01:02:03:04"
+UNKNOWN_MACV6_MC_ADDR1="33:33:01:02:03:05"
+UNKNOWN_MACV6_MC_ADDR2="33:33:01:02:03:06"
+UNKNOWN_MACV6_MC_ADDR3="33:33:01:02:03:07"
+
+NON_IP_MC="01:02:03:04:05:06"
+NON_IP_PKT="00:04 48:45:4c:4f"
+BC="ff:ff:ff:ff:ff:ff"
+
+# Disable promisc to ensure we don't receive unknown MAC DA packets
+export TCPDUMP_EXTRA_FLAGS="-pl"
+
+h1=${NETIFS[p1]}
+h2=${NETIFS[p2]}
+
+send_non_ip()
+{
+	local if_name=$1
+	local smac=$2
+	local dmac=$3
+
+	$MZ -q $if_name "$dmac $smac $NON_IP_PKT"
+}
+
+send_uc_ipv4()
+{
+	local if_name=$1
+	local dmac=$2
+
+	ip neigh add $H2_IPV4 lladdr $dmac dev $if_name
+	ping_do $if_name $H2_IPV4
+	ip neigh del $H2_IPV4 dev $if_name
+}
+
+check_rcv()
+{
+	local if_name=$1
+	local type=$2
+	local pattern=$3
+	local should_receive=$4
+	local should_fail=
+
+	[ $should_receive = true ] && should_fail=0 || should_fail=1
+	RET=0
+
+	tcpdump_show $if_name | grep -q "$pattern"
+
+	check_err_fail "$should_fail" "$?" "reception"
+
+	log_test "$if_name: $type"
+}
+
+mc_route_prepare()
+{
+	local if_name=$1
+	local vrf_name=$(master_name_get $if_name)
+
+	ip route add 225.100.1.0/24 dev $if_name vrf $vrf_name
+	ip -6 route add ff2e::/64 dev $if_name vrf $vrf_name
+}
+
+mc_route_destroy()
+{
+	local if_name=$1
+	local vrf_name=$(master_name_get $if_name)
+
+	ip route del 225.100.1.0/24 dev $if_name vrf $vrf_name
+	ip -6 route del ff2e::/64 dev $if_name vrf $vrf_name
+}
+
+run_test()
+{
+	local rcv_if_name=$1
+	local smac=$(mac_get $h1)
+	local rcv_dmac=$(mac_get $rcv_if_name)
+
+	tcpdump_start $rcv_if_name
+
+	mc_route_prepare $h1
+	mc_route_prepare $rcv_if_name
+
+	send_uc_ipv4 $h1 $rcv_dmac
+	send_uc_ipv4 $h1 $MACVLAN_ADDR
+	send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR1
+
+	ip link set dev $rcv_if_name promisc on
+	send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR2
+	mc_send $h1 $UNKNOWN_IPV4_MC_ADDR2
+	mc_send $h1 $UNKNOWN_IPV6_MC_ADDR2
+	ip link set dev $rcv_if_name promisc off
+
+	mc_join $rcv_if_name $JOINED_IPV4_MC_ADDR
+	mc_send $h1 $JOINED_IPV4_MC_ADDR
+	mc_leave
+
+	mc_join $rcv_if_name $JOINED_IPV6_MC_ADDR
+	mc_send $h1 $JOINED_IPV6_MC_ADDR
+	mc_leave
+
+	mc_send $h1 $UNKNOWN_IPV4_MC_ADDR1
+	mc_send $h1 $UNKNOWN_IPV6_MC_ADDR1
+
+	ip link set dev $rcv_if_name allmulticast on
+	send_uc_ipv4 $h1 $UNKNOWN_UC_ADDR3
+	mc_send $h1 $UNKNOWN_IPV4_MC_ADDR3
+	mc_send $h1 $UNKNOWN_IPV6_MC_ADDR3
+	ip link set dev $rcv_if_name allmulticast off
+
+	mc_route_destroy $rcv_if_name
+	mc_route_destroy $h1
+
+	sleep 1
+
+	tcpdump_stop $rcv_if_name
+
+	check_rcv $rcv_if_name "Unicast IPv4 to primary MAC address" \
+		"$smac > $rcv_dmac, ethertype IPv4 (0x0800)" \
+		true
+
+	check_rcv $rcv_if_name "Unicast IPv4 to macvlan MAC address" \
+		"$smac > $MACVLAN_ADDR, ethertype IPv4 (0x0800)" \
+		true
+
+	check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address" \
+		"$smac > $UNKNOWN_UC_ADDR1, ethertype IPv4 (0x0800)" \
+		false
+
+	check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, promisc" \
+		"$smac > $UNKNOWN_UC_ADDR2, ethertype IPv4 (0x0800)" \
+		true
+
+	check_rcv $rcv_if_name "Unicast IPv4 to unknown MAC address, allmulti" \
+		"$smac > $UNKNOWN_UC_ADDR3, ethertype IPv4 (0x0800)" \
+		false
+
+	check_rcv $rcv_if_name "Multicast IPv4 to joined group" \
+		"$smac > $JOINED_MACV4_MC_ADDR, ethertype IPv4 (0x0800)" \
+		true
+
+	check_rcv $rcv_if_name "Multicast IPv4 to unknown group" \
+		"$smac > $UNKNOWN_MACV4_MC_ADDR1, ethertype IPv4 (0x0800)" \
+		false
+
+	check_rcv $rcv_if_name "Multicast IPv4 to unknown group, promisc" \
+		"$smac > $UNKNOWN_MACV4_MC_ADDR2, ethertype IPv4 (0x0800)" \
+		true
+
+	check_rcv $rcv_if_name "Multicast IPv4 to unknown group, allmulti" \
+		"$smac > $UNKNOWN_MACV4_MC_ADDR3, ethertype IPv4 (0x0800)" \
+		true
+
+	check_rcv $rcv_if_name "Multicast IPv6 to joined group" \
+		"$smac > $JOINED_MACV6_MC_ADDR, ethertype IPv6 (0x86dd)" \
+		true
+
+	check_rcv $rcv_if_name "Multicast IPv6 to unknown group" \
+		"$smac > $UNKNOWN_MACV6_MC_ADDR1, ethertype IPv6 (0x86dd)" \
+		false
+
+	check_rcv $rcv_if_name "Multicast IPv6 to unknown group, promisc" \
+		"$smac > $UNKNOWN_MACV6_MC_ADDR2, ethertype IPv6 (0x86dd)" \
+		true
+
+	check_rcv $rcv_if_name "Multicast IPv6 to unknown group, allmulti" \
+		"$smac > $UNKNOWN_MACV6_MC_ADDR3, ethertype IPv6 (0x86dd)" \
+		true
+
+	tcpdump_cleanup $rcv_if_name
+}
+
+h1_create()
+{
+	simple_if_init $h1 $H1_IPV4/24 $H1_IPV6/64
+}
+
+h1_destroy()
+{
+	simple_if_fini $h1 $H1_IPV4/24 $H1_IPV6/64
+}
+
+h2_create()
+{
+	simple_if_init $h2 $H2_IPV4/24 $H2_IPV6/64
+}
+
+h2_destroy()
+{
+	simple_if_fini $h2 $H2_IPV4/24 $H2_IPV6/64
+}
+
+bridge_create()
+{
+	ip link add br0 type bridge
+	ip link set br0 address $BRIDGE_ADDR
+	ip link set br0 up
+
+	ip link set $h2 master br0
+	ip link set $h2 up
+
+	simple_if_init br0 $H2_IPV4/24 $H2_IPV6/64
+}
+
+bridge_destroy()
+{
+	simple_if_fini br0 $H2_IPV4/24 $H2_IPV6/64
+
+	ip link del br0
+}
+
+standalone()
+{
+	h1_create
+	h2_create
+
+	ip link add link $h2 name macvlan0 type macvlan mode private
+	ip link set macvlan0 address $MACVLAN_ADDR
+	ip link set macvlan0 up
+
+	run_test $h2
+
+	ip link del macvlan0
+
+	h2_destroy
+	h1_destroy
+}
+
+bridge()
+{
+	h1_create
+	bridge_create
+
+	ip link add link br0 name macvlan0 type macvlan mode private
+	ip link set macvlan0 address $MACVLAN_ADDR
+	ip link set macvlan0 up
+
+	run_test br0
+
+	ip link del macvlan0
+
+	bridge_destroy
+	h1_destroy
+}
+
+cleanup()
+{
+	pre_cleanup
+	vrf_cleanup
+}
+
+setup_prepare()
+{
+	vrf_prepare
+	# setup_wait() needs this
+	ip link set $h1 up
+	ip link set $h2 up
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
-- 
2.25.1


^ permalink raw reply related

* [PATCH net-next 8/8] selftests: drivers: dsa: add a subset of forwarding selftests
From: Vladimir Oltean @ 2022-04-22 10:15 UTC (permalink / raw)
  To: netdev
  Cc: Jakub Kicinski, David S. Miller, Paolo Abeni, Florian Fainelli,
	Andrew Lunn, Vivien Didelot, Vladimir Oltean, Claudiu Manoil,
	Alexandre Belloni, UNGLinuxDriver, Tobias Waldekranz,
	Mattias Forsblad, Roopa Prabhu, Nikolay Aleksandrov, Jiri Pirko,
	Ido Schimmel, Joachim Wiberg, Marek Behún, Ansuel Smith,
	DENG Qingfang, Kurt Kanzenbach
In-Reply-To: <20220422101504.3729309-1-vladimir.oltean@nxp.com>

This adds an initial subset of forwarding selftests which I considered
to be relevant for DSA drivers, along with a forwarding.config that
makes it easier to run them (disables veth pair creation, makes sure MAC
addresses are unique and stable).

The intention is to request driver writers to run these selftests during
review and make sure that the tests pass, or at least that the problems
are known.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
---
 tools/testing/selftests/drivers/net/dsa/bridge_locked_port.sh  | 1 +
 tools/testing/selftests/drivers/net/dsa/bridge_mdb.sh          | 1 +
 tools/testing/selftests/drivers/net/dsa/bridge_mld.sh          | 1 +
 tools/testing/selftests/drivers/net/dsa/bridge_vlan_aware.sh   | 1 +
 tools/testing/selftests/drivers/net/dsa/bridge_vlan_mcast.sh   | 1 +
 tools/testing/selftests/drivers/net/dsa/bridge_vlan_unaware.sh | 1 +
 tools/testing/selftests/drivers/net/dsa/forwarding.config      | 2 ++
 tools/testing/selftests/drivers/net/dsa/lib.sh                 | 1 +
 tools/testing/selftests/drivers/net/dsa/local_termination.sh   | 1 +
 tools/testing/selftests/drivers/net/dsa/no_forwarding.sh       | 1 +
 10 files changed, 11 insertions(+)
 create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_locked_port.sh
 create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_mdb.sh
 create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_mld.sh
 create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_vlan_aware.sh
 create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_vlan_mcast.sh
 create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_vlan_unaware.sh
 create mode 100644 tools/testing/selftests/drivers/net/dsa/forwarding.config
 create mode 120000 tools/testing/selftests/drivers/net/dsa/lib.sh
 create mode 120000 tools/testing/selftests/drivers/net/dsa/local_termination.sh
 create mode 120000 tools/testing/selftests/drivers/net/dsa/no_forwarding.sh

diff --git a/tools/testing/selftests/drivers/net/dsa/bridge_locked_port.sh b/tools/testing/selftests/drivers/net/dsa/bridge_locked_port.sh
new file mode 120000
index 000000000000..f5eb940c4c7c
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/dsa/bridge_locked_port.sh
@@ -0,0 +1 @@
+../../../net/forwarding/bridge_locked_port.sh
\ No newline at end of file
diff --git a/tools/testing/selftests/drivers/net/dsa/bridge_mdb.sh b/tools/testing/selftests/drivers/net/dsa/bridge_mdb.sh
new file mode 120000
index 000000000000..76492da525f7
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/dsa/bridge_mdb.sh
@@ -0,0 +1 @@
+../../../net/forwarding/bridge_mdb.sh
\ No newline at end of file
diff --git a/tools/testing/selftests/drivers/net/dsa/bridge_mld.sh b/tools/testing/selftests/drivers/net/dsa/bridge_mld.sh
new file mode 120000
index 000000000000..81a7e0df0474
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/dsa/bridge_mld.sh
@@ -0,0 +1 @@
+../../../net/forwarding/bridge_mld.sh
\ No newline at end of file
diff --git a/tools/testing/selftests/drivers/net/dsa/bridge_vlan_aware.sh b/tools/testing/selftests/drivers/net/dsa/bridge_vlan_aware.sh
new file mode 120000
index 000000000000..9831ed74376a
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/dsa/bridge_vlan_aware.sh
@@ -0,0 +1 @@
+../../../net/forwarding/bridge_vlan_aware.sh
\ No newline at end of file
diff --git a/tools/testing/selftests/drivers/net/dsa/bridge_vlan_mcast.sh b/tools/testing/selftests/drivers/net/dsa/bridge_vlan_mcast.sh
new file mode 120000
index 000000000000..7f3c3f0bf719
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/dsa/bridge_vlan_mcast.sh
@@ -0,0 +1 @@
+../../../net/forwarding/bridge_vlan_mcast.sh
\ No newline at end of file
diff --git a/tools/testing/selftests/drivers/net/dsa/bridge_vlan_unaware.sh b/tools/testing/selftests/drivers/net/dsa/bridge_vlan_unaware.sh
new file mode 120000
index 000000000000..bf1a57e6bde1
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/dsa/bridge_vlan_unaware.sh
@@ -0,0 +1 @@
+../../../net/forwarding/bridge_vlan_unaware.sh
\ No newline at end of file
diff --git a/tools/testing/selftests/drivers/net/dsa/forwarding.config b/tools/testing/selftests/drivers/net/dsa/forwarding.config
new file mode 100644
index 000000000000..7adc1396fae0
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/dsa/forwarding.config
@@ -0,0 +1,2 @@
+NETIF_CREATE=no
+STABLE_MAC_ADDRS=yes
diff --git a/tools/testing/selftests/drivers/net/dsa/lib.sh b/tools/testing/selftests/drivers/net/dsa/lib.sh
new file mode 120000
index 000000000000..39c96828c5ef
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/dsa/lib.sh
@@ -0,0 +1 @@
+../../../net/forwarding/lib.sh
\ No newline at end of file
diff --git a/tools/testing/selftests/drivers/net/dsa/local_termination.sh b/tools/testing/selftests/drivers/net/dsa/local_termination.sh
new file mode 120000
index 000000000000..c08166f84501
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/dsa/local_termination.sh
@@ -0,0 +1 @@
+../../../net/forwarding/local_termination.sh
\ No newline at end of file
diff --git a/tools/testing/selftests/drivers/net/dsa/no_forwarding.sh b/tools/testing/selftests/drivers/net/dsa/no_forwarding.sh
new file mode 120000
index 000000000000..b9757466bc97
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/dsa/no_forwarding.sh
@@ -0,0 +1 @@
+../../../net/forwarding/no_forwarding.sh
\ No newline at end of file
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH net-next 0/8] DSA selftests
From: Vladimir Oltean @ 2022-04-22 10:22 UTC (permalink / raw)
  To: Vladimir Oltean
  Cc: netdev, Jakub Kicinski, David S. Miller, Paolo Abeni,
	Florian Fainelli, Andrew Lunn, Vivien Didelot, Claudiu Manoil,
	Alexandre Belloni, UNGLinuxDriver, Tobias Waldekranz,
	Mattias Forsblad, Roopa Prabhu, Nikolay Aleksandrov, Jiri Pirko,
	Ido Schimmel, Joachim Wiberg, Marek Behún, Ansuel Smith,
	DENG Qingfang, Kurt Kanzenbach
In-Reply-To: <20220422101504.3729309-1-vladimir.oltean@nxp.com>

On Fri, Apr 22, 2022 at 01:14:56PM +0300, Vladimir Oltean wrote:
> When working on complex new features or reworks it becomes increasingly
> difficult to ensure there aren't regressions being introduced, and
> therefore it would be nice if we could go over the functionality we
> already have and write some tests for it.
> 
> Verbally I know from Tobias Waldekranz that he has been working on some
> selftests for DSA, yet I have never seen them, so here I am adding some
> tests I have written which have been useful for me. The list is by no
> means complete (it only covers elementary functionality), but it's still
> good to have as a starting point. I also borrowed some refactoring
> changes from Joachim Wiberg that he submitted for his "net: bridge:
> forwarding of unknown IPv4/IPv6/MAC BUM traffic" series, but not the
> entirety of his selftests. I now think that his selftests have some
> overlap with bridge_vlan_unaware.sh and bridge_vlan_aware.sh and they
> should be more tightly integrated with each other - yet I didn't do that
> either :). Another issue I had with his selftests was that they jumped
> straight ahead to configure brport flags on br0 (a radical new idea
> still at RFC status) while we have bigger problems, and we don't have
> nearly enough coverage for the *existing* functionality.
> 
> One idea introduced here which I haven't seen before is the symlinking
> of relevant forwarding selftests to the selftests/drivers/net/<my-driver>/
> folder, plus a forwarding.config file. I think there's some value in
> having things structured this way, since the forwarding dir has so many
> selftests that aren't relevant to DSA that it is a bit difficult to find
> the ones that are.
> 
> While searching for applications that I could use for multicast testing
> (not my domain of interest/knowledge really), I found Joachim Wiberg's
> mtools, mcjoin and omping, and I tried them all with various degrees of
> success. In particular, I was going to use mcjoin, but I faced some
> issues getting IPv6 multicast traffic to work in a VRF, and I bothered
> David Ahern about it here:
> https://lore.kernel.org/netdev/97eaffb8-2125-834e-641f-c99c097b6ee2@gmail.com/t/
> It seems that the problem is that this application should use
> SO_BINDTODEVICE, yet it doesn't.
> 
> So I ended up patching the bare-bones mtools (msend, mreceive) forked by
> Joachim from the University of Virginia's Multimedia Networks Group to
> include IPv6 support, and to use SO_BINDTODEVICE. This is what I'm using
> now for IPv6.
> 
> Note that mausezahn doesn't appear to do a particularly good job of
> supporting IPv6 really, and I needed a program to emit the actual
> IP_ADD_MEMBERSHIP calls, for dev_mc_add(), so I could test RX filtering.
> Crafting the IGMP/MLD reports by hand doesn't really do the trick.
> While extremely bare-bones, the mreceive application now seems to do
> what I need it to.
> 
> Feedback appreciated, it is very likely that I could have done things in
> a better way.
> 
> Joachim Wiberg (2):
>   selftests: forwarding: add TCPDUMP_EXTRA_FLAGS to lib.sh
>   selftests: forwarding: multiple instances in tcpdump helper
> 
> Vladimir Oltean (6):
>   selftests: forwarding: add option to run tests with stable MAC
>     addresses
>   selftests: forwarding: add helpers for IP multicast group joins/leaves
>   selftests: forwarding: add helper for retrieving IPv6 link-local
>     address of interface
>   selftests: forwarding: add a no_forwarding.sh test
>   selftests: forwarding: add a test for local_termination.sh
>   selftests: drivers: dsa: add a subset of forwarding selftests
> 
>  .../drivers/net/dsa/bridge_locked_port.sh     |   1 +
>  .../selftests/drivers/net/dsa/bridge_mdb.sh   |   1 +
>  .../selftests/drivers/net/dsa/bridge_mld.sh   |   1 +
>  .../drivers/net/dsa/bridge_vlan_aware.sh      |   1 +
>  .../drivers/net/dsa/bridge_vlan_mcast.sh      |   1 +
>  .../drivers/net/dsa/bridge_vlan_unaware.sh    |   1 +
>  .../drivers/net/dsa/forwarding.config         |   2 +
>  .../testing/selftests/drivers/net/dsa/lib.sh  |   1 +
>  .../drivers/net/dsa/local_termination.sh      |   1 +
>  .../drivers/net/dsa/no_forwarding.sh          |   1 +
>  .../drivers/net/ocelot/tc_flower_chains.sh    |  24 +-
>  tools/testing/selftests/net/forwarding/lib.sh | 112 ++++++-
>  .../net/forwarding/local_termination.sh       | 299 ++++++++++++++++++
>  .../selftests/net/forwarding/no_forwarding.sh | 261 +++++++++++++++
>  14 files changed, 687 insertions(+), 20 deletions(-)
>  create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_locked_port.sh
>  create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_mdb.sh
>  create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_mld.sh
>  create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_vlan_aware.sh
>  create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_vlan_mcast.sh
>  create mode 120000 tools/testing/selftests/drivers/net/dsa/bridge_vlan_unaware.sh
>  create mode 100644 tools/testing/selftests/drivers/net/dsa/forwarding.config
>  create mode 120000 tools/testing/selftests/drivers/net/dsa/lib.sh
>  create mode 120000 tools/testing/selftests/drivers/net/dsa/local_termination.sh
>  create mode 120000 tools/testing/selftests/drivers/net/dsa/no_forwarding.sh
>  mode change 100644 => 100755 tools/testing/selftests/net/forwarding/lib.sh
>  create mode 100755 tools/testing/selftests/net/forwarding/local_termination.sh
>  create mode 100755 tools/testing/selftests/net/forwarding/no_forwarding.sh
> 
> -- 
> 2.25.1
> 

Sorry again, now I've really fixed all the places from which I could
have possibly copy-pasted Nikolay's dead NVIDIA email address.
It shouldn't happen again next time.

For those who consider replying, if you don't forget, maybe you can also
replace Nikolay's address in Cc: here to avoid getting a bounce-back
email from NVIDIA.

Sorry!

^ permalink raw reply

* Re: [PATCH net-next] Bonding: add per port priority support
From: Hangbin Liu @ 2022-04-22 10:23 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Jonathan Toppins, netdev, Veaceslav Falico, Andy Gospodarek,
	David S . Miller, Jakub Kicinski, David Ahern,
	Nikolay Aleksandrov, Eric Dumazet, Paolo Abeni
In-Reply-To: <Yl07fecwg6cIWF8w@Laptop-X1>

On Mon, Apr 18, 2022 at 06:20:52PM +0800, Hangbin Liu wrote:
> > 	Agreed, on both the comment and in regards to using the extant
> > bonding options management stuff.
> > 
> > >Also, in the Documentation it is mentioned that this parameter is only
> > >used in modes active-backup and balance-alb/tlb. Do we need to send an
> > >error message back preventing the modification of this value when not in
> > >these modes?
> > 
> > 	Using the option management stuff would get this for free.
> 
> Hi Jav, Jon,
> 
> I remembered the reason why I didn't use bond default option management.
> 
> It's because the bonding options management only take bond and values. We
> need to create an extra string to save the slave name and option values.
> Then in bond option setting function we extract the info from the string
> and do setting again, like the bond_option_queue_id_set().
> 
> I think this is too heavy for just an int value setting for slave.
> As we only support netlink for new options. There is no need to handle
> string setting via sysfs. For mode checking, we do just do like:
> 
> if (!bond_uses_primary(bond))
> 	return -EACCES;
> 
> So why bother the bonding options management? What do you think?
> Do you have a easier way to get the slave name in options management?
> If yes, I'm happy to use the default option management.

Hi Jan,

Any comments?

Thanks
Hangbin

^ permalink raw reply

* RE: [EXT] Re: [PATCH V2] octeontx2-pf: Add support for adaptive interrupt coalescing
From: Suman Ghosh @ 2022-04-22 10:29 UTC (permalink / raw)
  To: Paolo Abeni, davem@davemloft.net, kuba@kernel.org,
	Sunil Kovvuri Goutham, netdev@vger.kernel.org
In-Reply-To: <151a8301018e91ae34f9e33141b398cbe68de45f.camel@redhat.com>

Hi Paolo,

>-----Original Message-----
>From: Paolo Abeni <pabeni@redhat.com>
>Sent: Thursday, April 21, 2022 6:57 PM
>To: Suman Ghosh <sumang@marvell.com>; davem@davemloft.net; kuba@kernel.org;
>Sunil Kovvuri Goutham <sgoutham@marvell.com>; netdev@vger.kernel.org
>Subject: Re: [PATCH V2] octeontx2-pf: Add support for adaptive
>interrupt coalescing
>
>On Mon, 2022-04-18 at 16:32 +0530, Suman Ghosh wrote:
>> Added support for adaptive IRQ coalescing. It uses net_dim algorithm
>> to find the suitable delay/IRQ count based on the current packet rate.
>>
>> Signed-off-by: Suman Ghosh <sumang@marvell.com>
>> Reviewed-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
>> ---
>> Changes since V1
>> - No change, resubmitting because V1 did not get picked up in patchworks
>>   for some reason.
>>
>>  .../net/ethernet/marvell/octeontx2/Kconfig    |  1 +
>>  .../marvell/octeontx2/nic/otx2_common.c       |  5 ---
>>  .../marvell/octeontx2/nic/otx2_common.h       | 10 +++++
>>  .../marvell/octeontx2/nic/otx2_ethtool.c      | 43 ++++++++++++++++---
>>  .../ethernet/marvell/octeontx2/nic/otx2_pf.c  | 22 ++++++++++
>>  .../marvell/octeontx2/nic/otx2_txrx.c         | 28 ++++++++++++
>>  .../marvell/octeontx2/nic/otx2_txrx.h         |  1 +
>>  7 files changed, 99 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/marvell/octeontx2/Kconfig
>> b/drivers/net/ethernet/marvell/octeontx2/Kconfig
>> index 8560f7e463d3..a544733152d8 100644
>> --- a/drivers/net/ethernet/marvell/octeontx2/Kconfig
>> +++ b/drivers/net/ethernet/marvell/octeontx2/Kconfig
>> @@ -30,6 +30,7 @@ config OCTEONTX2_PF
>>  	tristate "Marvell OcteonTX2 NIC Physical Function driver"
>>  	select OCTEONTX2_MBOX
>>  	select NET_DEVLINK
>> +	select DIMLIB
>>  	depends on PCI
>>  	help
>>  	  This driver supports Marvell's OcteonTX2 Resource Virtualization
>> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
>> b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
>> index 033fd79d35b0..c28850d815c2 100644
>> --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
>> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
>> @@ -103,11 +103,6 @@ void otx2_get_dev_stats(struct otx2_nic *pfvf)  {
>>  	struct otx2_dev_stats *dev_stats = &pfvf->hw.dev_stats;
>>
>> -#define OTX2_GET_RX_STATS(reg) \
>> -	 otx2_read64(pfvf, NIX_LF_RX_STATX(reg))
>> -#define OTX2_GET_TX_STATS(reg) \
>> -	 otx2_read64(pfvf, NIX_LF_TX_STATX(reg))
>> -
>>  	dev_stats->rx_bytes = OTX2_GET_RX_STATS(RX_OCTS);
>>  	dev_stats->rx_drops = OTX2_GET_RX_STATS(RX_DROP);
>>  	dev_stats->rx_bcast_frames = OTX2_GET_RX_STATS(RX_BCAST); diff --git
>> a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
>> b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
>> index d9f4b085b2a4..6abf5c28921f 100644
>> --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
>> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
>> @@ -16,6 +16,7 @@
>>  #include <net/pkt_cls.h>
>>  #include <net/devlink.h>
>>  #include <linux/time64.h>
>> +#include <linux/dim.h>
>>
>>  #include <mbox.h>
>>  #include <npc.h>
>> @@ -54,6 +55,11 @@ enum arua_mapped_qtypes {
>>  /* Send skid of 2000 packets required for CQ size of 4K CQEs. */
>>  #define SEND_CQ_SKID	2000
>>
>> +#define OTX2_GET_RX_STATS(reg) \
>> +	 otx2_read64(pfvf, NIX_LF_RX_STATX(reg)) #define
>> +OTX2_GET_TX_STATS(reg) \
>> +	 otx2_read64(pfvf, NIX_LF_TX_STATX(reg))
>> +
>>  struct otx2_lmt_info {
>>  	u64 lmt_addr;
>>  	u16 lmt_id;
>> @@ -363,6 +369,7 @@ struct otx2_nic {
>>  #define OTX2_FLAG_TC_MATCHALL_INGRESS_ENABLED	BIT_ULL(13)
>>  #define OTX2_FLAG_DMACFLTR_SUPPORT		BIT_ULL(14)
>>  #define OTX2_FLAG_PTP_ONESTEP_SYNC		BIT_ULL(15)
>> +#define OTX2_FLAG_ADPTV_INT_COAL_ENABLED	BIT_ULL(16)
>>  	u64			flags;
>>  	u64			*cq_op_addr;
>>
>> @@ -442,6 +449,9 @@ struct otx2_nic {
>>  #endif
>>  	/* qos */
>>  	struct otx2_qos		qos;
>> +
>> +	/* napi event count. It is needed for adaptive irq coalescing */
>> +	u32 napi_events;
>>  };
>>
>>  static inline bool is_otx2_lbkvf(struct pci_dev *pdev) diff --git
>> a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
>> b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
>> index 72d0b02da3cc..8ed282991f05 100644
>> --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
>> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c
>> @@ -477,6 +477,14 @@ static int otx2_get_coalesce(struct net_device
>*netdev,
>>  	cmd->rx_max_coalesced_frames = hw->cq_ecount_wait;
>>  	cmd->tx_coalesce_usecs = hw->cq_time_wait;
>>  	cmd->tx_max_coalesced_frames = hw->cq_ecount_wait;
>> +	if ((pfvf->flags & OTX2_FLAG_ADPTV_INT_COAL_ENABLED) ==
>> +		OTX2_FLAG_ADPTV_INT_COAL_ENABLED) {
>> +		cmd->use_adaptive_rx_coalesce = 1;
>> +		cmd->use_adaptive_tx_coalesce = 1;
>> +	} else {
>> +		cmd->use_adaptive_rx_coalesce = 0;
>> +		cmd->use_adaptive_tx_coalesce = 0;
>> +	}
>>
>>  	return 0;
>>  }
>> @@ -486,10 +494,10 @@ static int otx2_set_coalesce(struct net_device
>> *netdev,  {
>>  	struct otx2_nic *pfvf = netdev_priv(netdev);
>>  	struct otx2_hw *hw = &pfvf->hw;
>> +	u8 priv_coalesce_status;
>>  	int qidx;
>>
>> -	if (ec->use_adaptive_rx_coalesce || ec->use_adaptive_tx_coalesce ||
>> -	    ec->rx_coalesce_usecs_irq || ec->rx_max_coalesced_frames_irq ||
>> +	if (ec->rx_coalesce_usecs_irq || ec->rx_max_coalesced_frames_irq ||
>>  	    ec->tx_coalesce_usecs_irq || ec->tx_max_coalesced_frames_irq ||
>>  	    ec->stats_block_coalesce_usecs || ec->pkt_rate_low ||
>>  	    ec->rx_coalesce_usecs_low || ec->rx_max_coalesced_frames_low ||
>> @@ -502,6 +510,18 @@ static int otx2_set_coalesce(struct net_device
>*netdev,
>>  	if (!ec->rx_max_coalesced_frames || !ec->tx_max_coalesced_frames)
>>  		return 0;
>>
>> +	/* Check and update coalesce status */
>> +	if ((pfvf->flags & OTX2_FLAG_ADPTV_INT_COAL_ENABLED) ==
>> +	    OTX2_FLAG_ADPTV_INT_COAL_ENABLED) {
>> +		priv_coalesce_status = 1;
>> +		if (!ec->use_adaptive_rx_coalesce || !ec-
>>use_adaptive_tx_coalesce)
>> +			pfvf->flags &= ~OTX2_FLAG_ADPTV_INT_COAL_ENABLED;
>> +	} else {
>> +		priv_coalesce_status = 0;
>> +		if (ec->use_adaptive_rx_coalesce || ec->use_adaptive_tx_coalesce)
>> +			pfvf->flags |= OTX2_FLAG_ADPTV_INT_COAL_ENABLED;
>> +	}
>> +
>>  	/* 'cq_time_wait' is 8bit and is in multiple of 100ns,
>>  	 * so clamp the user given value to the range of 1 to 25usec.
>>  	 */
>> @@ -521,13 +541,13 @@ static int otx2_set_coalesce(struct net_device
>*netdev,
>>  		hw->cq_time_wait = min_t(u8, ec->rx_coalesce_usecs,
>>  					 ec->tx_coalesce_usecs);
>>
>> -	/* Max ecount_wait supported is 16bit,
>> -	 * so clamp the user given value to the range of 1 to 64k.
>> +	/* Max packet budget per napi is 64,
>> +	 * so clamp the user given value to the range of 1 to 64.
>>  	 */
>>  	ec->rx_max_coalesced_frames = clamp_t(u32, ec-
>>rx_max_coalesced_frames,
>> -					      1, U16_MAX);
>> +					      1, NAPI_POLL_WEIGHT);
>>  	ec->tx_max_coalesced_frames = clamp_t(u32, ec-
>>tx_max_coalesced_frames,
>> -					      1, U16_MAX);
>> +					      1, NAPI_POLL_WEIGHT);
>>
>>  	/* Rx and Tx are mapped to same CQ, check which one
>>  	 * is changed, if both then choose the min.
>> @@ -540,6 +560,17 @@ static int otx2_set_coalesce(struct net_device
>*netdev,
>>  		hw->cq_ecount_wait = min_t(u16, ec->rx_max_coalesced_frames,
>>  					   ec->tx_max_coalesced_frames);
>>
>> +	/* Reset 'cq_time_wait' and 'cq_ecount_wait' to
>> +	 * default values if coalesce status changed from
>> +	 * 'on' to 'off'.
>> +	 */
>> +	if (priv_coalesce_status &&
>> +	    ((pfvf->flags & OTX2_FLAG_ADPTV_INT_COAL_ENABLED) !=
>> +	    OTX2_FLAG_ADPTV_INT_COAL_ENABLED)) {
>> +		hw->cq_time_wait = CQ_TIMER_THRESH_DEFAULT;
>> +		hw->cq_ecount_wait = CQ_CQE_THRESH_DEFAULT;
>> +	}
>> +
>>  	if (netif_running(netdev)) {
>>  		for (qidx = 0; qidx < pfvf->hw.cint_cnt; qidx++)
>>  			otx2_config_irq_coalescing(pfvf, qidx); diff --git
>> a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
>> b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
>> index f18c9a9a50d0..94aaafbeb365 100644
>> --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
>> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
>> @@ -1279,6 +1279,7 @@ static irqreturn_t otx2_cq_intr_handler(int irq,
>void *cq_irq)
>>  	otx2_write64(pf, NIX_LF_CINTX_ENA_W1C(qidx), BIT_ULL(0));
>>
>>  	/* Schedule NAPI */
>> +	pf->napi_events++;
>>  	napi_schedule_irqoff(&cq_poll->napi);
>>
>>  	return IRQ_HANDLED;
>> @@ -1292,6 +1293,7 @@ static void otx2_disable_napi(struct otx2_nic
>> *pf)
>>
>>  	for (qidx = 0; qidx < pf->hw.cint_cnt; qidx++) {
>>  		cq_poll = &qset->napi[qidx];
>> +		cancel_work_sync(&cq_poll->dim.work);
>>  		napi_disable(&cq_poll->napi);
>>  		netif_napi_del(&cq_poll->napi);
>>  	}
>> @@ -1538,6 +1540,24 @@ static void otx2_free_hw_resources(struct otx2_nic
>*pf)
>>  	mutex_unlock(&mbox->lock);
>>  }
>>
>> +static void otx2_dim_work(struct work_struct *w) {
>> +	struct dim_cq_moder cur_moder;
>> +	struct otx2_cq_poll *cq_poll;
>> +	struct otx2_nic *pfvf;
>> +	struct dim *dim;
>> +
>> +	dim = container_of(w, struct dim, work);
>> +	cur_moder = net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
>> +	cq_poll = container_of(dim, struct otx2_cq_poll, dim);
>> +	pfvf = (struct otx2_nic *)cq_poll->dev;
>> +	pfvf->hw.cq_time_wait = (cur_moder.usec > CQ_TIMER_THRESH_MAX) ?
>> +				CQ_TIMER_THRESH_MAX : cur_moder.usec;
>> +	pfvf->hw.cq_ecount_wait = (cur_moder.pkts > NAPI_POLL_WEIGHT) ?
>> +				NAPI_POLL_WEIGHT : cur_moder.pkts;
>> +	dim->state = DIM_START_MEASURE;
>> +}
>> +
>>  int otx2_open(struct net_device *netdev)  {
>>  	struct otx2_nic *pf = netdev_priv(netdev); @@ -1611,6 +1631,8 @@ int
>> otx2_open(struct net_device *netdev)
>>  					  CINT_INVALID_CQ;
>>
>>  		cq_poll->dev = (void *)pf;
>> +		cq_poll->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_CQE;
>> +		INIT_WORK(&cq_poll->dim.work, otx2_dim_work);
>>  		netif_napi_add(netdev, &cq_poll->napi,
>>  			       otx2_napi_handler, NAPI_POLL_WEIGHT);
>>  		napi_enable(&cq_poll->napi);
>> diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
>> b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
>> index 459b94b99ddb..927dd12b4f4e 100644
>> --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
>> +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
>> @@ -512,6 +512,22 @@ static int otx2_tx_napi_handler(struct otx2_nic
>*pfvf,
>>  	return 0;
>>  }
>>
>> +static void otx2_adjust_adaptive_coalese(struct otx2_nic *pfvf,
>> +struct otx2_cq_poll *cq_poll) {
>> +	struct dim_sample dim_sample;
>> +	u64 rx_frames, rx_bytes;
>> +
>> +	rx_frames = OTX2_GET_RX_STATS(RX_BCAST) + OTX2_GET_RX_STATS(RX_MCAST)
>+
>> +			OTX2_GET_RX_STATS(RX_UCAST);
>> +	rx_bytes = OTX2_GET_RX_STATS(RX_OCTS);
>> +	dim_update_sample(pfvf->napi_events,
>> +			  rx_frames,
>> +			  rx_bytes,
>> +			  &dim_sample);
>> +
>> +	net_dim(&cq_poll->dim, dim_sample);
>> +}
>> +
>>  int otx2_napi_handler(struct napi_struct *napi, int budget)  {
>>  	struct otx2_cq_queue *rx_cq = NULL;
>> @@ -549,6 +565,18 @@ int otx2_napi_handler(struct napi_struct *napi, int
>budget)
>>  		if (pfvf->flags & OTX2_FLAG_INTF_DOWN)
>>  			return workdone;
>>
>> +		/* Check for adaptive interrupt coalesce */
>> +		if (workdone != 0 &&
>> +		    ((pfvf->flags & OTX2_FLAG_ADPTV_INT_COAL_ENABLED) ==
>> +		    OTX2_FLAG_ADPTV_INT_COAL_ENABLED)) {
>> +			/* Adjust irq coalese using net_dim */
>> +			otx2_adjust_adaptive_coalese(pfvf, cq_poll);
>> +
>> +			/* Update irq coalescing */
>> +			for (i = 0; i < pfvf->hw.cint_cnt; i++)
>> +				otx2_config_irq_coalescing(pfvf, i);
>> +		}
>> +
>
>Why are you updating the IRQ coalescing parameters for every sample?
>You probably should to that in void otx2_dim_work(), when dim tells it's
>the right time to update them.

Otx2_dim_work() is called from dim context. All the existing users of net_dim
is not doing register read/writes from dim context(example: drivers/net/ethernet/amazon/ena/ena_netdev.c).
I guess this is to ensure individual user's register read/write should not impact the performance of
the dim algorithm.

Regards,
Suman

>Thanks,
>
>Paolo


^ permalink raw reply

* Re: [PATCH] NFC: nfcmrvl: fix error check return value of irq_of_parse_and_map()
From: Krzysztof Kozlowski @ 2022-04-22 10:30 UTC (permalink / raw)
  To: cgel.zte, davem
  Cc: kuba, lv.ruyi, yashsri421, sameo, cuissard, netdev, linux-kernel,
	Zeal Robot
In-Reply-To: <20220422084605.2775542-1-lv.ruyi@zte.com.cn>

On 22/04/2022 10:46, cgel.zte@gmail.com wrote:
> From: Lv Ruyi <lv.ruyi@zte.com.cn>
> 
> The irq_of_parse_and_map() function returns 0 on failure, and does not
> return an negative value.
> 
> Fixes: 	b5b3e23e4cac ("NFC: nfcmrvl: add i2c driver")

Some unneeded indentation above.

Except that:

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>

> Reported-by: Zeal Robot <zealci@zte.com.cn>
> Signed-off-by: Lv Ruyi <lv.ruyi@zte.com.cn>




Best regards,
Krzysztof

^ permalink raw reply

* [PATCH] nfc: nfcmrvl: spi: Fix irq_of_parse_and_map() return value
From: Krzysztof Kozlowski @ 2022-04-22 10:47 UTC (permalink / raw)
  To: Krzysztof Kozlowski, Vincent Cuissard, Samuel Ortiz, linux-nfc,
	netdev, linux-kernel

The irq_of_parse_and_map() returns 0 on failure, not a negative ERRNO.

Fixes: caf6e49bf6d0 ("NFC: nfcmrvl: add spi driver")
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>

---

This is another issue to https://lore.kernel.org/all/20220422084605.2775542-1-lv.ruyi@zte.com.cn/
---
 drivers/nfc/nfcmrvl/spi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nfc/nfcmrvl/spi.c b/drivers/nfc/nfcmrvl/spi.c
index a38e2fcdfd39..01f0a08a381c 100644
--- a/drivers/nfc/nfcmrvl/spi.c
+++ b/drivers/nfc/nfcmrvl/spi.c
@@ -115,7 +115,7 @@ static int nfcmrvl_spi_parse_dt(struct device_node *node,
 	}
 
 	ret = irq_of_parse_and_map(node, 0);
-	if (ret < 0) {
+	if (!ret) {
 		pr_err("Unable to get irq, error: %d\n", ret);
 		return ret;
 	}
-- 
2.32.0


^ permalink raw reply related

* [PATCH] staging: qlge: Fix line wrapping
From: Lungash @ 2022-04-22  8:51 UTC (permalink / raw)
  To: Manish Chopra, GR-Linux-NIC-Dev, Coiby Xu, Greg Kroah-Hartman,
	netdev, linux-staging, linux-kernel, outreachy

This patch fixes line wrapping following kernel coding style.

Task on TODO list

* fix weird line wrapping (all over, ex. the ql_set_routing_reg() calls in
  qlge_set_multicast_list()).

Signed-off-by: Lungash <denzlungash@gmail.com>
---
 drivers/staging/qlge/qlge_main.c | 235 ++++++++++++++-----------------
 1 file changed, 107 insertions(+), 128 deletions(-)

diff --git a/drivers/staging/qlge/qlge_main.c b/drivers/staging/qlge/qlge_main.c
index 113a3efd12e9..309db00e0b22 100644
--- a/drivers/staging/qlge/qlge_main.c
+++ b/drivers/staging/qlge/qlge_main.c
@@ -499,77 +499,57 @@ static int qlge_set_routing_reg(struct qlge_adapter *qdev, u32 index, u32 mask,
 
 	switch (mask) {
 	case RT_IDX_CAM_HIT:
-		{
-			value = RT_IDX_DST_CAM_Q |	/* dest */
-			    RT_IDX_TYPE_NICQ |	/* type */
-			    (RT_IDX_CAM_HIT_SLOT << RT_IDX_IDX_SHIFT);/* index */
-			break;
-		}
+		value = RT_IDX_DST_CAM_Q |	/* dest */
+			RT_IDX_TYPE_NICQ |	/* type */
+			(RT_IDX_CAM_HIT_SLOT << RT_IDX_IDX_SHIFT);/* index */
+		break;
 	case RT_IDX_VALID:	/* Promiscuous Mode frames. */
-		{
-			value = RT_IDX_DST_DFLT_Q |	/* dest */
-			    RT_IDX_TYPE_NICQ |	/* type */
-			    (RT_IDX_PROMISCUOUS_SLOT << RT_IDX_IDX_SHIFT);/* index */
-			break;
-		}
+		value = RT_IDX_DST_DFLT_Q |	/* dest */
+			RT_IDX_TYPE_NICQ |	/* type */
+			(RT_IDX_PROMISCUOUS_SLOT << RT_IDX_IDX_SHIFT);/* index */
+		break;
 	case RT_IDX_ERR:	/* Pass up MAC,IP,TCP/UDP error frames. */
-		{
-			value = RT_IDX_DST_DFLT_Q |	/* dest */
-			    RT_IDX_TYPE_NICQ |	/* type */
-			    (RT_IDX_ALL_ERR_SLOT << RT_IDX_IDX_SHIFT);/* index */
-			break;
-		}
+		value = RT_IDX_DST_DFLT_Q |	/* dest */
+			RT_IDX_TYPE_NICQ |	/* type */
+			(RT_IDX_ALL_ERR_SLOT << RT_IDX_IDX_SHIFT);/* index */
+		break;
 	case RT_IDX_IP_CSUM_ERR: /* Pass up IP CSUM error frames. */
-		{
-			value = RT_IDX_DST_DFLT_Q | /* dest */
-				RT_IDX_TYPE_NICQ | /* type */
-				(RT_IDX_IP_CSUM_ERR_SLOT <<
-				RT_IDX_IDX_SHIFT); /* index */
-			break;
-		}
+		value = RT_IDX_DST_DFLT_Q | /* dest */
+			RT_IDX_TYPE_NICQ | /* type */
+			(RT_IDX_IP_CSUM_ERR_SLOT <<
+			RT_IDX_IDX_SHIFT); /* index */
+		break;
 	case RT_IDX_TU_CSUM_ERR: /* Pass up TCP/UDP CSUM error frames. */
-		{
-			value = RT_IDX_DST_DFLT_Q | /* dest */
-				RT_IDX_TYPE_NICQ | /* type */
-				(RT_IDX_TCP_UDP_CSUM_ERR_SLOT <<
-				RT_IDX_IDX_SHIFT); /* index */
-			break;
-		}
+		value = RT_IDX_DST_DFLT_Q | /* dest */
+			RT_IDX_TYPE_NICQ | /* type */
+			(RT_IDX_TCP_UDP_CSUM_ERR_SLOT <<
+			RT_IDX_IDX_SHIFT); /* index */
+		break;
 	case RT_IDX_BCAST:	/* Pass up Broadcast frames to default Q. */
-		{
-			value = RT_IDX_DST_DFLT_Q |	/* dest */
-			    RT_IDX_TYPE_NICQ |	/* type */
-			    (RT_IDX_BCAST_SLOT << RT_IDX_IDX_SHIFT);/* index */
-			break;
-		}
+		value = RT_IDX_DST_DFLT_Q |	/* dest */
+			RT_IDX_TYPE_NICQ |	/* type */
+			(RT_IDX_BCAST_SLOT << RT_IDX_IDX_SHIFT);/* index */
+		break;
 	case RT_IDX_MCAST:	/* Pass up All Multicast frames. */
-		{
-			value = RT_IDX_DST_DFLT_Q |	/* dest */
-			    RT_IDX_TYPE_NICQ |	/* type */
-			    (RT_IDX_ALLMULTI_SLOT << RT_IDX_IDX_SHIFT);/* index */
-			break;
-		}
+		value = RT_IDX_DST_DFLT_Q |	/* dest */
+			RT_IDX_TYPE_NICQ |	/* type */
+			(RT_IDX_ALLMULTI_SLOT << RT_IDX_IDX_SHIFT);/* index */
+		break;
 	case RT_IDX_MCAST_MATCH:	/* Pass up matched Multicast frames. */
-		{
-			value = RT_IDX_DST_DFLT_Q |	/* dest */
-			    RT_IDX_TYPE_NICQ |	/* type */
-			    (RT_IDX_MCAST_MATCH_SLOT << RT_IDX_IDX_SHIFT);/* index */
-			break;
-		}
+		value = RT_IDX_DST_DFLT_Q |	/* dest */
+			RT_IDX_TYPE_NICQ |	/* type */
+			(RT_IDX_MCAST_MATCH_SLOT << RT_IDX_IDX_SHIFT);/* index */
+		break;
 	case RT_IDX_RSS_MATCH:	/* Pass up matched RSS frames. */
-		{
-			value = RT_IDX_DST_RSS |	/* dest */
-			    RT_IDX_TYPE_NICQ |	/* type */
-			    (RT_IDX_RSS_MATCH_SLOT << RT_IDX_IDX_SHIFT);/* index */
-			break;
-		}
+		value = RT_IDX_DST_RSS |	/* dest */
+			RT_IDX_TYPE_NICQ |	/* type */
+			(RT_IDX_RSS_MATCH_SLOT << RT_IDX_IDX_SHIFT);/* index */
+		break;
 	case 0:		/* Clear the E-bit on an entry. */
-		{
-			value = RT_IDX_DST_DFLT_Q |	/* dest */
-			    RT_IDX_TYPE_NICQ |	/* type */
-			    (index << RT_IDX_IDX_SHIFT);/* index */
-			break;
-		}
+		value = RT_IDX_DST_DFLT_Q |	/* dest */
+			RT_IDX_TYPE_NICQ |	/* type */
+			(index << RT_IDX_IDX_SHIFT);/* index */
+		break;
 	default:
 		netif_err(qdev, ifup, qdev->ndev,
 			  "Mask type %d not yet supported.\n", mask);
@@ -599,14 +579,16 @@ static void qlge_disable_interrupts(struct qlge_adapter *qdev)
 	qlge_write32(qdev, INTR_EN, (INTR_EN_EI << 16));
 }
 
-static void qlge_enable_completion_interrupt(struct qlge_adapter *qdev, u32 intr)
+static void qlge_enable_completion_interrupt(struct qlge_adapter *qdev,
+					     u32 intr)
 {
 	struct intr_context *ctx = &qdev->intr_context[intr];
 
 	qlge_write32(qdev, INTR_EN, ctx->intr_en_mask);
 }
 
-static void qlge_disable_completion_interrupt(struct qlge_adapter *qdev, u32 intr)
+static void qlge_disable_completion_interrupt(struct qlge_adapter *qdev,
+					      u32 intr)
 {
 	struct intr_context *ctx = &qdev->intr_context[intr];
 
@@ -621,7 +603,8 @@ static void qlge_enable_all_completion_interrupts(struct qlge_adapter *qdev)
 		qlge_enable_completion_interrupt(qdev, i);
 }
 
-static int qlge_validate_flash(struct qlge_adapter *qdev, u32 size, const char *str)
+static int qlge_validate_flash(struct qlge_adapter *qdev, u32 size,
+			       const char *str)
 {
 	int status, i;
 	u16 csum = 0;
@@ -643,7 +626,8 @@ static int qlge_validate_flash(struct qlge_adapter *qdev, u32 size, const char *
 	return csum;
 }
 
-static int qlge_read_flash_word(struct qlge_adapter *qdev, int offset, __le32 *data)
+static int qlge_read_flash_word(struct qlge_adapter *qdev, int offset,
+				__le32 *data)
 {
 	int status = 0;
 	/* wait for reg to come ready */
@@ -696,10 +680,8 @@ static int qlge_get_8000_flash_params(struct qlge_adapter *qdev)
 		}
 	}
 
-	status = qlge_validate_flash(qdev,
-				     sizeof(struct flash_params_8000) /
-				   sizeof(u16),
-				   "8000");
+	status = qlge_validate_flash(qdev, sizeof(struct flash_params_8000) /
+				     sizeof(u16), "8000");
 	if (status) {
 		netif_err(qdev, ifup, qdev->ndev, "Invalid flash.\n");
 		status = -EINVAL;
@@ -757,10 +739,8 @@ static int qlge_get_8012_flash_params(struct qlge_adapter *qdev)
 		}
 	}
 
-	status = qlge_validate_flash(qdev,
-				     sizeof(struct flash_params_8012) /
-				       sizeof(u16),
-				     "8012");
+	status = qlge_validate_flash(qdev, sizeof(struct flash_params_8012) /
+				     sizeof(u16), "8012");
 	if (status) {
 		netif_err(qdev, ifup, qdev->ndev, "Invalid flash.\n");
 		status = -EINVAL;
@@ -928,8 +908,8 @@ static int qlge_8012_port_initialize(struct qlge_adapter *qdev)
 		goto end;
 
 	/* Turn on jumbo. */
-	status =
-	    qlge_write_xgmac_reg(qdev, MAC_TX_PARAMS, MAC_TX_PARAMS_JUMBO | (0x2580 << 16));
+	status = qlge_write_xgmac_reg(qdev, MAC_TX_PARAMS,
+				      MAC_TX_PARAMS_JUMBO | (0x2580 << 16));
 	if (status)
 		goto end;
 	status =
@@ -1520,10 +1500,9 @@ static void qlge_process_mac_rx_page(struct qlge_adapter *qdev,
 		} else if ((ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_U) &&
 			   (ib_mac_rsp->flags3 & IB_MAC_IOCB_RSP_V4)) {
 			/* Unfragmented ipv4 UDP frame. */
-			struct iphdr *iph =
-				(struct iphdr *)((u8 *)addr + hlen);
-			if (!(iph->frag_off &
-			      htons(IP_MF | IP_OFFSET))) {
+			struct iphdr *iph = (struct iphdr *)((u8 *)addr + hlen);
+
+			if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
 				skb->ip_summed = CHECKSUM_UNNECESSARY;
 				netif_printk(qdev, rx_status, KERN_DEBUG,
 					     qdev->ndev,
@@ -1763,7 +1742,8 @@ static struct sk_buff *qlge_build_rx_skb(struct qlge_adapter *qdev,
 			lbq_desc = qlge_get_curr_lchunk(qdev, rx_ring);
 			skb = netdev_alloc_skb(qdev->ndev, length);
 			if (!skb) {
-				netif_printk(qdev, probe, KERN_DEBUG, qdev->ndev,
+				netif_printk(qdev, probe, KERN_DEBUG,
+					     qdev->ndev,
 					     "No skb available, drop the packet.\n");
 				return NULL;
 			}
@@ -1835,8 +1815,8 @@ static struct sk_buff *qlge_build_rx_skb(struct qlge_adapter *qdev,
 			length -= size;
 			i++;
 		} while (length > 0);
-		qlge_update_mac_hdr_len(qdev, ib_mac_rsp, lbq_desc->p.pg_chunk.va,
-					&hlen);
+		qlge_update_mac_hdr_len(qdev, ib_mac_rsp,
+					lbq_desc->p.pg_chunk.va, &hlen);
 		__pskb_pull_tail(skb, hlen);
 	}
 	return skb;
@@ -2447,7 +2427,8 @@ static irqreturn_t qlge_isr(int irq, void *dev_id)
 	return work_done ? IRQ_HANDLED : IRQ_NONE;
 }
 
-static int qlge_tso(struct sk_buff *skb, struct qlge_ob_mac_tso_iocb_req *mac_iocb_ptr)
+static int qlge_tso(struct sk_buff *skb,
+		    struct qlge_ob_mac_tso_iocb_req *mac_iocb_ptr)
 {
 	if (skb_is_gso(skb)) {
 		int err;
@@ -2683,9 +2664,10 @@ static void qlge_free_tx_resources(struct qlge_adapter *qdev,
 static int qlge_alloc_tx_resources(struct qlge_adapter *qdev,
 				   struct tx_ring *tx_ring)
 {
-	tx_ring->wq_base =
-		dma_alloc_coherent(&qdev->pdev->dev, tx_ring->wq_size,
-				   &tx_ring->wq_base_dma, GFP_ATOMIC);
+	tx_ring->wq_base = dma_alloc_coherent(&qdev->pdev->dev,
+					      tx_ring->wq_size,
+					      &tx_ring->wq_base_dma,
+					      GFP_ATOMIC);
 
 	if (!tx_ring->wq_base ||
 	    tx_ring->wq_base_dma & WQ_ADDR_ALIGN)
@@ -2707,15 +2689,15 @@ static int qlge_alloc_tx_resources(struct qlge_adapter *qdev,
 	return -ENOMEM;
 }
 
-static void qlge_free_lbq_buffers(struct qlge_adapter *qdev, struct rx_ring *rx_ring)
+static void qlge_free_lbq_buffers(struct qlge_adapter *qdev,
+				  struct rx_ring *rx_ring)
 {
 	struct qlge_bq *lbq = &rx_ring->lbq;
 	unsigned int last_offset;
 
 	last_offset = qlge_lbq_block_size(qdev) - qdev->lbq_buf_size;
 	while (lbq->next_to_clean != lbq->next_to_use) {
-		struct qlge_bq_desc *lbq_desc =
-			&lbq->queue[lbq->next_to_clean];
+		struct qlge_bq_desc *lbq_desc = &lbq->queue[lbq->next_to_clean];
 
 		if (lbq_desc->p.pg_chunk.offset == last_offset)
 			dma_unmap_page(&qdev->pdev->dev, lbq_desc->dma_addr,
@@ -2734,7 +2716,8 @@ static void qlge_free_lbq_buffers(struct qlge_adapter *qdev, struct rx_ring *rx_
 	}
 }
 
-static void qlge_free_sbq_buffers(struct qlge_adapter *qdev, struct rx_ring *rx_ring)
+static void qlge_free_sbq_buffers(struct qlge_adapter *qdev,
+				  struct rx_ring *rx_ring)
 {
 	int i;
 
@@ -2854,9 +2837,10 @@ static int qlge_alloc_rx_resources(struct qlge_adapter *qdev,
 	/*
 	 * Allocate the completion queue for this rx_ring.
 	 */
-	rx_ring->cq_base =
-		dma_alloc_coherent(&qdev->pdev->dev, rx_ring->cq_size,
-				   &rx_ring->cq_base_dma, GFP_ATOMIC);
+	rx_ring->cq_base = dma_alloc_coherent(&qdev->pdev->dev,
+					      rx_ring->cq_size,
+					      &rx_ring->cq_base_dma,
+					      GFP_ATOMIC);
 
 	if (!rx_ring->cq_base) {
 		netif_err(qdev, ifup, qdev->ndev, "rx_ring alloc failed.\n");
@@ -2952,8 +2936,8 @@ static int qlge_start_rx_ring(struct qlge_adapter *qdev, struct rx_ring *rx_ring
 		(rx_ring->cq_id * RX_RING_SHADOW_SPACE);
 	u64 shadow_reg_dma = qdev->rx_ring_shadow_reg_dma +
 		(rx_ring->cq_id * RX_RING_SHADOW_SPACE);
-	void __iomem *doorbell_area =
-		qdev->doorbell_area + (DB_PAGE_SIZE * (128 + rx_ring->cq_id));
+	void __iomem *doorbell_area = qdev->doorbell_area +
+		(DB_PAGE_SIZE * (128 + rx_ring->cq_id));
 	int err = 0;
 	u64 tmp;
 	__le64 *base_indirect_ptr;
@@ -3061,8 +3045,8 @@ static int qlge_start_rx_ring(struct qlge_adapter *qdev, struct rx_ring *rx_ring
 static int qlge_start_tx_ring(struct qlge_adapter *qdev, struct tx_ring *tx_ring)
 {
 	struct wqicb *wqicb = (struct wqicb *)tx_ring;
-	void __iomem *doorbell_area =
-		qdev->doorbell_area + (DB_PAGE_SIZE * tx_ring->wq_id);
+	void __iomem *doorbell_area = qdev->doorbell_area +
+		(DB_PAGE_SIZE * tx_ring->wq_id);
 	void *shadow_reg = qdev->tx_ring_shadow_reg_area +
 		(tx_ring->wq_id * sizeof(u64));
 	u64 shadow_reg_dma = qdev->tx_ring_shadow_reg_dma +
@@ -3268,16 +3252,13 @@ static void qlge_resolve_queues_to_irqs(struct qlge_adapter *qdev)
 			 * We set up each vectors enable/disable/read bits so
 			 * there's no bit/mask calculations in the critical path.
 			 */
-			intr_context->intr_en_mask =
-				INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
+			intr_context->intr_en_mask = INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
 				INTR_EN_TYPE_ENABLE | INTR_EN_IHD_MASK | INTR_EN_IHD
 				| i;
-			intr_context->intr_dis_mask =
-				INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
+			intr_context->intr_dis_mask = INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
 				INTR_EN_TYPE_DISABLE | INTR_EN_IHD_MASK |
 				INTR_EN_IHD | i;
-			intr_context->intr_read_mask =
-				INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
+			intr_context->intr_read_mask = INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
 				INTR_EN_TYPE_READ | INTR_EN_IHD_MASK | INTR_EN_IHD |
 				i;
 			if (i == 0) {
@@ -3309,10 +3290,9 @@ static void qlge_resolve_queues_to_irqs(struct qlge_adapter *qdev)
 		 * We set up each vectors enable/disable/read bits so
 		 * there's no bit/mask calculations in the critical path.
 		 */
-		intr_context->intr_en_mask =
-			INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK | INTR_EN_TYPE_ENABLE;
-		intr_context->intr_dis_mask =
-			INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
+		intr_context->intr_en_mask = INTR_EN_TYPE_MASK |
+			INTR_EN_INTR_MASK | INTR_EN_TYPE_ENABLE;
+		intr_context->intr_dis_mask = INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK |
 			INTR_EN_TYPE_DISABLE;
 		if (test_bit(QL_LEGACY_ENABLED, &qdev->flags)) {
 			/* Experience shows that when using INTx interrupts,
@@ -3324,8 +3304,8 @@ static void qlge_resolve_queues_to_irqs(struct qlge_adapter *qdev)
 				INTR_EN_EI;
 			intr_context->intr_dis_mask |= INTR_EN_EI << 16;
 		}
-		intr_context->intr_read_mask =
-			INTR_EN_TYPE_MASK | INTR_EN_INTR_MASK | INTR_EN_TYPE_READ;
+		intr_context->intr_read_mask = INTR_EN_TYPE_MASK |
+			INTR_EN_INTR_MASK | INTR_EN_TYPE_READ;
 		/*
 		 * Single interrupt means one handler for all rings.
 		 */
@@ -3434,8 +3414,7 @@ static int qlge_start_rss(struct qlge_adapter *qdev)
 	memset((void *)ricb, 0, sizeof(*ricb));
 
 	ricb->base_cq = RSS_L4K;
-	ricb->flags =
-		(RSS_L6K | RSS_LI | RSS_LB | RSS_LM | RSS_RT4 | RSS_RT6);
+	ricb->flags = (RSS_L6K | RSS_LI | RSS_LB | RSS_LM | RSS_RT4 | RSS_RT6);
 	ricb->mask = cpu_to_le16((u16)(0x3ff));
 
 	/*
@@ -4127,8 +4106,8 @@ static void qlge_set_multicast_list(struct net_device *ndev)
 	 */
 	if (ndev->flags & IFF_PROMISC) {
 		if (!test_bit(QL_PROMISCUOUS, &qdev->flags)) {
-			if (qlge_set_routing_reg
-			    (qdev, RT_IDX_PROMISCUOUS_SLOT, RT_IDX_VALID, 1)) {
+			if (qlge_set_routing_reg(qdev, RT_IDX_PROMISCUOUS_SLOT,
+						 RT_IDX_VALID, 1)) {
 				netif_err(qdev, hw, qdev->ndev,
 					  "Failed to set promiscuous mode.\n");
 			} else {
@@ -4137,8 +4116,8 @@ static void qlge_set_multicast_list(struct net_device *ndev)
 		}
 	} else {
 		if (test_bit(QL_PROMISCUOUS, &qdev->flags)) {
-			if (qlge_set_routing_reg
-			    (qdev, RT_IDX_PROMISCUOUS_SLOT, RT_IDX_VALID, 0)) {
+			if (qlge_set_routing_reg(qdev, RT_IDX_PROMISCUOUS_SLOT,
+						 RT_IDX_VALID, 0)) {
 				netif_err(qdev, hw, qdev->ndev,
 					  "Failed to clear promiscuous mode.\n");
 			} else {
@@ -4154,8 +4133,8 @@ static void qlge_set_multicast_list(struct net_device *ndev)
 	if ((ndev->flags & IFF_ALLMULTI) ||
 	    (netdev_mc_count(ndev) > MAX_MULTICAST_ENTRIES)) {
 		if (!test_bit(QL_ALLMULTI, &qdev->flags)) {
-			if (qlge_set_routing_reg
-			    (qdev, RT_IDX_ALLMULTI_SLOT, RT_IDX_MCAST, 1)) {
+			if (qlge_set_routing_reg(qdev, RT_IDX_ALLMULTI_SLOT,
+						 RT_IDX_MCAST, 1)) {
 				netif_err(qdev, hw, qdev->ndev,
 					  "Failed to set all-multi mode.\n");
 			} else {
@@ -4164,8 +4143,8 @@ static void qlge_set_multicast_list(struct net_device *ndev)
 		}
 	} else {
 		if (test_bit(QL_ALLMULTI, &qdev->flags)) {
-			if (qlge_set_routing_reg
-			    (qdev, RT_IDX_ALLMULTI_SLOT, RT_IDX_MCAST, 0)) {
+			if (qlge_set_routing_reg(qdev, RT_IDX_ALLMULTI_SLOT,
+						 RT_IDX_MCAST, 0)) {
 				netif_err(qdev, hw, qdev->ndev,
 					  "Failed to clear all-multi mode.\n");
 			} else {
@@ -4190,8 +4169,8 @@ static void qlge_set_multicast_list(struct net_device *ndev)
 			i++;
 		}
 		qlge_sem_unlock(qdev, SEM_MAC_ADDR_MASK);
-		if (qlge_set_routing_reg
-		    (qdev, RT_IDX_MCAST_MATCH_SLOT, RT_IDX_MCAST_MATCH, 1)) {
+		if (qlge_set_routing_reg(qdev, RT_IDX_MCAST_MATCH_SLOT,
+					 RT_IDX_MCAST_MATCH, 1)) {
 			netif_err(qdev, hw, qdev->ndev,
 				  "Failed to set multicast match mode.\n");
 		} else {
@@ -4235,8 +4214,8 @@ static void qlge_tx_timeout(struct net_device *ndev, unsigned int txqueue)
 
 static void qlge_asic_reset_work(struct work_struct *work)
 {
-	struct qlge_adapter *qdev =
-		container_of(work, struct qlge_adapter, asic_reset_work.work);
+	struct qlge_adapter *qdev = container_of(work, struct qlge_adapter,
+						 asic_reset_work.work);
 	int status;
 
 	rtnl_lock();
@@ -4407,8 +4386,8 @@ static int qlge_init_device(struct pci_dev *pdev, struct qlge_adapter *qdev,
 	/* Set PCIe reset type for EEH to fundamental. */
 	pdev->needs_freset = 1;
 	pci_save_state(pdev);
-	qdev->reg_base =
-		ioremap(pci_resource_start(pdev, 1), pci_resource_len(pdev, 1));
+	qdev->reg_base = ioremap(pci_resource_start(pdev, 1),
+				 pci_resource_len(pdev, 1));
 	if (!qdev->reg_base) {
 		dev_err(&pdev->dev, "Register mapping failed.\n");
 		err = -ENOMEM;
@@ -4416,8 +4395,8 @@ static int qlge_init_device(struct pci_dev *pdev, struct qlge_adapter *qdev,
 	}
 
 	qdev->doorbell_area_size = pci_resource_len(pdev, 3);
-	qdev->doorbell_area =
-		ioremap(pci_resource_start(pdev, 3), pci_resource_len(pdev, 3));
+	qdev->doorbell_area = ioremap(pci_resource_start(pdev, 3),
+				      pci_resource_len(pdev, 3));
 	if (!qdev->doorbell_area) {
 		dev_err(&pdev->dev, "Doorbell register mapping failed.\n");
 		err = -ENOMEM;
-- 
2.35.1



^ permalink raw reply related

* Re: [PATCH net-next 1/3] ipv4: Don't reset ->flowi4_scope in ip_rt_fix_tos().
From: Guillaume Nault @ 2022-04-22 10:53 UTC (permalink / raw)
  To: David Ahern
  Cc: David Miller, Jakub Kicinski, Paolo Abeni, netdev,
	Hideaki YOSHIFUJI, dccp
In-Reply-To: <0d4e27ee-385c-fd5d-bd31-51e9e2382667@kernel.org>

On Thu, Apr 21, 2022 at 08:30:52PM -0600, David Ahern wrote:
> On 4/20/22 5:21 PM, Guillaume Nault wrote:
> > All callers already initialise ->flowi4_scope with RT_SCOPE_UNIVERSE,
> > either by manual field assignment, memset(0) of the whole structure or
> > implicit structure initialisation of on-stack variables
> > (RT_SCOPE_UNIVERSE actually equals 0).
> > 
> > Therefore, we don't need to always initialise ->flowi4_scope in
> > ip_rt_fix_tos(). We only need to reduce the scope to RT_SCOPE_LINK when
> > the special RTO_ONLINK flag is present in the tos.
> > 
> > This will allow some code simplification, like removing
> > ip_rt_fix_tos(). Also, the long term idea is to remove RTO_ONLINK
> > entirely by properly initialising ->flowi4_scope, instead of
> > overloading ->flowi4_tos with a special flag. Eventually, this will
> > allow to convert ->flowi4_tos to dscp_t.
> > 
> > Signed-off-by: Guillaume Nault <gnault@redhat.com>
> > ---
> > It's important for the correctness of this patch that all callers
> > initialise ->flowi4_scope to 0 (in one way or another). Auditing all of
> > them is long, although each case is pretty trivial.
> > 
> > If it helps, I can send a patch series that converts implicit
> > initialisation of ->flowi4_scope with an explicit assignment to
> > RT_SCOPE_UNIVERSE. This would also have the advantage of making it
> > clear to future readers that ->flowi4_scope _has_ to be initialised. I
> > haven't sent such patch series to not overwhelm reviewers with trivial
> > and not technically-required changes (there are 40+ places to modify,
> > scattered over 30+ different files). But if anyone prefers explicit
> > initialisation everywhere, then just let me know and I'll send such
> > patches.
> 
> There are a handful of places that open code the initialization of the
> flow struct. I *think* I found all of them in 40867d74c374.

By open code, do you mean "doesn't use flowi4_init_output() nor
ip_tunnel_init_flow()"? If so, I think there are many more.

To be sure we're on the same page, here's a small part of the diff for
my "explicit flowi4_scope initialisation" patch series:

 drivers/infiniband/core/addr.c                          | 1 +
 drivers/infiniband/sw/rxe/rxe_net.c                     | 1 +
 drivers/net/amt.c                                       | 3 +++
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c            | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c     | 3 +++
 drivers/net/ethernet/netronome/nfp/flower/action.c      | 1 +
 drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c | 7 +++++--
 drivers/net/geneve.c                                    | 9 +++++++--
 drivers/net/gtp.c                                       | 1 +
 drivers/net/ipvlan/ipvlan_core.c                        | 1 +
 drivers/net/vrf.c                                       | 1 +
 drivers/net/vxlan/vxlan_core.c                          | 1 +
 drivers/net/wireguard/socket.c                          | 1 +
 include/net/ip_tunnels.h                                | 1 +
 include/net/route.h                                     | 2 ++
 net/core/filter.c                                       | 1 +
 net/core/lwt_bpf.c                                      | 1 +
 net/dccp/ipv4.c                                         | 1 +
 net/ipv4/icmp.c                                         | 3 +++
 net/ipv4/netfilter.c                                    | 1 +
 net/ipv4/netfilter/nf_reject_ipv4.c                     | 1 +
 net/ipv4/route.c                                        | 1 +
 net/ipv4/xfrm4_policy.c                                 | 1 +
 net/netfilter/ipvs/ip_vs_xmit.c                         | 1 +
 net/netfilter/nf_conntrack_h323_main.c                  | 3 +++
 net/netfilter/nf_conntrack_sip.c                        | 1 +
 net/netfilter/nft_flow_offload.c                        | 1 +
 net/netfilter/nft_rt.c                                  | 1 +
 net/netfilter/xt_TCPMSS.c                               | 2 ++
 net/sctp/protocol.c                                     | 1 +
 net/smc/smc_ib.c                                        | 1 +
 net/tipc/udp_media.c                                    | 1 +
 net/xfrm/xfrm_policy.c                                  | 1 +
 33 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index f253295795f0..5b6e0003eead 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -399,6 +399,7 @@ static int addr4_resolve(struct sockaddr *src_sock,
        memset(&fl4, 0, sizeof(fl4));
        fl4.daddr = dst_ip;
        fl4.saddr = src_ip;
+       fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl4.flowi4_oif = addr->bound_dev_if;
        rt = ip_route_output_key(addr->net, &fl4);
        ret = PTR_ERR_OR_ZERO(rt);
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index c53f4529f098..cf6dc89a8785 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -31,6 +31,7 @@ static struct dst_entry *rxe_find_route4(struct net_device *ndev,
        fl.flowi4_oif = ndev->ifindex;
        memcpy(&fl.saddr, saddr, sizeof(*saddr));
        memcpy(&fl.daddr, daddr, sizeof(*daddr));
+       fl.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl.flowi4_proto = IPPROTO_UDP;
 
        rt = ip_route_output_key(&init_net, &fl);
diff --git a/drivers/net/amt.c b/drivers/net/amt.c
index 10455c9b9da0..3e957ff64715 100644
--- a/drivers/net/amt.c
+++ b/drivers/net/amt.c
@@ -990,6 +990,7 @@ static bool amt_send_membership_update(struct amt_dev *amt,
        fl4.daddr              = amt->remote_ip;
        fl4.saddr              = amt->local_ip;
        fl4.flowi4_tos         = AMT_TOS;
+       fl4.flowi4_scope       = RT_SCOPE_UNIVERSE;
        fl4.flowi4_proto       = IPPROTO_UDP;
        rt = ip_route_output_key(amt->net, &fl4);
        if (IS_ERR(rt)) {
@@ -1048,6 +1049,7 @@ static void amt_send_multicast_data(struct amt_dev *amt,
        fl4.flowi4_oif         = amt->stream_dev->ifindex;
        fl4.daddr              = tunnel->ip4;
        fl4.saddr              = amt->local_ip;
+       fl4.flowi4_scope       = RT_SCOPE_UNIVERSE;
        fl4.flowi4_proto       = IPPROTO_UDP;
        rt = ip_route_output_key(amt->net, &fl4);
        if (IS_ERR(rt)) {
@@ -1103,6 +1105,7 @@ static bool amt_send_membership_query(struct amt_dev *amt,
        fl4.daddr              = tunnel->ip4;
        fl4.saddr              = amt->local_ip;
        fl4.flowi4_tos         = AMT_TOS;
+       fl4.flowi4_scope       = RT_SCOPE_UNIVERSE;
        fl4.flowi4_proto       = IPPROTO_UDP;
        rt = ip_route_output_key(amt->net, &fl4);
        if (IS_ERR(rt)) {
...

> > ---
> >  net/ipv4/route.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> > index e839d424b861..d8f82c0ac132 100644
> > --- a/net/ipv4/route.c
> > +++ b/net/ipv4/route.c
> > @@ -503,8 +503,8 @@ static void ip_rt_fix_tos(struct flowi4 *fl4)
> >  	__u8 tos = RT_FL_TOS(fl4);
> >  
> >  	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
> > -	fl4->flowi4_scope = tos & RTO_ONLINK ?
> > -			    RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
> > +	if (tos & RTO_ONLINK)
> > +		fl4->flowi4_scope = RT_SCOPE_LINK;
> >  }
> >  
> >  static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
> 
> Reviewed-by: David Ahern <dsahern@kernel.org>
> 


^ permalink raw reply related

* Re: [PATCH net-next 0/3] ipv4: First steps toward removing RTO_ONLINK
From: Guillaume Nault @ 2022-04-22 11:02 UTC (permalink / raw)
  To: David Ahern
  Cc: David Miller, Jakub Kicinski, Paolo Abeni, netdev,
	Hideaki YOSHIFUJI, dccp
In-Reply-To: <2ee8fb0d-aeb4-5010-bc8c-16cbd6e88eff@kernel.org>

On Thu, Apr 21, 2022 at 09:10:21PM -0600, David Ahern wrote:
> On 4/20/22 5:21 PM, Guillaume Nault wrote:
> > RTO_ONLINK is a flag that allows to reduce the scope of route lookups.
> > It's stored in a normally unused bit of the ->flowi4_tos field, in
> > struct flowi4. However it has several problems:
> > 
> >  * This bit is also used by ECN. Although ECN bits are supposed to be
> >    cleared before doing a route lookup, it happened that some code
> >    paths didn't properly sanitise their ->flowi4_tos. So this mechanism
> >    is fragile and we had bugs in the past where ECN bits slipped in and
> >    could end up being erroneously interpreted as RTO_ONLINK.
> > 
> >  * A dscp_t type was recently introduced to ensure ECN bits are cleared
> >    during route lookups. ->flowi4_tos is the most important structure
> >    field to convert, but RTO_ONLINK prevents such conversion, as dscp_t
> >    mandates that ECN bits (where RTO_ONLINK is stored) be zero.
> > 
> > Therefore we need to stop using RTO_ONLINK altogether. Fortunately
> > RTO_ONLINK isn't a necessity. Instead of passing a flag in ->flowi4_tos
> > to tell the route lookup function to restrict the scope, we can simply
> > initialise the scope correctly.
> > 
> 
> I believe the set looks ok. I think the fib test coverage in selftests
> could use more tests to cover tos.

Yes, this is on my todo list. I also plan to review existing tests that
cover route lookups with link scope, and extend them if necessary.

Thanks for the review.


^ permalink raw reply

* Re: [PATCH] cgroup: don't queue css_release_work if one already pending
From: Michal Koutný @ 2022-04-22 11:05 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Tadeusz Struk, cgroups, Zefan Li, Johannes Weiner,
	Christian Brauner, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, netdev, bpf, stable, linux-kernel,
	syzbot+e42ae441c3b10acf9e9d
In-Reply-To: <YmHwOAdGY2Lwl+M3@slm.duckdns.org>

On Thu, Apr 21, 2022 at 02:00:56PM -1000, Tejun Heo <tj@kernel.org> wrote:
> If this is the case, we need to hold an extra reference to be put by the
> css_killed_work_fn(), right?

I looked into it a bit more lately and found that there already is such
a fuse in kill_css() [1].

At the same type syzbots stack trace demonstrates the fuse is
ineffective

> css_release+0xae/0xc0 kernel/cgroup/cgroup.c:5146                    (**)
> percpu_ref_put_many include/linux/percpu-refcount.h:322 [inline]
> percpu_ref_put include/linux/percpu-refcount.h:338 [inline]
> percpu_ref_call_confirm_rcu lib/percpu-refcount.c:162 [inline]        (*)
> percpu_ref_switch_to_atomic_rcu+0x5a2/0x5b0 lib/percpu-refcount.c:199
> rcu_do_batch+0x4f8/0xbc0 kernel/rcu/tree.c:2485
> rcu_core+0x59b/0xe30 kernel/rcu/tree.c:2722
> rcu_core_si+0x9/0x10 kernel/rcu/tree.c:2735
> __do_softirq+0x27e/0x596 kernel/softirq.c:305

(*) this calls css_killed_ref_fn confirm_switch
(**) zero references after confirmed kill?

So, I was also looking at the possible race with css_free_rwork_fn()
(from failed css_create()) but that would likely emit a warning from
__percpu_ref_exit().

So, I still think there's something fishy (so far possible only via
artificial ENOMEM injection) that needs an explanation...

Michal

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/cgroup/cgroup.c#n5608


^ permalink raw reply

* Re: [PATCH bpf-next v2 4/6] bpf, arm64: Impelment bpf_arch_text_poke() for arm64
From: Jakub Sitnicki @ 2022-04-22 10:54 UTC (permalink / raw)
  To: Xu Kuohai
  Cc: bpf, linux-arm-kernel, linux-kernel, netdev, linux-kselftest,
	Catalin Marinas, Will Deacon, Steven Rostedt, Ingo Molnar,
	Daniel Borkmann, Alexei Starovoitov, Zi Shen Lim, Andrii Nakryiko,
	Martin KaFai Lau, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, David S . Miller, Hideaki YOSHIFUJI, David Ahern,
	Thomas Gleixner, Borislav Petkov, Dave Hansen, x86, hpa,
	Shuah Khan, Mark Rutland, Ard Biesheuvel, Pasha Tatashin,
	Peter Collingbourne, Daniel Kiss, Sudeep Holla, Steven Price,
	Marc Zyngier, Mark Brown, Kumar Kartikeya Dwivedi,
	Delyan Kratunov, kernel-team
In-Reply-To: <20220414162220.1985095-5-xukuohai@huawei.com>

Hi Xu,

Thanks for working on this.

We are also looking forward to using fentry hooks on arm64.
In particular, attaching to entry/exit into/from XDP progs.

On Thu, Apr 14, 2022 at 12:22 PM -04, Xu Kuohai wrote:
> Impelment bpf_arch_text_poke() for arm64, so bpf trampoline code can use
> it to replace nop with jump, or replace jump with nop.
>
> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
> Acked-by: Song Liu <songliubraving@fb.com>
> ---
>  arch/arm64/net/bpf_jit_comp.c | 52 +++++++++++++++++++++++++++++++++++
>  1 file changed, 52 insertions(+)
>
> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
> index 8ab4035dea27..1a1c3ea75ee2 100644
> --- a/arch/arm64/net/bpf_jit_comp.c
> +++ b/arch/arm64/net/bpf_jit_comp.c
> @@ -9,6 +9,7 @@
>  
>  #include <linux/bitfield.h>
>  #include <linux/bpf.h>
> +#include <linux/memory.h>
>  #include <linux/filter.h>
>  #include <linux/printk.h>
>  #include <linux/slab.h>
> @@ -18,6 +19,7 @@
>  #include <asm/cacheflush.h>
>  #include <asm/debug-monitors.h>
>  #include <asm/insn.h>
> +#include <asm/patching.h>
>  #include <asm/set_memory.h>
>  
>  #include "bpf_jit.h"
> @@ -1529,3 +1531,53 @@ void bpf_jit_free_exec(void *addr)
>  {
>  	return vfree(addr);
>  }
> +
> +static int gen_branch_or_nop(enum aarch64_insn_branch_type type, void *ip,
> +			     void *addr, u32 *insn)
> +{
> +	if (!addr)
> +		*insn = aarch64_insn_gen_nop();
> +	else
> +		*insn = aarch64_insn_gen_branch_imm((unsigned long)ip,
> +						    (unsigned long)addr,
> +						    type);
> +
> +	return *insn != AARCH64_BREAK_FAULT ? 0 : -EFAULT;
> +}
> +
> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
> +		       void *old_addr, void *new_addr)
> +{
> +	int ret;
> +	u32 old_insn;
> +	u32 new_insn;
> +	u32 replaced;
> +	enum aarch64_insn_branch_type branch_type;
> +
> +	if (poke_type == BPF_MOD_CALL)
> +		branch_type = AARCH64_INSN_BRANCH_LINK;

This path, bpf_arch_text_poke(<ip>, BPF_MOD_CALL, ...), is what we hit
when attaching a BPF program entry. It is exercised by selftest #232
xdp_bpf2bpf.

However, with this patchset alone it will not work because we don't
emit, yet, the ftrace patch (MOV X9, LR; NOP) as a part of BPF prog
prologue, like ftrace_init_nop() does. So patching attempt will fail.

I think that is what you mentioned to in your reply to Hou [1]

So my question is - is support for attaching to BPF progs in scope for
this patchset?

If no, then perhaps it would be better for now to fail early with
something like -EOPNOTSUPP when poke_type is BPF_MOD_CALL, rather then
attempt to patch the code.

If you plan to enable it as a part of this patchset, then I've given it
a quick try, and it seems that not a lot is needed get fentry to BPF
attachment to work.

I'm including the diff for my quick and dirty attempt below. With that
patch on top, the xdp_bpf2bpf tests pass:

#232 xdp_bpf2bpf:OK

[1] https://lore.kernel.org/bpf/d8c4f1fb-a020-9457-44e2-dc63982a9213@huawei.com/

> +	else
> +		branch_type = AARCH64_INSN_BRANCH_NOLINK;
> +
> +	if (gen_branch_or_nop(branch_type, ip, old_addr, &old_insn) < 0)
> +		return -EFAULT;
> +
> +	if (gen_branch_or_nop(branch_type, ip, new_addr, &new_insn) < 0)
> +		return -EFAULT;
> +
> +	mutex_lock(&text_mutex);
> +	if (aarch64_insn_read(ip, &replaced)) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	if (replaced != old_insn) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	ret =  aarch64_insn_patch_text_nosync((void *)ip, new_insn);
> +out:
> +	mutex_unlock(&text_mutex);

The body of this critical section is identical as ftrace_modify_code().
Perhaps we could export it and reuse?

> +	return ret;
> +}

---
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 5f6bd755050f..94d8251500ab 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -240,9 +240,9 @@ static bool is_lsi_offset(int offset, int scale)
 /* Tail call offset to jump into */
 #if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) || \
 	IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL)
-#define PROLOGUE_OFFSET 9
+#define PROLOGUE_OFFSET 11
 #else
-#define PROLOGUE_OFFSET 8
+#define PROLOGUE_OFFSET 10
 #endif
 
 static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
@@ -281,6 +281,10 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
 	 *
 	 */
 
+	/* Set up ftrace patch (initially in disabled state) */
+	emit(A64_MOV(1, A64_R(9), A64_LR), ctx);
+	emit(A64_NOP, ctx);
+
 	/* Sign lr */
 	if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL))
 		emit(A64_PACIASP, ctx);
@@ -1888,10 +1892,16 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
 	u32 replaced;
 	enum aarch64_insn_branch_type branch_type;
 
-	if (poke_type == BPF_MOD_CALL)
+	if (poke_type == BPF_MOD_CALL) {
 		branch_type = AARCH64_INSN_BRANCH_LINK;
-	else
+		/*
+		 * Adjust addr to point at the BL in the callsite.
+		 * See ftrace_init_nop() for the callsite sequence.
+		 */
+		ip = (void *)((unsigned long)ip + AARCH64_INSN_SIZE);
+	} else {
 		branch_type = AARCH64_INSN_BRANCH_NOLINK;
+	}
 
 	if (gen_branch_or_nop(branch_type, ip, old_addr, &old_insn) < 0)
 		return -EFAULT;

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox