Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCHv6 iproute2-next 3/5] lib: add libbpf support
From: Hangbin Liu @ 2020-11-23 13:11 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern
  Cc: Daniel Borkmann, Martin KaFai Lau, Song Liu, Yonghong Song,
	David Miller, netdev, bpf, Jiri Benc,
	Toke Høiland-Jørgensen, Jesper Dangaard Brouer,
	Alexei Starovoitov, Hangbin Liu
In-Reply-To: <20201123131201.4108483-1-haliu@redhat.com>

This patch converts iproute2 to use libbpf for loading and attaching
BPF programs when it is available, which is started by Toke's
implementation[1]. With libbpf iproute2 could correctly process BTF
information and support the new-style BTF-defined maps, while keeping
compatibility with the old internal map definition syntax.

The old iproute2 bpf code is kept and will be used if no suitable libbpf
is available. When using libbpf, wrapper code in bpf_legacy.c ensures that
iproute2 will still understand the old map definition format, including
populating map-in-map and tail call maps before load.

In bpf_libbpf.c, we init iproute2 ctx and elf info first to check the
legacy bytes. When handling the legacy maps, for map-in-maps, we create
them manually and re-use the fd as they are associated with id/inner_id.
For pin maps, we only set the pin path and let libbp load to handle it.
For tail calls, we find it first and update the element after prog load.

Other maps/progs will be loaded by libbpf directly.

[1] https://lore.kernel.org/bpf/20190820114706.18546-1-toke@redhat.com/

Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Hangbin Liu <haliu@redhat.com>
---
v6: also make bpf_libbpf.c build depend on HAVE_ELF

v5: no update

v4:
Move ipvrf code to patch 02
Move HAVE_LIBBPF inside HAVE_ELF definition as libbpf depends on elf.

v3:
Add a new function get_bpf_program__section_name() to choose whether
use bpf_program__title() or not.

v2:
Remove self defined IS_ERR_OR_NULL and use libbpf_get_error() instead.
Add ipvrf with libbpf support.
---
 include/bpf_util.h |  17 +++
 lib/Makefile       |   6 +
 lib/bpf_legacy.c   | 178 +++++++++++++++++++++++
 lib/bpf_libbpf.c   | 348 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 549 insertions(+)
 create mode 100644 lib/bpf_libbpf.c

diff --git a/include/bpf_util.h b/include/bpf_util.h
index 3235c34e..53acc410 100644
--- a/include/bpf_util.h
+++ b/include/bpf_util.h
@@ -291,6 +291,16 @@ int bpf_dump_prog_info(FILE *f, uint32_t id);
 int bpf_send_map_fds(const char *path, const char *obj);
 int bpf_recv_map_fds(const char *path, int *fds, struct bpf_map_aux *aux,
 		     unsigned int entries);
+#ifdef HAVE_LIBBPF
+int iproute2_bpf_elf_ctx_init(struct bpf_cfg_in *cfg);
+int iproute2_bpf_fetch_ancillary(void);
+int iproute2_get_root_path(char *root_path, size_t len);
+bool iproute2_is_pin_map(const char *libbpf_map_name, char *pathname);
+bool iproute2_is_map_in_map(const char *libbpf_map_name, struct bpf_elf_map *imap,
+			    struct bpf_elf_map *omap, char *omap_name);
+int iproute2_find_map_name_by_id(unsigned int map_id, char *name);
+int iproute2_load_libbpf(struct bpf_cfg_in *cfg);
+#endif /* HAVE_LIBBPF */
 #else
 static inline int bpf_send_map_fds(const char *path, const char *obj)
 {
@@ -303,6 +313,13 @@ static inline int bpf_recv_map_fds(const char *path, int *fds,
 {
 	return -1;
 }
+#ifdef HAVE_LIBBPF
+static inline int iproute2_load_libbpf(struct bpf_cfg_in *cfg)
+{
+	fprintf(stderr, "No ELF library support compiled in.\n");
+	return -1;
+}
+#endif /* HAVE_LIBBPF */
 #endif /* HAVE_ELF */
 
 const char *get_libbpf_version(void);
diff --git a/lib/Makefile b/lib/Makefile
index 7c8a197c..e37585c6 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -7,6 +7,12 @@ UTILOBJ = utils.o rt_names.o ll_map.o ll_types.o ll_proto.o ll_addr.o \
 	inet_proto.o namespace.o json_writer.o json_print.o \
 	names.o color.o bpf_legacy.o bpf_glue.o exec.o fs.o cg_map.o
 
+ifeq ($(HAVE_ELF),y)
+ifeq ($(HAVE_LIBBPF),y)
+UTILOBJ += bpf_libbpf.o
+endif
+endif
+
 NLOBJ=libgenl.o libnetlink.o mnl_utils.o
 
 all: libnetlink.a libutil.a
diff --git a/lib/bpf_legacy.c b/lib/bpf_legacy.c
index 4246fb76..bc869c3f 100644
--- a/lib/bpf_legacy.c
+++ b/lib/bpf_legacy.c
@@ -940,6 +940,9 @@ static int bpf_do_parse(struct bpf_cfg_in *cfg, const bool *opt_tbl)
 static int bpf_do_load(struct bpf_cfg_in *cfg)
 {
 	if (cfg->mode == EBPF_OBJECT) {
+#ifdef HAVE_LIBBPF
+		return iproute2_load_libbpf(cfg);
+#endif
 		cfg->prog_fd = bpf_obj_open(cfg->object, cfg->type,
 					    cfg->section, cfg->ifindex,
 					    cfg->verbose);
@@ -3155,4 +3158,179 @@ int bpf_recv_map_fds(const char *path, int *fds, struct bpf_map_aux *aux,
 	close(fd);
 	return ret;
 }
+
+#ifdef HAVE_LIBBPF
+/* The following functions are wrapper functions for libbpf code to be
+ * compatible with the legacy format. So all the functions have prefix
+ * with iproute2_
+ */
+int iproute2_bpf_elf_ctx_init(struct bpf_cfg_in *cfg)
+{
+	struct bpf_elf_ctx *ctx = &__ctx;
+
+	return bpf_elf_ctx_init(ctx, cfg->object, cfg->type, cfg->ifindex, cfg->verbose);
+}
+
+int iproute2_bpf_fetch_ancillary(void)
+{
+	struct bpf_elf_ctx *ctx = &__ctx;
+	struct bpf_elf_sec_data data;
+	int i, ret = 0;
+
+	for (i = 1; i < ctx->elf_hdr.e_shnum; i++) {
+		ret = bpf_fill_section_data(ctx, i, &data);
+		if (ret < 0)
+			continue;
+
+		if (data.sec_hdr.sh_type == SHT_PROGBITS &&
+		    !strcmp(data.sec_name, ELF_SECTION_MAPS))
+			ret = bpf_fetch_maps_begin(ctx, i, &data);
+		else if (data.sec_hdr.sh_type == SHT_SYMTAB &&
+			 !strcmp(data.sec_name, ".symtab"))
+			ret = bpf_fetch_symtab(ctx, i, &data);
+		else if (data.sec_hdr.sh_type == SHT_STRTAB &&
+			 !strcmp(data.sec_name, ".strtab"))
+			ret = bpf_fetch_strtab(ctx, i, &data);
+		if (ret < 0) {
+			fprintf(stderr, "Error parsing section %d! Perhaps check with readelf -a?\n",
+				i);
+			return ret;
+		}
+	}
+
+	if (bpf_has_map_data(ctx)) {
+		ret = bpf_fetch_maps_end(ctx);
+		if (ret < 0) {
+			fprintf(stderr, "Error fixing up map structure, incompatible struct bpf_elf_map used?\n");
+			return ret;
+		}
+	}
+
+	return ret;
+}
+
+int iproute2_get_root_path(char *root_path, size_t len)
+{
+	struct bpf_elf_ctx *ctx = &__ctx;
+	int ret = 0;
+
+	snprintf(root_path, len, "%s/%s",
+		 bpf_get_work_dir(ctx->type), BPF_DIR_GLOBALS);
+
+	ret = mkdir(root_path, S_IRWXU);
+	if (ret && errno != EEXIST) {
+		fprintf(stderr, "mkdir %s failed: %s\n", root_path, strerror(errno));
+		return ret;
+	}
+
+	return 0;
+}
+
+bool iproute2_is_pin_map(const char *libbpf_map_name, char *pathname)
+{
+	struct bpf_elf_ctx *ctx = &__ctx;
+	const char *map_name, *tmp;
+	unsigned int pinning;
+	int i, ret = 0;
+
+	for (i = 0; i < ctx->map_num; i++) {
+		if (ctx->maps[i].pinning == PIN_OBJECT_NS &&
+		    ctx->noafalg) {
+			fprintf(stderr, "Missing kernel AF_ALG support for PIN_OBJECT_NS!\n");
+			return false;
+		}
+
+		map_name = bpf_map_fetch_name(ctx, i);
+		if (!map_name) {
+			return false;
+		}
+
+		if (strcmp(libbpf_map_name, map_name))
+			continue;
+
+		pinning = ctx->maps[i].pinning;
+
+		if (bpf_no_pinning(ctx, pinning) || !bpf_get_work_dir(ctx->type))
+			return false;
+
+		if (pinning == PIN_OBJECT_NS)
+			ret = bpf_make_obj_path(ctx);
+		else if ((tmp = bpf_custom_pinning(ctx, pinning)))
+			ret = bpf_make_custom_path(ctx, tmp);
+		if (ret < 0)
+			return false;
+
+		bpf_make_pathname(pathname, PATH_MAX, map_name, ctx, pinning);
+
+		return true;
+	}
+
+	return false;
+}
+
+bool iproute2_is_map_in_map(const char *libbpf_map_name, struct bpf_elf_map *imap,
+			    struct bpf_elf_map *omap, char *omap_name)
+{
+	struct bpf_elf_ctx *ctx = &__ctx;
+	const char *inner_map_name, *outer_map_name;
+	int i, j;
+
+	for (i = 0; i < ctx->map_num; i++) {
+		inner_map_name = bpf_map_fetch_name(ctx, i);
+		if (!inner_map_name) {
+			return false;
+		}
+
+		if (strcmp(libbpf_map_name, inner_map_name))
+			continue;
+
+		if (!ctx->maps[i].id ||
+		    ctx->maps[i].inner_id ||
+		    ctx->maps[i].inner_idx == -1)
+			continue;
+
+		*imap = ctx->maps[i];
+
+		for (j = 0; j < ctx->map_num; j++) {
+			if (!bpf_is_map_in_map_type(&ctx->maps[j]))
+				continue;
+			if (ctx->maps[j].inner_id != ctx->maps[i].id)
+				continue;
+
+			*omap = ctx->maps[j];
+			outer_map_name = bpf_map_fetch_name(ctx, j);
+			memcpy(omap_name, outer_map_name, strlen(outer_map_name) + 1);
+
+			return true;
+		}
+	}
+
+	return false;
+}
+
+int iproute2_find_map_name_by_id(unsigned int map_id, char *name)
+{
+	struct bpf_elf_ctx *ctx = &__ctx;
+	const char *map_name;
+	int i, idx = -1;
+
+	for (i = 0; i < ctx->map_num; i++) {
+		if (ctx->maps[i].id == map_id &&
+		    ctx->maps[i].type == BPF_MAP_TYPE_PROG_ARRAY) {
+			idx = i;
+			break;
+		}
+	}
+
+	if (idx < 0)
+		return -1;
+
+	map_name = bpf_map_fetch_name(ctx, idx);
+	if (!map_name)
+		return -1;
+
+	memcpy(name, map_name, strlen(map_name) + 1);
+	return 0;
+}
+#endif /* HAVE_LIBBPF */
 #endif /* HAVE_ELF */
diff --git a/lib/bpf_libbpf.c b/lib/bpf_libbpf.c
new file mode 100644
index 00000000..d05737a4
--- /dev/null
+++ b/lib/bpf_libbpf.c
@@ -0,0 +1,348 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * bpf_libbpf.c		BPF code relay on libbpf
+ * Authors:		Hangbin Liu <haliu@redhat.com>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include <libelf.h>
+#include <gelf.h>
+
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+
+#include "bpf_util.h"
+
+static int verbose_print(enum libbpf_print_level level, const char *format, va_list args)
+{
+	return vfprintf(stderr, format, args);
+}
+
+static int silent_print(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level > LIBBPF_WARN)
+		return 0;
+
+	/* Skip warning from bpf_object__init_user_maps() for legacy maps */
+	if (strstr(format, "has unrecognized, non-zero options"))
+		return 0;
+
+	return vfprintf(stderr, format, args);
+}
+
+static const char *get_bpf_program__section_name(const struct bpf_program *prog)
+{
+#ifdef HAVE_LIBBPF_SECTION_NAME
+	return bpf_program__section_name(prog);
+#else
+	return bpf_program__title(prog, false);
+#endif
+}
+
+static int create_map(const char *name, struct bpf_elf_map *map,
+		      __u32 ifindex, int inner_fd)
+{
+	struct bpf_create_map_attr map_attr = {};
+
+	map_attr.name = name;
+	map_attr.map_type = map->type;
+	map_attr.map_flags = map->flags;
+	map_attr.key_size = map->size_key;
+	map_attr.value_size = map->size_value;
+	map_attr.max_entries = map->max_elem;
+	map_attr.map_ifindex = ifindex;
+	map_attr.inner_map_fd = inner_fd;
+
+	return bpf_create_map_xattr(&map_attr);
+}
+
+static int create_map_in_map(struct bpf_object *obj, struct bpf_map *map,
+			     struct bpf_elf_map *elf_map, int inner_fd,
+			     bool *reuse_pin_map)
+{
+	char pathname[PATH_MAX];
+	const char *map_name;
+	bool pin_map = false;
+	int map_fd, ret = 0;
+
+	map_name = bpf_map__name(map);
+
+	if (iproute2_is_pin_map(map_name, pathname)) {
+		pin_map = true;
+
+		/* Check if there already has a pinned map */
+		map_fd = bpf_obj_get(pathname);
+		if (map_fd > 0) {
+			if (reuse_pin_map)
+				*reuse_pin_map = true;
+			close(map_fd);
+			return bpf_map__set_pin_path(map, pathname);
+		}
+	}
+
+	map_fd = create_map(map_name, elf_map, bpf_map__ifindex(map), inner_fd);
+	if (map_fd < 0) {
+		fprintf(stderr, "create map %s failed\n", map_name);
+		return map_fd;
+	}
+
+	ret = bpf_map__reuse_fd(map, map_fd);
+	if (ret < 0) {
+		fprintf(stderr, "map %s reuse fd failed\n", map_name);
+		goto err_out;
+	}
+
+	if (pin_map) {
+		ret = bpf_map__set_pin_path(map, pathname);
+		if (ret < 0)
+			goto err_out;
+	}
+
+	return 0;
+err_out:
+	close(map_fd);
+	return ret;
+}
+
+static int
+handle_legacy_map_in_map(struct bpf_object *obj, struct bpf_map *inner_map,
+			 const char *inner_map_name)
+{
+	int inner_fd, outer_fd, inner_idx, ret = 0;
+	struct bpf_elf_map imap, omap;
+	struct bpf_map *outer_map;
+	/* What's the size limit of map name? */
+	char outer_map_name[128];
+	bool reuse_pin_map = false;
+
+	/* Deal with map-in-map */
+	if (iproute2_is_map_in_map(inner_map_name, &imap, &omap, outer_map_name)) {
+		ret = create_map_in_map(obj, inner_map, &imap, -1, NULL);
+		if (ret < 0)
+			return ret;
+
+		inner_fd = bpf_map__fd(inner_map);
+		outer_map = bpf_object__find_map_by_name(obj, outer_map_name);
+		ret = create_map_in_map(obj, outer_map, &omap, inner_fd, &reuse_pin_map);
+		if (ret < 0)
+			return ret;
+
+		if (!reuse_pin_map) {
+			inner_idx = imap.inner_idx;
+			outer_fd = bpf_map__fd(outer_map);
+			ret = bpf_map_update_elem(outer_fd, &inner_idx, &inner_fd, 0);
+			if (ret < 0)
+				fprintf(stderr, "Cannot update inner_idx into outer_map\n");
+		}
+	}
+
+	return ret;
+}
+
+static int find_legacy_tail_calls(struct bpf_program *prog, struct bpf_object *obj)
+{
+	unsigned int map_id, key_id;
+	const char *sec_name;
+	struct bpf_map *map;
+	char map_name[128];
+	int ret;
+
+	/* Handle iproute2 tail call */
+	sec_name = get_bpf_program__section_name(prog);
+	ret = sscanf(sec_name, "%i/%i", &map_id, &key_id);
+	if (ret != 2)
+		return -1;
+
+	ret = iproute2_find_map_name_by_id(map_id, map_name);
+	if (ret < 0) {
+		fprintf(stderr, "unable to find map id %u for tail call\n", map_id);
+		return ret;
+	}
+
+	map = bpf_object__find_map_by_name(obj, map_name);
+	if (!map)
+		return -1;
+
+	/* Save the map here for later updating */
+	bpf_program__set_priv(prog, map, NULL);
+
+	return 0;
+}
+
+static int update_legacy_tail_call_maps(struct bpf_object *obj)
+{
+	int prog_fd, map_fd, ret = 0;
+	unsigned int map_id, key_id;
+	struct bpf_program *prog;
+	const char *sec_name;
+	struct bpf_map *map;
+
+	bpf_object__for_each_program(prog, obj) {
+		map = bpf_program__priv(prog);
+		if (!map)
+			continue;
+
+		prog_fd = bpf_program__fd(prog);
+		if (prog_fd < 0)
+			continue;
+
+		sec_name = get_bpf_program__section_name(prog);
+		ret = sscanf(sec_name, "%i/%i", &map_id, &key_id);
+		if (ret != 2)
+			continue;
+
+		map_fd = bpf_map__fd(map);
+		ret = bpf_map_update_elem(map_fd, &key_id, &prog_fd, 0);
+		if (ret < 0) {
+			fprintf(stderr, "Cannot update map key for tail call!\n");
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int handle_legacy_maps(struct bpf_object *obj)
+{
+	char pathname[PATH_MAX];
+	struct bpf_map *map;
+	const char *map_name;
+	int map_fd, ret = 0;
+
+	bpf_object__for_each_map(map, obj) {
+		map_name = bpf_map__name(map);
+
+		ret = handle_legacy_map_in_map(obj, map, map_name);
+		if (ret)
+			return ret;
+
+		/* If it is a iproute2 legacy pin maps, just set pin path
+		 * and let bpf_object__load() to deal with the map creation.
+		 * We need to ignore map-in-maps which have pinned maps manually
+		 */
+		map_fd = bpf_map__fd(map);
+		if (map_fd < 0 && iproute2_is_pin_map(map_name, pathname)) {
+			ret = bpf_map__set_pin_path(map, pathname);
+			if (ret) {
+				fprintf(stderr, "map '%s': couldn't set pin path.\n", map_name);
+				break;
+			}
+		}
+
+	}
+
+	return ret;
+}
+
+static int load_bpf_object(struct bpf_cfg_in *cfg)
+{
+	struct bpf_program *p, *prog = NULL;
+	struct bpf_object *obj;
+	char root_path[PATH_MAX];
+	struct bpf_map *map;
+	int prog_fd, ret = 0;
+
+	ret = iproute2_get_root_path(root_path, PATH_MAX);
+	if (ret)
+		return ret;
+
+	DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts,
+			.relaxed_maps = true,
+			.pin_root_path = root_path,
+	);
+
+	obj = bpf_object__open_file(cfg->object, &open_opts);
+	if (libbpf_get_error(obj)) {
+		fprintf(stderr, "ERROR: opening BPF object file failed\n");
+		return -ENOENT;
+	}
+
+	bpf_object__for_each_program(p, obj) {
+		/* Only load the programs that will either be subsequently
+		 * attached or inserted into a tail call map */
+		if (find_legacy_tail_calls(p, obj) < 0 && cfg->section &&
+		    strcmp(get_bpf_program__section_name(p), cfg->section)) {
+			ret = bpf_program__set_autoload(p, false);
+			if (ret)
+				return -EINVAL;
+			continue;
+		}
+
+		bpf_program__set_type(p, cfg->type);
+		bpf_program__set_ifindex(p, cfg->ifindex);
+		if (!prog)
+			prog = p;
+	}
+
+	bpf_object__for_each_map(map, obj) {
+		if (!bpf_map__is_offload_neutral(map))
+			bpf_map__set_ifindex(map, cfg->ifindex);
+	}
+
+	if (!prog) {
+		fprintf(stderr, "object file doesn't contain sec %s\n", cfg->section);
+		return -ENOENT;
+	}
+
+	/* Handle iproute2 legacy pin maps and map-in-maps */
+	ret = handle_legacy_maps(obj);
+	if (ret)
+		goto unload_obj;
+
+	ret = bpf_object__load(obj);
+	if (ret)
+		goto unload_obj;
+
+	ret = update_legacy_tail_call_maps(obj);
+	if (ret)
+		goto unload_obj;
+
+	prog_fd = fcntl(bpf_program__fd(prog), F_DUPFD_CLOEXEC, 1);
+	if (prog_fd < 0)
+		ret = -errno;
+	else
+		cfg->prog_fd = prog_fd;
+
+unload_obj:
+	/* Close obj as we don't need it */
+	bpf_object__close(obj);
+	return ret;
+}
+
+/* Load ebpf and return prog fd */
+int iproute2_load_libbpf(struct bpf_cfg_in *cfg)
+{
+	int ret = 0;
+
+	if (cfg->verbose)
+		libbpf_set_print(verbose_print);
+	else
+		libbpf_set_print(silent_print);
+
+	ret = iproute2_bpf_elf_ctx_init(cfg);
+	if (ret < 0) {
+		fprintf(stderr, "Cannot initialize ELF context!\n");
+		return ret;
+	}
+
+	ret = iproute2_bpf_fetch_ancillary();
+	if (ret < 0) {
+		fprintf(stderr, "Error fetching ELF ancillary data!\n");
+		return ret;
+	}
+
+	ret = load_bpf_object(cfg);
+	if (ret)
+		return ret;
+
+	return cfg->prog_fd;
+}
-- 
2.25.4


^ permalink raw reply related

* [PATCHv6 iproute2-next 5/5] examples/bpf: add bpf examples with BTF defined maps
From: Hangbin Liu @ 2020-11-23 13:12 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern
  Cc: Daniel Borkmann, Martin KaFai Lau, Song Liu, Yonghong Song,
	David Miller, netdev, bpf, Jiri Benc,
	Toke Høiland-Jørgensen, Jesper Dangaard Brouer,
	Alexei Starovoitov, Hangbin Liu
In-Reply-To: <20201123131201.4108483-1-haliu@redhat.com>

Users should try use the new BTF defined maps instead of struct
bpf_elf_map defined maps. The tail call examples are not added yet
as libbpf doesn't currently support declaratively populating tail call
maps.

Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Hangbin Liu <haliu@redhat.com>
---
 examples/bpf/README           |  6 ++++
 examples/bpf/bpf_graft.c      | 66 +++++++++++++++++++++++++++++++++++
 examples/bpf/bpf_map_in_map.c | 55 +++++++++++++++++++++++++++++
 examples/bpf/bpf_shared.c     | 53 ++++++++++++++++++++++++++++
 include/bpf_api.h             | 13 +++++++
 5 files changed, 193 insertions(+)
 create mode 100644 examples/bpf/bpf_graft.c
 create mode 100644 examples/bpf/bpf_map_in_map.c
 create mode 100644 examples/bpf/bpf_shared.c

diff --git a/examples/bpf/README b/examples/bpf/README
index 732bcc83..b7261191 100644
--- a/examples/bpf/README
+++ b/examples/bpf/README
@@ -1,6 +1,12 @@
 eBPF toy code examples (running in kernel) to familiarize yourself
 with syntax and features:
 
+- BTF defined map examples
+ - bpf_graft.c		-> Demo on altering runtime behaviour
+ - bpf_shared.c 	-> Ingress/egress map sharing example
+ - bpf_map_in_map.c	-> Using map in map example
+
+- legacy struct bpf_elf_map defined map examples
  - legacy/bpf_shared.c		-> Ingress/egress map sharing example
  - legacy/bpf_tailcall.c	-> Using tail call chains
  - legacy/bpf_cyclic.c		-> Simple cycle as tail calls
diff --git a/examples/bpf/bpf_graft.c b/examples/bpf/bpf_graft.c
new file mode 100644
index 00000000..8066dcce
--- /dev/null
+++ b/examples/bpf/bpf_graft.c
@@ -0,0 +1,66 @@
+#include "../../include/bpf_api.h"
+
+/* This example demonstrates how classifier run-time behaviour
+ * can be altered with tail calls. We start out with an empty
+ * jmp_tc array, then add section aaa to the array slot 0, and
+ * later on atomically replace it with section bbb. Note that
+ * as shown in other examples, the tc loader can prepopulate
+ * tail called sections, here we start out with an empty one
+ * on purpose to show it can also be done this way.
+ *
+ * tc filter add dev foo parent ffff: bpf obj graft.o
+ * tc exec bpf dbg
+ *   [...]
+ *   Socket Thread-20229 [001] ..s. 138993.003923: : fallthrough
+ *   <idle>-0            [001] ..s. 138993.202265: : fallthrough
+ *   Socket Thread-20229 [001] ..s. 138994.004149: : fallthrough
+ *   [...]
+ *
+ * tc exec bpf graft m:globals/jmp_tc key 0 obj graft.o sec aaa
+ * tc exec bpf dbg
+ *   [...]
+ *   Socket Thread-19818 [002] ..s. 139012.053587: : aaa
+ *   <idle>-0            [002] ..s. 139012.172359: : aaa
+ *   Socket Thread-19818 [001] ..s. 139012.173556: : aaa
+ *   [...]
+ *
+ * tc exec bpf graft m:globals/jmp_tc key 0 obj graft.o sec bbb
+ * tc exec bpf dbg
+ *   [...]
+ *   Socket Thread-19818 [002] ..s. 139022.102967: : bbb
+ *   <idle>-0            [002] ..s. 139022.155640: : bbb
+ *   Socket Thread-19818 [001] ..s. 139022.156730: : bbb
+ *   [...]
+ */
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+	__uint(key_size, sizeof(uint32_t));
+	__uint(value_size, sizeof(uint32_t));
+	__uint(max_entries, 1);
+	__uint(pinning, LIBBPF_PIN_BY_NAME);
+} jmp_tc __section(".maps");
+
+__section("aaa")
+int cls_aaa(struct __sk_buff *skb)
+{
+	printt("aaa\n");
+	return TC_H_MAKE(1, 42);
+}
+
+__section("bbb")
+int cls_bbb(struct __sk_buff *skb)
+{
+	printt("bbb\n");
+	return TC_H_MAKE(1, 43);
+}
+
+__section_cls_entry
+int cls_entry(struct __sk_buff *skb)
+{
+	tail_call(skb, &jmp_tc, 0);
+	printt("fallthrough\n");
+	return BPF_H_DEFAULT;
+}
+
+BPF_LICENSE("GPL");
diff --git a/examples/bpf/bpf_map_in_map.c b/examples/bpf/bpf_map_in_map.c
new file mode 100644
index 00000000..39c86268
--- /dev/null
+++ b/examples/bpf/bpf_map_in_map.c
@@ -0,0 +1,55 @@
+#include "../../include/bpf_api.h"
+
+struct inner_map {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(key_size, sizeof(uint32_t));
+	__uint(value_size, sizeof(uint32_t));
+	__uint(max_entries, 1);
+} map_inner __section(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+	__uint(key_size, sizeof(uint32_t));
+	__uint(value_size, sizeof(uint32_t));
+	__uint(max_entries, 1);
+	__uint(pinning, LIBBPF_PIN_BY_NAME);
+	__array(values, struct inner_map);
+} map_outer __section(".maps") = {
+	.values = {
+		[0] = &map_inner,
+	},
+};
+
+__section("egress")
+int emain(struct __sk_buff *skb)
+{
+	struct bpf_elf_map *map_inner;
+	int key = 0, *val;
+
+	map_inner = map_lookup_elem(&map_outer, &key);
+	if (map_inner) {
+		val = map_lookup_elem(map_inner, &key);
+		if (val)
+			lock_xadd(val, 1);
+	}
+
+	return BPF_H_DEFAULT;
+}
+
+__section("ingress")
+int imain(struct __sk_buff *skb)
+{
+	struct bpf_elf_map *map_inner;
+	int key = 0, *val;
+
+	map_inner = map_lookup_elem(&map_outer, &key);
+	if (map_inner) {
+		val = map_lookup_elem(map_inner, &key);
+		if (val)
+			printt("map val: %d\n", *val);
+	}
+
+	return BPF_H_DEFAULT;
+}
+
+BPF_LICENSE("GPL");
diff --git a/examples/bpf/bpf_shared.c b/examples/bpf/bpf_shared.c
new file mode 100644
index 00000000..99a332f4
--- /dev/null
+++ b/examples/bpf/bpf_shared.c
@@ -0,0 +1,53 @@
+#include "../../include/bpf_api.h"
+
+/* Minimal, stand-alone toy map pinning example:
+ *
+ * clang -target bpf -O2 [...] -o bpf_shared.o -c bpf_shared.c
+ * tc filter add dev foo parent 1: bpf obj bpf_shared.o sec egress
+ * tc filter add dev foo parent ffff: bpf obj bpf_shared.o sec ingress
+ *
+ * Both classifier will share the very same map instance in this example,
+ * so map content can be accessed from ingress *and* egress side!
+ *
+ * This example has a pinning of PIN_OBJECT_NS, so it's private and
+ * thus shared among various program sections within the object.
+ *
+ * A setting of PIN_GLOBAL_NS would place it into a global namespace,
+ * so that it can be shared among different object files. A setting
+ * of PIN_NONE (= 0) means no sharing, so each tc invocation a new map
+ * instance is being created.
+ */
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(key_size, sizeof(uint32_t));
+	__uint(value_size, sizeof(uint32_t));
+	__uint(max_entries, 1);
+	__uint(pinning, LIBBPF_PIN_BY_NAME);	/* or LIBBPF_PIN_NONE */
+} map_sh __section(".maps");
+
+__section("egress")
+int emain(struct __sk_buff *skb)
+{
+	int key = 0, *val;
+
+	val = map_lookup_elem(&map_sh, &key);
+	if (val)
+		lock_xadd(val, 1);
+
+	return BPF_H_DEFAULT;
+}
+
+__section("ingress")
+int imain(struct __sk_buff *skb)
+{
+	int key = 0, *val;
+
+	val = map_lookup_elem(&map_sh, &key);
+	if (val)
+		printt("map val: %d\n", *val);
+
+	return BPF_H_DEFAULT;
+}
+
+BPF_LICENSE("GPL");
diff --git a/include/bpf_api.h b/include/bpf_api.h
index 89d3488d..82c47089 100644
--- a/include/bpf_api.h
+++ b/include/bpf_api.h
@@ -19,6 +19,19 @@
 
 #include "bpf_elf.h"
 
+/** libbpf pin type. */
+enum libbpf_pin_type {
+	LIBBPF_PIN_NONE,
+	/* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */
+	LIBBPF_PIN_BY_NAME,
+};
+
+/** Type helper macros. */
+
+#define __uint(name, val) int (*name)[val]
+#define __type(name, val) typeof(val) *name
+#define __array(name, val) typeof(val) *name[]
+
 /** Misc macros. */
 
 #ifndef __stringify
-- 
2.25.4


^ permalink raw reply related

* [PATCHv6 iproute2-next 2/5] lib: make ipvrf able to use libbpf and fix function name conflicts
From: Hangbin Liu @ 2020-11-23 13:11 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern
  Cc: Daniel Borkmann, Martin KaFai Lau, Song Liu, Yonghong Song,
	David Miller, netdev, bpf, Jiri Benc,
	Toke Høiland-Jørgensen, Jesper Dangaard Brouer,
	Alexei Starovoitov, Hangbin Liu
In-Reply-To: <20201123131201.4108483-1-haliu@redhat.com>

There are directly calls in libbpf for bpf program load/attach.
So we could just use two wrapper functions for ipvrf and convert
them with libbpf support.

Function bpf_prog_load() is removed as it's conflict with libbpf
function name.

bpf.c is moved to bpf_legacy.c for later main libbpf support in
iproute2.

Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Hangbin Liu <haliu@redhat.com>
---
v6: bpf_glue.c is created in previous patch. So I changed the commit
    description.
v5: Fix bpf_prog_load_dev typo.
v4: Add new file bpf_glue.c
v2-v3: no update
---
 include/bpf_util.h          | 10 +++++++---
 ip/ipvrf.c                  |  6 +++---
 lib/Makefile                |  2 +-
 lib/bpf_glue.c              | 23 +++++++++++++++++++++++
 lib/{bpf.c => bpf_legacy.c} | 15 +++------------
 5 files changed, 37 insertions(+), 19 deletions(-)
 rename lib/{bpf.c => bpf_legacy.c} (99%)

diff --git a/include/bpf_util.h b/include/bpf_util.h
index dee5bb02..3235c34e 100644
--- a/include/bpf_util.h
+++ b/include/bpf_util.h
@@ -274,12 +274,16 @@ int bpf_trace_pipe(void);
 
 void bpf_print_ops(struct rtattr *bpf_ops, __u16 len);
 
-int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns,
-		  size_t size_insns, const char *license, char *log,
-		  size_t size_log);
+int bpf_prog_load_dev(enum bpf_prog_type type, const struct bpf_insn *insns,
+		      size_t size_insns, const char *license, __u32 ifindex,
+		      char *log, size_t size_log);
+int bpf_program_load(enum bpf_prog_type type, const struct bpf_insn *insns,
+		     size_t size_insns, const char *license, char *log,
+		     size_t size_log);
 
 int bpf_prog_attach_fd(int prog_fd, int target_fd, enum bpf_attach_type type);
 int bpf_prog_detach_fd(int target_fd, enum bpf_attach_type type);
+int bpf_program_attach(int prog_fd, int target_fd, enum bpf_attach_type type);
 
 int bpf_dump_prog_info(FILE *f, uint32_t id);
 
diff --git a/ip/ipvrf.c b/ip/ipvrf.c
index 28dd8e25..42779e5c 100644
--- a/ip/ipvrf.c
+++ b/ip/ipvrf.c
@@ -256,8 +256,8 @@ static int prog_load(int idx)
 		BPF_EXIT_INSN(),
 	};
 
-	return bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, prog, sizeof(prog),
-			     "GPL", bpf_log_buf, sizeof(bpf_log_buf));
+	return bpf_program_load(BPF_PROG_TYPE_CGROUP_SOCK, prog, sizeof(prog),
+			        "GPL", bpf_log_buf, sizeof(bpf_log_buf));
 }
 
 static int vrf_configure_cgroup(const char *path, int ifindex)
@@ -288,7 +288,7 @@ static int vrf_configure_cgroup(const char *path, int ifindex)
 		goto out;
 	}
 
-	if (bpf_prog_attach_fd(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE)) {
+	if (bpf_program_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE)) {
 		fprintf(stderr, "Failed to attach prog to cgroup: '%s'\n",
 			strerror(errno));
 		goto out;
diff --git a/lib/Makefile b/lib/Makefile
index a02775a5..7c8a197c 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -5,7 +5,7 @@ CFLAGS += -fPIC
 
 UTILOBJ = utils.o rt_names.o ll_map.o ll_types.o ll_proto.o ll_addr.o \
 	inet_proto.o namespace.o json_writer.o json_print.o \
-	names.o color.o bpf.o bpf_glue.o exec.o fs.o cg_map.o
+	names.o color.o bpf_legacy.o bpf_glue.o exec.o fs.o cg_map.o
 
 NLOBJ=libgenl.o libnetlink.o mnl_utils.o
 
diff --git a/lib/bpf_glue.c b/lib/bpf_glue.c
index 67c41c22..fa609bfe 100644
--- a/lib/bpf_glue.c
+++ b/lib/bpf_glue.c
@@ -5,6 +5,29 @@
  *
  */
 #include "bpf_util.h"
+#ifdef HAVE_LIBBPF
+#include <bpf/bpf.h>
+#endif
+
+int bpf_program_load(enum bpf_prog_type type, const struct bpf_insn *insns,
+		     size_t size_insns, const char *license, char *log,
+		     size_t size_log)
+{
+#ifdef HAVE_LIBBPF
+	return bpf_load_program(type, insns, size_insns, license, 0, log, size_log);
+#else
+	return bpf_prog_load_dev(type, insns, size_insns, license, 0, log, size_log);
+#endif
+}
+
+int bpf_program_attach(int prog_fd, int target_fd, enum bpf_attach_type type)
+{
+#ifdef HAVE_LIBBPF
+	return bpf_prog_attach(prog_fd, target_fd, type, 0);
+#else
+	return bpf_prog_attach_fd(prog_fd, target_fd, type);
+#endif
+}
 
 #ifdef HAVE_LIBBPF
 static const char *_libbpf_compile_version = LIBBPF_VERSION;
diff --git a/lib/bpf.c b/lib/bpf_legacy.c
similarity index 99%
rename from lib/bpf.c
rename to lib/bpf_legacy.c
index c7d45077..4246fb76 100644
--- a/lib/bpf.c
+++ b/lib/bpf_legacy.c
@@ -1087,10 +1087,9 @@ int bpf_prog_detach_fd(int target_fd, enum bpf_attach_type type)
 	return bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
 }
 
-static int bpf_prog_load_dev(enum bpf_prog_type type,
-			     const struct bpf_insn *insns, size_t size_insns,
-			     const char *license, __u32 ifindex,
-			     char *log, size_t size_log)
+int bpf_prog_load_dev(enum bpf_prog_type type, const struct bpf_insn *insns,
+		      size_t size_insns, const char *license, __u32 ifindex,
+		      char *log, size_t size_log)
 {
 	union bpf_attr attr = {};
 
@@ -1109,14 +1108,6 @@ static int bpf_prog_load_dev(enum bpf_prog_type type,
 	return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
 }
 
-int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns,
-		  size_t size_insns, const char *license, char *log,
-		  size_t size_log)
-{
-	return bpf_prog_load_dev(type, insns, size_insns, license, 0,
-				 log, size_log);
-}
-
 #ifdef HAVE_ELF
 struct bpf_elf_prog {
 	enum bpf_prog_type	type;
-- 
2.25.4


^ permalink raw reply related

* [PATCHv6 iproute2-next 1/5] iproute2: add check_libbpf() and get_libbpf_version()
From: Hangbin Liu @ 2020-11-23 13:11 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern
  Cc: Daniel Borkmann, Martin KaFai Lau, Song Liu, Yonghong Song,
	David Miller, netdev, bpf, Jiri Benc,
	Toke Høiland-Jørgensen, Jesper Dangaard Brouer,
	Alexei Starovoitov, Hangbin Liu
In-Reply-To: <20201123131201.4108483-1-haliu@redhat.com>

This patch aim to add basic checking functions for later iproute2
libbpf support.

First we add check_libbpf() in configure to see if we have bpf library
support. By default the system libbpf will be used, but static linking
against a custom libbpf version can be achieved by passing libbpf DESTDIR
to variable LIBBPF_DIR for configure.

Another variable LIBBPF_FORCE is used to control whether to build iproute2
with libbpf. If set to on, then force to build with libbpf and exit if
not available. If set to off, then force to not build with libbpf.

When dynamically linking against libbpf, we can't be sure that the
version we discovered at compile time is actually the one we are
using at runtime. This can lead to hard-to-debug errors. So we add
a new file lib/bpf_glue.c and a helper function get_libbpf_version()
to get correct libbpf version at runtime.

Signed-off-by: Hangbin Liu <haliu@redhat.com>
---

v6:
1) Add a new helper get_libbpf_version() to get runtime libbpf version
  based on Toke's xdp-tools patch. The libbpf version will be printed
  when exec ip -V or tc -V.

v5:
1) Fix LIBBPF_DIR type and description, use libbpf DESTDIR as LIBBPF_DIR
   dest.

v4:
1) Remove duplicate LIBBPF_CFLAGS
2) Remove un-needed -L since using static libbpf.a
3) Fix == not supported in dash
4) Extend LIBBPF_FORCE to support on/off, when set to on, stop building when
   there is no libbpf support. If set to off, discard libbpf check.
5) Print libbpf version after checking

v3:
Check function bpf_program__section_name() separately and only use it
on higher libbpf version.

v2:
No update
---
 configure          | 113 +++++++++++++++++++++++++++++++++++++++++++++
 include/bpf_util.h |   3 ++
 ip/ip.c            |  10 +++-
 lib/Makefile       |   2 +-
 lib/bpf_glue.c     |  63 +++++++++++++++++++++++++
 tc/tc.c            |  10 +++-
 6 files changed, 196 insertions(+), 5 deletions(-)
 create mode 100644 lib/bpf_glue.c

diff --git a/configure b/configure
index 307912aa..2c363d3b 100755
--- a/configure
+++ b/configure
@@ -2,6 +2,11 @@
 # SPDX-License-Identifier: GPL-2.0
 # This is not an autoconf generated configure
 #
+# Influential LIBBPF environment variables:
+#   LIBBPF_FORCE={on,off}   on: require link against libbpf;
+#                           off: disable libbpf probing
+#   LIBBPF_DIR              Path to libbpf DESTDIR to use
+
 INCLUDE=${1:-"$PWD/include"}
 
 # Output file which is input to Makefile
@@ -240,6 +245,111 @@ check_elf()
     fi
 }
 
+have_libbpf_basic()
+{
+    cat >$TMPDIR/libbpf_test.c <<EOF
+#include <bpf/libbpf.h>
+int main(int argc, char **argv) {
+    bpf_program__set_autoload(NULL, false);
+    bpf_map__ifindex(NULL);
+    bpf_map__set_pin_path(NULL, NULL);
+    bpf_object__open_file(NULL, NULL);
+    return 0;
+}
+EOF
+
+    $CC -o $TMPDIR/libbpf_test $TMPDIR/libbpf_test.c $LIBBPF_CFLAGS $LIBBPF_LDLIBS >/dev/null 2>&1
+    local ret=$?
+
+    rm -f $TMPDIR/libbpf_test.c $TMPDIR/libbpf_test
+    return $ret
+}
+
+have_libbpf_sec_name()
+{
+    cat >$TMPDIR/libbpf_sec_test.c <<EOF
+#include <bpf/libbpf.h>
+int main(int argc, char **argv) {
+    void *ptr;
+    bpf_program__section_name(NULL);
+    return 0;
+}
+EOF
+
+    $CC -o $TMPDIR/libbpf_sec_test $TMPDIR/libbpf_sec_test.c $LIBBPF_CFLAGS $LIBBPF_LDLIBS >/dev/null 2>&1
+    local ret=$?
+
+    rm -f $TMPDIR/libbpf_sec_test.c $TMPDIR/libbpf_sec_test
+    return $ret
+}
+
+check_force_libbpf_on()
+{
+    # if set LIBBPF_FORCE=on but no libbpf support, just exist the config
+    # process to make sure we don't build without libbpf.
+    if [ "$LIBBPF_FORCE" = on ]; then
+        echo "	LIBBPF_FORCE=on set, but couldn't find a usable libbpf"
+        exit 1
+    fi
+}
+
+check_libbpf()
+{
+    # if set LIBBPF_FORCE=off, disable libbpf entirely
+    if [ "$LIBBPF_FORCE" = off ]; then
+        echo "no"
+        return
+    fi
+
+    if ! ${PKG_CONFIG} libbpf --exists && [ -z "$LIBBPF_DIR" ] ; then
+        echo "no"
+        check_force_libbpf_on
+        return
+    fi
+
+    if [ $(uname -m) = x86_64 ]; then
+        local LIBBPF_LIBDIR="${LIBBPF_DIR}/usr/lib64"
+    else
+        local LIBBPF_LIBDIR="${LIBBPF_DIR}/usr/lib"
+    fi
+
+    if [ -n "$LIBBPF_DIR" ]; then
+        LIBBPF_CFLAGS="-I${LIBBPF_DIR}/usr/include"
+        LIBBPF_LDLIBS="${LIBBPF_LIBDIR}/libbpf.a -lz -lelf"
+        LIBBPF_VERSION=$(PKG_CONFIG_LIBDIR=${LIBBPF_LIBDIR}/pkgconfig ${PKG_CONFIG} libbpf --modversion)
+    else
+        LIBBPF_CFLAGS=$(${PKG_CONFIG} libbpf --cflags)
+        LIBBPF_LDLIBS=$(${PKG_CONFIG} libbpf --libs)
+        LIBBPF_VERSION=$(${PKG_CONFIG} libbpf --modversion)
+    fi
+
+    if ! have_libbpf_basic; then
+        echo "no"
+        echo "	libbpf version $LIBBPF_VERSION is too low, please update it to at least 0.1.0"
+        check_force_libbpf_on
+        return
+    else
+        echo "HAVE_LIBBPF:=y" >> $CONFIG
+        echo 'CFLAGS += -DHAVE_LIBBPF ' $LIBBPF_CFLAGS >> $CONFIG
+        echo "CFLAGS += -DLIBBPF_VERSION=\\\"$LIBBPF_VERSION\\\"" >> $CONFIG
+        echo 'LDLIBS += ' $LIBBPF_LDLIBS >> $CONFIG
+
+        if [ -z "$LIBBPF_DIR" ]; then
+            echo "CFLAGS += -DLIBBPF_DYNAMIC" >> $CONFIG
+        fi
+    fi
+
+    # bpf_program__title() is deprecated since libbpf 0.2.0, use
+    # bpf_program__section_name() instead if we support
+    if have_libbpf_sec_name; then
+        echo "HAVE_LIBBPF_SECTION_NAME:=y" >> $CONFIG
+        echo 'CFLAGS += -DHAVE_LIBBPF_SECTION_NAME ' >> $CONFIG
+    fi
+
+    echo "yes"
+    echo "	libbpf version $LIBBPF_VERSION"
+}
+
 check_selinux()
 # SELinux is a compile time option in the ss utility
 {
@@ -385,6 +495,9 @@ check_setns
 echo -n "SELinux support: "
 check_selinux
 
+echo -n "libbpf support: "
+check_libbpf
+
 echo -n "ELF support: "
 check_elf
 
diff --git a/include/bpf_util.h b/include/bpf_util.h
index 63db07ca..dee5bb02 100644
--- a/include/bpf_util.h
+++ b/include/bpf_util.h
@@ -300,4 +300,7 @@ static inline int bpf_recv_map_fds(const char *path, int *fds,
 	return -1;
 }
 #endif /* HAVE_ELF */
+
+const char *get_libbpf_version(void);
+
 #endif /* __BPF_UTIL__ */
diff --git a/ip/ip.c b/ip/ip.c
index 5e31957f..466dbb52 100644
--- a/ip/ip.c
+++ b/ip/ip.c
@@ -24,6 +24,7 @@
 #include "namespace.h"
 #include "color.h"
 #include "rt_names.h"
+#include "bpf_util.h"
 
 int preferred_family = AF_UNSPEC;
 int human_readable;
@@ -147,8 +148,9 @@ static int batch(const char *name)
 
 int main(int argc, char **argv)
 {
-	char *basename;
+	const char *libbpf_version;
 	char *batch_file = NULL;
+	char *basename;
 	int color = 0;
 
 	/* to run vrf exec without root, capabilities might be set, drop them
@@ -229,7 +231,11 @@ int main(int argc, char **argv)
 			++timestamp;
 			++timestamp_short;
 		} else if (matches(opt, "-Version") == 0) {
-			printf("ip utility, iproute2-%s\n", version);
+			printf("ip utility, iproute2-%s", version);
+			libbpf_version = get_libbpf_version();
+			if (libbpf_version)
+				printf(", libbpf %s", libbpf_version);
+			printf("\n");
 			exit(0);
 		} else if (matches(opt, "-force") == 0) {
 			++force;
diff --git a/lib/Makefile b/lib/Makefile
index 13f4ee15..a02775a5 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -5,7 +5,7 @@ CFLAGS += -fPIC
 
 UTILOBJ = utils.o rt_names.o ll_map.o ll_types.o ll_proto.o ll_addr.o \
 	inet_proto.o namespace.o json_writer.o json_print.o \
-	names.o color.o bpf.o exec.o fs.o cg_map.o
+	names.o color.o bpf.o bpf_glue.o exec.o fs.o cg_map.o
 
 NLOBJ=libgenl.o libnetlink.o mnl_utils.o
 
diff --git a/lib/bpf_glue.c b/lib/bpf_glue.c
new file mode 100644
index 00000000..67c41c22
--- /dev/null
+++ b/lib/bpf_glue.c
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * bpf_glue.c:	BPF code to call both legacy and libbpf code
+ * Authors:	Hangbin Liu <haliu@redhat.com>
+ *
+ */
+#include "bpf_util.h"
+
+#ifdef HAVE_LIBBPF
+static const char *_libbpf_compile_version = LIBBPF_VERSION;
+static char _libbpf_version[10] = {};
+
+const char *get_libbpf_version(void)
+{
+	/* Start by copying compile-time version into buffer so we have a
+	 * fallback value in case we are dynamically linked, or can't find a
+	 * version in /proc/self/maps below.
+	 */
+	strncpy(_libbpf_version, _libbpf_compile_version,
+		sizeof(_libbpf_version)-1);
+#ifdef LIBBPF_DYNAMIC
+	char buf[PATH_MAX], *s;
+	bool found = false;
+	FILE *fp;
+
+	/* When dynamically linking against libbpf, we can't be sure that the
+	 * version we discovered at compile time is actually the one we are
+	 * using at runtime. This can lead to hard-to-debug errors, so we try to
+	 * discover the correct version at runtime.
+	 *
+	 * The simple solution to this would be if libbpf itself exported a
+	 * version in its API. But since it doesn't, we work around this by
+	 * parsing the mappings of the binary at runtime, looking for the full
+	 * filename of libbpf.so and using that.
+	 */
+	fp = fopen("/proc/self/maps", "r");
+	if (fp == NULL)
+		goto out;
+
+	while ((s = fgets(buf, sizeof(buf), fp)) != NULL) {
+		if ((s = strstr(buf, "libbpf.so.")) != NULL) {
+			strncpy(_libbpf_version, s+10, sizeof(_libbpf_version)-1);
+			strtok(_libbpf_version, "\n");
+			found = true;
+			break;
+		}
+	}
+
+	fclose(fp);
+out:
+	if (!found)
+		fprintf(stderr, "Couldn't find runtime libbpf version - falling back to compile-time value!\n");
+#endif /* LIBBPF_DYNAMIC */
+
+	_libbpf_version[sizeof(_libbpf_version)-1] = '\0';
+	return _libbpf_version;
+}
+#else
+const char *get_libbpf_version(void)
+{
+	return NULL;
+}
+#endif /* HAVE_LIBBPF */
diff --git a/tc/tc.c b/tc/tc.c
index af9b21da..7557b977 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -30,6 +30,7 @@
 #include "tc_common.h"
 #include "namespace.h"
 #include "rt_names.h"
+#include "bpf_util.h"
 
 int show_stats;
 int show_details;
@@ -259,8 +260,9 @@ static int batch(const char *name)
 
 int main(int argc, char **argv)
 {
-	int ret;
+	const char *libbpf_version;
 	char *batch_file = NULL;
+	int ret;
 
 	while (argc > 1) {
 		if (argv[1][0] != '-')
@@ -277,7 +279,11 @@ int main(int argc, char **argv)
 		} else if (matches(argv[1], "-graph") == 0) {
 			show_graph = 1;
 		} else if (matches(argv[1], "-Version") == 0) {
-			printf("tc utility, iproute2-%s\n", version);
+			printf("tc utility, iproute2-%s", version);
+			libbpf_version = get_libbpf_version();
+			if (libbpf_version)
+				printf(", libbpf %s", libbpf_version);
+			printf("\n");
 			return 0;
 		} else if (matches(argv[1], "-iec") == 0) {
 			++use_iec;
-- 
2.25.4


^ permalink raw reply related

* [PATCHv6 iproute2-next 0/5] iproute2: add libbpf support
From: Hangbin Liu @ 2020-11-23 13:11 UTC (permalink / raw)
  To: Stephen Hemminger, David Ahern
  Cc: Daniel Borkmann, Martin KaFai Lau, Song Liu, Yonghong Song,
	David Miller, netdev, bpf, Jiri Benc,
	Toke Høiland-Jørgensen, Jesper Dangaard Brouer,
	Alexei Starovoitov, Hangbin Liu
In-Reply-To: <20201116065305.1010651-1-haliu@redhat.com>

This series converts iproute2 to use libbpf for loading and attaching
BPF programs when it is available. This means that iproute2 will
correctly process BTF information and support the new-style BTF-defined
maps, while keeping compatibility with the old internal map definition
syntax.

This is achieved by checking for libbpf at './configure' time, and using
it if available. By default the system libbpf will be used, but static
linking against a custom libbpf version can be achieved by passing
LIBBPF_DIR to configure. LIBBPF_FORCE can be set to on to force configure
abort if no suitable libbpf is found (useful for automatic packaging
that wants to enforce the dependency), or set off to disable libbpf check
and build iproute2 with legacy bpf.

The old iproute2 bpf code is kept and will be used if no suitable libbpf
is available. When using libbpf, wrapper code ensures that iproute2 will
still understand the old map definition format, including populating
map-in-map and tail call maps before load.

The examples in bpf/examples are kept, and a separate set of examples
are added with BTF-based map definitions for those examples where this
is possible (libbpf doesn't currently support declaratively populating
tail call maps).

At last, Thanks a lot for Toke's help on this patch set.

v6:
a) print runtime libbpf version in ip -V and tc -V

v5:
a) Fix LIBBPF_DIR typo and description, use libbpf DESTDIR as LIBBPF_DIR
   dest.
b) Fix bpf_prog_load_dev typo.
c) rebase to latest iproute2-next.

v4:
a) Make variable LIBBPF_FORCE able to control whether build iproute2
   with libbpf or not.
b) Add new file bpf_glue.c to for libbpf/legacy mixed bpf calls.
c) Fix some build issues and shell compatibility error.

v3:
a) Update configure to Check function bpf_program__section_name() separately
b) Add a new function get_bpf_program__section_name() to choose whether to
use bpf_program__title() or not.
c) Test build the patch on Fedora 33 with libbpf-0.1.0-1.fc33 and
   libbpf-devel-0.1.0-1.fc33

v2:
a) Remove self defined IS_ERR_OR_NULL and use libbpf_get_error() instead.
b) Add ipvrf with libbpf support.


Here are the test results with patched iproute2:
== Show libbpf version
# ip -V
ip utility, iproute2-5.9.0, libbpf 0.1.0
# tc -V
tc utility, iproute2-5.9.0, libbpf 0.1.0

== setup env
# clang -O2 -Wall -g -target bpf -c bpf_graft.c -o btf_graft.o
# clang -O2 -Wall -g -target bpf -c bpf_map_in_map.c -o btf_map_in_map.o
# clang -O2 -Wall -g -target bpf -c bpf_shared.c -o btf_shared.o
# clang -O2 -Wall -g -target bpf -c legacy/bpf_cyclic.c -o bpf_cyclic.o
# clang -O2 -Wall -g -target bpf -c legacy/bpf_graft.c -o bpf_graft.o
# clang -O2 -Wall -g -target bpf -c legacy/bpf_map_in_map.c -o bpf_map_in_map.o
# clang -O2 -Wall -g -target bpf -c legacy/bpf_shared.c -o bpf_shared.o
# clang -O2 -Wall -g -target bpf -c legacy/bpf_tailcall.c -o bpf_tailcall.o
# rm -rf /sys/fs/bpf/xdp/globals
# /root/iproute2/ip/ip link add type veth
# /root/iproute2/ip/ip link set veth0 up
# /root/iproute2/ip/ip link set veth1 up


== Load objs
# /root/iproute2/ip/ip link set veth0 xdp obj bpf_graft.o sec aaa
# /root/iproute2/ip/ip link show veth0
5: veth0@veth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 xdp qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 6a:e6:fa:2b:4e:1f brd ff:ff:ff:ff:ff:ff
    prog/xdp id 4 tag 3056d2382e53f27c jited
# ls /sys/fs/bpf/xdp/globals
jmp_tc
# bpftool map show
1: prog_array  name jmp_tc  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
# bpftool prog show
4: xdp  name cls_aaa  tag 3056d2382e53f27c  gpl
        loaded_at 2020-10-22T08:04:21-0400  uid 0
        xlated 80B  jited 71B  memlock 4096B
        btf_id 5
# /root/iproute2/ip/ip link set veth0 xdp off
# /root/iproute2/ip/ip link set veth0 xdp obj bpf_map_in_map.o sec ingress
# /root/iproute2/ip/ip link show veth0
5: veth0@veth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 xdp qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 6a:e6:fa:2b:4e:1f brd ff:ff:ff:ff:ff:ff
    prog/xdp id 8 tag 4420e72b2a601ed7 jited
# ls /sys/fs/bpf/xdp/globals
jmp_tc  map_inner  map_outer
# bpftool map show
1: prog_array  name jmp_tc  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
2: array  name map_inner  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
3: array_of_maps  name map_outer  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
# bpftool prog show
8: xdp  name imain  tag 4420e72b2a601ed7  gpl
        loaded_at 2020-10-22T08:04:23-0400  uid 0
        xlated 336B  jited 193B  memlock 4096B  map_ids 3
        btf_id 10
# /root/iproute2/ip/ip link set veth0 xdp off
# /root/iproute2/ip/ip link set veth0 xdp obj bpf_shared.o sec ingress
# /root/iproute2/ip/ip link show veth0
5: veth0@veth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 xdp qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 6a:e6:fa:2b:4e:1f brd ff:ff:ff:ff:ff:ff
    prog/xdp id 12 tag 9cbab549c3af3eab jited
# ls /sys/fs/bpf/xdp/7a1422e90cd81478f97bc33fbd7782bcb3b868ef /sys/fs/bpf/xdp/globals
/sys/fs/bpf/xdp/7a1422e90cd81478f97bc33fbd7782bcb3b868ef:
map_sh

/sys/fs/bpf/xdp/globals:
jmp_tc  map_inner  map_outer
# bpftool map show
1: prog_array  name jmp_tc  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
2: array  name map_inner  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
3: array_of_maps  name map_outer  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
4: array  name map_sh  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
# bpftool prog show
12: xdp  name imain  tag 9cbab549c3af3eab  gpl
        loaded_at 2020-10-22T08:04:25-0400  uid 0
        xlated 224B  jited 139B  memlock 4096B  map_ids 4
        btf_id 15
# /root/iproute2/ip/ip link set veth0 xdp off


== Load objs again to make sure maps could be reused
# /root/iproute2/ip/ip link set veth0 xdp obj bpf_graft.o sec aaa
# /root/iproute2/ip/ip link show veth0
5: veth0@veth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 xdp qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 6a:e6:fa:2b:4e:1f brd ff:ff:ff:ff:ff:ff
    prog/xdp id 16 tag 3056d2382e53f27c jited
# ls /sys/fs/bpf/xdp/7a1422e90cd81478f97bc33fbd7782bcb3b868ef /sys/fs/bpf/xdp/globals
/sys/fs/bpf/xdp/7a1422e90cd81478f97bc33fbd7782bcb3b868ef:
map_sh

/sys/fs/bpf/xdp/globals:
jmp_tc  map_inner  map_outer
# bpftool map show
1: prog_array  name jmp_tc  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
2: array  name map_inner  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
3: array_of_maps  name map_outer  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
4: array  name map_sh  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
# bpftool prog show
16: xdp  name cls_aaa  tag 3056d2382e53f27c  gpl
        loaded_at 2020-10-22T08:04:27-0400  uid 0
        xlated 80B  jited 71B  memlock 4096B
        btf_id 20
# /root/iproute2/ip/ip link set veth0 xdp off
# /root/iproute2/ip/ip link set veth0 xdp obj bpf_map_in_map.o sec ingress
# /root/iproute2/ip/ip link show veth0
5: veth0@veth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 xdp qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 6a:e6:fa:2b:4e:1f brd ff:ff:ff:ff:ff:ff
    prog/xdp id 20 tag 4420e72b2a601ed7 jited
# ls /sys/fs/bpf/xdp/7a1422e90cd81478f97bc33fbd7782bcb3b868ef /sys/fs/bpf/xdp/globals
/sys/fs/bpf/xdp/7a1422e90cd81478f97bc33fbd7782bcb3b868ef:
map_sh

/sys/fs/bpf/xdp/globals:
jmp_tc  map_inner  map_outer
# bpftool map show                                                                                                                                                                   [236/4518]
1: prog_array  name jmp_tc  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
2: array  name map_inner  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
3: array_of_maps  name map_outer  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
4: array  name map_sh  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
# bpftool prog show
20: xdp  name imain  tag 4420e72b2a601ed7  gpl
        loaded_at 2020-10-22T08:04:29-0400  uid 0
        xlated 336B  jited 193B  memlock 4096B  map_ids 3
        btf_id 25
# /root/iproute2/ip/ip link set veth0 xdp off
# /root/iproute2/ip/ip link set veth0 xdp obj bpf_shared.o sec ingress
# /root/iproute2/ip/ip link show veth0
5: veth0@veth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 xdp qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 6a:e6:fa:2b:4e:1f brd ff:ff:ff:ff:ff:ff
    prog/xdp id 24 tag 9cbab549c3af3eab jited
# ls /sys/fs/bpf/xdp/7a1422e90cd81478f97bc33fbd7782bcb3b868ef /sys/fs/bpf/xdp/globals
/sys/fs/bpf/xdp/7a1422e90cd81478f97bc33fbd7782bcb3b868ef:
map_sh

/sys/fs/bpf/xdp/globals:
jmp_tc  map_inner  map_outer
# bpftool map show
1: prog_array  name jmp_tc  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
2: array  name map_inner  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
3: array_of_maps  name map_outer  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
4: array  name map_sh  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
# bpftool prog show
24: xdp  name imain  tag 9cbab549c3af3eab  gpl
        loaded_at 2020-10-22T08:04:31-0400  uid 0
        xlated 224B  jited 139B  memlock 4096B  map_ids 4
        btf_id 30
# /root/iproute2/ip/ip link set veth0 xdp off
# rm -rf /sys/fs/bpf/xdp/7a1422e90cd81478f97bc33fbd7782bcb3b868ef /sys/fs/bpf/xdp/globals

== Testing if we can load new-style objects (using xdp-filter as an example)
# /root/iproute2/ip/ip link set veth0 xdp obj /usr/lib64/bpf/xdpfilt_alw_all.o sec xdp_filter
# /root/iproute2/ip/ip link show veth0
5: veth0@veth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 xdp qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 6a:e6:fa:2b:4e:1f brd ff:ff:ff:ff:ff:ff
    prog/xdp id 28 tag e29eeda1489a6520 jited
# ls /sys/fs/bpf/xdp/globals
filter_ethernet  filter_ipv4  filter_ipv6  filter_ports  xdp_stats_map
# bpftool map show
5: percpu_array  name xdp_stats_map  flags 0x0
        key 4B  value 16B  max_entries 5  memlock 4096B
        btf_id 35
6: percpu_array  name filter_ports  flags 0x0
        key 4B  value 8B  max_entries 65536  memlock 1576960B
        btf_id 35
7: percpu_hash  name filter_ipv4  flags 0x0
        key 4B  value 8B  max_entries 10000  memlock 1064960B
        btf_id 35
8: percpu_hash  name filter_ipv6  flags 0x0
        key 16B  value 8B  max_entries 10000  memlock 1142784B
        btf_id 35
9: percpu_hash  name filter_ethernet  flags 0x0
        key 6B  value 8B  max_entries 10000  memlock 1064960B
        btf_id 35
# bpftool prog show
28: xdp  name xdpfilt_alw_all  tag e29eeda1489a6520  gpl
        loaded_at 2020-10-22T08:04:33-0400  uid 0
        xlated 2408B  jited 1405B  memlock 4096B  map_ids 9,5,7,8,6
        btf_id 35
# /root/iproute2/ip/ip link set veth0 xdp off
# /root/iproute2/ip/ip link set veth0 xdp obj /usr/lib64/bpf/xdpfilt_alw_ip.o sec xdp_filter
# /root/iproute2/ip/ip link show veth0
5: veth0@veth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 xdp qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 6a:e6:fa:2b:4e:1f brd ff:ff:ff:ff:ff:ff
    prog/xdp id 32 tag 2f2b9dbfb786a5a2 jited
# ls /sys/fs/bpf/xdp/globals
filter_ethernet  filter_ipv4  filter_ipv6  filter_ports  xdp_stats_map
# bpftool map show
5: percpu_array  name xdp_stats_map  flags 0x0
        key 4B  value 16B  max_entries 5  memlock 4096B
        btf_id 35
6: percpu_array  name filter_ports  flags 0x0
        key 4B  value 8B  max_entries 65536  memlock 1576960B
        btf_id 35
7: percpu_hash  name filter_ipv4  flags 0x0
        key 4B  value 8B  max_entries 10000  memlock 1064960B
        btf_id 35
8: percpu_hash  name filter_ipv6  flags 0x0
        key 16B  value 8B  max_entries 10000  memlock 1142784B
        btf_id 35
9: percpu_hash  name filter_ethernet  flags 0x0
        key 6B  value 8B  max_entries 10000  memlock 1064960B
        btf_id 35
# bpftool prog show
32: xdp  name xdpfilt_alw_ip  tag 2f2b9dbfb786a5a2  gpl
        loaded_at 2020-10-22T08:04:35-0400  uid 0
        xlated 1336B  jited 778B  memlock 4096B  map_ids 7,8,5
        btf_id 40
# /root/iproute2/ip/ip link set veth0 xdp off
# /root/iproute2/ip/ip link set veth0 xdp obj /usr/lib64/bpf/xdpfilt_alw_tcp.o sec xdp_filter
# /root/iproute2/ip/ip link show veth0
5: veth0@veth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 xdp qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 6a:e6:fa:2b:4e:1f brd ff:ff:ff:ff:ff:ff
    prog/xdp id 36 tag 18c1bb25084030bc jited
# ls /sys/fs/bpf/xdp/globals
filter_ethernet  filter_ipv4  filter_ipv6  filter_ports  xdp_stats_map
# bpftool map show
5: percpu_array  name xdp_stats_map  flags 0x0
        key 4B  value 16B  max_entries 5  memlock 4096B
        btf_id 35
6: percpu_array  name filter_ports  flags 0x0
        key 4B  value 8B  max_entries 65536  memlock 1576960B
        btf_id 35
7: percpu_hash  name filter_ipv4  flags 0x0
        key 4B  value 8B  max_entries 10000  memlock 1064960B
        btf_id 35
8: percpu_hash  name filter_ipv6  flags 0x0
        key 16B  value 8B  max_entries 10000  memlock 1142784B
        btf_id 35
9: percpu_hash  name filter_ethernet  flags 0x0
        key 6B  value 8B  max_entries 10000  memlock 1064960B
        btf_id 35
# bpftool prog show
36: xdp  name xdpfilt_alw_tcp  tag 18c1bb25084030bc  gpl
        loaded_at 2020-10-22T08:04:37-0400  uid 0
        xlated 1128B  jited 690B  memlock 4096B  map_ids 6,5
        btf_id 45
# /root/iproute2/ip/ip link set veth0 xdp off
# rm -rf /sys/fs/bpf/xdp/globals


== Load new btf defined maps
# /root/iproute2/ip/ip link set veth0 xdp obj btf_graft.o sec aaa
# /root/iproute2/ip/ip link show veth0
5: veth0@veth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 xdp qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 6a:e6:fa:2b:4e:1f brd ff:ff:ff:ff:ff:ff
    prog/xdp id 40 tag 3056d2382e53f27c jited
# ls /sys/fs/bpf/xdp/globals
jmp_tc
# bpftool map show
10: prog_array  name jmp_tc  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
# bpftool prog show
40: xdp  name cls_aaa  tag 3056d2382e53f27c  gpl
        loaded_at 2020-10-22T08:04:39-0400  uid 0
        xlated 80B  jited 71B  memlock 4096B
        btf_id 50
# /root/iproute2/ip/ip link set veth0 xdp off
# /root/iproute2/ip/ip link set veth0 xdp obj btf_map_in_map.o sec ingress
# /root/iproute2/ip/ip link show veth0
5: veth0@veth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 xdp qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 6a:e6:fa:2b:4e:1f brd ff:ff:ff:ff:ff:ff
    prog/xdp id 44 tag 4420e72b2a601ed7 jited
# ls /sys/fs/bpf/xdp/globals
jmp_tc  map_outer
# bpftool map show
10: prog_array  name jmp_tc  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
11: array  name map_inner  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
13: array_of_maps  name map_outer  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
# bpftool prog show
44: xdp  name imain  tag 4420e72b2a601ed7  gpl
        loaded_at 2020-10-22T08:04:41-0400  uid 0
        xlated 336B  jited 193B  memlock 4096B  map_ids 13
        btf_id 55
# /root/iproute2/ip/ip link set veth0 xdp off
# /root/iproute2/ip/ip link set veth0 xdp obj btf_shared.o sec ingress
# /root/iproute2/ip/ip link show veth0
5: veth0@veth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 xdp qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 6a:e6:fa:2b:4e:1f brd ff:ff:ff:ff:ff:ff
    prog/xdp id 48 tag 9cbab549c3af3eab jited
# ls /sys/fs/bpf/xdp/globals
jmp_tc  map_outer  map_sh
# bpftool map show
10: prog_array  name jmp_tc  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
11: array  name map_inner  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
13: array_of_maps  name map_outer  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
14: array  name map_sh  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
# bpftool prog show
48: xdp  name imain  tag 9cbab549c3af3eab  gpl
        loaded_at 2020-10-22T08:04:43-0400  uid 0
        xlated 224B  jited 139B  memlock 4096B  map_ids 14
        btf_id 60
# /root/iproute2/ip/ip link set veth0 xdp off
# rm -rf /sys/fs/bpf/xdp/globals


== Test load objs by tc
# /root/iproute2/tc/tc qdisc add dev veth0 ingress
# /root/iproute2/tc/tc filter add dev veth0 ingress bpf da obj bpf_cyclic.o sec 0xabccba/0
# /root/iproute2/tc/tc filter add dev veth0 parent ffff: bpf obj bpf_graft.o
# /root/iproute2/tc/tc filter add dev veth0 ingress bpf da obj bpf_tailcall.o sec 42/0
# /root/iproute2/tc/tc filter add dev veth0 ingress bpf da obj bpf_tailcall.o sec 42/1
# /root/iproute2/tc/tc filter add dev veth0 ingress bpf da obj bpf_tailcall.o sec 43/0
# /root/iproute2/tc/tc filter add dev veth0 ingress bpf da obj bpf_tailcall.o sec classifier
# /root/iproute2/ip/ip link show veth0
5: veth0@veth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000
    link/ether 6a:e6:fa:2b:4e:1f brd ff:ff:ff:ff:ff:ff
# ls /sys/fs/bpf/xdp/37e88cb3b9646b2ea5f99ab31069ad88db06e73d /sys/fs/bpf/xdp/fc68fe3e96378a0cba284ea6acbe17e898d8b11f /sys/fs/bpf/xdp/globals
/sys/fs/bpf/xdp/37e88cb3b9646b2ea5f99ab31069ad88db06e73d:
jmp_tc

/sys/fs/bpf/xdp/fc68fe3e96378a0cba284ea6acbe17e898d8b11f:
jmp_ex  jmp_tc  map_sh

/sys/fs/bpf/xdp/globals:
jmp_tc
# bpftool map show
15: prog_array  name jmp_tc  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
        owner_prog_type sched_cls  owner jited
16: prog_array  name jmp_tc  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
        owner_prog_type sched_cls  owner jited
17: prog_array  name jmp_ex  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
        owner_prog_type sched_cls  owner jited
18: prog_array  name jmp_tc  flags 0x0
        key 4B  value 4B  max_entries 2  memlock 4096B
        owner_prog_type sched_cls  owner jited
19: array  name map_sh  flags 0x0
        key 4B  value 4B  max_entries 1  memlock 4096B
# bpftool prog show
52: sched_cls  name cls_loop  tag 3e98a40b04099d36  gpl
        loaded_at 2020-10-22T08:04:45-0400  uid 0
        xlated 168B  jited 133B  memlock 4096B  map_ids 15
        btf_id 65
56: sched_cls  name cls_entry  tag 0fbb4d9310a6ee26  gpl
        loaded_at 2020-10-22T08:04:45-0400  uid 0
        xlated 144B  jited 121B  memlock 4096B  map_ids 16
        btf_id 70
60: sched_cls  name cls_case1  tag e06a3bd62293d65d  gpl
        loaded_at 2020-10-22T08:04:45-0400  uid 0
        xlated 328B  jited 216B  memlock 4096B  map_ids 19,17
        btf_id 75
66: sched_cls  name cls_case1  tag e06a3bd62293d65d  gpl
        loaded_at 2020-10-22T08:04:45-0400  uid 0
        xlated 328B  jited 216B  memlock 4096B  map_ids 19,17
        btf_id 80
72: sched_cls  name cls_case1  tag e06a3bd62293d65d  gpl
        loaded_at 2020-10-22T08:04:45-0400  uid 0
        xlated 328B  jited 216B  memlock 4096B  map_ids 19,17
        btf_id 85
78: sched_cls  name cls_case1  tag e06a3bd62293d65d  gpl
        loaded_at 2020-10-22T08:04:45-0400  uid 0
        xlated 328B  jited 216B  memlock 4096B  map_ids 19,17
        btf_id 90
79: sched_cls  name cls_case2  tag ee218ff893dca823  gpl
        loaded_at 2020-10-22T08:04:45-0400  uid 0
        xlated 336B  jited 218B  memlock 4096B  map_ids 19,18
        btf_id 90
80: sched_cls  name cls_exit  tag e78a58140deed387  gpl
        loaded_at 2020-10-22T08:04:45-0400  uid 0
        xlated 288B  jited 177B  memlock 4096B  map_ids 19
        btf_id 90

I also run the following upstream kselftest with patches iproute2 and
all passed.

test_lwt_ip_encap.sh
test_xdp_redirect.sh
test_tc_redirect.sh
test_xdp_meta.sh
test_xdp_veth.sh
test_xdp_vlan.sh


Hangbin Liu (5):
  iproute2: add check_libbpf() and get_libbpf_version()
  lib: make ipvrf able to use libbpf and fix function name conflicts
  lib: add libbpf support
  examples/bpf: move struct bpf_elf_map defined maps to legacy folder
  examples/bpf: add bpf examples with BTF defined maps

 configure                                | 113 ++++++++
 examples/bpf/README                      |  18 +-
 examples/bpf/bpf_graft.c                 |  14 +-
 examples/bpf/bpf_map_in_map.c            |  37 ++-
 examples/bpf/bpf_shared.c                |  14 +-
 examples/bpf/{ => legacy}/bpf_cyclic.c   |   2 +-
 examples/bpf/legacy/bpf_graft.c          |  66 +++++
 examples/bpf/legacy/bpf_map_in_map.c     |  56 ++++
 examples/bpf/legacy/bpf_shared.c         |  53 ++++
 examples/bpf/{ => legacy}/bpf_tailcall.c |   2 +-
 include/bpf_api.h                        |  13 +
 include/bpf_util.h                       |  30 +-
 ip/ip.c                                  |  10 +-
 ip/ipvrf.c                               |   6 +-
 lib/Makefile                             |   8 +-
 lib/bpf_glue.c                           |  86 ++++++
 lib/{bpf.c => bpf_legacy.c}              | 193 ++++++++++++-
 lib/bpf_libbpf.c                         | 348 +++++++++++++++++++++++
 tc/tc.c                                  |  10 +-
 19 files changed, 1017 insertions(+), 62 deletions(-)
 rename examples/bpf/{ => legacy}/bpf_cyclic.c (95%)
 create mode 100644 examples/bpf/legacy/bpf_graft.c
 create mode 100644 examples/bpf/legacy/bpf_map_in_map.c
 create mode 100644 examples/bpf/legacy/bpf_shared.c
 rename examples/bpf/{ => legacy}/bpf_tailcall.c (98%)
 create mode 100644 lib/bpf_glue.c
 rename lib/{bpf.c => bpf_legacy.c} (94%)
 create mode 100644 lib/bpf_libbpf.c

-- 
2.25.4


^ permalink raw reply

* [PATCH bpf] net, xsk: Avoid taking multiple skbuff references
From: Björn Töpel @ 2020-11-23 13:12 UTC (permalink / raw)
  To: ast, daniel, netdev, bpf
  Cc: Björn Töpel, jonathan.lemon, yhs, weqaar.janjua,
	magnus.karlsson, weqaar.a.janjua

From: Björn Töpel <bjorn.topel@intel.com>

Commit 642e450b6b59 ("xsk: Do not discard packet when NETDEV_TX_BUSY")
addressed the problem that packets were discarded from the Tx AF_XDP
ring, when the driver returned NETDEV_TX_BUSY. Part of the fix was
bumping the skbuff reference count, so that the buffer would not be
freed by dev_direct_xmit(). A reference count larger than one means
that the skbuff is "shared", which is not the case.

If the "shared" skbuff is sent to the generic XDP receive path,
netif_receive_generic_xdp(), and pskb_expand_head() is entered the
BUG_ON(skb_shared(skb)) will trigger.

This patch adds a variant to dev_direct_xmit(), __dev_direct_xmit(),
where a user can select the skbuff free policy. This allows AF_XDP to
avoid bumping the reference count, but still keep the NETDEV_TX_BUSY
behavior.

Reported-by: Yonghong Song <yhs@fb.com>
Fixes: 642e450b6b59 ("xsk: Do not discard packet when NETDEV_TX_BUSY")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 include/linux/netdevice.h | 1 +
 net/core/dev.c            | 9 +++++++--
 net/xdp/xsk.c             | 8 +-------
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 964b494b0e8d..e7402fca7752 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2815,6 +2815,7 @@ u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
 		       struct net_device *sb_dev);
 int dev_queue_xmit(struct sk_buff *skb);
 int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
+int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id, bool free_on_busy);
 int dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
 int register_netdevice(struct net_device *dev);
 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
diff --git a/net/core/dev.c b/net/core/dev.c
index 82dc6b48e45f..2af79a4253bb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4180,7 +4180,7 @@ int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
 }
 EXPORT_SYMBOL(dev_queue_xmit_accel);
 
-int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id, bool free_on_busy)
 {
 	struct net_device *dev = skb->dev;
 	struct sk_buff *orig_skb = skb;
@@ -4211,7 +4211,7 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 
 	local_bh_enable();
 
-	if (!dev_xmit_complete(ret))
+	if (free_on_busy && !dev_xmit_complete(ret))
 		kfree_skb(skb);
 
 	return ret;
@@ -4220,6 +4220,11 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
 	kfree_skb_list(skb);
 	return NET_XMIT_DROP;
 }
+
+int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+{
+	return __dev_direct_xmit(skb, queue_id, true);
+}
 EXPORT_SYMBOL(dev_direct_xmit);
 
 /*************************************************************************
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 5a6cdf7b320d..c6ad31b374b7 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -411,11 +411,7 @@ static int xsk_generic_xmit(struct sock *sk)
 		skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
 		skb->destructor = xsk_destruct_skb;
 
-		/* Hinder dev_direct_xmit from freeing the packet and
-		 * therefore completing it in the destructor
-		 */
-		refcount_inc(&skb->users);
-		err = dev_direct_xmit(skb, xs->queue_id);
+		err = __dev_direct_xmit(skb, xs->queue_id, false);
 		if  (err == NETDEV_TX_BUSY) {
 			/* Tell user-space to retry the send */
 			skb->destructor = sock_wfree;
@@ -429,12 +425,10 @@ static int xsk_generic_xmit(struct sock *sk)
 		/* Ignore NET_XMIT_CN as packet might have been sent */
 		if (err == NET_XMIT_DROP) {
 			/* SKB completed but not sent */
-			kfree_skb(skb);
 			err = -EBUSY;
 			goto out;
 		}
 
-		consume_skb(skb);
 		sent_frame = true;
 	}
 

base-commit: 178648916e73e00de83150eb0c90c0d3a977a46a
-- 
2.27.0


^ permalink raw reply related

* Re: [Intel-wired-lan] [PATCH 000/141] Fix fall-through warnings for Clang
From: Gustavo A. R. Silva @ 2020-11-23 13:03 UTC (permalink / raw)
  To: James Bottomley
  Cc: Joe Perches, Kees Cook, Jakub Kicinski, alsa-devel,
	linux-atm-general, reiserfs-devel, linux-iio, linux-wireless,
	linux-fbdev, dri-devel, linux-kernel, Nathan Chancellor,
	linux-ide, dm-devel, keyrings, linux-mtd, GR-everest-linux-l2,
	wcn36xx, samba-technical, linux-i3c, linux1394-devel, linux-afs,
	usb-storage, drbd-dev, devel, linux-cifs, rds-devel,
	Nick Desaulniers, linux-scsi, linux-rdma, oss-drivers, bridge,
	linux-security-module, amd-gfx, linux-stm32, cluster-devel,
	linux-acpi, coreteam, intel-wired-lan, linux-input, Miguel Ojeda,
	tipc-discussion, linux-ext4, linux-media, linux-watchdog, selinux,
	linux-arm-msm, intel-gfx, linux-geode, linux-can, linux-block,
	linux-gpio, op-tee, linux-mediatek, xen-devel, nouveau,
	linux-hams, ceph-devel, virtualization, linux-arm-kernel,
	linux-hwmon, x86, linux-nfs, GR-Linux-NIC-Dev, linux-mm, netdev,
	linux-decnet-user, linux-mmc, linux-renesas-soc, linux-sctp,
	linux-usb, netfilter-devel, linux-crypto, patches,
	linux-integrity, target-devel, linux-hardening, Jonathan Cameron,
	Greg KH
In-Reply-To: <dbd2cb703ed9eefa7dde9281ea26ab0f7acc8afe.camel@HansenPartnership.com>

On Sun, Nov 22, 2020 at 11:53:55AM -0800, James Bottomley wrote:
> On Sun, 2020-11-22 at 11:22 -0800, Joe Perches wrote:
> > On Sun, 2020-11-22 at 11:12 -0800, James Bottomley wrote:
> > > On Sun, 2020-11-22 at 10:25 -0800, Joe Perches wrote:
> > > > On Sun, 2020-11-22 at 10:21 -0800, James Bottomley wrote:
> > > > > Please tell me our reward for all this effort isn't a single
> > > > > missing error print.
> > > > 
> > > > There were quite literally dozens of logical defects found
> > > > by the fallthrough additions.  Very few were logging only.
> > > 
> > > So can you give us the best examples (or indeed all of them if
> > > someone is keeping score)?  hopefully this isn't a US election
> > > situation ...
> > 
> > Gustavo?  Are you running for congress now?
> > 
> > https://lwn.net/Articles/794944/
> 
> That's 21 reported fixes of which about 50% seem to produce no change
> in code behaviour at all, a quarter seem to have no user visible effect
> with the remaining quarter producing unexpected errors on obscure
> configuration parameters, which is why no-one really noticed them
> before.

The really important point here is the number of bugs this has prevented
and will prevent in the future. See an example of this, below:

https://lore.kernel.org/linux-iio/20190813135802.GB27392@kroah.com/

This work is still relevant, even if the total number of issues/bugs
we find in the process is zero (which is not the case).

"The sucky thing about doing hard work to deploy hardening is that the
result is totally invisible by definition (things not happening) [..]"
- Dmitry Vyukov

Thanks
--
Gustavo






^ permalink raw reply

* Re: [PATCH net-next] bridge: mrp: Implement LC mode for MRP
From: Horatiu Vultur @ 2020-11-23 12:31 UTC (permalink / raw)
  To: Nikolay Aleksandrov; +Cc: roopa, davem, kuba, linux-kernel, bridge, netdev
In-Reply-To: <5ffa6f9f-d1f3-adc7-ddb8-e8107ea78da5@nvidia.com>

The 11/23/2020 14:13, Nikolay Aleksandrov wrote:
> EXTERNAL EMAIL: Do not click links or open attachments unless you know the content is safe
> 
> On 23/11/2020 13:14, Horatiu Vultur wrote:
> > Extend MRP to support LC mode(link check) for the interconnect port.
> > This applies only to the interconnect ring.
> >
> > Opposite to RC mode(ring check) the LC mode is using CFM frames to
> > detect when the link goes up or down and based on that the userspace
> > will need to react.
> > One advantage of the LC mode over RC mode is that there will be fewer
> > frames in the normal rings. Because RC mode generates InTest on all
> > ports while LC mode sends CFM frame only on the interconnect port.
> >
> > All 4 nodes part of the interconnect ring needs to have the same mode.
> > And it is not possible to have running LC and RC mode at the same time
> > on a node.
> >
> > Whenever the MIM starts it needs to detect the status of the other 3
> > nodes in the interconnect ring so it would send a frame called
> > InLinkStatus, on which the clients needs to reply with their link
> > status.
> >
> > This patch adds the frame header for the frame InLinkStatus and
> > extends existing rules on how to forward this frame.
> >
> > Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
> > ---
> >  include/uapi/linux/mrp_bridge.h |  7 +++++++
> >  net/bridge/br_mrp.c             | 18 +++++++++++++++---
> >  2 files changed, 22 insertions(+), 3 deletions(-)
> >
> 
> Hi Horatiu,
> The patch looks good overall, just one question below.

Hi Nik,

Thanks for taking time to review the patch.

> 
> > diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h
> > index 6aeb13ef0b1e..450f6941a5a1 100644
> > --- a/include/uapi/linux/mrp_bridge.h
> > +++ b/include/uapi/linux/mrp_bridge.h
> > @@ -61,6 +61,7 @@ enum br_mrp_tlv_header_type {
> >       BR_MRP_TLV_HEADER_IN_TOPO = 0x7,
> >       BR_MRP_TLV_HEADER_IN_LINK_DOWN = 0x8,
> >       BR_MRP_TLV_HEADER_IN_LINK_UP = 0x9,
> > +     BR_MRP_TLV_HEADER_IN_LINK_STATUS = 0xa,
> >       BR_MRP_TLV_HEADER_OPTION = 0x7f,
> >  };
> >
> > @@ -156,4 +157,10 @@ struct br_mrp_in_link_hdr {
> >       __be16 interval;
> >  };
> >
> > +struct br_mrp_in_link_status_hdr {
> > +     __u8 sa[ETH_ALEN];
> > +     __be16 port_role;
> > +     __be16 id;
> > +};
> > +
> 
> I didn't see this struct used anywhere, am I missing anything?

Yes, you are right, the struct is not used any. But I put it there as I
put the other frame types for MRP.

> 
> Cheers,
>  Nik
> 
> >  #endif
> > diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
> > index bb12fbf9aaf2..cec2c4e4561d 100644
> > --- a/net/bridge/br_mrp.c
> > +++ b/net/bridge/br_mrp.c
> > @@ -858,7 +858,8 @@ static bool br_mrp_in_frame(struct sk_buff *skb)
> >       if (hdr->type == BR_MRP_TLV_HEADER_IN_TEST ||
> >           hdr->type == BR_MRP_TLV_HEADER_IN_TOPO ||
> >           hdr->type == BR_MRP_TLV_HEADER_IN_LINK_DOWN ||
> > -         hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP)
> > +         hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP ||
> > +         hdr->type == BR_MRP_TLV_HEADER_IN_LINK_STATUS)
> >               return true;
> >
> >       return false;
> > @@ -1126,9 +1127,9 @@ static int br_mrp_rcv(struct net_bridge_port *p,
> >                                               goto no_forward;
> >                               }
> >                       } else {
> > -                             /* MIM should forward IntLinkChange and
> > +                             /* MIM should forward IntLinkChange/Status and
> >                                * IntTopoChange between ring ports but MIM
> > -                              * should not forward IntLinkChange and
> > +                              * should not forward IntLinkChange/Status and
> >                                * IntTopoChange if the frame was received at
> >                                * the interconnect port
> >                                */
> > @@ -1155,6 +1156,17 @@ static int br_mrp_rcv(struct net_bridge_port *p,
> >                            in_type == BR_MRP_TLV_HEADER_IN_LINK_DOWN))
> >                               goto forward;
> >
> > +                     /* MIC should forward IntLinkStatus frames only to
> > +                      * interconnect port if it was received on a ring port.
> > +                      * If it is received on interconnect port then, it
> > +                      * should be forward on both ring ports
> > +                      */
> > +                     if (br_mrp_is_ring_port(p_port, s_port, p) &&
> > +                         in_type == BR_MRP_TLV_HEADER_IN_LINK_STATUS) {
> > +                             p_dst = NULL;
> > +                             s_dst = NULL;
> > +                     }
> > +
> >                       /* Should forward the InTopo frames only between the
> >                        * ring ports
> >                        */
> >
> 

-- 
/Horatiu

^ permalink raw reply

* Re: [PATCH bpf] xsk: fix incorrect netdev reference count
From: patchwork-bot+netdevbpf @ 2020-11-23 12:30 UTC (permalink / raw)
  To: Marek Majtyka
  Cc: magnus.karlsson, bjorn.topel, ast, daniel, netdev, jonathan.lemon,
	marekx.majtyka, bpf
In-Reply-To: <20201120151443.105903-1-marekx.majtyka@intel.com>

Hello:

This patch was applied to bpf/bpf.git (refs/heads/master):

On Fri, 20 Nov 2020 16:14:43 +0100 you wrote:
> From: Marek Majtyka <marekx.majtyka@intel.com>
> 
> Fix incorrect netdev reference count in xsk_bind operation. Incorrect
> reference count of the device appears when a user calls bind with the
> XDP_ZEROCOPY flag on an interface which does not support zero-copy.
> In such a case, an error is returned but the reference count is not
> decreased. This change fixes the fault, by decreasing the reference count
> in case of such an error.
> 
> [...]

Here is the summary with links:
  - [bpf] xsk: fix incorrect netdev reference count
    https://git.kernel.org/bpf/bpf/c/178648916e73

You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH bpf-next v2 0/5] selftests/bpf: xsk selftests
From: Björn Töpel @ 2020-11-23 12:20 UTC (permalink / raw)
  To: Yonghong Song, Weqaar Janjua, bpf, netdev, daniel, ast,
	magnus.karlsson
  Cc: Weqaar Janjua, shuah, skhan, linux-kselftest, anders.roxell,
	jonathan.lemon
In-Reply-To: <586d63b4-1828-f633-a4ff-88e4e23d164a@fb.com>

On 2020-11-21 01:31, Yonghong Song wrote:
> 
> 
> On 11/20/20 5:00 AM, Weqaar Janjua wrote:
>> This patch set adds AF_XDP selftests based on veth to selftests/bpf.
>>
>> # Topology:
>> # ---------
>> #                 -----------
>> #               _ | Process | _
>> #              /  -----------  \
>> #             /        |        \
>> #            /         |         \
>> #      -----------     |     -----------
>> #      | Thread1 |     |     | Thread2 |
>> #      -----------     |     -----------
>> #           |          |          |
>> #      -----------     |     -----------
>> #      |  xskX   |     |     |  xskY   |
>> #      -----------     |     -----------
>> #           |          |          |
>> #      -----------     |     ----------
>> #      |  vethX  | --------- |  vethY |
>> #      -----------   peer    ----------
>> #           |          |          |
>> #      namespaceX      |     namespaceY
>>
>> These selftests test AF_XDP SKB and Native/DRV modes using veth Virtual
>> Ethernet interfaces.
>>
>> The test program contains two threads, each thread is single socket with
>> a unique UMEM. It validates in-order packet delivery and packet content
>> by sending packets to each other.
>>
>> Prerequisites setup by script test_xsk_prerequisites.sh:
>>
>>     Set up veth interfaces as per the topology shown ^^:
>>     * setup two veth interfaces and one namespace
>>     ** veth<xxxx> in root namespace
>>     ** veth<yyyy> in af_xdp<xxxx> namespace
>>     ** namespace af_xdp<xxxx>
>>     * create a spec file veth.spec that includes this run-time 
>> configuration
>>       that is read by test scripts - filenames prefixed with test_xsk_
>>     *** xxxx and yyyy are randomly generated 4 digit numbers used to 
>> avoid
>>         conflict with any existing interface
>>
>> The following tests are provided:
>>
>> 1. AF_XDP SKB mode
>>     Generic mode XDP is driver independent, used when the driver does
>>     not have support for XDP. Works on any netdevice using sockets and
>>     generic XDP path. XDP hook from netif_receive_skb().
>>     a. nopoll - soft-irq processing
>>     b. poll - using poll() syscall
>>     c. Socket Teardown
>>        Create a Tx and a Rx socket, Tx from one socket, Rx on another.
>>        Destroy both sockets, then repeat multiple times. Only nopoll mode
>>       is used
>>     d. Bi-directional Sockets
>>        Configure sockets as bi-directional tx/rx sockets, sets up fill
>>       and completion rings on each socket, tx/rx in both directions.
>>       Only nopoll mode is used
>>
>> 2. AF_XDP DRV/Native mode
>>     Works on any netdevice with XDP_REDIRECT support, driver dependent.
>>     Processes packets before SKB allocation. Provides better performance
>>     than SKB. Driver hook available just after DMA of buffer descriptor.
>>     a. nopoll
>>     b. poll
>>     c. Socket Teardown
>>     d. Bi-directional Sockets
>>     * Only copy mode is supported because veth does not currently support
>>       zero-copy mode
>>
>> Total tests: 8
>>
>> Flow:
>> * Single process spawns two threads: Tx and Rx
>> * Each of these two threads attach to a veth interface within their
>>    assigned namespaces
>> * Each thread creates one AF_XDP socket connected to a unique umem
>>    for each veth interface
>> * Tx thread transmits 10k packets from veth<xxxx> to veth<yyyy>
>> * Rx thread verifies if all 10k packets were received and delivered
>>    in-order, and have the right content
>>
>> v2 changes:
>> * Move selftests/xsk to selftests/bpf
>> * Remove Makefiles under selftests/xsk, and utilize 
>> selftests/bpf/Makefile
>>
>> Structure of the patch set:
>>
>> Patch 1: This patch adds XSK Selftests framework under selftests/bpf
>> Patch 2: Adds tests: SKB poll and nopoll mode, and mac-ip-udp debug
>> Patch 3: Adds tests: DRV poll and nopoll mode
>> Patch 4: Adds tests: SKB and DRV Socket Teardown
>> Patch 5: Adds tests: SKB and DRV Bi-directional Sockets
> 
> I just want to report that after applying the above 5 patches
> on top of bpf-next commit 450d060e8f75 ("bpftool: Add {i,d}tlb_misses 
> support for bpftool profile"), I hit the following error with below 
> command sequences:
> 
>   $ ./test_xsk_prerequisites.sh
>   $ ./test_xsk_skb_poll.sh
> # Interface found: ve1480
> # Interface found: ve9258
> # NS switched: af_xdp9258
> 1..1
> # Interface [ve9258] vector [Rx]
> # Interface [ve1480] vector [Tx]
> # Sending 10000 packets on interface ve1480
> [  331.741244] ------------[ cut here ]------------
> [  331.741741] kernel BUG at net/core/skbuff.c:1621!
> [  331.742265] invalid opcode: 0000 [#1] PREEMPT SMP PTI
> [  331.742837] CPU: 0 PID: 1883 Comm: xdpxceiver Not tainted 5.10.0-rc3+ 
> #1037
> [  331.743468] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
> BIOS 1.9.3
> -1.el7.centos 04/01/2014
> [  331.744300] RIP: 0010:pskb_expand_head+0x27b/0x310

Ugh, looks like the tests are working. :-P

This is a BUG_ON(skb_shared(skb)) trigger, related to the skbuff 
refcount changes done recently in AF_XDP.

I'll cook a patch! Thanks for the report!


Björn

^ permalink raw reply

* Re: [PATCHv2 1/1] xdp: remove the function xsk_map_inc
From: Magnus Karlsson @ 2020-11-23 12:19 UTC (permalink / raw)
  To: Zhu Yanjun
  Cc: Karlsson, Magnus, Björn Töpel, David S. Miller, netdev
In-Reply-To: <CAD=hENfysbUCNapfFZ6i0tOFo5Ge3QS+iQSt2ySBDb10zFdgwg@mail.gmail.com>

On Mon, Nov 23, 2020 at 1:11 PM Zhu Yanjun <zyjzyj2000@gmail.com> wrote:
>
> On Mon, Nov 23, 2020 at 8:05 PM <zyjzyj2000@gmail.com> wrote:
> >
> > From: Zhu Yanjun <zyjzyj2000@gmail.com>
> >
> > The function xsk_map_inc is a simple wrapper of bpf_map_inc and
> > always returns zero. As such, replacing this function with bpf_map_inc
> > and removing the test code.
> >
> > Signed-off-by: Zhu Yanjun <zyjzyj2000@gmail.com>
>
>
> > ---
> >  net/xdp/xsk.c    |  1 -
> >  net/xdp/xsk.h    |  1 -
> >  net/xdp/xskmap.c | 13 +------------
> >  3 files changed, 1 insertion(+), 14 deletions(-)
> >
> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > index cfbec3989a76..c1b8a888591c 100644
> > --- a/net/xdp/xsk.c
> > +++ b/net/xdp/xsk.c
> > @@ -548,7 +548,6 @@ static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
> >         node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
> >                                         node);
> >         if (node) {
> > -               WARN_ON(xsk_map_inc(node->map));

This should be bpf_map_inc(&node->map->map); Think you forgot to
convert this one.

> >                 map = node->map;
> >                 *map_entry = node->map_entry;
> >         }
> > diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
> > index b9e896cee5bb..0aad25c0e223 100644
> > --- a/net/xdp/xsk.h
> > +++ b/net/xdp/xsk.h
> > @@ -41,7 +41,6 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk)
> >
> >  void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
> >                              struct xdp_sock **map_entry);
> > -int xsk_map_inc(struct xsk_map *map);
> >  void xsk_map_put(struct xsk_map *map);
> >  void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id);
> >  int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
> > diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
> > index 49da2b8ace8b..6b7e9a72b101 100644
> > --- a/net/xdp/xskmap.c
> > +++ b/net/xdp/xskmap.c
> > @@ -11,12 +11,6 @@
> >
> >  #include "xsk.h"
> >
> > -int xsk_map_inc(struct xsk_map *map)
> > -{
> > -       bpf_map_inc(&map->map);
> > -       return 0;
> > -}
>
> Hi, Magnus
>
> The function xsk_map_inc is replaced with bpf_map_inc.
>
> Zhu Yanjun
>
> > -
> >  void xsk_map_put(struct xsk_map *map)
> >  {
> >         bpf_map_put(&map->map);
> > @@ -26,17 +20,12 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
> >                                                struct xdp_sock **map_entry)
> >  {
> >         struct xsk_map_node *node;
> > -       int err;
> >
> >         node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
> >         if (!node)
> >                 return ERR_PTR(-ENOMEM);
> >
> > -       err = xsk_map_inc(map);
> > -       if (err) {
> > -               kfree(node);
> > -               return ERR_PTR(err);
> > -       }
> > +       bpf_map_inc(&map->map);
> >
> >         node->map = map;
> >         node->map_entry = map_entry;
> > --
> > 2.25.1
> >

^ permalink raw reply

* Re: [PATCH net-next] bridge: mrp: Implement LC mode for MRP
From: Nikolay Aleksandrov @ 2020-11-23 12:13 UTC (permalink / raw)
  To: Horatiu Vultur, roopa, davem, kuba, linux-kernel, bridge, netdev
In-Reply-To: <20201123111401.136952-1-horatiu.vultur@microchip.com>

On 23/11/2020 13:14, Horatiu Vultur wrote:
> Extend MRP to support LC mode(link check) for the interconnect port.
> This applies only to the interconnect ring.
> 
> Opposite to RC mode(ring check) the LC mode is using CFM frames to
> detect when the link goes up or down and based on that the userspace
> will need to react.
> One advantage of the LC mode over RC mode is that there will be fewer
> frames in the normal rings. Because RC mode generates InTest on all
> ports while LC mode sends CFM frame only on the interconnect port.
> 
> All 4 nodes part of the interconnect ring needs to have the same mode.
> And it is not possible to have running LC and RC mode at the same time
> on a node.
> 
> Whenever the MIM starts it needs to detect the status of the other 3
> nodes in the interconnect ring so it would send a frame called
> InLinkStatus, on which the clients needs to reply with their link
> status.
> 
> This patch adds the frame header for the frame InLinkStatus and
> extends existing rules on how to forward this frame.
> 
> Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
> ---
>  include/uapi/linux/mrp_bridge.h |  7 +++++++
>  net/bridge/br_mrp.c             | 18 +++++++++++++++---
>  2 files changed, 22 insertions(+), 3 deletions(-)
> 

Hi Horatiu,
The patch looks good overall, just one question below.

> diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h
> index 6aeb13ef0b1e..450f6941a5a1 100644
> --- a/include/uapi/linux/mrp_bridge.h
> +++ b/include/uapi/linux/mrp_bridge.h
> @@ -61,6 +61,7 @@ enum br_mrp_tlv_header_type {
>  	BR_MRP_TLV_HEADER_IN_TOPO = 0x7,
>  	BR_MRP_TLV_HEADER_IN_LINK_DOWN = 0x8,
>  	BR_MRP_TLV_HEADER_IN_LINK_UP = 0x9,
> +	BR_MRP_TLV_HEADER_IN_LINK_STATUS = 0xa,
>  	BR_MRP_TLV_HEADER_OPTION = 0x7f,
>  };
>  
> @@ -156,4 +157,10 @@ struct br_mrp_in_link_hdr {
>  	__be16 interval;
>  };
>  
> +struct br_mrp_in_link_status_hdr {
> +	__u8 sa[ETH_ALEN];
> +	__be16 port_role;
> +	__be16 id;
> +};
> +

I didn't see this struct used anywhere, am I missing anything?

Cheers,
 Nik

>  #endif
> diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
> index bb12fbf9aaf2..cec2c4e4561d 100644
> --- a/net/bridge/br_mrp.c
> +++ b/net/bridge/br_mrp.c
> @@ -858,7 +858,8 @@ static bool br_mrp_in_frame(struct sk_buff *skb)
>  	if (hdr->type == BR_MRP_TLV_HEADER_IN_TEST ||
>  	    hdr->type == BR_MRP_TLV_HEADER_IN_TOPO ||
>  	    hdr->type == BR_MRP_TLV_HEADER_IN_LINK_DOWN ||
> -	    hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP)
> +	    hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP ||
> +	    hdr->type == BR_MRP_TLV_HEADER_IN_LINK_STATUS)
>  		return true;
>  
>  	return false;
> @@ -1126,9 +1127,9 @@ static int br_mrp_rcv(struct net_bridge_port *p,
>  						goto no_forward;
>  				}
>  			} else {
> -				/* MIM should forward IntLinkChange and
> +				/* MIM should forward IntLinkChange/Status and
>  				 * IntTopoChange between ring ports but MIM
> -				 * should not forward IntLinkChange and
> +				 * should not forward IntLinkChange/Status and
>  				 * IntTopoChange if the frame was received at
>  				 * the interconnect port
>  				 */
> @@ -1155,6 +1156,17 @@ static int br_mrp_rcv(struct net_bridge_port *p,
>  			     in_type == BR_MRP_TLV_HEADER_IN_LINK_DOWN))
>  				goto forward;
>  
> +			/* MIC should forward IntLinkStatus frames only to
> +			 * interconnect port if it was received on a ring port.
> +			 * If it is received on interconnect port then, it
> +			 * should be forward on both ring ports
> +			 */
> +			if (br_mrp_is_ring_port(p_port, s_port, p) &&
> +			    in_type == BR_MRP_TLV_HEADER_IN_LINK_STATUS) {
> +				p_dst = NULL;
> +				s_dst = NULL;
> +			}
> +
>  			/* Should forward the InTopo frames only between the
>  			 * ring ports
>  			 */
> 


^ permalink raw reply

* Re: [PATCHv2 1/1] xdp: remove the function xsk_map_inc
From: Zhu Yanjun @ 2020-11-23 12:10 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, David S. Miller, netdev
In-Reply-To: <20201123120531.724963-1-zyjzyj2000@gmail.com>

On Mon, Nov 23, 2020 at 8:05 PM <zyjzyj2000@gmail.com> wrote:
>
> From: Zhu Yanjun <zyjzyj2000@gmail.com>
>
> The function xsk_map_inc is a simple wrapper of bpf_map_inc and
> always returns zero. As such, replacing this function with bpf_map_inc
> and removing the test code.
>
> Signed-off-by: Zhu Yanjun <zyjzyj2000@gmail.com>


> ---
>  net/xdp/xsk.c    |  1 -
>  net/xdp/xsk.h    |  1 -
>  net/xdp/xskmap.c | 13 +------------
>  3 files changed, 1 insertion(+), 14 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index cfbec3989a76..c1b8a888591c 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -548,7 +548,6 @@ static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
>         node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
>                                         node);
>         if (node) {
> -               WARN_ON(xsk_map_inc(node->map));
>                 map = node->map;
>                 *map_entry = node->map_entry;
>         }
> diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h
> index b9e896cee5bb..0aad25c0e223 100644
> --- a/net/xdp/xsk.h
> +++ b/net/xdp/xsk.h
> @@ -41,7 +41,6 @@ static inline struct xdp_sock *xdp_sk(struct sock *sk)
>
>  void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
>                              struct xdp_sock **map_entry);
> -int xsk_map_inc(struct xsk_map *map);
>  void xsk_map_put(struct xsk_map *map);
>  void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id);
>  int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
> diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
> index 49da2b8ace8b..6b7e9a72b101 100644
> --- a/net/xdp/xskmap.c
> +++ b/net/xdp/xskmap.c
> @@ -11,12 +11,6 @@
>
>  #include "xsk.h"
>
> -int xsk_map_inc(struct xsk_map *map)
> -{
> -       bpf_map_inc(&map->map);
> -       return 0;
> -}

Hi, Magnus

The function xsk_map_inc is replaced with bpf_map_inc.

Zhu Yanjun

> -
>  void xsk_map_put(struct xsk_map *map)
>  {
>         bpf_map_put(&map->map);
> @@ -26,17 +20,12 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
>                                                struct xdp_sock **map_entry)
>  {
>         struct xsk_map_node *node;
> -       int err;
>
>         node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
>         if (!node)
>                 return ERR_PTR(-ENOMEM);
>
> -       err = xsk_map_inc(map);
> -       if (err) {
> -               kfree(node);
> -               return ERR_PTR(err);
> -       }
> +       bpf_map_inc(&map->map);
>
>         node->map = map;
>         node->map_entry = map_entry;
> --
> 2.25.1
>

^ permalink raw reply

* Re: [PATCH v15 6/9] arm64/kvm: Add hypercall service for kvm ptp.
From: Marc Zyngier @ 2020-11-23 11:59 UTC (permalink / raw)
  To: Jianyong Wu
  Cc: justin.he, kvm, netdev, richardcochran, linux-kernel,
	sean.j.christopherson, steven.price, Andre.Przywara, john.stultz,
	yangbo.lu, pbonzini, tglx, nd, will, kvmarm, linux-arm-kernel
In-Reply-To: <d409aa1cb7cfcbf4351e6c5fc34d9c7e@kernel.org>

On 2020-11-23 10:44, Marc Zyngier wrote:
> On 2020-11-11 06:22, Jianyong Wu wrote:
>> ptp_kvm will get this service through SMCC call.
>> The service offers wall time and cycle count of host to guest.
>> The caller must specify whether they want the host cycle count
>> or the difference between host cycle count and cntvoff.
>> 
>> Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
>> ---
>>  arch/arm64/kvm/hypercalls.c | 61 
>> +++++++++++++++++++++++++++++++++++++
>>  include/linux/arm-smccc.h   | 17 +++++++++++
>>  2 files changed, 78 insertions(+)
>> 
>> diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
>> index b9d8607083eb..f7d189563f3d 100644
>> --- a/arch/arm64/kvm/hypercalls.c
>> +++ b/arch/arm64/kvm/hypercalls.c
>> @@ -9,6 +9,51 @@
>>  #include <kvm/arm_hypercalls.h>
>>  #include <kvm/arm_psci.h>
>> 
>> +static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
>> +{
>> +	struct system_time_snapshot systime_snapshot;
>> +	u64 cycles = ~0UL;
>> +	u32 feature;
>> +
>> +	/*
>> +	 * system time and counter value must captured in the same
>> +	 * time to keep consistency and precision.
>> +	 */
>> +	ktime_get_snapshot(&systime_snapshot);
>> +
>> +	// binding ptp_kvm clocksource to arm_arch_counter
>> +	if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER)
>> +		return;
>> +
>> +	val[0] = upper_32_bits(systime_snapshot.real);
>> +	val[1] = lower_32_bits(systime_snapshot.real);
> 
> What is the endianness of these values? I can't see it defined
> anywhere, and this is likely not to work if guest and hypervisor
> don't align.

Scratch that. This is all passed via registers, so the endianness
of the data is irrelevant. Please discard any comment about endianness
I made in this review.

The documentation aspect still requires to be beefed up.

Thanks,

         M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply

* BUG: receive list entry not found for dev vxcan1, id 002, mask C00007FF
From: syzbot @ 2020-11-23 11:58 UTC (permalink / raw)
  To: davem, kuba, linux-can, linux-kernel, mkl, netdev, socketcan,
	syzkaller-bugs

Hello,

syzbot found the following issue on:

HEAD commit:    c2e7554e Merge tag 'gfs2-v5.10-rc4-fixes' of git://git.ker..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=117f03ba500000
kernel config:  https://syzkaller.appspot.com/x/.config?x=75292221eb79ace2
dashboard link: https://syzkaller.appspot.com/bug?extid=381d06e0c8eaacb8706f
compiler:       gcc (GCC) 10.1.0-syz 20200507

Unfortunately, I don't have any reproducer for this issue yet.

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+381d06e0c8eaacb8706f@syzkaller.appspotmail.com

------------[ cut here ]------------
BUG: receive list entry not found for dev vxcan1, id 002, mask C00007FF
WARNING: CPU: 1 PID: 12946 at net/can/af_can.c:546 can_rx_unregister+0x5a4/0x700 net/can/af_can.c:546
Modules linked in:
CPU: 1 PID: 12946 Comm: syz-executor.1 Not tainted 5.10.0-rc4-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:can_rx_unregister+0x5a4/0x700 net/can/af_can.c:546
Code: 8b 7c 24 78 44 8b 64 24 68 49 c7 c5 20 ac 56 8a e8 01 6c 97 f9 44 89 f9 44 89 e2 4c 89 ee 48 c7 c7 60 ac 56 8a e8 66 af d3 00 <0f> 0b 48 8b 7c 24 28 e8 b0 25 0f 01 e9 54 fb ff ff e8 26 e0 d8 f9
RSP: 0018:ffffc90017e2fb38 EFLAGS: 00010286
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: ffff8880147a8000 RSI: ffffffff8158f3c5 RDI: fffff52002fc5f59
RBP: 0000000000000118 R08: 0000000000000001 R09: ffff8880b9f2011b
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000002
R13: ffff8880254c0000 R14: 1ffff92002fc5f6e R15: 00000000c00007ff
FS:  0000000001ddc940(0000) GS:ffff8880b9f00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000001b2f121000 CR3: 00000000152c0000 CR4: 00000000001506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 isotp_notifier+0x2a7/0x540 net/can/isotp.c:1303
 call_netdevice_notifier net/core/dev.c:1735 [inline]
 call_netdevice_unregister_notifiers+0x156/0x1c0 net/core/dev.c:1763
 call_netdevice_unregister_net_notifiers net/core/dev.c:1791 [inline]
 unregister_netdevice_notifier+0xcd/0x170 net/core/dev.c:1870
 isotp_release+0x136/0x600 net/can/isotp.c:1011
 __sock_release+0xcd/0x280 net/socket.c:596
 sock_close+0x18/0x20 net/socket.c:1277
 __fput+0x285/0x920 fs/file_table.c:281
 task_work_run+0xdd/0x190 kernel/task_work.c:151
 tracehook_notify_resume include/linux/tracehook.h:188 [inline]
 exit_to_user_mode_loop kernel/entry/common.c:164 [inline]
 exit_to_user_mode_prepare+0x17e/0x1a0 kernel/entry/common.c:191
 syscall_exit_to_user_mode+0x38/0x260 kernel/entry/common.c:266
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x417811
Code: 75 14 b8 03 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83 a4 1a 00 00 c3 48 83 ec 08 e8 0a fc ff ff 48 89 04 24 b8 03 00 00 00 0f 05 <48> 8b 3c 24 48 89 c2 e8 53 fc ff ff 48 89 d0 48 83 c4 08 48 3d 01
RSP: 002b:000000000169fbf0 EFLAGS: 00000293 ORIG_RAX: 0000000000000003
RAX: 0000000000000000 RBX: 0000000000000004 RCX: 0000000000417811
RDX: 0000000000000000 RSI: 00000000000013b7 RDI: 0000000000000003
RBP: 0000000000000001 R08: 00000000acabb3b7 R09: 00000000acabb3bb
R10: 000000000169fcd0 R11: 0000000000000293 R12: 000000000118c9a0
R13: 000000000118c9a0 R14: 00000000000003e8 R15: 000000000118bf2c


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

^ permalink raw reply

* Re: [PATCH 1/2] bluetooth: hci_event: consolidate error paths in hci_phy_link_complete_evt()
From: Marcel Holtmann @ 2020-11-23 11:58 UTC (permalink / raw)
  To: Sergey Shtylyov
  Cc: Johan Hedberg, linux-bluetooth, David S. Miller, Jakub Kicinski,
	netdev
In-Reply-To: <c7579df5-a69b-d9e7-ccb6-6a7b2fc23d4a@omprussia.ru>

Hi Sergey,

>>> hci_phy_link_complete_evt() has several duplicate error paths -- consolidate
>>> them, using the *goto* statements.
>>> 
>>> Signed-off-by: Sergey Shtylyov <s.shtylyov@omprussia.ru>
>>> 
>>> ---
>>> net/bluetooth/hci_event.c |   16 ++++++----------
>>> 1 file changed, 6 insertions(+), 10 deletions(-)
>> patch has been applied to bluetooth-next tree.
> 
>   What about the 2nd patch?

must have been slipping somehow. Can you please re-send against bluetooth-next.

Regards

Marcel


^ permalink raw reply

* Re: [PATCH net] Bluetooth: Fix potential null pointer dereference in create_le_conn_complete
From: Marcel Holtmann @ 2020-11-23 11:58 UTC (permalink / raw)
  To: Wang Hai
  Cc: David S. Miller, kuba, Johan Hedberg, jpawlowski, linux-bluetooth,
	netdev, linux-kernel
In-Reply-To: <20201113113956.52187-1-wanghai38@huawei.com>

Hi Wang,

> The pointer 'conn' may be null. Before being used by
> hci_connect_le_scan_cleanup(), The pointer 'conn' must be
> checked whether it is null.
> 
> Fixes: 28a667c9c279 ("Bluetooth: advertisement handling in new connect procedure")
> Reported-by: Hulk Robot <hulkci@huawei.com>
> Signed-off-by: Wang Hai <wanghai38@huawei.com>
> ---
> net/bluetooth/hci_conn.c | 5 ++---
> 1 file changed, 2 insertions(+), 3 deletions(-)

please send a version that applies cleanly against bluetooth-next tree.

Regards

Marcel


^ permalink raw reply

* Re: [PATCH net-next 10/10] mptcp: refine MPTCP-level ack scheduling
From: Eric Dumazet @ 2020-11-23 11:57 UTC (permalink / raw)
  To: Mat Martineau, netdev; +Cc: Paolo Abeni, kuba, mptcp
In-Reply-To: <20201119194603.103158-11-mathew.j.martineau@linux.intel.com>



On 11/19/20 8:46 PM, Mat Martineau wrote:
> From: Paolo Abeni <pabeni@redhat.com>
> 
> Send timely MPTCP-level ack is somewhat difficult when
> the insertion into the msk receive level is performed
> by the worker.
> 
> It needs TCP-level dup-ack to notify the MPTCP-level
> ack_seq increase, as both the TCP-level ack seq and the
> rcv window are unchanged.
> 
> We can actually avoid processing incoming data with the
> worker, and let the subflow or recevmsg() send ack as needed.
> 
> When recvmsg() moves the skbs inside the msk receive queue,
> the msk space is still unchanged, so tcp_cleanup_rbuf() could
> end-up skipping TCP-level ack generation. Anyway, when
> __mptcp_move_skbs() is invoked, a known amount of bytes is
> going to be consumed soon: we update rcv wnd computation taking
> them in account.
> 
> Additionally we need to explicitly trigger tcp_cleanup_rbuf()
> when recvmsg() consumes a significant amount of the receive buffer.
> 
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
> ---
>  net/mptcp/options.c  |   1 +
>  net/mptcp/protocol.c | 105 +++++++++++++++++++++----------------------
>  net/mptcp/protocol.h |   8 ++++
>  net/mptcp/subflow.c  |   4 +-
>  4 files changed, 61 insertions(+), 57 deletions(-)
> 
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index 248e3930c0cb..8a59b3e44599 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -530,6 +530,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
>  		opts->ext_copy.ack64 = 0;
>  	}
>  	opts->ext_copy.use_ack = 1;
> +	WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
>  
>  	/* Add kind/length/subtype/flag overhead if mapping is not populated */
>  	if (dss_size == 0)
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 4ae2c4a30e44..748343f1a968 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -407,16 +407,42 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
>  	mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
>  }
>  
> -static void mptcp_send_ack(struct mptcp_sock *msk)
> +static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
> +{
> +	struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
> +
> +	/* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
> +	if (subflow->request_join && !subflow->fully_established)
> +		return false;
> +
> +	/* only send if our side has not closed yet */
> +	return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
> +}
> +
> +static void mptcp_send_ack(struct mptcp_sock *msk, bool force)
>  {
>  	struct mptcp_subflow_context *subflow;
> +	struct sock *pick = NULL;
>  
>  	mptcp_for_each_subflow(msk, subflow) {
>  		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
>  
> -		lock_sock(ssk);
> -		tcp_send_ack(ssk);
> -		release_sock(ssk);
> +		if (force) {
> +			lock_sock(ssk);
> +			tcp_send_ack(ssk);
> +			release_sock(ssk);
> +			continue;
> +		}
> +
> +		/* if the hintes ssk is still active, use it */
> +		pick = ssk;
> +		if (ssk == msk->ack_hint)
> +			break;
> +	}
> +	if (!force && pick) {
> +		lock_sock(pick);
> +		tcp_cleanup_rbuf(pick, 1);

Calling tcp_cleanup_rbuf() on a socket that was never established is going to fail
with a divide by 0 (mss being 0)

AFAIK, mptcp_recvmsg() can be called right after a socket(AF_INET, SOCK_STREAM, IPPROTO_MPTCP)
call.

Probably, after a lock_sock(), you should double check socket state (same above before calling tcp_send_ack())



> +		release_sock(pick);
>  	}
>  }
>  


....

>  
> +		/* be sure to advertise window change */
> +		old_space = READ_ONCE(msk->old_wspace);
> +		if ((tcp_space(sk) - old_space) >= old_space)
> +			mptcp_send_ack(msk, false);
> +

Yes, if we call recvmsg() right after socket(), we will end up calling tcp_cleanup_rbuf(),
while no byte was ever copied/drained.


^ permalink raw reply

* Re: [PATCH] Bluetooth: sco: Fix crash when using BT_SNDMTU/BT_RCVMTU option
From: Marcel Holtmann @ 2020-11-23 11:53 UTC (permalink / raw)
  To: Wei Yongjun
  Cc: Marcel Holtmann Johan Hedberg Jakub Kicinski Joseph Hwang Alain Michaud Abhishek Pandit-Subedi Pali Rohár,
	linux-bluetooth, netdev, Hulk Robot
In-Reply-To: <20201116132421.94624-1-weiyongjun1@huawei.com>

Hi Wei,

> This commit add the invalid check for connected socket, without it will
> causes the following crash due to sco_pi(sk)->conn being NULL:
> 
> KASAN: null-ptr-deref in range [0x0000000000000050-0x0000000000000057]
> CPU: 3 PID: 4284 Comm: test_sco Not tainted 5.10.0-rc3+ #1
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014
> RIP: 0010:sco_sock_getsockopt+0x45d/0x8e0
> Code: 48 c1 ea 03 80 3c 02 00 0f 85 ca 03 00 00 49 8b 9d f8 04 00 00 48 b8 00
>      00 00 00 00 fc ff df 48 8d 7b 50 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84
>      c0 74 08 3c 03 0f 8e b5 03 00 00 8b 43 50 48 8b 0c
> RSP: 0018:ffff88801bb17d88 EFLAGS: 00010206
> RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff83a4ecdf
> RDX: 000000000000000a RSI: ffffc90002fce000 RDI: 0000000000000050
> RBP: 1ffff11003762fb4 R08: 0000000000000001 R09: ffff88810e1008c0
> R10: ffffffffbd695dcf R11: fffffbfff7ad2bb9 R12: 0000000000000000
> R13: ffff888018ff1000 R14: dffffc0000000000 R15: 000000000000000d
> FS:  00007fb4f76c1700(0000) GS:ffff88811af80000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00005555e3b7a938 CR3: 00000001117be001 CR4: 0000000000770ee0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> PKRU: 55555554
> Call Trace:
> ? sco_skb_put_cmsg+0x80/0x80
> ? sco_skb_put_cmsg+0x80/0x80
> __sys_getsockopt+0x12a/0x220
> ? __ia32_sys_setsockopt+0x150/0x150
> ? syscall_enter_from_user_mode+0x18/0x50
> ? rcu_read_lock_bh_held+0xb0/0xb0
> __x64_sys_getsockopt+0xba/0x150
> ? syscall_enter_from_user_mode+0x1d/0x50
> do_syscall_64+0x33/0x40
> entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> Fixes: 0fc1a726f897 ("Bluetooth: sco: new getsockopt options BT_SNDMTU/BT_RCVMTU")
> Reported-by: Hulk Robot <hulkci@huawei.com>
> Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>

patch has been applied to bluetooth-next tree.

Regards

Marcel


^ permalink raw reply

* Re: [PATCH 0/3] Bluetooth: Power down controller when suspending
From: Marcel Holtmann @ 2020-11-23 11:46 UTC (permalink / raw)
  To: Abhishek Pandit-Subedi
  Cc: BlueZ development, chromeos-bluetooth-upstreaming, mcchou,
	danielwinkler, David S. Miller, Johan Hedberg, netdev,
	linux-kernel, Jakub Kicinski
In-Reply-To: <20201118234352.2138694-1-abhishekpandit@chromium.org>

Hi Abhishek,

> This patch series adds support for a quirk that will power down the
> Bluetooth controller when suspending and power it back up when resuming.
> 
> On Marvell SDIO Bluetooth controllers (SD8897 and SD8997), we are seeing
> a large number of suspend failures with the following log messages:
> 
> [ 4764.773873] Bluetooth: hci_cmd_timeout() hci0 command 0x0c14 tx timeout
> [ 4767.777897] Bluetooth: btmrvl_enable_hs() Host sleep enable command failed
> [ 4767.777920] Bluetooth: btmrvl_sdio_suspend() HS not actived, suspend failed!
> [ 4767.777946] dpm_run_callback(): pm_generic_suspend+0x0/0x48 returns -16
> [ 4767.777963] call mmc2:0001:2+ returned -16 after 4882288 usecs
> 
> The daily failure rate with this signature is quite significant and
> users are likely facing this at least once a day (and some unlucky users
> are likely facing it multiple times a day).
> 
> Given the severity, we'd like to power off the controller during suspend
> so the driver doesn't need to take any action (or block in any way) when
> suspending and power on during resume. This will break wake-on-bt for
> users but should improve the reliability of suspend.
> 
> We don't want to force all users of MVL8897 and MVL8997 to encounter
> this behavior if they're not affected (especially users that depend on
> Bluetooth for keyboard/mouse input) so the new behavior is enabled via
> module param. We are limiting this quirk to only Chromebooks (i.e.
> laptop). Chromeboxes will continue to have the old behavior since users
> may depend on BT HID to wake and use the system.

I don’t have a super great feeling with this change.

So historically only hciconfig hci0 up/down was doing a power cycle of the controller and when adding the mgmt interface we moved that to the mgmt interface. In addition we added a special case of power up via hdev->setup. We never had an intention that the kernel otherwise can power up/down the controller as it pleases.

Can we ask Marvell first to investigate why this is fundamentally broken with their hardware? Since what you are proposing is a pretty heavy change that might has side affects. For example the state machine for the mgmt interface has no concept of a power down/up from the kernel. It is all triggered by bluetoothd.

I am careful here since the whole power up/down path is already complicated enough.

Regards

Marcel


^ permalink raw reply

* Re: [PATCHv4 net-next 2/3] octeontx2-af: Add devlink health reporters for NPA
From: Jiri Pirko @ 2020-11-23 11:34 UTC (permalink / raw)
  To: George Cherian
  Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	kuba@kernel.org, davem@davemloft.net, Sunil Kovvuri Goutham,
	Linu Cherian, Geethasowjanya Akula, masahiroy@kernel.org,
	willemdebruijn.kernel@gmail.com, saeed@kernel.org
In-Reply-To: <BYAPR18MB2679CFF7446F9EDA8A95FA1BC5FC0@BYAPR18MB2679.namprd18.prod.outlook.com>

Mon, Nov 23, 2020 at 11:28:28AM CET, gcherian@marvell.com wrote:
>Hi Jiri,
>
>> -----Original Message-----
>> From: Jiri Pirko <jiri@resnulli.us>
>> Sent: Monday, November 23, 2020 3:52 PM
>> To: George Cherian <gcherian@marvell.com>
>> Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org;
>> kuba@kernel.org; davem@davemloft.net; Sunil Kovvuri Goutham
>> <sgoutham@marvell.com>; Linu Cherian <lcherian@marvell.com>;
>> Geethasowjanya Akula <gakula@marvell.com>; masahiroy@kernel.org;
>> willemdebruijn.kernel@gmail.com; saeed@kernel.org
>> Subject: Re: [PATCHv4 net-next 2/3] octeontx2-af: Add devlink health
>> reporters for NPA
>> 
>> Mon, Nov 23, 2020 at 03:49:06AM CET, gcherian@marvell.com wrote:
>> >
>> >
>> >> -----Original Message-----
>> >> From: Jiri Pirko <jiri@resnulli.us>
>> >> Sent: Saturday, November 21, 2020 7:44 PM
>> >> To: George Cherian <gcherian@marvell.com>
>> >> Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org;
>> >> kuba@kernel.org; davem@davemloft.net; Sunil Kovvuri Goutham
>> >> <sgoutham@marvell.com>; Linu Cherian <lcherian@marvell.com>;
>> >> Geethasowjanya Akula <gakula@marvell.com>; masahiroy@kernel.org;
>> >> willemdebruijn.kernel@gmail.com; saeed@kernel.org
>> >> Subject: Re: [PATCHv4 net-next 2/3] octeontx2-af: Add devlink health
>> >> reporters for NPA
>> >>
>> >> Sat, Nov 21, 2020 at 05:02:00AM CET, george.cherian@marvell.com wrote:
>> >> >Add health reporters for RVU NPA block.
>> >> >NPA Health reporters handle following HW event groups
>> >> > - GENERAL events
>> >> > - ERROR events
>> >> > - RAS events
>> >> > - RVU event
>> >> >An event counter per event is maintained in SW.
>> >> >
>> >> >Output:
>> >> > # devlink health
>> >> > pci/0002:01:00.0:
>> >> >   reporter npa
>> >> >     state healthy error 0 recover 0  # devlink  health dump show
>> >> >pci/0002:01:00.0 reporter npa
>> >> > NPA_AF_GENERAL:
>> >> >        Unmap PF Error: 0
>> >> >        Free Disabled for NIX0 RX: 0
>> >> >        Free Disabled for NIX0 TX: 0
>> >> >        Free Disabled for NIX1 RX: 0
>> >> >        Free Disabled for NIX1 TX: 0
>> >>
>> >> This is for 2 ports if I'm not mistaken. Then you need to have this
>> >> reporter per-port. Register ports and have reporter for each.
>> >>
>> >No, these are not port specific reports.
>> >NIX is the Network Interface Controller co-processor block.
>> >There are (max of) 2 such co-processor blocks per SoC.
>> 
>> Ah. I see. In that case, could you please structure the json differently. Don't
>> concatenate the number with the string. Instead of that, please have 2
>> subtrees, one for each NIX.
>> 
>NPA_AF_GENERAL:
>        Unmap PF Error: 0
>        Free Disabled for NIX0 
>		RX: 0
>       		TX: 0
>        Free Disabled for NIX1
>		RX: 0
>        		TX: 0
>
>Something like this?

It should be 2 member array, use devlink_fmsg_arr_pair_nest_start()
NIX {
0: {free disabled TX: 0, free disabled RX: 0,}
1: {free disabled TX: 0, free disabled RX: 0,}
}

something like this.


>
>Regards,
>-George
>> 
>> >
>> >Moreover, this is an NPA (Network Pool/Buffer Allocator co- processor)
>> reporter.
>> >This tells whether a free or alloc operation is skipped due to the
>> >configurations set by other co-processor blocks (NIX,SSO,TIM etc).
>> >
>> >https://urldefense.proofpoint.com/v2/url?u=https-
>> 3A__www.kernel.org_doc
>> >_html_latest_networking_device-
>> 5Fdrivers_ethernet_marvell_octeontx2.htm
>> >l&d=DwIBAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=npgTSgHrUSLmXpBZJKVhk0
>> lE_XNvtVDl8
>> >ZA2zBvBqPw&m=FNPm6lB8fRvGYvMqQWer6S9WI6rZIlMmDCqbM8xrnxM
>> &s=B47zBTfDlIdM
>> >xUmK0hmQkuoZnsGZYSzkvbZUloevT0A&e=
>> >> NAK.
>

^ permalink raw reply

* Re: [PATCH net-next v4 2/5] net/lapb: support netdev events
From: Xie He @ 2020-11-23 11:17 UTC (permalink / raw)
  To: Martin Schiller
  Cc: Andrew Hendry, David S. Miller, Jakub Kicinski, Linux X25,
	Linux Kernel Network Developers, LKML
In-Reply-To: <16b7e74e6e221f43420da7836659d7df@dev.tdt.de>

On Mon, Nov 23, 2020 at 2:38 AM Martin Schiller <ms@dev.tdt.de> wrote:
>
> Well, one could argue that we would have to repair these drivers, but I
> don't think that will get us anywhere.

Yeah... One problem I see with the Linux project is the lack of
docs/specs. Often we don't know what is right and what is wrong.

>  From this point of view it will be the best to handle the NETDEV_UP in
> the lapb event handler and establish the link analog to the
> NETDEV_CHANGE event if the carrier is UP.

Thanks! This way we can make sure LAPB would automatically connect in
all situations.

Since we'll have a netif_carrier_ok check in NETDEV_UP handing, it
might make the code look prettier to also have a netif_carrier_ok
check in NETDEV_GOING_DOWN handing (for symmetry). Just a suggestion.
You can do whatever looks good to you :)

Thanks!

^ permalink raw reply

* [arm64] kernel BUG at kernel/seccomp.c:1309!
From: Naresh Kamboju @ 2020-11-23 11:15 UTC (permalink / raw)
  To: open list, Netdev, bpf, lkft-triage, Linux ARM
  Cc: Daniel Borkmann, Kees Cook, Andrii Nakryiko, Song Liu,
	Yonghong Song, Andy Lutomirski, Sumit Semwal, Arnd Bergmann,
	Jann Horn, yifeifz2

While booting arm64 kernel the following kernel BUG noticed on several arm64
devices running linux next 20201123 tag kernel.


$ git log --oneline next-20201120..next-20201123 -- kernel/seccomp.c
5c5c5fa055ea Merge remote-tracking branch 'seccomp/for-next/seccomp'
bce6a8cba7bf Merge branch 'linus'
7ef95e3dbcee Merge branch 'for-linus/seccomp' into for-next/seccomp
fab686eb0307 seccomp: Remove bogus __user annotations
0d8315dddd28 seccomp/cache: Report cache data through /proc/pid/seccomp_cache
8e01b51a31a1 seccomp/cache: Add "emulator" to check if filter is constant allow
f9d480b6ffbe seccomp/cache: Lookup syscall allowlist bitmap for fast path
23d67a54857a seccomp: Migrate to use SYSCALL_WORK flag


Please find these easy steps to reproduce the kernel build and boot.

step to reproduce:
# please install tuxmake
# sudo pip3 install -U tuxmake
# cd linux-next
# tuxmake --runtime docker --target-arch arm --toolchain gcc-9
--kconfig defconfig --kconfig-add
https://builds.tuxbuild.com/1kgWN61pS5M35vjnVfDSvOOPd38/config

# Boot the arm64 on any arm64 devices.
# you will notice the below BUG

crash log details:
-----------------------
[    6.941012] ------------[ cut here ]------------
Found device  /dev/ttyAMA3.
[    6.947587] lima f4080000.gpu: mod rate = 500000000
[    6.955422] kernel BUG at kernel/seccomp.c:1309!
[    6.955430] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
[    6.955437] Modules linked in: cec rfkill wlcore_sdio(+) kirin_drm
dw_drm_dsi lima(+) drm_kms_helper gpu_sched drm fuse
[    6.955481] CPU: 2 PID: 291 Comm: systemd-udevd Not tainted
5.10.0-rc4-next-20201123 #2
[    6.955485] Hardware name: HiKey Development Board (DT)
[    6.955493] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
[    6.955510] pc : __secure_computing+0xe0/0xe8
[    6.958171] mmc_host mmc2: Bus speed (slot 0) = 24800000Hz (slot
req 400000Hz, actual 400000HZ div = 31)
[    6.965975] [drm] Initialized lima 1.1.0 20191231 for f4080000.gpu on minor 0
[    6.970176] lr : syscall_trace_enter+0x1cc/0x218
[    6.970181] sp : ffff800012d8be10
[    6.970185] x29: ffff800012d8be10 x28: ffff00000092cb00
[    6.970195] x27: 0000000000000000 x26: 0000000000000000
[    6.970203] x25: 0000000000000000 x24: 0000000000000000
[    6.970210] x23: 0000000060000000 x22: 0000000000000202
[    7.011614] mmc_host mmc2: Bus speed (slot 0) = 24800000Hz (slot
req 25000000Hz, actual 24800000HZ div = 0)
[    7.016457]
[    7.016461] x21: 0000000000000200 x20: ffff00000092cb00
[    7.016470] x19: ffff800012d8bec0 x18: 0000000000000000
[    7.016478] x17: 0000000000000000 x16: 0000000000000000
[    7.016485] x15: 0000000000000000 x14: 0000000000000000
[    7.054116] mmc_host mmc2: Bus speed (slot 0) = 24800000Hz (slot
req 400000Hz, actual 400000HZ div = 31)
[    7.056715]
[    7.103444] mmc_host mmc2: Bus speed (slot 0) = 24800000Hz (slot
req 25000000Hz, actual 24800000HZ div = 0)
[    7.105105] x13: 0000000000000000 x12: 0000000000000000
[    7.125849] x11: 0000000000000000 x10: 0000000000000000
[    7.125858] x9 : ffff80001001bcbc x8 : 0000000000000000
[    7.125865] x7 : 0000000000000000 x6 : 0000000000000000
[    7.125871] x5 : 0000000000000000 x4 : 0000000000000000
[    7.125879] x3 : 0000000000000000 x2 : ffff00000092cb00
[    7.125886] x1 : 0000000000000000 x0 : 0000000000000116
[    7.125896] Call trace:
] Found device /dev/ttyAMA2.
[    7.125908]  __secure_computing+0xe0/0xe8
[    7.125918]  syscall_trace_enter+0x1cc/0x218
[    7.125927]  el0_svc_common.constprop.0+0x19c/0x1b8
[    7.125933]  do_el0_svc+0x2c/0x98
[    7.125940]  el0_sync_handler+0x180/0x188
[    7.125946]  el0_sync+0x174/0x180
[    7.125958] Code: d2800121 97ffd9a9 d2800120 97fbf1a9 (d4210000)
[    7.199584] ---[ end trace 463debbc21f0c7b5 ]---
[    7.204205] note: systemd-udevd[291] exited with preempt_count 1
[    7.210733] ------------[ cut here ]------------
[    7.215451] WARNING: CPU: 2 PID: #
0 at kernel/rcu/tree.c:632 rcu_eqs_enter.isra.0+0x134/0x140
[    7.223927] Modules linked in: cec rfkill wlcore_sdio kirin_drm
dw_drm_dsi lima drm_kms_helper gpu_sched drm fuse
[    7.234295] CPU: 2 PID: 0 Comm: swapper/2 Tainted: G      D
  5.10.0-rc4-next-20201123 #2
[    7.243252] Hardware name: HiKey Development Board (DT)
[    7.248561] pstate: 200003c5 (nzCv DAIF -PAN -UAO -TCO BTYPE=--)
[    7.254638] pc : rcu_eqs_enter.isra.0+0x134/0x140
[    7.259350] lr : rcu_idle_enter+0x18/0x28
[    7.263362] sp : ffff8000128e3e80
[    7.266678] x29: ffff8000128e3e80 x28: 0000000000000000
[    7.272001] x27: 0000000000000000 x26: ffff000001b79080
[    7.277321] x25: 0000000000000000 x24: 00000001adc9b310
[    7.282641] x23: 0000000000000000 x22: ffff000001b79080
[    7.287970] x21: ffff000077b24b00 x20: ffff000001b79098
[    7.287979] x19: ffff800011c7ab40 x18: 0000000000000010
[    7.287986] x17: 0000000000000000 x16: 0000000000000000
[    7.287993] x15: ffff00000092cf98 x14: 0720072007200720
[    7.288001] x13: 0720072007200720 x12: 00000000000003c6
[    7.288008] x11: 071c71c71c71c71c x10: 00000000000003c6
[    7.288016] x9 : ffff800010df267c x8 : 000000000000048c
[    7.288023] x7 : 0000000000000c6f x6 : 0000000000009c3f
[    7.288030] x5 : 00000000ffffffff x4 : 0000000000000015
[    7.288038] x3 : 000000000022b7f0 x2 : 4000000000000002
[    7.288046] x1 : 4000000000000000 x0 : ffff000077b26b40
[    7.288054] Call trace:
[    7.288064]  rcu_eqs_enter.isra.0+0x134/0x140
#
[    7.288069]  rcu_idle_enter+0x18/0x28
[    7.288078]  cpuidle_enter_state+0x34c/0x438
[    7.288084]  cpuidle_enter+0x40/0x58
[    7.288092]  call_cpuidle+0x24/0x50
Reached target Sockets.
[    7.288108]  do_idle+0x228/0x290
[    7.375468]  cpu_startup_entry+0x30/0x78
[    7.379397]  secondary_start_kernel+0x158/0x190
[    7.383930] ---[ end trace 463debbc21f0c7b6 ]---
[     OK      ] Reached target B#

Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>

full test log,
https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20201123/testrun/3478150/suite/linux-log-parser/test/check-kernel-bug-1968549/log
https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20201123/testrun/3478177/suite/linux-log-parser/test/check-kernel-bug-1968583/log
https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20201123/testrun/3478197/suite/linux-log-parser/test/check-kernel-bug-147858/log

metadata:
  git branch: master
  git repo: https://gitlab.com/Linaro/lkft/mirrors/next/linux-next
  git commit: 62918e6fd7b5751c1285c7f8c6cbd27eb6600c02
  git describe: next-20201123
  make_kernelversion: 5.10.0-rc4
  kernel-config: https://builds.tuxbuild.com/1kgWN61pS5M35vjnVfDSvOOPd38/config


-- 
Linaro LKFT
https://lkft.linaro.org

^ permalink raw reply

* [PATCH net-next] bridge: mrp: Implement LC mode for MRP
From: Horatiu Vultur @ 2020-11-23 11:14 UTC (permalink / raw)
  To: nikolay, roopa, davem, kuba, linux-kernel, bridge, netdev; +Cc: Horatiu Vultur

Extend MRP to support LC mode(link check) for the interconnect port.
This applies only to the interconnect ring.

Opposite to RC mode(ring check) the LC mode is using CFM frames to
detect when the link goes up or down and based on that the userspace
will need to react.
One advantage of the LC mode over RC mode is that there will be fewer
frames in the normal rings. Because RC mode generates InTest on all
ports while LC mode sends CFM frame only on the interconnect port.

All 4 nodes part of the interconnect ring needs to have the same mode.
And it is not possible to have running LC and RC mode at the same time
on a node.

Whenever the MIM starts it needs to detect the status of the other 3
nodes in the interconnect ring so it would send a frame called
InLinkStatus, on which the clients needs to reply with their link
status.

This patch adds the frame header for the frame InLinkStatus and
extends existing rules on how to forward this frame.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
---
 include/uapi/linux/mrp_bridge.h |  7 +++++++
 net/bridge/br_mrp.c             | 18 +++++++++++++++---
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h
index 6aeb13ef0b1e..450f6941a5a1 100644
--- a/include/uapi/linux/mrp_bridge.h
+++ b/include/uapi/linux/mrp_bridge.h
@@ -61,6 +61,7 @@ enum br_mrp_tlv_header_type {
 	BR_MRP_TLV_HEADER_IN_TOPO = 0x7,
 	BR_MRP_TLV_HEADER_IN_LINK_DOWN = 0x8,
 	BR_MRP_TLV_HEADER_IN_LINK_UP = 0x9,
+	BR_MRP_TLV_HEADER_IN_LINK_STATUS = 0xa,
 	BR_MRP_TLV_HEADER_OPTION = 0x7f,
 };
 
@@ -156,4 +157,10 @@ struct br_mrp_in_link_hdr {
 	__be16 interval;
 };
 
+struct br_mrp_in_link_status_hdr {
+	__u8 sa[ETH_ALEN];
+	__be16 port_role;
+	__be16 id;
+};
+
 #endif
diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
index bb12fbf9aaf2..cec2c4e4561d 100644
--- a/net/bridge/br_mrp.c
+++ b/net/bridge/br_mrp.c
@@ -858,7 +858,8 @@ static bool br_mrp_in_frame(struct sk_buff *skb)
 	if (hdr->type == BR_MRP_TLV_HEADER_IN_TEST ||
 	    hdr->type == BR_MRP_TLV_HEADER_IN_TOPO ||
 	    hdr->type == BR_MRP_TLV_HEADER_IN_LINK_DOWN ||
-	    hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP)
+	    hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP ||
+	    hdr->type == BR_MRP_TLV_HEADER_IN_LINK_STATUS)
 		return true;
 
 	return false;
@@ -1126,9 +1127,9 @@ static int br_mrp_rcv(struct net_bridge_port *p,
 						goto no_forward;
 				}
 			} else {
-				/* MIM should forward IntLinkChange and
+				/* MIM should forward IntLinkChange/Status and
 				 * IntTopoChange between ring ports but MIM
-				 * should not forward IntLinkChange and
+				 * should not forward IntLinkChange/Status and
 				 * IntTopoChange if the frame was received at
 				 * the interconnect port
 				 */
@@ -1155,6 +1156,17 @@ static int br_mrp_rcv(struct net_bridge_port *p,
 			     in_type == BR_MRP_TLV_HEADER_IN_LINK_DOWN))
 				goto forward;
 
+			/* MIC should forward IntLinkStatus frames only to
+			 * interconnect port if it was received on a ring port.
+			 * If it is received on interconnect port then, it
+			 * should be forward on both ring ports
+			 */
+			if (br_mrp_is_ring_port(p_port, s_port, p) &&
+			    in_type == BR_MRP_TLV_HEADER_IN_LINK_STATUS) {
+				p_dst = NULL;
+				s_dst = NULL;
+			}
+
 			/* Should forward the InTopo frames only between the
 			 * ring ports
 			 */
-- 
2.27.0


^ permalink raw reply related

* Re: netconsole deadlock with virtnet
From: Leon Romanovsky @ 2020-11-23 11:08 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Jason Wang, Sergey Senozhatsky, Michael S. Tsirkin, Petr Mladek,
	John Ogness, virtualization, Amit Shah, Itay Aveksis,
	Ran Rozenstein, netdev
In-Reply-To: <20201118091257.2ee6757a@gandalf.local.home>

On Wed, Nov 18, 2020 at 09:12:57AM -0500, Steven Rostedt wrote:
>
> [ Adding netdev as perhaps someone there knows ]
>
> On Wed, 18 Nov 2020 12:09:59 +0800
> Jason Wang <jasowang@redhat.com> wrote:
>
> > > This CPU0 lock(_xmit_ETHER#2) -> hard IRQ -> lock(console_owner) is
> > > basically
> > > 	soft IRQ -> lock(_xmit_ETHER#2) -> hard IRQ -> printk()
> > >
> > > Then CPU1 spins on xmit, which is owned by CPU0, CPU0 spins on
> > > console_owner, which is owned by CPU1?
>
> It still looks to me that the target_list_lock is taken in IRQ, (which can
> be the case because printk calls write_msg() which takes that lock). And
> someplace there's a:
>
> 	lock(target_list_lock)
> 	lock(xmit_lock)
>
> which means you can remove the console lock from this scenario completely,
> and you still have a possible deadlock between target_list_lock and
> xmit_lock.
>
> >
> >
> > If this is true, it looks not a virtio-net specific issue but somewhere
> > else.
> >
> > I think all network driver will synchronize through bh instead of hardirq.
>
> I think the issue is where target_list_lock is held when we take xmit_lock.
> Is there anywhere in netconsole.c that can end up taking xmit_lock while
> holding the target_list_lock? If so, that's the problem. As
> target_list_lock is something that can be taken in IRQ context, which means
> *any* other lock that is taking while holding the target_list_lock must
> also protect against interrupts from happening while it they are held.

I increased printk buffer like Petr suggested and the splat is below.
It doesn't happening on x86, but on ARM65 and ppc64.

 [   10.027975] =====================================================
 [   10.027976] WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
 [   10.027976] 5.10.0-rc4_for_upstream_min_debug_2020_11_22_19_37 #1 Not tainted
 [   10.027977] -----------------------------------------------------
 [   10.027978] modprobe/638 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
 [   10.027979] ffff0000c9f63c98 (_xmit_ETHER#2){+.-.}-{2:2}, at: virtnet_poll_tx+0x84/0x120
 [   10.027982]
 [   10.027982] and this task is already holding:
 [   10.027983] ffff800009007018 (target_list_lock){....}-{2:2}, at: write_msg+0x6c/0x120 [netconsole]
 [   10.027985] which would create a new lock dependency:
 [   10.027985]  (target_list_lock){....}-{2:2} -> (_xmit_ETHER#2){+.-.}-{2:2}
 [   10.027989]
 [   10.027989] but this new dependency connects a HARDIRQ-irq-safe lock:
 [   10.027990]  (console_owner){-...}-{0:0}
 [   10.027991]
 [   10.027992] ... which became HARDIRQ-irq-safe at:
 [   10.027992]   __lock_acquire+0xa78/0x1a94
 [   10.027993]   lock_acquire.part.0+0x170/0x360
 [   10.027993]   lock_acquire+0x68/0x8c
 [   10.027994]   console_unlock+0x1e8/0x6a4
 [   10.027994]   vprintk_emit+0x1c4/0x3c4
 [   10.027995]   vprintk_default+0x40/0x4c
 [   10.027995]   vprintk_func+0x10c/0x220
 [   10.027995]   printk+0x68/0x90
 [   10.027996]   crng_fast_load+0x1bc/0x1c0
 [   10.027997]   add_interrupt_randomness+0x280/0x290
 [   10.027997]   handle_irq_event+0x80/0x120
 [   10.027997]   handle_fasteoi_irq+0xac/0x200
 [   10.027998]   __handle_domain_irq+0x84/0xf0
 [   10.027999]   gic_handle_irq+0xd4/0x320
 [   10.027999]   el1_irq+0xd0/0x180
 [   10.028000]   arch_cpu_idle+0x24/0x44
 [   10.028000]   default_idle_call+0x48/0xa0
 [   10.028001]   do_idle+0x260/0x300
 [   10.028001]   cpu_startup_entry+0x30/0x6c
 [   10.028001]   rest_init+0x1b4/0x288
 [   10.028002]   arch_call_rest_init+0x18/0x24
 [   10.028002]   start_kernel+0x5cc/0x608
 [   10.028003]
 [   10.028003] to a HARDIRQ-irq-unsafe lock:
 [   10.028004]  (_xmit_ETHER#2){+.-.}-{2:2}
 [   10.028005]
 [   10.028006] ... which became HARDIRQ-irq-unsafe at:
 [   10.028006] ...  __lock_acquire+0x8bc/0x1a94
 [   10.028007]   lock_acquire.part.0+0x170/0x360
 [   10.028007]   lock_acquire+0x68/0x8c
 [   10.028008]   _raw_spin_trylock+0x80/0xd0
 [   10.028008]   virtnet_poll+0xac/0x360
 [   10.028009]   net_rx_action+0x1b0/0x4e0
 [   10.028010]   __do_softirq+0x1f4/0x638
 [   10.028010]   do_softirq+0xb8/0xcc
 [   10.028010]   __local_bh_enable_ip+0x18c/0x200
 [   10.028011]   virtnet_napi_enable+0xc0/0xd4
 [   10.028011]   virtnet_open+0x98/0x1c0
 [   10.028012]   __dev_open+0x12c/0x200
 [   10.028013]   __dev_change_flags+0x1a0/0x220
 [   10.028013]   dev_change_flags+0x2c/0x70
 [   10.028014]   do_setlink+0x214/0xe20
 [   10.028014]   __rtnl_newlink+0x514/0x820
 [   10.028015]   rtnl_newlink+0x58/0x84
 [   10.028015]   rtnetlink_rcv_msg+0x184/0x4b4
 [   10.028016]   netlink_rcv_skb+0x60/0x124
 [   10.028016]   rtnetlink_rcv+0x20/0x30
 [   10.028017]   netlink_unicast+0x1b4/0x270
 [   10.028017]   netlink_sendmsg+0x1f0/0x400
 [   10.028018]   sock_sendmsg+0x5c/0x70
 [   10.028018]   ____sys_sendmsg+0x24c/0x280
 [   10.028019]   ___sys_sendmsg+0x88/0xd0
 [   10.028019]   __sys_sendmsg+0x70/0xd0
 [   10.028020]   __arm64_sys_sendmsg+0x2c/0x40
 [   10.028021]   el0_svc_common.constprop.0+0x84/0x200
 [   10.028021]   do_el0_svc+0x2c/0x90
 [   10.028021]   el0_svc+0x18/0x50
 [   10.028022]   el0_sync_handler+0xe0/0x350
 [   10.028023]   el0_sync+0x158/0x180
 [   10.028023]
 [   10.028023] other info that might help us debug this:
 [   10.028024]
 [   10.028024] Chain exists of:
 [   10.028025]   console_owner --> target_list_lock --> _xmit_ETHER#2
 [   10.028028]
 [   10.028028]  Possible interrupt unsafe locking scenario:
 [   10.028029]
 [   10.028029]        CPU0                    CPU1
 [   10.028030]        ----                    ----
 [   10.028030]   lock(_xmit_ETHER#2);
 [   10.028032]                                local_irq_disable();
 [   10.028032]                                lock(console_owner);
 [   10.028034]                                lock(target_list_lock);
 [   10.028035]   <Interrupt>
 [   10.028035]     lock(console_owner);
 [   10.028036]
 [   10.028037]  *** DEADLOCK ***
 [   10.028037]
 [   10.028038] 3 locks held by modprobe/638:
 [   10.028038]  #0: ffff800011e1efe0 (console_lock){+.+.}-{0:0}, at: register_console+0x144/0x2f4
 [   10.028040]  #1: ffff800011e1f108 (console_owner){-...}-{0:0}, at: console_unlock+0x17c/0x6a4
 [   10.028043]  #2: ffff800009007018 (target_list_lock){....}-{2:2}, at: write_msg+0x6c/0x120 [netconsole]
 [   10.028045]
 [   10.028046] the dependencies between HARDIRQ-irq-safe lock and the holding lock:
 [   10.028046]  -> (console_owner){-...}-{0:0} ops: 1574 {
 [   10.028049]     IN-HARDIRQ-W at:
 [   10.028050]                          __lock_acquire+0xa78/0x1a94
 [   10.028050]                          lock_acquire.part.0+0x170/0x360
 [   10.028051]                          lock_acquire+0x68/0x8c
 [   10.028051]                          console_unlock+0x1e8/0x6a4
 [   10.028052]                          vprintk_emit+0x1c4/0x3c4
 [   10.028052]                          vprintk_default+0x40/0x4c
 [   10.028053]                          vprintk_func+0x10c/0x220
 [   10.028054]                          printk+0x68/0x90
 [   10.028054]                          crng_fast_load+0x1bc/0x1c0
 [   10.028055]                          add_interrupt_randomness+0x280/0x290
 [   10.028056]                          handle_irq_event+0x80/0x120
 [   10.028056]                          handle_fasteoi_irq+0xac/0x200
 [   10.028057]                          __handle_domain_irq+0x84/0xf0
 [   10.028057]                          gic_handle_irq+0xd4/0x320
 [   10.028058]                          el1_irq+0xd0/0x180
 [   10.028058]                          arch_cpu_idle+0x24/0x44
 [   10.028059]                          default_idle_call+0x48/0xa0
 [   10.028060]                          do_idle+0x260/0x300
 [   10.028061]                          cpu_startup_entry+0x30/0x6c
 [   10.028061]                          rest_init+0x1b4/0x288
 [   10.028062]                          arch_call_rest_init+0x18/0x24
 [   10.028062]                          start_kernel+0x5cc/0x608
 [   10.028063]     INITIAL USE at:
 [   10.028064]                         __lock_acquire+0x2e0/0x1a94
 [   10.028064]                         lock_acquire.part.0+0x170/0x360
 [   10.028065]                         lock_acquire+0x68/0x8c
 [   10.028066]                         console_unlock+0x1e8/0x6a4
 [   10.028067]                         vprintk_emit+0x1c4/0x3c4
 [   10.028067]                         vprintk_default+0x40/0x4c
 [   10.028068]                         vprintk_func+0x10c/0x220
 [   10.028068]                         printk+0x68/0x90
 [   10.028069]                         start_kernel+0x8c/0x608
 [   10.028069]   }
 [   10.028070]   ... key      at: [<ffff800011e1f108>] console_owner_dep_map+0x0/0x28
 [   10.028071]   ... acquired at:
 [   10.028071]    lock_acquire.part.0+0x170/0x360
 [   10.028072]    lock_acquire+0x68/0x8c
 [   10.028072]    _raw_spin_lock_irqsave+0x88/0x15c
 [   10.028073]    write_msg+0x6c/0x120 [netconsole]
 [   10.028073]    console_unlock+0x3ec/0x6a4
 [   10.028074]    register_console+0x17c/0x2f4
 [   10.028075]    init_netconsole+0x20c/0x1000 [netconsole]
 [   10.028075]    do_one_initcall+0x8c/0x480
 [   10.028076]    do_init_module+0x60/0x270
 [   10.028076]    load_module+0x21f8/0x2734
 [   10.028077]    __do_sys_finit_module+0xbc/0x12c
 [   10.028077]    __arm64_sys_finit_module+0x28/0x34
 [   10.028078]    el0_svc_common.constprop.0+0x84/0x200
 [   10.028078]    do_el0_svc+0x2c/0x90
 [   10.028079]    el0_svc+0x18/0x50
 [   10.028079]    el0_sync_handler+0xe0/0x350
 [   10.028080]    el0_sync+0x158/0x180
 [   10.028080]
 [   10.028081] -> (target_list_lock){....}-{2:2} ops: 34 {
 [   10.028083]    INITIAL USE at:
 [   10.028084]                       __lock_acquire+0x2e0/0x1a94
 [   10.028084]                       lock_acquire.part.0+0x170/0x360
 [   10.028085]                       lock_acquire+0x68/0x8c
 [   10.028085]                       _raw_spin_lock_irqsave+0x88/0x15c
 [   10.028086]                       init_netconsole+0x148/0x1000 [netconsole]
 [   10.028087]                       do_one_initcall+0x8c/0x480
 [   10.028087]                       do_init_module+0x60/0x270
 [   10.028088]                       load_module+0x21f8/0x2734
 [   10.028088]                       __do_sys_finit_module+0xbc/0x12c
 [   10.028089]                       __arm64_sys_finit_module+0x28/0x34
 [   10.028090]                       el0_svc_common.constprop.0+0x84/0x200
 [   10.028090]                       do_el0_svc+0x2c/0x90
 [   10.028091]                       el0_svc+0x18/0x50
 [   10.028092]                       el0_sync_handler+0xe0/0x350
 [   10.028092]                       el0_sync+0x158/0x180
 [   10.028093]  }
 [   10.028093]  ... key      at: [<ffff800009007018>] target_list_lock+0x18/0xfffffffffffff000 [netconsole]
 [   10.028094]  ... acquired at:
 [   10.028094]    __lock_acquire+0x134c/0x1a94
 [   10.028095]    lock_acquire.part.0+0x170/0x360
 [   10.028095]    lock_acquire+0x68/0x8c
 [   10.028096]    _raw_spin_lock+0x64/0x90
 [   10.028096]    virtnet_poll_tx+0x84/0x120
 [   10.028097]    netpoll_poll_dev+0x12c/0x350
 [   10.028097]    netpoll_send_skb+0x39c/0x400
 [   10.028098]    netpoll_send_udp+0x2b8/0x440
 [   10.028098]    write_msg+0xfc/0x120 [netconsole]
 [   10.028099]    console_unlock+0x3ec/0x6a4
 [   10.028100]    register_console+0x17c/0x2f4
 [   10.028100]    init_netconsole+0x20c/0x1000 [netconsole]
 [   10.028101]    do_one_initcall+0x8c/0x480
 [   10.028101]    do_init_module+0x60/0x270
 [   10.028102]    load_module+0x21f8/0x2734
 [   10.028102]    __do_sys_finit_module+0xbc/0x12c
 [   10.028103]    __arm64_sys_finit_module+0x28/0x34
 [   10.028103]    el0_svc_common.constprop.0+0x84/0x200
 [   10.028104]    do_el0_svc+0x2c/0x90
 [   10.028104]    el0_svc+0x18/0x50
 [   10.028105]    el0_sync_handler+0xe0/0x350
 [   10.028105]    el0_sync+0x158/0x180
 [   10.028106]
 [   10.028106]
 [   10.028107] the dependencies between the lock to be acquired
 [   10.028107]  and HARDIRQ-irq-unsafe lock:
 [   10.028108] -> (_xmit_ETHER#2){+.-.}-{2:2} ops: 217 {
 [   10.028110]    HARDIRQ-ON-W at:
 [   10.028111]                        __lock_acquire+0x8bc/0x1a94
 [   10.028111]                        lock_acquire.part.0+0x170/0x360
 [   10.028112]                        lock_acquire+0x68/0x8c
 [   10.028113]                        _raw_spin_trylock+0x80/0xd0
 [   10.028113]                        virtnet_poll+0xac/0x360
 [   10.028114]                        net_rx_action+0x1b0/0x4e0
 [   10.028115]                        __do_softirq+0x1f4/0x638
 [   10.028115]                        do_softirq+0xb8/0xcc
 [   10.028116]                        __local_bh_enable_ip+0x18c/0x200
 [   10.028116]                        virtnet_napi_enable+0xc0/0xd4
 [   10.028117]                        virtnet_open+0x98/0x1c0
 [   10.028118]                        __dev_open+0x12c/0x200
 [   10.028118]                        __dev_change_flags+0x1a0/0x220
 [   10.028119]                        dev_change_flags+0x2c/0x70
 [   10.028119]                        do_setlink+0x214/0xe20
 [   10.028120]                        __rtnl_newlink+0x514/0x820
 [   10.028120]                        rtnl_newlink+0x58/0x84
 [   10.028121]                        rtnetlink_rcv_msg+0x184/0x4b4
 [   10.028122]                        netlink_rcv_skb+0x60/0x124
 [   10.028122]                        rtnetlink_rcv+0x20/0x30
 [   10.028123]                        netlink_unicast+0x1b4/0x270
 [   10.028124]                        netlink_sendmsg+0x1f0/0x400
 [   10.028124]                        sock_sendmsg+0x5c/0x70
 [   10.028125]                        ____sys_sendmsg+0x24c/0x280
 [   10.028125]                        ___sys_sendmsg+0x88/0xd0
 [   10.028126]                        __sys_sendmsg+0x70/0xd0
 [   10.028127]                        __arm64_sys_sendmsg+0x2c/0x40
 [   10.028128]                        el0_svc_common.constprop.0+0x84/0x200
 [   10.028128]                        do_el0_svc+0x2c/0x90
 [   10.028129]                        el0_svc+0x18/0x50
 [   10.028129]                        el0_sync_handler+0xe0/0x350
 [   10.028130]                        el0_sync+0x158/0x180
 [   10.028130]    IN-SOFTIRQ-W at:
 [   10.028131]                        __lock_acquire+0x894/0x1a94
 [   10.028132]                        lock_acquire.part.0+0x170/0x360
 [   10.028132]                        lock_acquire+0x68/0x8c
 [   10.028133]                        _raw_spin_lock+0x64/0x90
 [   10.028134]                        virtnet_poll_tx+0x84/0x120
 [   10.028134]                        net_rx_action+0x1b0/0x4e0
 [   10.028135]                        __do_softirq+0x1f4/0x638
 [   10.028135]                        do_softirq+0xb8/0xcc
 [   10.028136]                        __local_bh_enable_ip+0x18c/0x200
 [   10.028137]                        virtnet_napi_enable+0xc0/0xd4
 [   10.028137]                        virtnet_open+0x14c/0x1c0
 [   10.028138]                        __dev_open+0x12c/0x200
 [   10.028138]                        __dev_change_flags+0x1a0/0x220
 [   10.028139]                        dev_change_flags+0x2c/0x70
 [   10.028140]                        do_setlink+0x214/0xe20
 [   10.028140]                        __rtnl_newlink+0x514/0x820
 [   10.028141]                        rtnl_newlink+0x58/0x84
 [   10.028141]                        rtnetlink_rcv_msg+0x184/0x4b4
 [   10.028142]                        netlink_rcv_skb+0x60/0x124
 [   10.028142]                        rtnetlink_rcv+0x20/0x30
 [   10.028143]                        netlink_unicast+0x1b4/0x270
 [   10.028144]                        netlink_sendmsg+0x1f0/0x400
 [   10.028144]                        sock_sendmsg+0x5c/0x70
 [   10.028145]                        ____sys_sendmsg+0x24c/0x280
 [   10.028146]                        ___sys_sendmsg+0x88/0xd0
 [   10.028146]                        __sys_sendmsg+0x70/0xd0
 [   10.028147]                        __arm64_sys_sendmsg+0x2c/0x40
 [   10.028148]                        el0_svc_common.constprop.0+0x84/0x200
 [   10.028148]                        do_el0_svc+0x2c/0x90
 [   10.028149]                        el0_svc+0x18/0x50
 [   10.028149]                        el0_sync_handler+0xe0/0x350
 [   10.028150]                        el0_sync+0x158/0x180
 [   10.028150]    INITIAL USE at:
 [   10.028151]                       __lock_acquire+0x2e0/0x1a94
 [   10.028152]                       lock_acquire.part.0+0x170/0x360
 [   10.028153]                       lock_acquire+0x68/0x8c
 [   10.028153]                       _raw_spin_trylock+0x80/0xd0
 [   10.028154]                       virtnet_poll+0xac/0x360
 [   10.028154]                       net_rx_action+0x1b0/0x4e0
 [   10.028155]                       __do_softirq+0x1f4/0x638
 [   10.028155]                       do_softirq+0xb8/0xcc
 [   10.028156]                       __local_bh_enable_ip+0x18c/0x200
 [   10.028157]                       virtnet_napi_enable+0xc0/0xd4
 [   10.028157]                       virtnet_open+0x98/0x1c0
 [   10.028158]                       __dev_open+0x12c/0x200
 [   10.028158]                       __dev_change_flags+0x1a0/0x220
 [   10.028159]                       dev_change_flags+0x2c/0x70
 [   10.028159]                       do_setlink+0x214/0xe20
 [   10.028160]                       __rtnl_newlink+0x514/0x820
 [   10.028161]                       rtnl_newlink+0x58/0x84
 [   10.028161]                       rtnetlink_rcv_msg+0x184/0x4b4
 [   10.028162]                       netlink_rcv_skb+0x60/0x124
 [   10.028162]                       rtnetlink_rcv+0x20/0x30
 [   10.028163]                       netlink_unicast+0x1b4/0x270
 [   10.028163]                       netlink_sendmsg+0x1f0/0x400
 [   10.028164]                       sock_sendmsg+0x5c/0x70
 [   10.028165]                       ____sys_sendmsg+0x24c/0x280
 [   10.028165]                       ___sys_sendmsg+0x88/0xd0
 [   10.028166]                       __sys_sendmsg+0x70/0xd0
 [   10.028166]                       __arm64_sys_sendmsg+0x2c/0x40
 [   10.028167]                       el0_svc_common.constprop.0+0x84/0x200
 [   10.028168]                       do_el0_svc+0x2c/0x90
 [   10.028168]                       el0_svc+0x18/0x50
 [   10.028169]                       el0_sync_handler+0xe0/0x350
 [   10.028169]                       el0_sync+0x158/0x180
 [   10.028170]  }
 [   10.028171]  ... key      at: [<ffff80001312aef8>] netdev_xmit_lock_key+0x10/0x390
 [   10.028171]  ... acquired at:
 [   10.028172]    __lock_acquire+0x134c/0x1a94
 [   10.028172]    lock_acquire.part.0+0x170/0x360
 [   10.028173]    lock_acquire+0x68/0x8c
 [   10.028173]    _raw_spin_lock+0x64/0x90
 [   10.028174]    virtnet_poll_tx+0x84/0x120
 [   10.028174]    netpoll_poll_dev+0x12c/0x350
 [   10.028175]    netpoll_send_skb+0x39c/0x400
 [   10.028175]    netpoll_send_udp+0x2b8/0x440
 [   10.028176]    write_msg+0xfc/0x120 [netconsole]
 [   10.028176]    console_unlock+0x3ec/0x6a4
 [   10.028177]    register_console+0x17c/0x2f4
 [   10.028178]    init_netconsole+0x20c/0x1000 [netconsole]
 [   10.028178]    do_one_initcall+0x8c/0x480
 [   10.028179]    do_init_module+0x60/0x270
 [   10.028179]    load_module+0x21f8/0x2734
 [   10.028180]    __do_sys_finit_module+0xbc/0x12c
 [   10.028180]    __arm64_sys_finit_module+0x28/0x34
 [   10.028181]    el0_svc_common.constprop.0+0x84/0x200
 [   10.028181]    do_el0_svc+0x2c/0x90
 [   10.028182]    el0_svc+0x18/0x50
 [   10.028182]    el0_sync_handler+0xe0/0x350
 [   10.028183]    el0_sync+0x158/0x180
 [   10.028183]
 [   10.028183]
 [   10.028184] stack backtrace:
 [   10.028185] CPU: 14 PID: 638 Comm: modprobe Not tainted 5.10.0-rc4_for_upstream_min_debug_2020_11_22_19_37 #1
 [   10.028186] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
 [   10.028186] Call trace:
 [   10.028186]  dump_backtrace+0x0/0x1d0
 [   10.028187]  show_stack+0x20/0x3c
 [   10.028187]  dump_stack+0xec/0x138
 [   10.028188]  check_irq_usage+0x6b8/0x6cc
 [   10.028188]  __lock_acquire+0x134c/0x1a94
 [   10.028189]  lock_acquire.part.0+0x170/0x360
 [   10.028189]  lock_acquire+0x68/0x8c
 [   10.028190]  _raw_spin_lock+0x64/0x90
 [   10.028191]  virtnet_poll_tx+0x84/0x120
 [   10.028191]  netpoll_poll_dev+0x12c/0x350
 [   10.028192]  netpoll_send_skb+0x39c/0x400
 [   10.028192]  netpoll_send_udp+0x2b8/0x440
 [   10.028193]  write_msg+0xfc/0x120 [netconsole]
 [   10.028193]  console_unlock+0x3ec/0x6a4
 [   10.028194]  register_console+0x17c/0x2f4
 [   10.028194]  init_netconsole+0x20c/0x1000 [netconsole]
 [   10.028195]  do_one_initcall+0x8c/0x480
 [   10.028195]  do_init_module+0x60/0x270
 [   10.028196]  load_module+0x21f8/0x2734
 [   10.028197]  __do_sys_finit_module+0xbc/0x12c
 [   10.028197]  __arm64_sys_finit_module+0x28/0x34
 [   10.028198]  el0_svc_common.constprop.0+0x84/0x200
 [   10.028198]  do_el0_svc+0x2c/0x90
 [   10.028199]  el0_svc+0x18/0x50
 [   10.028199]  el0_sync_handler+0xe0/0x350
 [   10.028200]  el0_sync+0x158/0x180
 [   10.073569] random: crng init done
 [   10.073964] printk: console [netcon0] enabled
 [   10.074704] random: 7 urandom warning(s) missed due to ratelimiting
 [   10.075340] netconsole: network logging started

>
> -- Steve

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox