Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH 0/3] ibmvnic: Fix bugs and memory leaks
From: Thomas Falcon @ 2018-05-16 20:59 UTC (permalink / raw)
  To: netdev; +Cc: jallen, nfont
In-Reply-To: <1526503745-14421-1-git-send-email-tlfalcon@linux.vnet.ibm.com>

On 05/16/2018 03:49 PM, Thomas Falcon wrote:
> This is a small patch series fixing up some bugs and memory leaks
> in the ibmvnic driver. The first fix frees up previously allocated
> memory that should be freed in case of an error. The second fixes
> a reset case that was failing due to TX/RX queue IRQ's being
> erroneously disabled without being enabled again. The final patch
> fixes incorrect reallocated of statistics buffers during a device
> reset, resulting in loss of statistics information and a memory leak.
>
> Thomas Falcon (3):
>   ibmvnic: Free coherent DMA memory if FW map failed
>   ibmvnic: Fix non-fatal firmware error reset
>   ibmvnic: Fix statistics buffers memory leak

Sorry, these are meant for the 'net' tree.

Tom

>
>  drivers/net/ethernet/ibm/ibmvnic.c | 28 +++++++++++++++++-----------
>  1 file changed, 17 insertions(+), 11 deletions(-)
>

^ permalink raw reply

* [PATCH bpf-next] libbpf: add ifindex to enable offload support
From: Jakub Kicinski @ 2018-05-16 21:02 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, David Beckett

From: David Beckett <david.beckett@netronome.com>

BPF programs currently can only be offloaded using iproute2. This
patch will allow programs to be offloaded using libbpf calls.

Signed-off-by: David Beckett <david.beckett@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 tools/lib/bpf/bpf.c    |  2 ++
 tools/lib/bpf/bpf.h    |  2 ++
 tools/lib/bpf/libbpf.c | 18 +++++++++++++++---
 tools/lib/bpf/libbpf.h |  1 +
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index a3a8fb2ac697..6a8a00097fd8 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -91,6 +91,7 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
 	attr.btf_fd = create_attr->btf_fd;
 	attr.btf_key_id = create_attr->btf_key_id;
 	attr.btf_value_id = create_attr->btf_value_id;
+	attr.map_ifindex = create_attr->map_ifindex;
 
 	return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
 }
@@ -201,6 +202,7 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 	attr.log_size = 0;
 	attr.log_level = 0;
 	attr.kern_version = load_attr->kern_version;
+	attr.prog_ifindex = load_attr->prog_ifindex;
 	memcpy(attr.prog_name, load_attr->name,
 	       min(name_len, BPF_OBJ_NAME_LEN - 1));
 
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index fb3a146d92ff..15bff7728cf1 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -38,6 +38,7 @@ struct bpf_create_map_attr {
 	__u32 btf_fd;
 	__u32 btf_key_id;
 	__u32 btf_value_id;
+	__u32 map_ifindex;
 };
 
 int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);
@@ -64,6 +65,7 @@ struct bpf_load_program_attr {
 	size_t insns_cnt;
 	const char *license;
 	__u32 kern_version;
+	__u32 prog_ifindex;
 };
 
 /* Recommend log buffer size */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index df54c4c9e48a..3dbe217bf23e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -178,6 +178,7 @@ struct bpf_program {
 	/* Index in elf obj file, for relocation use. */
 	int idx;
 	char *name;
+	int prog_ifindex;
 	char *section_name;
 	struct bpf_insn *insns;
 	size_t insns_cnt, main_prog_cnt;
@@ -213,6 +214,7 @@ struct bpf_map {
 	int fd;
 	char *name;
 	size_t offset;
+	int map_ifindex;
 	struct bpf_map_def def;
 	uint32_t btf_key_id;
 	uint32_t btf_value_id;
@@ -1091,6 +1093,7 @@ bpf_object__create_maps(struct bpf_object *obj)
 		int *pfd = &map->fd;
 
 		create_attr.name = map->name;
+		create_attr.map_ifindex = map->map_ifindex;
 		create_attr.map_type = def->type;
 		create_attr.map_flags = def->map_flags;
 		create_attr.key_size = def->key_size;
@@ -1273,7 +1276,7 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
 static int
 load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type,
 	     const char *name, struct bpf_insn *insns, int insns_cnt,
-	     char *license, u32 kern_version, int *pfd)
+	     char *license, u32 kern_version, int *pfd, int prog_ifindex)
 {
 	struct bpf_load_program_attr load_attr;
 	char *log_buf;
@@ -1287,6 +1290,7 @@ load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type,
 	load_attr.insns_cnt = insns_cnt;
 	load_attr.license = license;
 	load_attr.kern_version = kern_version;
+	load_attr.prog_ifindex = prog_ifindex;
 
 	if (!load_attr.insns || !load_attr.insns_cnt)
 		return -EINVAL;
@@ -1368,7 +1372,8 @@ bpf_program__load(struct bpf_program *prog,
 		}
 		err = load_program(prog->type, prog->expected_attach_type,
 				   prog->name, prog->insns, prog->insns_cnt,
-				   license, kern_version, &fd);
+				   license, kern_version, &fd,
+				   prog->prog_ifindex);
 		if (!err)
 			prog->instances.fds[0] = fd;
 		goto out;
@@ -1399,7 +1404,8 @@ bpf_program__load(struct bpf_program *prog,
 		err = load_program(prog->type, prog->expected_attach_type,
 				   prog->name, result.new_insn_ptr,
 				   result.new_insn_cnt,
-				   license, kern_version, &fd);
+				   license, kern_version, &fd,
+				   prog->prog_ifindex);
 
 		if (err) {
 			pr_warning("Loading the %dth instance of program '%s' failed\n",
@@ -2188,6 +2194,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 	enum bpf_attach_type expected_attach_type;
 	enum bpf_prog_type prog_type;
 	struct bpf_object *obj;
+	struct bpf_map *map;
 	int section_idx;
 	int err;
 
@@ -2207,6 +2214,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 		 * section name.
 		 */
 		prog_type = attr->prog_type;
+		prog->prog_ifindex = attr->ifindex;
 		expected_attach_type = attr->expected_attach_type;
 		if (prog_type == BPF_PROG_TYPE_UNSPEC) {
 			section_idx = bpf_program__identify_section(prog);
@@ -2227,6 +2235,10 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 			first_prog = prog;
 	}
 
+	bpf_map__for_each(map, obj) {
+		map->map_ifindex = attr->ifindex;
+	}
+
 	if (!first_prog) {
 		pr_warning("object file doesn't contain bpf program\n");
 		bpf_object__close(obj);
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 4574b9563278..cd3fd8d782c7 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -259,6 +259,7 @@ struct bpf_prog_load_attr {
 	const char *file;
 	enum bpf_prog_type prog_type;
 	enum bpf_attach_type expected_attach_type;
+	int ifindex;
 };
 
 int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
-- 
2.17.0

^ permalink raw reply related

* [PATCH v3 0/2] IR decoding using BPF
From: Sean Young @ 2018-05-16 21:04 UTC (permalink / raw)
  To: linux-media, linux-kernel, Alexei Starovoitov,
	Mauro Carvalho Chehab, Daniel Borkmann, netdev, Matthias Reichl,
	Devin Heitmueller, Y Song

The kernel IR decoders (drivers/media/rc/ir-*-decoder.c) support the most
widely used IR protocols, but there are many protocols which are not
supported[1]. For example, the lirc-remotes[2] repo has over 2700 remotes,
many of which are not supported by rc-core. There is a "long tail" of
unsupported IR protocols, for which lircd is need to decode the IR .

IR encoding is done in such a way that some simple circuit can decode it;
therefore, bpf is ideal.

In order to support all these protocols, here we have bpf based IR decoding.
The idea is that user-space can define a decoder in bpf, attach it to
the rc device through the lirc chardev.

Separate work is underway to extend ir-keytable to have an extensive library
of bpf-based decoders, and a much expanded library of rc keymaps.

Another future application would be to compile IRP[3] to a IR BPF program, and
so support virtually every remote without having to write a decoder for each.
It might also be possible to support non-button devices such as analog
directional pads or air conditioning remote controls and decode the target
temperature in bpf, and pass that to an input device.

Thanks,

Sean Young

[1] http://www.hifi-remote.com/wiki/index.php?title=DecodeIR
[2] https://sourceforge.net/p/lirc-remotes/code/ci/master/tree/remotes/
[3] http://www.hifi-remote.com/wiki/index.php?title=IRP_Notation

Changes since v2:
 - Fixed locking issues
 - Improved self-test to cover more cases
 - Rebased on bpf-next again

Changes since v1:
 - Code review comments from Y Song <ys114321@gmail.com> and
   Randy Dunlap <rdunlap@infradead.org>
 - Re-wrote sample bpf to be selftest
 - Renamed RAWIR_DECODER -> RAWIR_EVENT (Kconfig, context, bpf prog type)
 - Rebase on bpf-next
 - Introduced bpf_rawir_event context structure with simpler access checking

Sean Young (2):
  media: rc: introduce BPF_PROG_RAWIR_EVENT
  bpf: add selftest for rawir_event type program

 drivers/media/rc/Kconfig                      |  13 +
 drivers/media/rc/Makefile                     |   1 +
 drivers/media/rc/bpf-rawir-event.c            | 363 ++++++++++++++++++
 drivers/media/rc/lirc_dev.c                   |  24 ++
 drivers/media/rc/rc-core-priv.h               |  24 ++
 drivers/media/rc/rc-ir-raw.c                  |  14 +-
 include/linux/bpf_rcdev.h                     |  30 ++
 include/linux/bpf_types.h                     |   3 +
 include/uapi/linux/bpf.h                      |  55 ++-
 kernel/bpf/syscall.c                          |   7 +
 tools/bpf/bpftool/prog.c                      |   1 +
 tools/include/uapi/linux/bpf.h                |  57 ++-
 tools/lib/bpf/libbpf.c                        |   1 +
 tools/testing/selftests/bpf/Makefile          |   8 +-
 tools/testing/selftests/bpf/bpf_helpers.h     |   6 +
 tools/testing/selftests/bpf/test_rawir.sh     |  37 ++
 .../selftests/bpf/test_rawir_event_kern.c     |  26 ++
 .../selftests/bpf/test_rawir_event_user.c     | 130 +++++++
 18 files changed, 792 insertions(+), 8 deletions(-)
 create mode 100644 drivers/media/rc/bpf-rawir-event.c
 create mode 100644 include/linux/bpf_rcdev.h
 create mode 100755 tools/testing/selftests/bpf/test_rawir.sh
 create mode 100644 tools/testing/selftests/bpf/test_rawir_event_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_rawir_event_user.c

-- 
2.17.0

^ permalink raw reply

* [PATCH v3 1/2] media: rc: introduce BPF_PROG_RAWIR_EVENT
From: Sean Young @ 2018-05-16 21:04 UTC (permalink / raw)
  To: linux-media, linux-kernel, Alexei Starovoitov,
	Mauro Carvalho Chehab, Daniel Borkmann, netdev, Matthias Reichl,
	Devin Heitmueller, Y Song
In-Reply-To: <cover.1526504511.git.sean@mess.org>

Add support for BPF_PROG_RAWIR_EVENT. This type of BPF program can call
rc_keydown() to reported decoded IR scancodes, or rc_repeat() to report
that the last key should be repeated.

The bpf program can be attached to using the bpf(BPF_PROG_ATTACH) syscall;
the target_fd must be the /dev/lircN device.

Signed-off-by: Sean Young <sean@mess.org>
---
 drivers/media/rc/Kconfig           |  13 ++
 drivers/media/rc/Makefile          |   1 +
 drivers/media/rc/bpf-rawir-event.c | 363 +++++++++++++++++++++++++++++
 drivers/media/rc/lirc_dev.c        |  24 ++
 drivers/media/rc/rc-core-priv.h    |  24 ++
 drivers/media/rc/rc-ir-raw.c       |  14 +-
 include/linux/bpf_rcdev.h          |  30 +++
 include/linux/bpf_types.h          |   3 +
 include/uapi/linux/bpf.h           |  55 ++++-
 kernel/bpf/syscall.c               |   7 +
 10 files changed, 531 insertions(+), 3 deletions(-)
 create mode 100644 drivers/media/rc/bpf-rawir-event.c
 create mode 100644 include/linux/bpf_rcdev.h

diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig
index eb2c3b6eca7f..2172d65b0213 100644
--- a/drivers/media/rc/Kconfig
+++ b/drivers/media/rc/Kconfig
@@ -25,6 +25,19 @@ config LIRC
 	   passes raw IR to and from userspace, which is needed for
 	   IR transmitting (aka "blasting") and for the lirc daemon.
 
+config BPF_RAWIR_EVENT
+	bool "Support for eBPF programs attached to lirc devices"
+	depends on BPF_SYSCALL
+	depends on RC_CORE=y
+	depends on LIRC
+	help
+	   Allow attaching eBPF programs to a lirc device using the bpf(2)
+	   syscall command BPF_PROG_ATTACH. This is supported for raw IR
+	   receivers.
+
+	   These eBPF programs can be used to decode IR into scancodes, for
+	   IR protocols not supported by the kernel decoders.
+
 menuconfig RC_DECODERS
 	bool "Remote controller decoders"
 	depends on RC_CORE
diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile
index 2e1c87066f6c..74907823bef8 100644
--- a/drivers/media/rc/Makefile
+++ b/drivers/media/rc/Makefile
@@ -5,6 +5,7 @@ obj-y += keymaps/
 obj-$(CONFIG_RC_CORE) += rc-core.o
 rc-core-y := rc-main.o rc-ir-raw.o
 rc-core-$(CONFIG_LIRC) += lirc_dev.o
+rc-core-$(CONFIG_BPF_RAWIR_EVENT) += bpf-rawir-event.o
 obj-$(CONFIG_IR_NEC_DECODER) += ir-nec-decoder.o
 obj-$(CONFIG_IR_RC5_DECODER) += ir-rc5-decoder.o
 obj-$(CONFIG_IR_RC6_DECODER) += ir-rc6-decoder.o
diff --git a/drivers/media/rc/bpf-rawir-event.c b/drivers/media/rc/bpf-rawir-event.c
new file mode 100644
index 000000000000..7cb48b8d87b5
--- /dev/null
+++ b/drivers/media/rc/bpf-rawir-event.c
@@ -0,0 +1,363 @@
+// SPDX-License-Identifier: GPL-2.0
+// bpf-rawir-event.c - handles bpf
+//
+// Copyright (C) 2018 Sean Young <sean@mess.org>
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/bpf_rcdev.h>
+#include "rc-core-priv.h"
+
+/*
+ * BPF interface for raw IR
+ */
+const struct bpf_prog_ops rawir_event_prog_ops = {
+};
+
+BPF_CALL_1(bpf_rc_repeat, struct bpf_rawir_event*, event)
+{
+	struct ir_raw_event_ctrl *ctrl;
+
+	ctrl = container_of(event, struct ir_raw_event_ctrl, bpf_rawir_event);
+
+	rc_repeat(ctrl->dev);
+
+	return 0;
+}
+
+static const struct bpf_func_proto rc_repeat_proto = {
+	.func	   = bpf_rc_repeat,
+	.gpl_only  = true, /* rc_repeat is EXPORT_SYMBOL_GPL */
+	.ret_type  = RET_INTEGER,
+	.arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_4(bpf_rc_keydown, struct bpf_rawir_event*, event, u32, protocol,
+	   u32, scancode, u32, toggle)
+{
+	struct ir_raw_event_ctrl *ctrl;
+
+	ctrl = container_of(event, struct ir_raw_event_ctrl, bpf_rawir_event);
+
+	rc_keydown(ctrl->dev, protocol, scancode, toggle != 0);
+
+	return 0;
+}
+
+static const struct bpf_func_proto rc_keydown_proto = {
+	.func	   = bpf_rc_keydown,
+	.gpl_only  = true, /* rc_keydown is EXPORT_SYMBOL_GPL */
+	.ret_type  = RET_INTEGER,
+	.arg1_type = ARG_PTR_TO_CTX,
+	.arg2_type = ARG_ANYTHING,
+	.arg3_type = ARG_ANYTHING,
+	.arg4_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+rawir_event_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_rc_repeat:
+		return &rc_repeat_proto;
+	case BPF_FUNC_rc_keydown:
+		return &rc_keydown_proto;
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_ktime_get_ns:
+		return &bpf_ktime_get_ns_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
+	case BPF_FUNC_get_prandom_u32:
+		return &bpf_get_prandom_u32_proto;
+	case BPF_FUNC_trace_printk:
+		if (capable(CAP_SYS_ADMIN))
+			return bpf_get_trace_printk_proto();
+		/* fall through */
+	default:
+		return NULL;
+	}
+}
+
+static bool rawir_event_is_valid_access(int off, int size,
+					enum bpf_access_type type,
+					const struct bpf_prog *prog,
+					struct bpf_insn_access_aux *info)
+{
+	/* struct bpf_rawir_event has two u32 fields */
+	if (type == BPF_WRITE)
+		return false;
+
+	if (size != sizeof(__u32))
+		return false;
+
+	if (!(off == offsetof(struct bpf_rawir_event, duration) ||
+	      off == offsetof(struct bpf_rawir_event, type)))
+		return false;
+
+	return true;
+}
+
+const struct bpf_verifier_ops rawir_event_verifier_ops = {
+	.get_func_proto  = rawir_event_func_proto,
+	.is_valid_access = rawir_event_is_valid_access
+};
+
+#define BPF_MAX_PROGS 64
+
+static int rc_dev_bpf_attach(struct rc_dev *rcdev, struct bpf_prog *prog)
+{
+	struct ir_raw_event_ctrl *raw;
+	struct bpf_prog_list *pl;
+	int ret, size;
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		return ret;
+
+	raw = rcdev->raw;
+	if (!raw) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	size = 0;
+	list_for_each_entry(pl, &raw->progs, node) {
+		if (pl->prog == prog) {
+			ret = -EEXIST;
+			goto out;
+		}
+
+		size++;
+	}
+
+	if (size >= BPF_MAX_PROGS) {
+		ret = -E2BIG;
+		goto out;
+	}
+
+	pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+	if (!pl) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	pl->prog = prog;
+	list_add(&pl->node, &raw->progs);
+out:
+	mutex_unlock(&ir_raw_handler_lock);
+	return ret;
+}
+
+static int rc_dev_bpf_detach(struct rc_dev *rcdev, struct bpf_prog *prog)
+{
+	struct ir_raw_event_ctrl *raw;
+	struct bpf_prog_list *pl, *tmp;
+	int ret;
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW)
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		return ret;
+
+	raw = rcdev->raw;
+	if (!raw) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	ret = -ENOENT;
+
+	list_for_each_entry_safe(pl, tmp, &raw->progs, node) {
+		if (pl->prog == prog) {
+			list_del(&pl->node);
+			kfree(pl);
+			bpf_prog_put(prog);
+			ret = 0;
+			goto out;
+		}
+	}
+out:
+	mutex_unlock(&ir_raw_handler_lock);
+	return ret;
+}
+
+void rc_dev_bpf_init(struct rc_dev *rcdev)
+{
+	INIT_LIST_HEAD(&rcdev->raw->progs);
+}
+
+void rc_dev_bpf_run(struct rc_dev *rcdev, struct ir_raw_event ev)
+{
+	struct ir_raw_event_ctrl *raw = rcdev->raw;
+	struct bpf_prog_list *pl;
+
+	if (list_empty(&raw->progs))
+		return;
+
+	if (unlikely(ev.carrier_report)) {
+		raw->bpf_rawir_event.carrier = ev.carrier;
+		raw->bpf_rawir_event.type = BPF_RAWIR_EVENT_CARRIER;
+	} else {
+		raw->bpf_rawir_event.duration = ev.duration;
+
+		if (ev.pulse)
+			raw->bpf_rawir_event.type = BPF_RAWIR_EVENT_PULSE;
+		else if (ev.timeout)
+			raw->bpf_rawir_event.type = BPF_RAWIR_EVENT_TIMEOUT;
+		else if (ev.reset)
+			raw->bpf_rawir_event.type = BPF_RAWIR_EVENT_RESET;
+		else
+			raw->bpf_rawir_event.type = BPF_RAWIR_EVENT_SPACE;
+	}
+
+	list_for_each_entry(pl, &raw->progs, node)
+		BPF_PROG_RUN(pl->prog, &raw->bpf_rawir_event);
+}
+
+void rc_dev_bpf_free(struct rc_dev *rcdev)
+{
+	struct bpf_prog_list *pl, *tmp;
+
+	list_for_each_entry_safe(pl, tmp, &rcdev->raw->progs, node) {
+		list_del(&pl->node);
+		bpf_prog_put(pl->prog);
+		kfree(pl);
+	}
+}
+
+int rc_dev_prog_attach(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	struct rc_dev *rcdev;
+	int ret;
+
+	if (attr->attach_flags)
+		return -EINVAL;
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd,
+				 BPF_PROG_TYPE_RAWIR_EVENT);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	rcdev = rc_dev_get_from_fd(attr->target_fd);
+	if (IS_ERR(rcdev)) {
+		bpf_prog_put(prog);
+		return PTR_ERR(rcdev);
+	}
+
+	ret = rc_dev_bpf_attach(rcdev, prog);
+	if (ret)
+		bpf_prog_put(prog);
+
+	put_device(&rcdev->dev);
+
+	return ret;
+}
+
+int rc_dev_prog_detach(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	struct rc_dev *rcdev;
+	int ret;
+
+	if (attr->attach_flags)
+		return -EINVAL;
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd,
+				 BPF_PROG_TYPE_RAWIR_EVENT);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	rcdev = rc_dev_get_from_fd(attr->target_fd);
+	if (IS_ERR(rcdev)) {
+		bpf_prog_put(prog);
+		return PTR_ERR(rcdev);
+	}
+
+	ret = rc_dev_bpf_detach(rcdev, prog);
+
+	bpf_prog_put(prog);
+	put_device(&rcdev->dev);
+
+	return ret;
+}
+
+int rc_dev_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr)
+{
+	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	struct ir_raw_event_ctrl *raw;
+	struct bpf_prog_list *pl;
+	struct rc_dev *rcdev;
+	u32 cnt, flags = 0, *ids, i;
+	int ret;
+
+	if (attr->query.query_flags)
+		return -EINVAL;
+
+	rcdev = rc_dev_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(rcdev))
+		return PTR_ERR(rcdev);
+
+	if (rcdev->driver_type != RC_DRIVER_IR_RAW) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = mutex_lock_interruptible(&ir_raw_handler_lock);
+	if (ret)
+		goto out;
+
+	raw = rcdev->raw;
+	if (!raw) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	cnt = 0;
+	list_for_each_entry(pl, &raw->progs, node)
+		cnt++;
+
+	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (attr->query.prog_cnt != 0 && prog_ids && cnt) {
+		if (attr->query.prog_cnt < cnt)
+			cnt = attr->query.prog_cnt;
+
+		ids = kmalloc_array(cnt, sizeof(u32), GFP_KERNEL);
+		if (!ids) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		i = 0;
+		list_for_each_entry(pl, &raw->progs, node) {
+			ids[i++] = pl->prog->aux->id;
+			if (i == cnt)
+				break;
+		}
+
+		ret = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
+	}
+out:
+	mutex_unlock(&ir_raw_handler_lock);
+	put_device(&rcdev->dev);
+
+	return ret;
+}
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index 24e9fbb80e81..193540ded626 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/device.h>
+#include <linux/file.h>
 #include <linux/idr.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
@@ -816,4 +817,27 @@ void __exit lirc_dev_exit(void)
 	unregister_chrdev_region(lirc_base_dev, RC_DEV_MAX);
 }
 
+struct rc_dev *rc_dev_get_from_fd(int fd)
+{
+	struct fd f = fdget(fd);
+	struct lirc_fh *fh;
+	struct rc_dev *dev;
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	if (f.file->f_op != &lirc_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	fh = f.file->private_data;
+	dev = fh->rc;
+
+	get_device(&dev->dev);
+	fdput(f);
+
+	return dev;
+}
+
 MODULE_ALIAS("lirc_dev");
diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h
index e0e6a17460f6..148db73cfa0c 100644
--- a/drivers/media/rc/rc-core-priv.h
+++ b/drivers/media/rc/rc-core-priv.h
@@ -13,6 +13,7 @@
 #define	MAX_IR_EVENT_SIZE	512
 
 #include <linux/slab.h>
+#include <uapi/linux/bpf.h>
 #include <media/rc-core.h>
 
 /**
@@ -57,6 +58,11 @@ struct ir_raw_event_ctrl {
 	/* raw decoder state follows */
 	struct ir_raw_event prev_ev;
 	struct ir_raw_event this_ev;
+
+#ifdef CONFIG_BPF_RAWIR_EVENT
+	struct bpf_rawir_event		bpf_rawir_event;
+	struct list_head		progs;
+#endif
 	struct nec_dec {
 		int state;
 		unsigned count;
@@ -126,6 +132,9 @@ struct ir_raw_event_ctrl {
 	} imon;
 };
 
+/* Mutex for locking raw IR processing and handler change */
+extern struct mutex ir_raw_handler_lock;
+
 /* macros for IR decoders */
 static inline bool geq_margin(unsigned d1, unsigned d2, unsigned margin)
 {
@@ -288,6 +297,7 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev);
 void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc);
 int ir_lirc_register(struct rc_dev *dev);
 void ir_lirc_unregister(struct rc_dev *dev);
+struct rc_dev *rc_dev_get_from_fd(int fd);
 #else
 static inline int lirc_dev_init(void) { return 0; }
 static inline void lirc_dev_exit(void) {}
@@ -299,4 +309,18 @@ static inline int ir_lirc_register(struct rc_dev *dev) { return 0; }
 static inline void ir_lirc_unregister(struct rc_dev *dev) { }
 #endif
 
+/*
+ * bpf interface
+ */
+#ifdef CONFIG_BPF_RAWIR_EVENT
+void rc_dev_bpf_init(struct rc_dev *dev);
+void rc_dev_bpf_free(struct rc_dev *dev);
+void rc_dev_bpf_run(struct rc_dev *dev, struct ir_raw_event ev);
+#else
+static inline void rc_dev_bpf_init(struct rc_dev *dev) { }
+static inline void rc_dev_bpf_free(struct rc_dev *dev) { }
+static inline void rc_dev_bpf_run(struct rc_dev *dev, struct ir_raw_event ev)
+{ }
+#endif
+
 #endif /* _RC_CORE_PRIV */
diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c
index 374f83105a23..e68cdd4fbf5d 100644
--- a/drivers/media/rc/rc-ir-raw.c
+++ b/drivers/media/rc/rc-ir-raw.c
@@ -14,7 +14,7 @@
 static LIST_HEAD(ir_raw_client_list);
 
 /* Used to handle IR raw handler extensions */
-static DEFINE_MUTEX(ir_raw_handler_lock);
+DEFINE_MUTEX(ir_raw_handler_lock);
 static LIST_HEAD(ir_raw_handler_list);
 static atomic64_t available_protocols = ATOMIC64_INIT(0);
 
@@ -32,6 +32,7 @@ static int ir_raw_event_thread(void *data)
 				    handler->protocols || !handler->protocols)
 					handler->decode(raw->dev, ev);
 			ir_lirc_raw_event(raw->dev, ev);
+			rc_dev_bpf_run(raw->dev, ev);
 			raw->prev_ev = ev;
 		}
 		mutex_unlock(&ir_raw_handler_lock);
@@ -572,6 +573,7 @@ int ir_raw_event_prepare(struct rc_dev *dev)
 	spin_lock_init(&dev->raw->edge_spinlock);
 	timer_setup(&dev->raw->edge_handle, ir_raw_edge_handle, 0);
 	INIT_KFIFO(dev->raw->kfifo);
+	rc_dev_bpf_init(dev);
 
 	return 0;
 }
@@ -621,9 +623,17 @@ void ir_raw_event_unregister(struct rc_dev *dev)
 	list_for_each_entry(handler, &ir_raw_handler_list, list)
 		if (handler->raw_unregister)
 			handler->raw_unregister(dev);
-	mutex_unlock(&ir_raw_handler_lock);
+
+	rc_dev_bpf_free(dev);
 
 	ir_raw_event_free(dev);
+
+	/*
+	 * A user can be calling bpf(BPF_PROG_{QUERY|ATTACH|DETACH}), so
+	 * ensure that the raw member is null on unlock; this is how
+	 * "device gone" is checked.
+	 */
+	mutex_unlock(&ir_raw_handler_lock);
 }
 
 /*
diff --git a/include/linux/bpf_rcdev.h b/include/linux/bpf_rcdev.h
new file mode 100644
index 000000000000..17a30f30436a
--- /dev/null
+++ b/include/linux/bpf_rcdev.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BPF_RCDEV_H
+#define _BPF_RCDEV_H
+
+#include <linux/bpf.h>
+#include <uapi/linux/bpf.h>
+
+#ifdef CONFIG_BPF_RAWIR_EVENT
+int rc_dev_prog_attach(const union bpf_attr *attr);
+int rc_dev_prog_detach(const union bpf_attr *attr);
+int rc_dev_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr);
+#else
+static inline int rc_dev_prog_attach(const union bpf_attr *attr)
+{
+	return -EINVAL;
+}
+
+static inline int rc_dev_prog_detach(const union bpf_attr *attr)
+{
+	return -EINVAL;
+}
+
+static inline int rc_dev_prog_query(const union bpf_attr *attr,
+				    union bpf_attr __user *uattr)
+{
+	return -EINVAL;
+}
+#endif
+
+#endif /* _BPF_RCDEV_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index b67f8793de0d..e2b1b12474d4 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -25,6 +25,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #endif
+#ifdef CONFIG_BPF_RAWIR_EVENT
+BPF_PROG_TYPE(BPF_PROG_TYPE_RAWIR_EVENT, rawir_event)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d94d333a8225..243e141e8a5b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -141,6 +141,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_MSG,
 	BPF_PROG_TYPE_RAW_TRACEPOINT,
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	BPF_PROG_TYPE_RAWIR_EVENT,
 };
 
 enum bpf_attach_type {
@@ -158,6 +159,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_CONNECT,
 	BPF_CGROUP_INET4_POST_BIND,
 	BPF_CGROUP_INET6_POST_BIND,
+	BPF_RAWIR_EVENT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1902,6 +1904,35 @@ union bpf_attr {
  *		egress otherwise). This is the only flag supported for now.
  *	Return
  *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_rc_keydown(void *ctx, u32 protocol, u32 scancode, u32 toggle)
+ *	Description
+ *		Report decoded scancode with toggle value. For use in
+ *		BPF_PROG_TYPE_RAWIR_EVENT, to report a successfully
+ *		decoded scancode. This is will generate a keydown event,
+ *		and a keyup event once the scancode is no longer repeated.
+ *
+ *		*ctx* pointer to bpf_rawir_event, *protocol* is decoded
+ *		protocol (see RC_PROTO_* enum).
+ *
+ *		Some protocols include a toggle bit, in case the button
+ *		was released and pressed again between consecutive scancodes,
+ *		copy this bit into *toggle* if it exists, else set to 0.
+ *
+ *     Return
+ *		Always return 0 (for now)
+ *
+ * int bpf_rc_repeat(void *ctx)
+ *	Description
+ *		Repeat the last decoded scancode; some IR protocols like
+ *		NEC have a special IR message for repeat last button,
+ *		in case user is holding a button down; the scancode is
+ *		not repeated.
+ *
+ *		*ctx* pointer to bpf_rawir_event.
+ *
+ *     Return
+ *		Always return 0 (for now)
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1976,7 +2007,9 @@ union bpf_attr {
 	FN(fib_lookup),			\
 	FN(sock_hash_update),		\
 	FN(msg_redirect_hash),		\
-	FN(sk_redirect_hash),
+	FN(sk_redirect_hash),		\
+	FN(rc_repeat),			\
+	FN(rc_keydown),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2043,6 +2076,26 @@ enum bpf_hdr_start_off {
 	BPF_HDR_START_NET,
 };
 
+/*
+ * user accessible mirror of in-kernel ir_raw_event
+ */
+#define BPF_RAWIR_EVENT_SPACE		0
+#define BPF_RAWIR_EVENT_PULSE		1
+#define BPF_RAWIR_EVENT_TIMEOUT		2
+#define BPF_RAWIR_EVENT_RESET		3
+#define BPF_RAWIR_EVENT_CARRIER		4
+#define BPF_RAWIR_EVENT_DUTY_CYCLE	5
+
+struct bpf_rawir_event {
+	union {
+		__u32	duration;
+		__u32	carrier;
+		__u32	duty_cycle;
+	};
+
+	__u32	type;
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e2aeb5e89f44..75c089f407c8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -11,6 +11,7 @@
  */
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/bpf_rcdev.h>
 #include <linux/btf.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
@@ -1567,6 +1568,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
+	case BPF_RAWIR_EVENT:
+		return rc_dev_prog_attach(attr);
 	default:
 		return -EINVAL;
 	}
@@ -1637,6 +1640,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
+	case BPF_RAWIR_EVENT:
+		return rc_dev_prog_detach(attr);
 	default:
 		return -EINVAL;
 	}
@@ -1684,6 +1689,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 		break;
+	case BPF_RAWIR_EVENT:
+		return rc_dev_prog_query(attr, uattr);
 	default:
 		return -EINVAL;
 	}
-- 
2.17.0

^ permalink raw reply related

* [PATCH v3 2/2] bpf: add selftest for rawir_event type program
From: Sean Young @ 2018-05-16 21:04 UTC (permalink / raw)
  To: linux-media, linux-kernel, Alexei Starovoitov,
	Mauro Carvalho Chehab, Daniel Borkmann, netdev, Matthias Reichl,
	Devin Heitmueller, Y Song
In-Reply-To: <cover.1526504511.git.sean@mess.org>

This is simple test over rc-loopback.

Signed-off-by: Sean Young <sean@mess.org>
---
 tools/bpf/bpftool/prog.c                      |   1 +
 tools/include/uapi/linux/bpf.h                |  57 +++++++-
 tools/lib/bpf/libbpf.c                        |   1 +
 tools/testing/selftests/bpf/Makefile          |   8 +-
 tools/testing/selftests/bpf/bpf_helpers.h     |   6 +
 tools/testing/selftests/bpf/test_rawir.sh     |  37 +++++
 .../selftests/bpf/test_rawir_event_kern.c     |  26 ++++
 .../selftests/bpf/test_rawir_event_user.c     | 130 ++++++++++++++++++
 8 files changed, 261 insertions(+), 5 deletions(-)
 create mode 100755 tools/testing/selftests/bpf/test_rawir.sh
 create mode 100644 tools/testing/selftests/bpf/test_rawir_event_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_rawir_event_user.c

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 9bdfdf2d3fbe..8889a4ee8577 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -71,6 +71,7 @@ static const char * const prog_type_name[] = {
 	[BPF_PROG_TYPE_SK_MSG]		= "sk_msg",
 	[BPF_PROG_TYPE_RAW_TRACEPOINT]	= "raw_tracepoint",
 	[BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
+	[BPF_PROG_TYPE_RAWIR_EVENT]	= "rawir_event",
 };
 
 static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1205d86a7a29..243e141e8a5b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -141,6 +141,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_MSG,
 	BPF_PROG_TYPE_RAW_TRACEPOINT,
 	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	BPF_PROG_TYPE_RAWIR_EVENT,
 };
 
 enum bpf_attach_type {
@@ -158,6 +159,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET6_CONNECT,
 	BPF_CGROUP_INET4_POST_BIND,
 	BPF_CGROUP_INET6_POST_BIND,
+	BPF_RAWIR_EVENT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1829,7 +1831,6 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- *
  * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
  *	Description
  *		Do FIB lookup in kernel tables using parameters in *params*.
@@ -1856,6 +1857,7 @@ union bpf_attr {
  *             Egress device index on success, 0 if packet needs to continue
  *             up the stack for further processing or a negative error in case
  *             of failure.
+ *
  * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
  *	Description
  *		Add an entry to, or update a sockhash *map* referencing sockets.
@@ -1902,6 +1904,35 @@ union bpf_attr {
  *		egress otherwise). This is the only flag supported for now.
  *	Return
  *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_rc_keydown(void *ctx, u32 protocol, u32 scancode, u32 toggle)
+ *	Description
+ *		Report decoded scancode with toggle value. For use in
+ *		BPF_PROG_TYPE_RAWIR_EVENT, to report a successfully
+ *		decoded scancode. This is will generate a keydown event,
+ *		and a keyup event once the scancode is no longer repeated.
+ *
+ *		*ctx* pointer to bpf_rawir_event, *protocol* is decoded
+ *		protocol (see RC_PROTO_* enum).
+ *
+ *		Some protocols include a toggle bit, in case the button
+ *		was released and pressed again between consecutive scancodes,
+ *		copy this bit into *toggle* if it exists, else set to 0.
+ *
+ *     Return
+ *		Always return 0 (for now)
+ *
+ * int bpf_rc_repeat(void *ctx)
+ *	Description
+ *		Repeat the last decoded scancode; some IR protocols like
+ *		NEC have a special IR message for repeat last button,
+ *		in case user is holding a button down; the scancode is
+ *		not repeated.
+ *
+ *		*ctx* pointer to bpf_rawir_event.
+ *
+ *     Return
+ *		Always return 0 (for now)
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1976,7 +2007,9 @@ union bpf_attr {
 	FN(fib_lookup),			\
 	FN(sock_hash_update),		\
 	FN(msg_redirect_hash),		\
-	FN(sk_redirect_hash),
+	FN(sk_redirect_hash),		\
+	FN(rc_repeat),			\
+	FN(rc_keydown),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2043,6 +2076,26 @@ enum bpf_hdr_start_off {
 	BPF_HDR_START_NET,
 };
 
+/*
+ * user accessible mirror of in-kernel ir_raw_event
+ */
+#define BPF_RAWIR_EVENT_SPACE		0
+#define BPF_RAWIR_EVENT_PULSE		1
+#define BPF_RAWIR_EVENT_TIMEOUT		2
+#define BPF_RAWIR_EVENT_RESET		3
+#define BPF_RAWIR_EVENT_CARRIER		4
+#define BPF_RAWIR_EVENT_DUTY_CYCLE	5
+
+struct bpf_rawir_event {
+	union {
+		__u32	duration;
+		__u32	carrier;
+		__u32	duty_cycle;
+	};
+
+	__u32	type;
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index df54c4c9e48a..372269e9053d 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1455,6 +1455,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_SK_MSG:
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+	case BPF_PROG_TYPE_RAWIR_EVENT:
 		return false;
 	case BPF_PROG_TYPE_UNSPEC:
 	case BPF_PROG_TYPE_KPROBE:
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 1eb0fa2aba92..b84e36d05d34 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -24,7 +24,7 @@ urandom_read: urandom_read.c
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
-	test_sock test_btf test_sockmap
+	test_sock test_btf test_sockmap test_rawir_event_user
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
@@ -33,7 +33,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
 	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
-	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o
+	test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
+	test_rawir_event_kern.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -42,7 +43,8 @@ TEST_PROGS := test_kmod.sh \
 	test_xdp_meta.sh \
 	test_offload.py \
 	test_sock_addr.sh \
-	test_tunnel.sh
+	test_tunnel.sh \
+	test_rawir.sh
 
 # Compile but not part of 'make run_tests'
 TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 8f143dfb3700..26d89b7f9841 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -114,6 +114,12 @@ static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
 static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params,
 			     int plen, __u32 flags) =
 	(void *) BPF_FUNC_fib_lookup;
+static int (*bpf_rc_repeat)(void *ctx) =
+	(void *) BPF_FUNC_rc_repeat;
+static int (*bpf_rc_keydown)(void *ctx, unsigned int protocol,
+			     unsigned int scancode, unsigned int toggle) =
+	(void *) BPF_FUNC_rc_keydown;
+
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/tools/testing/selftests/bpf/test_rawir.sh b/tools/testing/selftests/bpf/test_rawir.sh
new file mode 100755
index 000000000000..0aa77b043ee1
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_rawir.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test bpf_rawir_event over rc-loopback. Steps for the test:
+#
+# 1. Find the /dev/lircN device for rc-loopback
+# 2. Attach bpf_rawir_event program which decodes some IR.
+# 3. Send some IR to the same IR device; since it is loopback, this will
+#    end up in the bpf program
+# 4. bpf program should decode IR and report keycode
+# 5. We can read keycode from same /dev/lirc device
+
+GREEN='\033[0;92m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+modprobe rc-loopback
+
+for i in /sys/class/rc/rc*
+do
+	if grep -q DRV_NAME=rc-loopback $i/uevent
+	then
+		LIRCDEV=$(grep DEVNAME= $i/lirc*/uevent | sed sQDEVNAME=Q/dev/Q)
+	fi
+done
+
+if [ -n $LIRCDEV ];
+then
+	TYPE=rawir_event
+	./test_rawir_event_user $LIRCDEV
+	ret=$?
+	if [ $ret -ne 0 ]; then
+		echo -e ${RED}"FAIL: $TYPE"${NC}
+	else
+		echo -e ${GREEN}"PASS: $TYPE"${NC}
+	fi
+fi
diff --git a/tools/testing/selftests/bpf/test_rawir_event_kern.c b/tools/testing/selftests/bpf/test_rawir_event_kern.c
new file mode 100644
index 000000000000..33ba5d30af62
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_rawir_event_kern.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+// test ir decoder
+//
+// Copyright (C) 2018 Sean Young <sean@mess.org>
+
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+SEC("rawir_event")
+int bpf_decoder(struct bpf_rawir_event *e)
+{
+	if (e->type == BPF_RAWIR_EVENT_PULSE) {
+		/*
+		 * The lirc interface is microseconds, but here we receive
+		 * nanoseconds.
+		 */
+		int microseconds = e->duration / 1000;
+
+		if (microseconds & 0x10000)
+			bpf_rc_keydown(e, 0x40, microseconds & 0xffff, 0);
+	}
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_rawir_event_user.c b/tools/testing/selftests/bpf/test_rawir_event_user.c
new file mode 100644
index 000000000000..c3d7f2c68033
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_rawir_event_user.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0
+// test ir decoder
+//
+// Copyright (C) 2018 Sean Young <sean@mess.org>
+
+#include <linux/bpf.h>
+#include <linux/lirc.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <poll.h>
+#include <libgen.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+int main(int argc, char **argv)
+{
+	struct bpf_object *obj;
+	int ret, lircfd, progfd, mode;
+	int testir = 0x1dead;
+	u32 prog_ids[10], prog_flags[10], prog_cnt;
+
+	if (argc != 2) {
+		printf("Usage: %s /dev/lircN\n", argv[0]);
+		return 2;
+	}
+
+	ret = bpf_prog_load("test_rawir_event_kern.o",
+			    BPF_PROG_TYPE_RAWIR_EVENT, &obj, &progfd);
+	if (ret) {
+		printf("Failed to load bpf program\n");
+		return 1;
+	}
+
+	lircfd = open(argv[1], O_RDWR | O_NONBLOCK);
+	if (lircfd == -1) {
+		printf("failed to open lirc device %s: %m\n", argv[1]);
+		return 1;
+	}
+
+	/* Let's try detach it before it was ever attached */
+	ret = bpf_prog_detach2(progfd, lircfd, BPF_RAWIR_EVENT);
+	if (ret != -1 || errno != ENOENT) {
+		printf("bpf_prog_detach2 not attached should fail: %m\n");
+		return 1;
+	}
+
+	mode = LIRC_MODE_SCANCODE;
+	if (ioctl(lircfd, LIRC_SET_REC_MODE, &mode)) {
+		printf("failed to set rec mode: %m\n");
+		return 1;
+	}
+
+	prog_cnt = 10;
+	ret = bpf_prog_query(lircfd, BPF_RAWIR_EVENT, 0, prog_flags, prog_ids,
+			     &prog_cnt);
+	if (ret) {
+		printf("Failed to query bpf programs on lirc device: %m\n");
+		return 1;
+	}
+
+	if (prog_cnt != 0) {
+		printf("Expected nothing to be attached\n");
+		return 1;
+	}
+
+	ret = bpf_prog_attach(progfd, lircfd, BPF_RAWIR_EVENT, 0);
+	if (ret) {
+		printf("Failed to attach bpf to lirc device: %m\n");
+		return 1;
+	}
+
+	/* Write raw IR */
+	ret = write(lircfd, &testir, sizeof(testir));
+	if (ret != sizeof(testir)) {
+		printf("Failed to send test IR message: %m\n");
+		return 1;
+	}
+
+	struct pollfd pfd = { .fd = lircfd, .events = POLLIN };
+	struct lirc_scancode lsc;
+
+	poll(&pfd, 1, 100);
+
+	/* Read decoded IR */
+	ret = read(lircfd, &lsc, sizeof(lsc));
+	if (ret != sizeof(lsc)) {
+		printf("Failed to read decoded IR: %m\n");
+		return 1;
+	}
+
+	if (lsc.scancode != 0xdead || lsc.rc_proto != 64) {
+		printf("Incorrect scancode decoded\n");
+		return 1;
+	}
+
+	prog_cnt = 10;
+	ret = bpf_prog_query(lircfd, BPF_RAWIR_EVENT, 0, prog_flags, prog_ids,
+			     &prog_cnt);
+	if (ret) {
+		printf("Failed to query bpf programs on lirc device: %m\n");
+		return 1;
+	}
+
+	if (prog_cnt != 1) {
+		printf("Expected one program to be attached\n");
+		return 1;
+	}
+
+	/* Let's try detaching it now it is actually attached */
+	ret = bpf_prog_detach2(progfd, lircfd, BPF_RAWIR_EVENT);
+	if (ret) {
+		printf("bpf_prog_detach2: returned %m\n");
+		return 1;
+	}
+
+	return 0;
+}
-- 
2.17.0

^ permalink raw reply related

* [PATCH bpf-next] bpf: fix sock hashmap kmalloc warning
From: Yonghong Song @ 2018-05-16 21:06 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team

syzbot reported a kernel warning below:
  WARNING: CPU: 0 PID: 4499 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996
  Kernel panic - not syncing: panic_on_warn set ...

  CPU: 0 PID: 4499 Comm: syz-executor050 Not tainted 4.17.0-rc3+ #9
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
  Call Trace:
   __dump_stack lib/dump_stack.c:77 [inline]
   dump_stack+0x1b9/0x294 lib/dump_stack.c:113
   panic+0x22f/0x4de kernel/panic.c:184
   __warn.cold.8+0x163/0x1b3 kernel/panic.c:536
   report_bug+0x252/0x2d0 lib/bug.c:186
   fixup_bug arch/x86/kernel/traps.c:178 [inline]
   do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296
   do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
   invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
  RIP: 0010:kmalloc_slab+0x56/0x70 mm/slab_common.c:996
  RSP: 0018:ffff8801d907fc58 EFLAGS: 00010246
  RAX: 0000000000000000 RBX: ffff8801aeecb280 RCX: ffffffff8185ebd7
  RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000ffffffe1
  RBP: ffff8801d907fc58 R08: ffff8801adb5e1c0 R09: ffffed0035a84700
  R10: ffffed0035a84700 R11: ffff8801ad423803 R12: ffff8801aeecb280
  R13: 00000000fffffff4 R14: ffff8801ad891a00 R15: 00000000014200c0
   __do_kmalloc mm/slab.c:3713 [inline]
   __kmalloc+0x25/0x760 mm/slab.c:3727
   kmalloc include/linux/slab.h:517 [inline]
   map_get_next_key+0x24a/0x640 kernel/bpf/syscall.c:858
   __do_sys_bpf kernel/bpf/syscall.c:2131 [inline]
   __se_sys_bpf kernel/bpf/syscall.c:2096 [inline]
   __x64_sys_bpf+0x354/0x4f0 kernel/bpf/syscall.c:2096
   do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

The test case is against sock hashmap with a key size 0xffffffe1.
Such a large key size will cause the below code in function
sock_hash_alloc() overflowing and produces a smaller elem_size,
hence map creation will be successful.
    htab->elem_size = sizeof(struct htab_elem) +
                      round_up(htab->map.key_size, 8);

Later, when map_get_next_key is called and kernel tries
to allocate the key unsuccessfully, it will issue
the above warning.

Similar to hashtab, ensure the key size is at most
MAX_BPF_STACK for a successful map creation.

Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Reported-by: syzbot+e4566d29080e7f3460ff@syzkaller.appspotmail.com
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/sockmap.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 56879c9fd3a4..79f5e8988889 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1990,6 +1990,12 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
 	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
+	if (attr->key_size > MAX_BPF_STACK)
+		/* eBPF programs initialize keys on stack, so they cannot be
+		 * larger than max stack size
+		 */
+		return ERR_PTR(-E2BIG);
+
 	err = bpf_tcp_ulp_register();
 	if (err && err != -EEXIST)
 		return ERR_PTR(err);
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH net-next v12 2/7] sch_cake: Add ingress mode
From: Cong Wang @ 2018-05-16 21:09 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Linux Kernel Network Developers, Cake List
In-Reply-To: <152650254608.25701.5749607607862123240.stgit@alrua-kau>

On Wed, May 16, 2018 at 1:29 PM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
> +       if (tb[TCA_CAKE_AUTORATE]) {
> +               if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
> +                       q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
> +               else
> +                       q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
> +       }
> +
> +       if (tb[TCA_CAKE_INGRESS]) {
> +               if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
> +                       q->rate_flags |= CAKE_FLAG_INGRESS;
> +               else
> +                       q->rate_flags &= ~CAKE_FLAG_INGRESS;
> +       }
> +
>         if (tb[TCA_CAKE_MEMORY])
>                 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
>
> @@ -1559,6 +1628,14 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
>         if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
>                 goto nla_put_failure;
>
> +       if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
> +                       !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
> +               goto nla_put_failure;
> +
> +       if (nla_put_u32(skb, TCA_CAKE_INGRESS,
> +                       !!(q->rate_flags & CAKE_FLAG_INGRESS)))
> +               goto nla_put_failure;
> +

Why do you want to dump each bit of the rate_flags separately rather than
dumping the whole rate_flags as an integer?

^ permalink raw reply

* Re: [iproute2-next v2 1/1] tipc: fixed node and name table listings
From: David Ahern @ 2018-05-16 21:13 UTC (permalink / raw)
  To: Jon Maloy, stephen, netdev
  Cc: mohan.krishna.ghanta.krishnamurthy, tung.q.nguyen, hoang.h.le,
	canh.d.luu, ying.xue, tipc-discussion
In-Reply-To: <1526392461-21682-1-git-send-email-jon.maloy@ericsson.com>

On 5/15/18 7:54 AM, Jon Maloy wrote:
> We make it easier for users to correlate between 128-bit node
> identities and 32-bit node hash number by extending the 'node list'
> command to also show the hash number.
> 
> We also improve the 'nametable show' command to show the node identity
> instead of the node hash number. Since the former potentially is much
> longer than the latter, we make room for it by eliminating the (to the
> user) irrelevant publication key. We also reorder some of the columns so
> that the node id comes last, since this looks nicer and is more logical.
> 
> ---
> v2: Fixed compiler warning as per comment from David Ahern
> 
> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
> ---
>  tipc/misc.c      | 18 ++++++++++++++++++
>  tipc/misc.h      |  1 +
>  tipc/nametable.c | 18 ++++++++++--------
>  tipc/node.c      | 19 ++++++++-----------
>  tipc/peer.c      |  4 ++++
>  5 files changed, 41 insertions(+), 19 deletions(-)
> 
> diff --git a/tipc/misc.c b/tipc/misc.c
> index 16849f1..e8b726f 100644
> --- a/tipc/misc.c
> +++ b/tipc/misc.c
> @@ -13,6 +13,9 @@
>  #include <stdint.h>
>  #include <linux/tipc.h>
>  #include <string.h>
> +#include <sys/ioctl.h>
> +#include <sys/socket.h>
> +#include <errno.h>
>  #include "misc.h"
>  
>  #define IN_RANGE(val, low, high) ((val) <= (high) && (val) >= (low))
> @@ -109,3 +112,18 @@ void nodeid2str(uint8_t *id, char *str)
>  	for (i = 31; str[i] == '0'; i--)
>  		str[i] = 0;
>  }
> +
> +void hash2nodestr(uint32_t hash, char *str)
> +{
> +	struct tipc_sioc_nodeid_req nr = {};
> +	int sd;
> +
> +	sd = socket(AF_TIPC, SOCK_RDM, 0);
> +	if (sd < 0) {
> +		fprintf(stderr, "opening TIPC socket: %s\n", strerror(errno));
> +		return;
> +	}
> +	nr.peer = hash;
> +	if (!ioctl(sd, SIOCGETNODEID, &nr))
> +		nodeid2str((uint8_t *)nr.node_id, str);
> +}

you are leaking sd

^ permalink raw reply

* Re: [PATCH net-next v12 1/7] sched: Add Common Applications Kept Enhanced (cake) qdisc
From: Toke Høiland-Jørgensen @ 2018-05-16 21:13 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers, Cake List
In-Reply-To: <CAM_iQpX0mfKJWjDE_2ofKftcr2KgaOkinm70-CdNtNSBj8_cJQ@mail.gmail.com>

Cong Wang <xiyou.wangcong@gmail.com> writes:

> On Wed, May 16, 2018 at 1:29 PM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
>> +
>> +static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
>> +{
>> +       return NULL;
>> +}
>> +
>> +static unsigned long cake_find(struct Qdisc *sch, u32 classid)
>> +{
>> +       return 0;
>> +}
>> +
>> +static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
>> +{
>> +}
>
>
> Thanks for adding the support to other TC filters, it is much better
> now!

You're welcome. Turned out not to be that hard :)

> A quick question: why class_ops->dump_stats is still NULL?
>
> It is supposed to dump the stats of each flow. Is there still any
> difficulty to map it to tc class? I thought you figured it out when
> you added the tcf_classify().

On the classify side, I solved the "multiple sets of queues" problem by
using skb->priority to select the tin (diffserv tier) and the classifier
output to select the queue within that tin. This would not work for
dumping stats; some other way of mapping queues to the linear class
space would be needed. And since we are not actually collecting any
per-flow stats that I could print, I thought it wasn't worth coming up
with a half-baked proposal for this just to add an API hook that no one
in the existing CAKE user base has ever asked for...

-Toke

^ permalink raw reply

* Re: [PATCH bpf-next] bpf: fix sock hashmap kmalloc warning
From: John Fastabend @ 2018-05-16 21:14 UTC (permalink / raw)
  To: Yonghong Song, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180516210626.776403-1-yhs@fb.com>

On 05/16/2018 02:06 PM, Yonghong Song wrote:
> syzbot reported a kernel warning below:
>   WARNING: CPU: 0 PID: 4499 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 mm/slab_common.c:996
>   Kernel panic - not syncing: panic_on_warn set ...
> 
>   CPU: 0 PID: 4499 Comm: syz-executor050 Not tainted 4.17.0-rc3+ #9
>   Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
>   Call Trace:
>    __dump_stack lib/dump_stack.c:77 [inline]
>    dump_stack+0x1b9/0x294 lib/dump_stack.c:113
>    panic+0x22f/0x4de kernel/panic.c:184
>    __warn.cold.8+0x163/0x1b3 kernel/panic.c:536
>    report_bug+0x252/0x2d0 lib/bug.c:186
>    fixup_bug arch/x86/kernel/traps.c:178 [inline]
>    do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296
>    do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
>    invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
>   RIP: 0010:kmalloc_slab+0x56/0x70 mm/slab_common.c:996
>   RSP: 0018:ffff8801d907fc58 EFLAGS: 00010246
>   RAX: 0000000000000000 RBX: ffff8801aeecb280 RCX: ffffffff8185ebd7
>   RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000ffffffe1
>   RBP: ffff8801d907fc58 R08: ffff8801adb5e1c0 R09: ffffed0035a84700
>   R10: ffffed0035a84700 R11: ffff8801ad423803 R12: ffff8801aeecb280
>   R13: 00000000fffffff4 R14: ffff8801ad891a00 R15: 00000000014200c0
>    __do_kmalloc mm/slab.c:3713 [inline]
>    __kmalloc+0x25/0x760 mm/slab.c:3727
>    kmalloc include/linux/slab.h:517 [inline]
>    map_get_next_key+0x24a/0x640 kernel/bpf/syscall.c:858
>    __do_sys_bpf kernel/bpf/syscall.c:2131 [inline]
>    __se_sys_bpf kernel/bpf/syscall.c:2096 [inline]
>    __x64_sys_bpf+0x354/0x4f0 kernel/bpf/syscall.c:2096
>    do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
>    entry_SYSCALL_64_after_hwframe+0x49/0xbe
> 
> The test case is against sock hashmap with a key size 0xffffffe1.
> Such a large key size will cause the below code in function
> sock_hash_alloc() overflowing and produces a smaller elem_size,
> hence map creation will be successful.
>     htab->elem_size = sizeof(struct htab_elem) +
>                       round_up(htab->map.key_size, 8);
> 
> Later, when map_get_next_key is called and kernel tries
> to allocate the key unsuccessfully, it will issue
> the above warning.
> 
> Similar to hashtab, ensure the key size is at most
> MAX_BPF_STACK for a successful map creation.
> 
> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> Reported-by: syzbot+e4566d29080e7f3460ff@syzkaller.appspotmail.com
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  kernel/bpf/sockmap.c | 6 ++++++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 56879c9fd3a4..79f5e8988889 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -1990,6 +1990,12 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
>  	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
>  		return ERR_PTR(-EINVAL);
>  
> +	if (attr->key_size > MAX_BPF_STACK)
> +		/* eBPF programs initialize keys on stack, so they cannot be
> +		 * larger than max stack size
> +		 */
> +		return ERR_PTR(-E2BIG);
> +
>  	err = bpf_tcp_ulp_register();
>  	if (err && err != -EEXIST)
>  		return ERR_PTR(err);
> 

Thanks!

Acked-by: John Fastabend <john.fastabend@gmail.com>

^ permalink raw reply

* Re: [PATCH net-next v12 4/7] sch_cake: Add NAT awareness to packet classifier
From: Toke Høiland-Jørgensen @ 2018-05-16 21:14 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers, Cake List
In-Reply-To: <CAM_iQpUCB24x7d7oDYkes_ngQ==8CjiwGqCBit-BK5NxzzZODg@mail.gmail.com>

Cong Wang <xiyou.wangcong@gmail.com> writes:

> On Wed, May 16, 2018 at 1:29 PM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
>> When CAKE is deployed on a gateway that also performs NAT (which is a
>> common deployment mode), the host fairness mechanism cannot distinguish
>> internal hosts from each other, and so fails to work correctly.
>>
>> To fix this, we add an optional NAT awareness mode, which will query the
>> kernel conntrack mechanism to obtain the pre-NAT addresses for each packet
>> and use that in the flow and host hashing.
>>
>> When the shaper is enabled and the host is already performing NAT, the cost
>> of this lookup is negligible. However, in unlimited mode with no NAT being
>> performed, there is a significant CPU cost at higher bandwidths. For this
>> reason, the feature is turned off by default.
>>
>> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
>> ---
>>  net/sched/sch_cake.c |   73 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 73 insertions(+)
>>
>> diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
>> index 65439b643c92..e1038a7b6686 100644
>> --- a/net/sched/sch_cake.c
>> +++ b/net/sched/sch_cake.c
>> @@ -71,6 +71,12 @@
>>  #include <net/tcp.h>
>>  #include <net/flow_dissector.h>
>>
>> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
>> +#include <net/netfilter/nf_conntrack_core.h>
>> +#include <net/netfilter/nf_conntrack_zones.h>
>> +#include <net/netfilter/nf_conntrack.h>
>> +#endif
>> +
>>  #define CAKE_SET_WAYS (8)
>>  #define CAKE_MAX_TINS (8)
>>  #define CAKE_QUEUES (1024)
>> @@ -514,6 +520,60 @@ static bool cobalt_should_drop(struct cobalt_vars *vars,
>>         return drop;
>>  }
>>
>> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
>> +
>> +static void cake_update_flowkeys(struct flow_keys *keys,
>> +                                const struct sk_buff *skb)
>> +{
>> +       const struct nf_conntrack_tuple *tuple;
>> +       enum ip_conntrack_info ctinfo;
>> +       struct nf_conn *ct;
>> +       bool rev = false;
>> +
>> +       if (tc_skb_protocol(skb) != htons(ETH_P_IP))
>> +               return;
>> +
>> +       ct = nf_ct_get(skb, &ctinfo);
>> +       if (ct) {
>> +               tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
>> +       } else {
>> +               const struct nf_conntrack_tuple_hash *hash;
>> +               struct nf_conntrack_tuple srctuple;
>> +
>> +               if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
>> +                                      NFPROTO_IPV4, dev_net(skb->dev),
>> +                                      &srctuple))
>> +                       return;
>> +
>> +               hash = nf_conntrack_find_get(dev_net(skb->dev),
>> +                                            &nf_ct_zone_dflt,
>> +                                            &srctuple);
>> +               if (!hash)
>> +                       return;
>> +
>> +               rev = true;
>> +               ct = nf_ct_tuplehash_to_ctrack(hash);
>> +               tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
>> +       }
>> +
>> +       keys->addrs.v4addrs.src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip;
>> +       keys->addrs.v4addrs.dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip;
>> +
>> +       if (keys->ports.ports) {
>> +               keys->ports.src = rev ? tuple->dst.u.all : tuple->src.u.all;
>> +               keys->ports.dst = rev ? tuple->src.u.all : tuple->dst.u.all;
>> +       }
>> +       if (rev)
>> +               nf_ct_put(ct);
>> +}
>> +#else
>> +static void cake_update_flowkeys(struct flow_keys *keys,
>> +                                const struct sk_buff *skb)
>> +{
>> +       /* There is nothing we can do here without CONNTRACK */
>> +}
>> +#endif
>> +
>>  /* Cake has several subtle multiple bit settings. In these cases you
>>   *  would be matching triple isolate mode as well.
>>   */
>> @@ -541,6 +601,9 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
>>         skb_flow_dissect_flow_keys(skb, &keys,
>>                                    FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
>>
>> +       if (flow_mode & CAKE_FLOW_NAT_FLAG)
>> +               cake_update_flowkeys(&keys, skb);
>> +
>>         /* flow_hash_from_keys() sorts the addresses by value, so we have
>>          * to preserve their order in a separate data structure to treat
>>          * src and dst host addresses as independently selectable.
>> @@ -1727,6 +1790,12 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
>>                 q->flow_mode = (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
>>                                 CAKE_FLOW_MASK);
>>
>> +       if (tb[TCA_CAKE_NAT]) {
>> +               q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
>> +               q->flow_mode |= CAKE_FLOW_NAT_FLAG *
>> +                       !!nla_get_u32(tb[TCA_CAKE_NAT]);
>> +       }
>
>
> I think it's better to return -EOPNOTSUPP when CONFIG_NF_CONNTRACK
> is not enabled.

Good point, will fix :)

-Toke

^ permalink raw reply

* Re: [PATCH net-next v12 2/7] sch_cake: Add ingress mode
From: Toke Høiland-Jørgensen @ 2018-05-16 21:16 UTC (permalink / raw)
  To: Cong Wang; +Cc: Linux Kernel Network Developers, Cake List
In-Reply-To: <CAM_iQpVz9bRDkW_cqj-vo6fapXaG6zV-bWM0Sc2wmHuj-vzWqg@mail.gmail.com>

Cong Wang <xiyou.wangcong@gmail.com> writes:

> On Wed, May 16, 2018 at 1:29 PM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
>> +       if (tb[TCA_CAKE_AUTORATE]) {
>> +               if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
>> +                       q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
>> +               else
>> +                       q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
>> +       }
>> +
>> +       if (tb[TCA_CAKE_INGRESS]) {
>> +               if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
>> +                       q->rate_flags |= CAKE_FLAG_INGRESS;
>> +               else
>> +                       q->rate_flags &= ~CAKE_FLAG_INGRESS;
>> +       }
>> +
>>         if (tb[TCA_CAKE_MEMORY])
>>                 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
>>
>> @@ -1559,6 +1628,14 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
>>         if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
>>                 goto nla_put_failure;
>>
>> +       if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
>> +                       !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
>> +               goto nla_put_failure;
>> +
>> +       if (nla_put_u32(skb, TCA_CAKE_INGRESS,
>> +                       !!(q->rate_flags & CAKE_FLAG_INGRESS)))
>> +               goto nla_put_failure;
>> +
>
> Why do you want to dump each bit of the rate_flags separately rather than
> dumping the whole rate_flags as an integer?

Well, these were added one at a time, each as a new option. Isn't that
more or less congruent with how netlink attributes are supposed to be
used?

-Toke

^ permalink raw reply

* Re: [PATCH iproute2-next] tc-netem: fix limit description in man page
From: David Ahern @ 2018-05-16 21:17 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner, netdev
In-Reply-To: <adc166a5db72cf5c658cbc68a7d733c0793e31d4.1526431697.git.mleitner@redhat.com>

On 5/15/18 6:49 PM, Marcelo Ricardo Leitner wrote:
> As the kernel code says, limit is actually the amount of packets it can
> hold queued at a time, as per:
> 
> static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
>                          struct sk_buff **to_free)
> {
> 	...
>         if (unlikely(sch->q.qlen >= sch->limit))
>                 return qdisc_drop_all(skb, sch, to_free);
> 
> So lets fix the description of the field in the man page.
> 
> Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
> ---
>  man/man8/tc-netem.8 | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 

applied to iproute2-next. Thanks,

^ permalink raw reply

* Re: [PATCH 00/14] Modify action API for implementing lockless actions
From: Vlad Buslov @ 2018-05-16 21:23 UTC (permalink / raw)
  To: Roman Mashak
  Cc: Jamal Hadi Salim, Linux Kernel Network Developers, David Miller,
	Cong Wang, Jiri Pirko, pablo, kadlec, fw, ast, Daniel Borkmann,
	Eric Dumazet, kliteyn
In-Reply-To: <85efib33kr.fsf@mojatatu.com>


On Wed 16 May 2018 at 17:36, Roman Mashak <mrv@mojatatu.com> wrote:
> Vlad Buslov <vladbu@mellanox.com> writes:
>
>> On Wed 16 May 2018 at 14:38, Roman Mashak <mrv@mojatatu.com> wrote:
>>> On Wed, May 16, 2018 at 2:43 AM, Vlad Buslov <vladbu@mellanox.com> wrote:
>>>>>>>> I'm trying to run tdc, but keep getting following error even on clean
>>>>>>>> branch without my patches:
>>>>>>>
>>>>>>> Vlad, not sure if you saw my email:
>>>>>>> Apply Roman's patch and try again
>>>>>>>
>>>>>>> https://marc.info/?l=linux-netdev&m=152639369112020&w=2
>>>>>>>
>>>>>>> cheers,
>>>>>>> jamal
>>>>>>
>>>>>> With patch applied I get following error:
>>>>>>
>>>>>> Test 7d50: Add skbmod action to set destination mac
>>>>>> exit: 255 0
>>>>>> dst MAC address <11:22:33:44:55:66>
>>>>>> RTNETLINK answers: No such file or directory
>>>>>> We have an error talking to the kernel
>>>>>>
>>>>>
>>>>> You may actually have broken something with your patches in this case.
>>>>
>>>> Results is for net-next without my patches.
>>>
>>> Do you have skbmod compiled in kernel or as a module?
>>
>> Thanks, already figured out that default config has some actions
>> disabled.
>> Have more errors now. Everything related to ife:
>>
>> Test 7682: Create valid ife encode action with mark and pass control
>> exit: 255 0
>> IFE type 0xED3E
>> RTNETLINK answers: No such file or directory
>> We have an error talking to the kernel
>>
>> Test ef47: Create valid ife encode action with mark and pipe control
>> exit: 255 0
>> IFE type 0xED3E
>> RTNETLINK answers: No space left on device
>> We have an error talking to the kernel
>>
>> Test df43: Create valid ife encode action with mark and continue control
>> exit: 255 0
>> IFE type 0xED3E
>> RTNETLINK answers: No space left on device
>> We have an error talking to the kernel
>>
>> Test e4cf: Create valid ife encode action with mark and drop control
>> exit: 255 0
>> IFE type 0xED3E
>> RTNETLINK answers: No space left on device
>> We have an error talking to the kernel
>>
>> Test ccba: Create valid ife encode action with mark and reclassify control
>> exit: 255 0
>> IFE type 0xED3E
>> RTNETLINK answers: No space left on device
>> We have an error talking to the kernel
>>
>> Test a1cf: Create valid ife encode action with mark and jump control
>> exit: 255 0
>> IFE type 0xED3E
>> RTNETLINK answers: No space left on device
>> We have an error talking to the kernel
>>
>> ...
>>
>>
>
> Please make sure you have these in your kernel config:
>
> CONFIG_NET_ACT_IFE=y
> CONFIG_NET_IFE_SKBMARK=m
> CONFIG_NET_IFE_SKBPRIO=m
> CONFIG_NET_IFE_SKBTCINDEX=m
>
> For tdc to run all the tests, it is assumed that all the supported tc
> actions/filters are enabled and compiled.

Enabling these options allowed all ife tests to pass. Thanks!

Error in u32 test still appears however:

Test e9a3: Add u32 with source match

-----> prepare stage *** Could not execute: "$TC qdisc add dev $DEV1 ingress"

-----> prepare stage *** Error message: "Cannot find device "v0p1"

^ permalink raw reply

* Re: [PATCH iproute2-next] tc-netem: fix limit description in man page
From: Stephen Hemminger @ 2018-05-16 21:28 UTC (permalink / raw)
  To: David Ahern; +Cc: Marcelo Ricardo Leitner, netdev
In-Reply-To: <700b8303-28df-d991-6410-2f9d06f2b70d@gmail.com>

On Wed, 16 May 2018 15:17:50 -0600
David Ahern <dsahern@gmail.com> wrote:

> On 5/15/18 6:49 PM, Marcelo Ricardo Leitner wrote:
> > As the kernel code says, limit is actually the amount of packets it can
> > hold queued at a time, as per:
> > 
> > static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
> >                          struct sk_buff **to_free)
> > {
> > 	...
> >         if (unlikely(sch->q.qlen >= sch->limit))
> >                 return qdisc_drop_all(skb, sch, to_free);
> > 
> > So lets fix the description of the field in the man page.
> > 
> > Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
> > ---
> >  man/man8/tc-netem.8 | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >   
> 
> applied to iproute2-next. Thanks,
> 

Since it is an error, I will put it in master.

^ permalink raw reply

* Re: [PATCH 00/14] Modify action API for implementing lockless actions
From: Vlad Buslov @ 2018-05-16 21:29 UTC (permalink / raw)
  To: Davide Caratti
  Cc: Roman Mashak, Jamal Hadi Salim, Linux Kernel Network Developers,
	David Miller, Cong Wang, Jiri Pirko, pablo, kadlec, fw, ast,
	Daniel Borkmann, Eric Dumazet, kliteyn
In-Reply-To: <bd6cf2d13402fae390e29dba8919a5a388481b5e.camel@redhat.com>


On Wed 16 May 2018 at 18:10, Davide Caratti <dcaratti@redhat.com> wrote:
> On Wed, 2018-05-16 at 13:36 -0400, Roman Mashak wrote:
>> Vlad Buslov <vladbu@mellanox.com> writes:
>> 
>> > On Wed 16 May 2018 at 14:38, Roman Mashak <mrv@mojatatu.com> wrote:
>> > > On Wed, May 16, 2018 at 2:43 AM, Vlad Buslov <vladbu@mellanox.com> wrote:
>> > > > > > > > I'm trying to run tdc, but keep getting following error even on clean
>> > > > > > > > branch without my patches:
>> > > > > > > 
>> > > > > > > Vlad, not sure if you saw my email:
>> > > > > > > Apply Roman's patch and try again
>> > > > > > > 
>> > > > > > > https://marc.info/?l=linux-netdev&m=152639369112020&w=2
>> > > > > > > 
>> > > > > > > cheers,
>> > > > > > > jamal
>> > > > > > 
>> > > > > > With patch applied I get following error:
>> > > > > > 
>> > > > > > Test 7d50: Add skbmod action to set destination mac
>> > > > > > exit: 255 0
>> > > > > > dst MAC address <11:22:33:44:55:66>
>> > > > > > RTNETLINK answers: No such file or directory
>> > > > > > We have an error talking to the kernel
>> > > > > > 
>> > > > > 
>> > > > > You may actually have broken something with your patches in this case.
>> > > > 
>> > > > Results is for net-next without my patches.
>> > > 
>> > > Do you have skbmod compiled in kernel or as a module?
>> > 
>> > Thanks, already figured out that default config has some actions
>> > disabled.
>> > Have more errors now. Everything related to ife:
>> > 
>> > Test 7682: Create valid ife encode action with mark and pass control
>> > exit: 255 0
>> > IFE type 0xED3E
>> > RTNETLINK answers: No such file or directory
>> > We have an error talking to the kernel
>> > 
>> > Test ef47: Create valid ife encode action with mark and pipe control
>> > exit: 255 0
>> > IFE type 0xED3E
>> > RTNETLINK answers: No space left on device
>> > We have an error talking to the kernel
>> > 
>> > Test df43: Create valid ife encode action with mark and continue control
>> > exit: 255 0
>> > IFE type 0xED3E
>> > RTNETLINK answers: No space left on device
>> > We have an error talking to the kernel
>> > 
>> > Test e4cf: Create valid ife encode action with mark and drop control
>> > exit: 255 0
>> > IFE type 0xED3E
>> > RTNETLINK answers: No space left on device
>> > We have an error talking to the kernel
>> > 
>> > Test ccba: Create valid ife encode action with mark and reclassify control
>> > exit: 255 0
>> > IFE type 0xED3E
>> > RTNETLINK answers: No space left on device
>> > We have an error talking to the kernel
>> > 
>> > Test a1cf: Create valid ife encode action with mark and jump control
>> > exit: 255 0
>> > IFE type 0xED3E
>> > RTNETLINK answers: No space left on device
>> > We have an error talking to the kernel
>> > 
>> > ...
>> > 
>> > 
>> 
>> Please make sure you have these in your kernel config:
>> 
>> CONFIG_NET_ACT_IFE=y
>> CONFIG_NET_IFE_SKBMARK=m
>> CONFIG_NET_IFE_SKBPRIO=m
>> CONFIG_NET_IFE_SKBTCINDEX=m
>> 
>> For tdc to run all the tests, it is assumed that all the supported tc
>> actions/filters are enabled and compiled.
> hello,
>
> looking at ife.json, it seems that we have at least 4 typos in
> 'teardown'. 
>
> It does
>
> $TC actions flush action skbedit
>
> in place of 
>
> $TC actions flush action ife
>
> On my fedora28 (with fedora28 kernel), fixing them made test 7682 return
> 'ok' (and all others in ife category, except ee94, 7ee0 and 0a7d).
>
> regards,

I can confirm that on net-next kernel version that I use, there are also
multiple teardowns of actions type skbedit after actually creating ife
action in file ife.json. However, tests pass when I enabled config
options that Roman suggested:

ok 119 - 7682 # Create valid ife encode action with mark and pass control

^ permalink raw reply

* Re: [PATCH bpf-next v6 1/4] bpf: sockmap, refactor sockmap routines to work with hashmap
From: John Fastabend @ 2018-05-16 21:46 UTC (permalink / raw)
  To: Daniel Borkmann, ast; +Cc: netdev, davem
In-Reply-To: <954e3b7f-9001-e528-c6ab-f8f69c84cfd8@iogearbox.net>

On 05/15/2018 12:19 PM, Daniel Borkmann wrote:
> On 05/14/2018 07:00 PM, John Fastabend wrote:
> [...]


[...]

> 
> As you say in the comment above the function wrt locking notes that the
> __sock_map_ctx_update_elem() can be called concurrently.
> 
>   All operations operate on sock_map using cmpxchg and xchg operations to ensure we
>   do not get stale references. Any reads into the map must be done with READ_ONCE()
>   because of this.
> 
> You initially use the READ_ONCE() on the verdict/parse/tx_msg, but later on when
> grabbing the reference you use again progs->bpf_verdict/bpf_parse/bpf_tx_msg which
> would potentially refetch it, but if updates would happen concurrently e.g. to the
> three progs, they could be NULL in the mean-time, no? bpf_prog_inc_not_zero() would
> then crash. Why are not the ones used that you fetched previously via READ_ONCE()
> for taking the ref?

Nice catch. We should use the reference fetched by READ_ONCE in all cases.

> 
> The second question I had is that verdict/parse/tx_msg are updated independently
> from each other and each could be NULL or non-NULL. What if, say, parse is NULL
> and verdict as well as tx_msg is non-NULL and the bpf_prog_inc_not_zero() on the
> tx_msg prog fails. Doesn't this cause a use-after-free since a ref on verdict wasn't
> taken earlier but the bpf_prog_put() will cause accidental misbalance/free of the
> progs?

Also good catch. I'll send patches for both now. Thanks.

> 
> It would probably help to clarify the locking comment a bit more if indeed the
> above should be okay as is.
> 
> Thanks,
> Daniel
> 

^ permalink raw reply

* [bpf PATCH 1/2] bpf: sockmap update rollback on error can incorrectly dec prog refcnt
From: John Fastabend @ 2018-05-16 21:46 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev

If the user were to only attach one of the parse or verdict programs
then it is possible a subsequent sockmap update could incorrectly
decrement the refcnt on the program. This happens because in the
rollback logic, after an error, we have to decrement the program
reference count when its been incremented. However, we only increment
the program reference count if the user has both a verdict and a
parse program. The reason for this is because, at least at the
moment, both are required for any one to be meaningful. The problem
fixed here is in the rollback path we decrement the program refcnt
even if only one existing. But we never incremented the refcnt in
the first place creating an imbalance.

This patch fixes the error path to handle this case.

Fixes: 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 kernel/bpf/sockmap.c |   12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 098eca5..f03aaa8 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1717,10 +1717,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	if (tx_msg) {
 		tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg);
 		if (IS_ERR(tx_msg)) {
-			if (verdict)
-				bpf_prog_put(verdict);
-			if (parse)
+			if (parse && verdict) {
 				bpf_prog_put(parse);
+				bpf_prog_put(verdict);
+			}
 			return PTR_ERR(tx_msg);
 		}
 	}
@@ -1805,10 +1805,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 out_free:
 	smap_release_sock(psock, sock);
 out_progs:
-	if (verdict)
-		bpf_prog_put(verdict);
-	if (parse)
+	if (parse && verdict) {
 		bpf_prog_put(parse);
+		bpf_prog_put(verdict);
+	}
 	if (tx_msg)
 		bpf_prog_put(tx_msg);
 	write_unlock_bh(&sock->sk_callback_lock);

^ permalink raw reply related

* [bpf PATCH 2/2] bpf: parse and verdict prog attach may race with bpf map update
From: John Fastabend @ 2018-05-16 21:46 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev
In-Reply-To: <20180516214651.6664.62408.stgit@john-Precision-Tower-5810>

In the sockmap design BPF programs (SK_SKB_STREAM_PARSER and
SK_SKB_STREAM_VERDICT) are attached to the sockmap map type and when
a sock is added to the map the programs are used by the socket.
However, sockmap updates from both userspace and BPF programs can
happen concurrently with the attach and detach of these programs.

To resolve this we use the bpf_prog_inc_not_zero and a READ_ONCE()
primitive to ensure the program pointer is not refeched and
possibly NULL'd before the refcnt increment. This happens inside
a RCU critical section so although the pointer reference in the map
object may be NULL (by a concurrent detach operation) the reference
from READ_ONCE will not be free'd until after grace period. This
ensures the object returned by READ_ONCE() is valid through the
RCU criticl section and safe to use as long as we "know" it may
be free'd shortly.

Daniel spotted a case in the sock update API where instead of using
the READ_ONCE() program reference we used the pointer from the
original map, stab->bpf_{verdict|parse}. The problem with this is
the logic checks the object returned from the READ_ONCE() is not
NULL and then tries to reference the object again but using the
above map pointer, which may have already been NULL'd by a parallel
detach operation. If this happened bpf_porg_inc_not_zero could
dereference a NULL pointer.

Fix this by using variable returned by READ_ONCE() that is checked
for NULL.

Fixes: 2f857d04601a ("bpf: sockmap, remove STRPARSER map_flags and add multi-map support")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 kernel/bpf/sockmap.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index f03aaa8..583c1eb 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1703,11 +1703,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		 * we increment the refcnt. If this is the case abort with an
 		 * error.
 		 */
-		verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+		verdict = bpf_prog_inc_not_zero(verdict);
 		if (IS_ERR(verdict))
 			return PTR_ERR(verdict);
 
-		parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+		parse = bpf_prog_inc_not_zero(parse);
 		if (IS_ERR(parse)) {
 			bpf_prog_put(verdict);
 			return PTR_ERR(parse);

^ permalink raw reply related

* Re: [PATCH 00/14] Modify action API for implementing lockless actions
From: Jiri Pirko @ 2018-05-16 21:51 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: Roman Mashak, Jamal Hadi Salim, Linux Kernel Network Developers,
	David Miller, Cong Wang, pablo, kadlec, fw, ast, Daniel Borkmann,
	Eric Dumazet, kliteyn
In-Reply-To: <vbfpo1vpa5e.fsf@reg-r-vrt-018-180.mtr.labs.mlnx>

Wed, May 16, 2018 at 11:23:41PM CEST, vladbu@mellanox.com wrote:
>
>On Wed 16 May 2018 at 17:36, Roman Mashak <mrv@mojatatu.com> wrote:
>> Vlad Buslov <vladbu@mellanox.com> writes:
>>
>>> On Wed 16 May 2018 at 14:38, Roman Mashak <mrv@mojatatu.com> wrote:
>>>> On Wed, May 16, 2018 at 2:43 AM, Vlad Buslov <vladbu@mellanox.com> wrote:
>>>>>>>>> I'm trying to run tdc, but keep getting following error even on clean
>>>>>>>>> branch without my patches:
>>>>>>>>
>>>>>>>> Vlad, not sure if you saw my email:
>>>>>>>> Apply Roman's patch and try again
>>>>>>>>
>>>>>>>> https://marc.info/?l=linux-netdev&m=152639369112020&w=2
>>>>>>>>
>>>>>>>> cheers,
>>>>>>>> jamal
>>>>>>>
>>>>>>> With patch applied I get following error:
>>>>>>>
>>>>>>> Test 7d50: Add skbmod action to set destination mac
>>>>>>> exit: 255 0
>>>>>>> dst MAC address <11:22:33:44:55:66>
>>>>>>> RTNETLINK answers: No such file or directory
>>>>>>> We have an error talking to the kernel
>>>>>>>
>>>>>>
>>>>>> You may actually have broken something with your patches in this case.
>>>>>
>>>>> Results is for net-next without my patches.
>>>>
>>>> Do you have skbmod compiled in kernel or as a module?
>>>
>>> Thanks, already figured out that default config has some actions
>>> disabled.
>>> Have more errors now. Everything related to ife:
>>>
>>> Test 7682: Create valid ife encode action with mark and pass control
>>> exit: 255 0
>>> IFE type 0xED3E
>>> RTNETLINK answers: No such file or directory
>>> We have an error talking to the kernel
>>>
>>> Test ef47: Create valid ife encode action with mark and pipe control
>>> exit: 255 0
>>> IFE type 0xED3E
>>> RTNETLINK answers: No space left on device
>>> We have an error talking to the kernel
>>>
>>> Test df43: Create valid ife encode action with mark and continue control
>>> exit: 255 0
>>> IFE type 0xED3E
>>> RTNETLINK answers: No space left on device
>>> We have an error talking to the kernel
>>>
>>> Test e4cf: Create valid ife encode action with mark and drop control
>>> exit: 255 0
>>> IFE type 0xED3E
>>> RTNETLINK answers: No space left on device
>>> We have an error talking to the kernel
>>>
>>> Test ccba: Create valid ife encode action with mark and reclassify control
>>> exit: 255 0
>>> IFE type 0xED3E
>>> RTNETLINK answers: No space left on device
>>> We have an error talking to the kernel
>>>
>>> Test a1cf: Create valid ife encode action with mark and jump control
>>> exit: 255 0
>>> IFE type 0xED3E
>>> RTNETLINK answers: No space left on device
>>> We have an error talking to the kernel
>>>
>>> ...
>>>
>>>
>>
>> Please make sure you have these in your kernel config:
>>
>> CONFIG_NET_ACT_IFE=y
>> CONFIG_NET_IFE_SKBMARK=m
>> CONFIG_NET_IFE_SKBPRIO=m
>> CONFIG_NET_IFE_SKBTCINDEX=m

Roman, could you please add this to some file? Something similar to:
tools/testing/selftests/net/forwarding/config

Thanks!

>>
>> For tdc to run all the tests, it is assumed that all the supported tc
>> actions/filters are enabled and compiled.
>
>Enabling these options allowed all ife tests to pass. Thanks!
>
>Error in u32 test still appears however:
>
>Test e9a3: Add u32 with source match
>
>-----> prepare stage *** Could not execute: "$TC qdisc add dev $DEV1 ingress"
>
>-----> prepare stage *** Error message: "Cannot find device "v0p1"

^ permalink raw reply

* Re: [PATCH bpf-next v5 3/6] bpf: Add IPv6 Segment Routing helpers
From: Mathieu Xhonneux @ 2018-05-16 21:59 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: netdev, David Lebrun, Alexei Starovoitov
In-Reply-To: <ae7d9849-1124-2d96-1304-4693a1827729@iogearbox.net>

 2018-05-14 23:40 GMT+01:00 Daniel Borkmann <daniel@iogearbox.net>:
> On 05/12/2018 07:25 PM, Mathieu Xhonneux wrote:
> [...]
>> +BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
>> +        const void *, from, u32, len)
>> +{
>> +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
>> +     struct seg6_bpf_srh_state *srh_state =
>> +             this_cpu_ptr(&seg6_bpf_srh_states);
>> +     void *srh_tlvs, *srh_end, *ptr;
>> +     struct ipv6_sr_hdr *srh;
>> +     int srhoff = 0;
>> +
>> +     if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
>> +             return -EINVAL;
>> +
>> +     srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
>> +     srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
>> +     srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
>
> Do we need to check that this cannot go out of bounds wrt skb data?
input_action_bpf_end (which calls the BPF program) already verifies
using get_srh() that the whole SRH is accessible and is not out of
bounds.
The seg6 helpers (e.g. bpf_lwt_seg6_adjust_srh) then modify
srh_state->hdrlen following the evolution of the SRH in size. I don't
think that a check on srh_end is needed here as the SRH is already
verified once, and srh_state->hdrlen is then updated to keep this
bound correct.

>> +     ptr = skb->data + offset;
>> +     if (ptr >= srh_tlvs && ptr + len <= srh_end)
>> +             srh_state->valid = 0;
>> +     else if (ptr < (void *)&srh->flags ||
>> +              ptr + len > (void *)&srh->segments)
>> +             return -EFAULT;
>> +
>> +     if (unlikely(bpf_try_make_writable(skb, offset + len)))
>> +             return -EFAULT;
>> +
>> +     memcpy(ptr, from, len);
>
> You have a use after free here. bpf_try_make_writable() is potentially changing
> underlying skb->data (e.g. see pskb_expand_head()). Therefore memcpy()'ing into
> cached ptr is invalid.
>
OK.

>> +     if (len > 0) {
>> +             ret = skb_cow_head(skb, len);
>> +             if (unlikely(ret < 0))
>> +                     return ret;
>> +
>> +             ret = bpf_skb_net_hdr_push(skb, offset, len);
>> +     } else {
>> +             ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
>> +     }
>> +     if (unlikely(ret < 0))
>> +             return ret;
>
> And here as well. You changed underlying pointers via skb_cow_head(), but in
> the error path you leave the cached pointers that now point to already freed
> buffer. Thus, you'd now be able to access the new skb data out of bounds since
> cb->data_end is still the old one due to missing bpf_compute_data_pointers(skb).
> Please fix and audit your whole series carefully against these types of subtle
> bugs.

Right.
I went through the whole series again. I found a similar mistake in
bpf_push_seg6_encap, and added also a bpf_compute_data_pointers(skb)
there. I didn't find anything else, so I hope that we're covered here
(bpf_lwt_seg6_store_bytes, bpf_lwt_seg6_adjust_srh and
bpf_push_seg6_encap are the only functions modifying the packet in
this series).

Thanks. I'll submit a v6 ASAP.

^ permalink raw reply

* Re: [PATCH bpf-next 2/7] bpf: introduce bpf subcommand BPF_PERF_EVENT_QUERY
From: Yonghong Song @ 2018-05-16 21:59 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <20180516112734.GE12217@hirez.programming.kicks-ass.net>



On 5/16/18 4:27 AM, Peter Zijlstra wrote:
> On Tue, May 15, 2018 at 04:45:16PM -0700, Yonghong Song wrote:
>> Currently, suppose a userspace application has loaded a bpf program
>> and attached it to a tracepoint/kprobe/uprobe, and a bpf
>> introspection tool, e.g., bpftool, wants to show which bpf program
>> is attached to which tracepoint/kprobe/uprobe. Such attachment
>> information will be really useful to understand the overall bpf
>> deployment in the system.
>>
>> There is a name field (16 bytes) for each program, which could
>> be used to encode the attachment point. There are some drawbacks
>> for this approaches. First, bpftool user (e.g., an admin) may not
>> really understand the association between the name and the
>> attachment point. Second, if one program is attached to multiple
>> places, encoding a proper name which can imply all these
>> attachments becomes difficult.
>>
>> This patch introduces a new bpf subcommand BPF_PERF_EVENT_QUERY.
>> Given a pid and fd, if the <pid, fd> is associated with a
>> tracepoint/kprobe/uprobea perf event, BPF_PERF_EVENT_QUERY will return
>>     . prog_id
>>     . tracepoint name, or
>>     . k[ret]probe funcname + offset or kernel addr, or
>>     . u[ret]probe filename + offset
>> to the userspace.
>> The user can use "bpftool prog" to find more information about
>> bpf program itself with prog_id.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/linux/trace_events.h |  15 ++++++
>>   include/uapi/linux/bpf.h     |  25 ++++++++++
>>   kernel/bpf/syscall.c         | 113 +++++++++++++++++++++++++++++++++++++++++++
>>   kernel/trace/bpf_trace.c     |  53 ++++++++++++++++++++
>>   kernel/trace/trace_kprobe.c  |  29 +++++++++++
>>   kernel/trace/trace_uprobe.c  |  22 +++++++++
>>   6 files changed, 257 insertions(+)
> 
> Why is the command called *_PERF_EVENT_* ? Are there not a lot of !perf
> places to attach BPF proglets?

Just gave a complete picture, the below are major places to attach
BPF programs:
    . perf based (through perf ioctl)
    . raw tracepoint based (through bpf interface)

    . netlink interface for tc, xdp, tunneling
    . setsockopt for socket filters
    . cgroup based (bpf attachment subcommand)
      mostly networking and io devices
    . some other networking socket related (sk_skb stream/parser/verdict,
      sk_msg verdict) through bpf attachment subcommand.

Currently, for cgroup based attachment, we have BPF_PROG_QUERY with 
input cgroup file descriptor. For other networking based queries, we
may need to enumerate tc filters, networking devices, open sockets, etc.
to get the attachment information.

So to have one BPF_QUERY command line may be too complex to
cover all cases.

But you are right that BPF_PERF_EVENT_QUERY name is too narrow since
it should be used for other (pid, fd) based queries as well (e.g., 
socket, or other potential uses in the future).

How about the subcommand name BPF_TASK_FD_QUERY and make 
bpf_attr.task_fd_query extensible?

Thanks!

^ permalink raw reply

* [PATCH] net: ethernet: ti: cpsw: disable mq feature for "AM33xx ES1.0" devices
From: Ivan Khoronzhuk @ 2018-05-16 22:21 UTC (permalink / raw)
  To: grygorii.strashko, davem
  Cc: netdev, linux-kernel, linux-omap, Ivan Khoronzhuk

The early versions of am33xx devices, related to ES1.0 SoC revision
have errata limiting mq support. That's the same errata as
commit 7da1160002f1 ("drivers: net: cpsw: add am335x errata workarround for
interrutps")

AM33xx Errata [1] Advisory 1.0.9
http://www.ti.com/lit/er/sprz360f/sprz360f.pdf

After additional investigation were found that drivers w/a is
propagated on all AM33xx SoCs and on DM814x. But the errata exists
only for ES1.0 of AM33xx family, limiting mq support for revisions
after ES1.0. So, disable mq support only for related SoCs and use
separate polls for revisions allowing mq.

Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---

Based on net-next/master

 drivers/net/ethernet/ti/cpsw.c | 109 ++++++++++++++++++---------------
 1 file changed, 60 insertions(+), 49 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 28d893b93d30..a7285dddfd29 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -36,6 +36,7 @@
 #include <linux/of_device.h>
 #include <linux/if_vlan.h>
 #include <linux/kmemleak.h>
+#include <linux/sys_soc.h>
 
 #include <linux/pinctrl/consumer.h>
 
@@ -957,7 +958,7 @@ static irqreturn_t cpsw_rx_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
+static int cpsw_tx_mq_poll(struct napi_struct *napi_tx, int budget)
 {
 	u32			ch_map;
 	int			num_tx, cur_budget, ch;
@@ -984,7 +985,21 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
 	if (num_tx < budget) {
 		napi_complete(napi_tx);
 		writel(0xff, &cpsw->wr_regs->tx_en);
-		if (cpsw->quirk_irq && cpsw->tx_irq_disabled) {
+	}
+
+	return num_tx;
+}
+
+static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
+{
+	struct cpsw_common *cpsw = napi_to_cpsw(napi_tx);
+	int num_tx;
+
+	num_tx = cpdma_chan_process(cpsw->txv[0].ch, budget);
+	if (num_tx < budget) {
+		napi_complete(napi_tx);
+		writel(0xff, &cpsw->wr_regs->tx_en);
+		if (cpsw->tx_irq_disabled) {
 			cpsw->tx_irq_disabled = false;
 			enable_irq(cpsw->irqs_table[1]);
 		}
@@ -993,7 +1008,7 @@ static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
 	return num_tx;
 }
 
-static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
+static int cpsw_rx_mq_poll(struct napi_struct *napi_rx, int budget)
 {
 	u32			ch_map;
 	int			num_rx, cur_budget, ch;
@@ -1020,7 +1035,21 @@ static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
 	if (num_rx < budget) {
 		napi_complete_done(napi_rx, num_rx);
 		writel(0xff, &cpsw->wr_regs->rx_en);
-		if (cpsw->quirk_irq && cpsw->rx_irq_disabled) {
+	}
+
+	return num_rx;
+}
+
+static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
+{
+	struct cpsw_common *cpsw = napi_to_cpsw(napi_rx);
+	int num_rx;
+
+	num_rx = cpdma_chan_process(cpsw->rxv[0].ch, budget);
+	if (num_rx < budget) {
+		napi_complete_done(napi_rx, num_rx);
+		writel(0xff, &cpsw->wr_regs->rx_en);
+		if (cpsw->rx_irq_disabled) {
 			cpsw->rx_irq_disabled = false;
 			enable_irq(cpsw->irqs_table[0]);
 		}
@@ -2364,9 +2393,9 @@ static void cpsw_get_channels(struct net_device *ndev,
 {
 	struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
 
+	ch->max_rx = cpsw->quirk_irq ? 1 : CPSW_MAX_QUEUES;
+	ch->max_tx = cpsw->quirk_irq ? 1 : CPSW_MAX_QUEUES;
 	ch->max_combined = 0;
-	ch->max_rx = CPSW_MAX_QUEUES;
-	ch->max_tx = CPSW_MAX_QUEUES;
 	ch->max_other = 0;
 	ch->other_count = 0;
 	ch->rx_count = cpsw->rx_ch_num;
@@ -2377,6 +2406,11 @@ static void cpsw_get_channels(struct net_device *ndev,
 static int cpsw_check_ch_settings(struct cpsw_common *cpsw,
 				  struct ethtool_channels *ch)
 {
+	if (cpsw->quirk_irq) {
+		dev_err(cpsw->dev, "Maximum one tx/rx queue is allowed");
+		return -EOPNOTSUPP;
+	}
+
 	if (ch->combined_count)
 		return -EINVAL;
 
@@ -2917,44 +2951,20 @@ static int cpsw_probe_dual_emac(struct cpsw_priv *priv)
 	return ret;
 }
 
-#define CPSW_QUIRK_IRQ		BIT(0)
-
-static const struct platform_device_id cpsw_devtype[] = {
-	{
-		/* keep it for existing comaptibles */
-		.name = "cpsw",
-		.driver_data = CPSW_QUIRK_IRQ,
-	}, {
-		.name = "am335x-cpsw",
-		.driver_data = CPSW_QUIRK_IRQ,
-	}, {
-		.name = "am4372-cpsw",
-		.driver_data = 0,
-	}, {
-		.name = "dra7-cpsw",
-		.driver_data = 0,
-	}, {
-		/* sentinel */
-	}
-};
-MODULE_DEVICE_TABLE(platform, cpsw_devtype);
-
-enum ti_cpsw_type {
-	CPSW = 0,
-	AM335X_CPSW,
-	AM4372_CPSW,
-	DRA7_CPSW,
-};
-
 static const struct of_device_id cpsw_of_mtable[] = {
-	{ .compatible = "ti,cpsw", .data = &cpsw_devtype[CPSW], },
-	{ .compatible = "ti,am335x-cpsw", .data = &cpsw_devtype[AM335X_CPSW], },
-	{ .compatible = "ti,am4372-cpsw", .data = &cpsw_devtype[AM4372_CPSW], },
-	{ .compatible = "ti,dra7-cpsw", .data = &cpsw_devtype[DRA7_CPSW], },
+	{ .compatible = "ti,cpsw"},
+	{ .compatible = "ti,am335x-cpsw"},
+	{ .compatible = "ti,am4372-cpsw"},
+	{ .compatible = "ti,dra7-cpsw"},
 	{ /* sentinel */ },
 };
 MODULE_DEVICE_TABLE(of, cpsw_of_mtable);
 
+static const struct soc_device_attribute cpsw_soc_devices[] = {
+	{ .family = "AM33xx", .revision = "ES1.0"},
+	{ /* sentinel */ }
+};
+
 static int cpsw_probe(struct platform_device *pdev)
 {
 	struct clk			*clk;
@@ -2966,9 +2976,9 @@ static int cpsw_probe(struct platform_device *pdev)
 	void __iomem			*ss_regs;
 	void __iomem			*cpts_regs;
 	struct resource			*res, *ss_res;
-	const struct of_device_id	*of_id;
 	struct gpio_descs		*mode;
 	u32 slave_offset, sliver_offset, slave_size;
+	const struct soc_device_attribute *soc;
 	struct cpsw_common		*cpsw;
 	int ret = 0, i;
 	int irq;
@@ -3141,6 +3151,10 @@ static int cpsw_probe(struct platform_device *pdev)
 		goto clean_dt_ret;
 	}
 
+	soc = soc_device_match(cpsw_soc_devices);
+	if (soc)
+		cpsw->quirk_irq = 1;
+
 	cpsw->txv[0].ch = cpdma_chan_create(cpsw->dma, 0, cpsw_tx_handler, 0);
 	if (IS_ERR(cpsw->txv[0].ch)) {
 		dev_err(priv->dev, "error initializing tx dma channel\n");
@@ -3180,19 +3194,16 @@ static int cpsw_probe(struct platform_device *pdev)
 		goto clean_dma_ret;
 	}
 
-	of_id = of_match_device(cpsw_of_mtable, &pdev->dev);
-	if (of_id) {
-		pdev->id_entry = of_id->data;
-		if (pdev->id_entry->driver_data)
-			cpsw->quirk_irq = true;
-	}
-
 	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_CTAG_RX;
 
 	ndev->netdev_ops = &cpsw_netdev_ops;
 	ndev->ethtool_ops = &cpsw_ethtool_ops;
-	netif_napi_add(ndev, &cpsw->napi_rx, cpsw_rx_poll, CPSW_POLL_WEIGHT);
-	netif_tx_napi_add(ndev, &cpsw->napi_tx, cpsw_tx_poll, CPSW_POLL_WEIGHT);
+	netif_napi_add(ndev, &cpsw->napi_rx,
+		       cpsw->quirk_irq ? cpsw_rx_poll : cpsw_rx_mq_poll,
+		       CPSW_POLL_WEIGHT);
+	netif_tx_napi_add(ndev, &cpsw->napi_tx,
+			  cpsw->quirk_irq ? cpsw_tx_poll : cpsw_tx_mq_poll,
+			  CPSW_POLL_WEIGHT);
 	cpsw_split_res(ndev);
 
 	/* register the network device */
-- 
2.17.0

^ permalink raw reply related

* Re: [PATCH net-next] erspan: set bso bit based on mirrored packet's len
From: Tobin C. Harding @ 2018-05-16 22:24 UTC (permalink / raw)
  To: William Tu; +Cc: Linux Kernel Network Developers
In-Reply-To: <CALDO+SaDVJtagChiQdvue6tY-N8G=jAT40GcD7U9kpa9qcVvQA@mail.gmail.com>

On Wed, May 16, 2018 at 07:05:34AM -0700, William Tu wrote:
> On Mon, May 14, 2018 at 10:33 PM, Tobin C. Harding <tobin@apporbit.com> wrote:
> > On Mon, May 14, 2018 at 04:54:36PM -0700, William Tu wrote:
> >> Before the patch, the erspan BSO bit (Bad/Short/Oversized) is not
> >> handled.  BSO has 4 possible values:
> >>   00 --> Good frame with no error, or unknown integrity
> >>   11 --> Payload is a Bad Frame with CRC or Alignment Error
> >>   01 --> Payload is a Short Frame
> >>   10 --> Payload is an Oversized Frame
> >>
> >> Based the short/oversized definitions in RFC1757, the patch sets
> >> the bso bit based on the mirrored packet's size.
> >>
> >> Reported-by: Xiaoyan Jin <xiaoyanj@vmware.com>
> >> Signed-off-by: William Tu <u9012063@gmail.com>
> >> ---
> >>  include/net/erspan.h | 25 +++++++++++++++++++++++++
> >>  1 file changed, 25 insertions(+)
> >>
> >> diff --git a/include/net/erspan.h b/include/net/erspan.h
> >> index d044aa60cc76..5eb95f78ad45 100644
> >> --- a/include/net/erspan.h
> >> +++ b/include/net/erspan.h
> >> @@ -219,6 +219,30 @@ static inline __be32 erspan_get_timestamp(void)
> >>       return htonl((u32)h_usecs);
> >>  }
> >>
> >> +/* ERSPAN BSO (Bad/Short/Oversized)
> >> + *   00b --> Good frame with no error, or unknown integrity
> >> + *   01b --> Payload is a Short Frame
> >> + *   10b --> Payload is an Oversized Frame
> >> + *   11b --> Payload is a Bad Frame with CRC or Alignment Error
> >> + */
> >> +enum erspan_bso {
> >> +     BSO_NOERROR,
> >> +     BSO_SHORT,
> >> +     BSO_OVERSIZED,
> >> +     BSO_BAD,
> >> +};
> >
> > If we are relying on the values perhaps this would be clearer
> >
> >         BSO_NOERROR     = 0x00,
> >         BSO_SHORT       = 0x01,
> >         BSO_OVERSIZED   = 0x02,
> >         BSO_BAD         = 0x03,
> >
> 
> Yes, thanks. I will change in v2.
> 
> >> +
> >> +static inline u8 erspan_detect_bso(struct sk_buff *skb)
> >> +{
> >> +     if (skb->len < ETH_ZLEN)
> >> +             return BSO_SHORT;
> >> +
> >> +     if (skb->len > ETH_FRAME_LEN)
> >> +             return BSO_OVERSIZED;
> >> +
> >> +     return BSO_NOERROR;
> >> +}
> >
> > Without having much contextual knowledge around this patch; should we be
> > doing some check on CRC or alignment (at some stage)?  Having BSO_BAD
> > seems to imply so?
> >
> 
> The definition of BSO_BAD:
> etherStatsCRCAlignErrors OBJECT-TYPE
>               SYNTAX Counter
>               ACCESS read-only
>               STATUS mandatory
>               DESCRIPTION
>                   "The total number of packets received that
>                   had a length (excluding framing bits, but
>                   including FCS octets) of between 64 and 1518
>                   octets, inclusive, but but had either a bad
>                   Frame Check Sequence (FCS) with an integral
>                   number of octets (FCS Error) or a bad FCS with
>                   a non-integral number of octets (Alignment Error)."
>
> But I don't know how to check CRC error at this code point.
> Isn't it done by the NIC hardware?

I'll just start with; I don't know anything about ERSPAN

	"ERSPAN is a Cisco proprietary feature and is available only to
	Catalyst 6500, 7600, Nexus, and ASR 1000 platforms to date. The
	ASR 1000 supports ERSPAN source (monitoring) only on Fast
	Ethernet, Gigabit Ethernet, and port-channel interfaces."

https://supportforums.cisco.com/t5/network-infrastructure-documents/understanding-span-rspan-and-erspan/ta-p/3144951

I dug around a bit and none of the files that currently import erspan.h
actually use the 'bso' field

$ grep bso $(git grep -l 'erspan\.h')
include/net/erspan.h:	u8 bso = 0; /* Bad/Short/Oversized */
include/net/erspan.h:	ershdr->en = bso;
net/ipv4/ip_gre.c:	   ICMP in the real Internet is absolutely infeasible.
net/ipv4/ip_gre.c:	 * ICMP in the real Internet is absolutely infeasible.


Normally, AFAICT, the FCS does not get passed to the operating system
since its a link layer mechanism.  If ERSPAN is passing the FCS when it
mirrors frames (does it mirror frames or packets, I don't know?) then
surely ERSPAN should provide a function to return the BSO value.

So IMHO this patch seems like a just pretense and not really doing
anything.

Hope this helps,
Tobin.

^ permalink raw reply

* Re: kernel BUG at lib/string.c:LINE! (4)
From: Julian Anastasov @ 2018-05-16 22:43 UTC (permalink / raw)
  To: syzbot
  Cc: coreteam, davem, fw, horms, kadlec, linux-kernel, lvs-devel,
	netdev, netfilter-devel, pablo, syzkaller-bugs, wensong
In-Reply-To: <0000000000006764b4056c5476d9@google.com>


	Hello,

On Wed, 16 May 2018, syzbot wrote:

> Hello,
> 
> syzbot found the following crash on:
> 
> HEAD commit:    0b7d9978406f Merge branch 'Microsemi-Ocelot-Ethernet-switc..
> git tree:       net-next
> console output: https://syzkaller.appspot.com/x/log.txt?x=16e91017800000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=b632d8e2c2ab2c1
> dashboard link: https://syzkaller.appspot.com/bug?extid=aac887f77319868646df
> compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
> syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=1665d637800000
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=10517107800000
> 
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+aac887f77319868646df@syzkaller.appspotmail.com
> 
> IPVS: Unknown mcast interface: veth1_to???a????????????
> IPVS: Unknown mcast interface: veth1_to???a????????????
> IPVS: Unknown mcast interface: veth1_to???a????????????
> detected buffer overflow in strlen
> ------------[ cut here ]------------
> kernel BUG at lib/string.c:1052!
> invalid opcode: 0000 [#1] SMP KASAN
> Dumping ftrace buffer:
>   (ftrace buffer empty)
> Modules linked in:
> CPU: 1 PID: 373 Comm: syz-executor936 Not tainted 4.17.0-rc4+ #45
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google
> 01/01/2011
> RIP: 0010:fortify_panic+0x13/0x20 lib/string.c:1051
> RSP: 0018:ffff8801c976f800 EFLAGS: 00010282
> RAX: 0000000000000022 RBX: 0000000000000040 RCX: 0000000000000000
> RDX: 0000000000000022 RSI: ffffffff8160f6f1 RDI: ffffed00392edef6
> RBP: ffff8801c976f800 R08: ffff8801cf4c62c0 R09: ffffed003b5e4fb0
> R10: ffffed003b5e4fb0 R11: ffff8801daf27d87 R12: ffff8801c976fa20
> R13: ffff8801c976fae4 R14: ffff8801c976fae0 R15: 000000000000048b
> FS:  00007fd99f75e700(0000) GS:ffff8801daf00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00000000200001c0 CR3: 00000001d6843000 CR4: 00000000001406e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
> strlen include/linux/string.h:270 [inline]
> strlcpy include/linux/string.h:293 [inline]
> do_ip_vs_set_ctl+0x31c/0x1d00 net/netfilter/ipvs/ip_vs_ctl.c:2388
> nf_sockopt net/netfilter/nf_sockopt.c:106 [inline]
> nf_setsockopt+0x7d/0xd0 net/netfilter/nf_sockopt.c:115
> ip_setsockopt+0xd8/0xf0 net/ipv4/ip_sockglue.c:1253
> udp_setsockopt+0x62/0xa0 net/ipv4/udp.c:2487
> ipv6_setsockopt+0x149/0x170 net/ipv6/ipv6_sockglue.c:917
> tcp_setsockopt+0x93/0xe0 net/ipv4/tcp.c:3057
> sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3046
> __sys_setsockopt+0x1bd/0x390 net/socket.c:1903
> __do_sys_setsockopt net/socket.c:1914 [inline]
> __se_sys_setsockopt net/socket.c:1911 [inline]
> __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911
> do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
> entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x447369
> RSP: 002b:00007fd99f75dda8 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
> RAX: ffffffffffffffda RBX: 00000000006e39e4 RCX: 0000000000447369
> RDX: 000000000000048b RSI: 0000000000000000 RDI: 0000000000000003
> RBP: 0000000000000000 R08: 0000000000000018 R09: 0000000000000000
> R10: 00000000200001c0 R11: 0000000000000246 R12: 00000000006e39e0
> R13: 75a1ff93f0896195 R14: 6f745f3168746576 R15: 0000000000000001
> Code: 08 5b 41 5c 41 5d 41 5e 41 5f 5d c3 0f 0b 48 89 df e8 d2 8f 48 fa eb de
> 55 48 89 fe 48 c7 c7 60 65 64 88 48 89 e5 e8 91 dd f3 f9 <0f> 0b 90 90 90 90
> 90 90 90 90 90 90 90 55 48 89 e5 41 57 41 56
> RIP: fortify_panic+0x13/0x20 lib/string.c:1051 RSP: ffff8801c976f800
> ---[ end trace 624046f2d9af7702 ]---

	Just to let you know that I tested a patch with
the syzbot, will do more tests before submitting...

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox