Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 3/3] mlxsw: spectrum: Skip firmware version check based on devlink parameter
From: Ido Schimmel @ 2018-11-06 20:05 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: davem@davemloft.net, Jiri Pirko, Shalom Toledo, Moshe Shemesh,
	dsahern@gmail.com, jakub.kicinski@netronome.com, andrew@lunn.ch,
	f.fainelli@gmail.com, mlxsw, Ido Schimmel
In-Reply-To: <20181106200314.29918-1-idosch@mellanox.com>

From: Shalom Toledo <shalomt@mellanox.com>

Based on fw_version_check devlink parameter, skip firmware version check in
order to run the device with different firmware version than required by
the driver for testing and/or debugging purposes.

Signed-off-by: Shalom Toledo <shalomt@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c    | 13 ++++++
 drivers/net/ethernet/mellanox/mlxsw/core.h    |  2 +
 .../net/ethernet/mellanox/mlxsw/spectrum.c    | 45 +++++++++++++++++++
 3 files changed, 60 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index b2e1b83525db..281aeb1c2386 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1036,6 +1036,12 @@ __mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 			goto err_devlink_register;
 	}
 
+	if (mlxsw_driver->params_register && !reload) {
+		err = mlxsw_driver->params_register(mlxsw_core);
+		if (err)
+			goto err_register_params;
+	}
+
 	err = mlxsw_hwmon_init(mlxsw_core, mlxsw_bus_info, &mlxsw_core->hwmon);
 	if (err)
 		goto err_hwmon_init;
@@ -1058,6 +1064,9 @@ __mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 err_thermal_init:
 	mlxsw_hwmon_fini(mlxsw_core->hwmon);
 err_hwmon_init:
+	if (mlxsw_driver->params_unregister && !reload)
+		mlxsw_driver->params_unregister(mlxsw_core);
+err_register_params:
 	if (!reload)
 		devlink_unregister(devlink);
 err_devlink_register:
@@ -1121,6 +1130,8 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core,
 		mlxsw_core->driver->fini(mlxsw_core);
 	mlxsw_thermal_fini(mlxsw_core->thermal);
 	mlxsw_hwmon_fini(mlxsw_core->hwmon);
+	if (mlxsw_core->driver->params_unregister && !reload)
+		mlxsw_core->driver->params_unregister(mlxsw_core);
 	if (!reload)
 		devlink_unregister(devlink);
 	mlxsw_emad_fini(mlxsw_core);
@@ -1133,6 +1144,8 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core,
 	return;
 
 reload_fail_deinit:
+	if (mlxsw_core->driver->params_unregister)
+		mlxsw_core->driver->params_unregister(mlxsw_core);
 	devlink_unregister(devlink);
 	devlink_resources_unregister(devlink, NULL);
 	devlink_free(devlink);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index c35be477856f..d811be8989b0 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -282,6 +282,8 @@ struct mlxsw_driver {
 			     const struct mlxsw_config_profile *profile,
 			     u64 *p_single_size, u64 *p_double_size,
 			     u64 *p_linear_size);
+	int (*params_register)(struct mlxsw_core *mlxsw_core);
+	void (*params_unregister)(struct mlxsw_core *mlxsw_core);
 	u8 txhdr_len;
 	const struct mlxsw_config_profile *profile;
 	bool res_query_enabled;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 9bec940330a4..ab1ca8c1d6df 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -318,6 +318,7 @@ static int mlxsw_sp_fw_rev_validate(struct mlxsw_sp *mlxsw_sp)
 	const struct mlxsw_fw_rev *rev = &mlxsw_sp->bus_info->fw_rev;
 	const struct mlxsw_fw_rev *req_rev = mlxsw_sp->req_rev;
 	const char *fw_filename = mlxsw_sp->fw_filename;
+	union devlink_param_value value;
 	const struct firmware *firmware;
 	int err;
 
@@ -325,6 +326,15 @@ static int mlxsw_sp_fw_rev_validate(struct mlxsw_sp *mlxsw_sp)
 	if (!req_rev || !fw_filename)
 		return 0;
 
+	/* Don't check if devlink fw_version_check param is false */
+	err = devlink_param_driverinit_value_get(priv_to_devlink(mlxsw_sp->core),
+						 DEVLINK_PARAM_GENERIC_ID_FW_VERSION_CHECK,
+						 &value);
+	if (err)
+		return err;
+	if (!value.vbool)
+		return 0;
+
 	/* Validate driver & FW are compatible */
 	if (rev->major != req_rev->major) {
 		WARN(1, "Mismatch in major FW version [%d:%d] is never expected; Please contact support\n",
@@ -4171,6 +4181,37 @@ static int mlxsw_sp_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
 	return 0;
 }
 
+static const struct devlink_param mlxsw_sp_devlink_params[] = {
+	DEVLINK_PARAM_GENERIC(FW_VERSION_CHECK,
+			      BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
+			      NULL, NULL, NULL),
+};
+
+static int mlxsw_sp_params_register(struct mlxsw_core *mlxsw_core)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+	union devlink_param_value value;
+	int err;
+
+	err = devlink_params_register(devlink, mlxsw_sp_devlink_params,
+				      ARRAY_SIZE(mlxsw_sp_devlink_params));
+	if (err)
+		return err;
+
+	value.vbool = true;
+	devlink_param_driverinit_value_set(devlink,
+					   DEVLINK_PARAM_GENERIC_ID_FW_VERSION_CHECK,
+					   value);
+	return 0;
+}
+
+static void mlxsw_sp_params_unregister(struct mlxsw_core *mlxsw_core)
+{
+	devlink_params_unregister(priv_to_devlink(mlxsw_core),
+				  mlxsw_sp_devlink_params,
+				  ARRAY_SIZE(mlxsw_sp_devlink_params));
+}
+
 static struct mlxsw_driver mlxsw_sp1_driver = {
 	.kind				= mlxsw_sp1_driver_name,
 	.priv_size			= sizeof(struct mlxsw_sp),
@@ -4192,6 +4233,8 @@ static struct mlxsw_driver mlxsw_sp1_driver = {
 	.txhdr_construct		= mlxsw_sp_txhdr_construct,
 	.resources_register		= mlxsw_sp1_resources_register,
 	.kvd_sizes_get			= mlxsw_sp_kvd_sizes_get,
+	.params_register		= mlxsw_sp_params_register,
+	.params_unregister		= mlxsw_sp_params_unregister,
 	.txhdr_len			= MLXSW_TXHDR_LEN,
 	.profile			= &mlxsw_sp1_config_profile,
 	.res_query_enabled		= true,
@@ -4217,6 +4260,8 @@ static struct mlxsw_driver mlxsw_sp2_driver = {
 	.sb_occ_tc_port_bind_get	= mlxsw_sp_sb_occ_tc_port_bind_get,
 	.txhdr_construct		= mlxsw_sp_txhdr_construct,
 	.resources_register		= mlxsw_sp2_resources_register,
+	.params_register		= mlxsw_sp_params_register,
+	.params_unregister		= mlxsw_sp_params_unregister,
 	.txhdr_len			= MLXSW_TXHDR_LEN,
 	.profile			= &mlxsw_sp2_config_profile,
 	.res_query_enabled		= true,
-- 
2.19.1

^ permalink raw reply related

* bpf-next is OPEN
From: Daniel Borkmann @ 2018-11-06 19:39 UTC (permalink / raw)
  To: netdev; +Cc: ast

Merge window is over so bpf-next is open again!

Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH net-next 1/3] devlink: Add fw_version_check generic parameter
From: Jakub Kicinski @ 2018-11-06 20:19 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: netdev@vger.kernel.org, davem@davemloft.net, Jiri Pirko,
	Shalom Toledo, Moshe Shemesh, dsahern@gmail.com, andrew@lunn.ch,
	f.fainelli@gmail.com, mlxsw
In-Reply-To: <20181106200314.29918-2-idosch@mellanox.com>

On Tue, 6 Nov 2018 20:05:00 +0000, Ido Schimmel wrote:
> From: Shalom Toledo <shalomt@mellanox.com>
> 
> Many drivers checking the device's firmware version during the
> initialization flow and flashing a compatible version if the current
> version is not.
> 
> fw_version_check gives the ability to skip this check which allows to run
> the device with a different firmware version than required by the driver
> for testing and/or debugging purposes.
> 
> Signed-off-by: Shalom Toledo <shalomt@mellanox.com>
> Reviewed-by: Jiri Pirko <jiri@mellanox.com>
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>

The documentation is missing, so it's hard to comment on the definition
of the parameter...  We have a FW loading policy for NFP, too, so it'd
be good to see if we can find a common ground.

^ permalink raw reply

* [PATCH bpf-next 1/2] bpf: add perf-event notificaton support for sock_ops
From: Sowmini Varadhan @ 2018-11-06 20:28 UTC (permalink / raw)
  To: sowmini.varadhan, daniel, netdev, davem
In-Reply-To: <cover.1541535172.git.sowmini.varadhan@oracle.com>

This patch allows eBPF programs that use sock_ops to send
perf-based event notifications using bpf_perf_event_output()

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
 net/core/filter.c |   19 +++++++++++++++++++
 1 files changed, 19 insertions(+), 0 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index e521c5e..23464a3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4048,6 +4048,23 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
 	return ret;
 }
 
+BPF_CALL_5(bpf_sock_opts_event_output, struct bpf_sock_ops *, skops,
+	   struct bpf_map *, map, u64, flags, void *, data, u64, size)
+{
+	return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
+}
+
+static const struct bpf_func_proto bpf_sock_ops_event_output_proto =  {
+	.func		= bpf_sock_opts_event_output,
+	.gpl_only       = true,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_ANYTHING,
+	.arg4_type      = ARG_PTR_TO_MEM,
+	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+};
+
 static const struct bpf_func_proto bpf_setsockopt_proto = {
 	.func		= bpf_setsockopt,
 	.gpl_only	= false,
@@ -5226,6 +5243,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
+	case BPF_FUNC_perf_event_output:
+		return &bpf_sock_ops_event_output_proto;
 	case BPF_FUNC_setsockopt:
 		return &bpf_setsockopt_proto;
 	case BPF_FUNC_getsockopt:
-- 
1.7.1

^ permalink raw reply related

* [PATCH bpf-next 0/2] TCP-BPF event notification support
From: Sowmini Varadhan @ 2018-11-06 20:28 UTC (permalink / raw)
  To: sowmini.varadhan, daniel, netdev, davem

This patchset uses eBPF perf-event based notification mechanism to solve
the problem described in 
   https://marc.info/?l=linux-netdev&m=154022219423571&w=2.
Thanks to Daniel Borkmann for feedback/input.

The problem statement is
  We would like to monitor some subset of TCP sockets in user-space,
  (the monitoring application would define 4-tuples it wants to monitor)
  using TCP_INFO stats to analyze reported problems. The idea is to
  use those stats to see where the bottlenecks are likely to be ("is it
  application-limited?" or "is there evidence of BufferBloat in the
  path?" etc)

  Today we can do this by periodically polling for tcp_info, but this
  could be made more efficient if the kernel would asynchronously
  notify the application via tcp_info when some "interesting"
  thresholds (e.g., "RTT variance > X", or "total_retrans > Y" etc)
  are reached. And to make this effective, it is better if
  we could apply the threshold check *before* constructing the
  tcp_info netlink notification, so that we don't waste resources
  constructing notifications that will be discarded by the filter.

This patchset solves the problem by adding perf-event based notification
support for sock_ops (Patch1). The eBPF kernel module can thus 
be designed to apply any desired filters to the bpf_sock_ops and
trigger a perf-event notification based on the verdict from the filter.
The uspace component can use these perf-event notifications to either
read any state managed by the eBPF kernel module, or issue a TCP_INFO 
netlink call if desired.

Patch 2 provides a simple example that shows how to use this infra
(and also provides a test case for it)

Sowmini Varadhan (2):
  bpf: add perf-event notificaton support for sock_ops
  selftests/bpf: add a test case for sock_ops perf-event notification

 net/core/filter.c                                 |   19 ++
 tools/testing/selftests/bpf/Makefile              |    4 +-
 tools/testing/selftests/bpf/perf-sys.h            |   74 ++++++++
 tools/testing/selftests/bpf/test_tcpnotify.h      |   19 ++
 tools/testing/selftests/bpf/test_tcpnotify_kern.c |   95 +++++++++++
 tools/testing/selftests/bpf/test_tcpnotify_user.c |  186 +++++++++++++++++++++
 6 files changed, 396 insertions(+), 1 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/perf-sys.h
 create mode 100644 tools/testing/selftests/bpf/test_tcpnotify.h
 create mode 100644 tools/testing/selftests/bpf/test_tcpnotify_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_tcpnotify_user.c

^ permalink raw reply

* [PATCH bpf-next 2/2] selftests/bpf: add a test case for sock_ops perf-event notification
From: Sowmini Varadhan @ 2018-11-06 20:28 UTC (permalink / raw)
  To: sowmini.varadhan, daniel, netdev, davem
In-Reply-To: <cover.1541535172.git.sowmini.varadhan@oracle.com>

This patch provides a tcp_bpf based eBPF sample. The test
- ncat(1) as the TCP client program to connect() to a port
  with the intention of triggerring SYN retransmissions: we
  first install an iptables DROP rule to make sure ncat SYNs are
  resent (instead of aborting instantly after a TCP RST)
- has a bpf kernel module that sends a perf-event notification for
  each TCP retransmit, and also tracks the number of such notifications
  sent in the global_map
The test passes when the number of event notifications intercepted
in user-space matches the value in the global_map.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
 tools/testing/selftests/bpf/Makefile              |    4 +-
 tools/testing/selftests/bpf/perf-sys.h            |   74 ++++++++
 tools/testing/selftests/bpf/test_tcpnotify.h      |   19 ++
 tools/testing/selftests/bpf/test_tcpnotify_kern.c |   95 +++++++++++
 tools/testing/selftests/bpf/test_tcpnotify_user.c |  186 +++++++++++++++++++++
 5 files changed, 377 insertions(+), 1 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/perf-sys.h
 create mode 100644 tools/testing/selftests/bpf/test_tcpnotify.h
 create mode 100644 tools/testing/selftests/bpf/test_tcpnotify_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_tcpnotify_user.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index e39dfb4..6c94048 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -24,12 +24,13 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
 	test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \
 	test_socket_cookie test_cgroup_storage test_select_reuseport test_section_names \
-	test_netcnt
+	test_netcnt test_tcpnotify_user
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
+	test_tcpnotify_kern.o \
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
 	test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
@@ -74,6 +75,7 @@ $(OUTPUT)/test_sock_addr: cgroup_helpers.c
 $(OUTPUT)/test_socket_cookie: cgroup_helpers.c
 $(OUTPUT)/test_sockmap: cgroup_helpers.c
 $(OUTPUT)/test_tcpbpf_user: cgroup_helpers.c
+$(OUTPUT)/test_tcpnotify_user: cgroup_helpers.c trace_helpers.c
 $(OUTPUT)/test_progs: trace_helpers.c
 $(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c
 $(OUTPUT)/test_cgroup_storage: cgroup_helpers.c
diff --git a/tools/testing/selftests/bpf/perf-sys.h b/tools/testing/selftests/bpf/perf-sys.h
new file mode 100644
index 0000000..3eb7a39
--- /dev/null
+++ b/tools/testing/selftests/bpf/perf-sys.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PERF_SYS_H
+#define _PERF_SYS_H
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/perf_event.h>
+#include <asm/barrier.h>
+
+#ifdef __powerpc__
+#define CPUINFO_PROC	{"cpu"}
+#endif
+
+#ifdef __s390__
+#define CPUINFO_PROC	{"vendor_id"}
+#endif
+
+#ifdef __sh__
+#define CPUINFO_PROC	{"cpu type"}
+#endif
+
+#ifdef __hppa__
+#define CPUINFO_PROC	{"cpu"}
+#endif
+
+#ifdef __sparc__
+#define CPUINFO_PROC	{"cpu"}
+#endif
+
+#ifdef __alpha__
+#define CPUINFO_PROC	{"cpu model"}
+#endif
+
+#ifdef __arm__
+#define CPUINFO_PROC	{"model name", "Processor"}
+#endif
+
+#ifdef __mips__
+#define CPUINFO_PROC	{"cpu model"}
+#endif
+
+#ifdef __arc__
+#define CPUINFO_PROC	{"Processor"}
+#endif
+
+#ifdef __xtensa__
+#define CPUINFO_PROC	{"core ID"}
+#endif
+
+#ifndef CPUINFO_PROC
+#define CPUINFO_PROC	{ "model name", }
+#endif
+
+static inline int
+sys_perf_event_open(struct perf_event_attr *attr,
+		      pid_t pid, int cpu, int group_fd,
+		      unsigned long flags)
+{
+	int fd;
+
+	fd = syscall(__NR_perf_event_open, attr, pid, cpu,
+		     group_fd, flags);
+
+#ifdef HAVE_ATTR_TEST
+	if (unlikely(test_attr__enabled))
+		test_attr__open(attr, pid, cpu, fd, group_fd, flags);
+#endif
+	return fd;
+}
+
+#endif /* _PERF_SYS_H */
diff --git a/tools/testing/selftests/bpf/test_tcpnotify.h b/tools/testing/selftests/bpf/test_tcpnotify.h
new file mode 100644
index 0000000..8b6cea0
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpnotify.h
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _TEST_TCPBPF_H
+#define _TEST_TCPBPF_H
+
+struct tcpnotify_globals {
+	__u32 total_retrans;
+	__u32 ncalls;
+};
+
+struct tcp_notifier {
+	__u8    type;
+	__u8    subtype;
+	__u8    source;
+	__u8    hash;
+};
+
+#define	TESTPORT	12877
+#endif
diff --git a/tools/testing/selftests/bpf/test_tcpnotify_kern.c b/tools/testing/selftests/bpf/test_tcpnotify_kern.c
new file mode 100644
index 0000000..edbca20
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpnotify_kern.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <netinet/in.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+#include "test_tcpnotify.h"
+
+struct bpf_map_def SEC("maps") global_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct tcpnotify_globals),
+	.max_entries = 4,
+};
+
+struct bpf_map_def SEC("maps") perf_event_map = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(__u32),
+	.max_entries = 2,
+};
+
+int _version SEC("version") = 1;
+
+SEC("sockops")
+int bpf_testcb(struct bpf_sock_ops *skops)
+{
+	int rv = -1;
+	int op;
+
+	op = (int) skops->op;
+
+	if (bpf_ntohl(skops->remote_port) != TESTPORT) {
+		skops->reply = -1;
+		return 0;
+	}
+
+	switch (op) {
+	case BPF_SOCK_OPS_TIMEOUT_INIT:
+	case BPF_SOCK_OPS_RWND_INIT:
+	case BPF_SOCK_OPS_NEEDS_ECN:
+	case BPF_SOCK_OPS_BASE_RTT:
+	case BPF_SOCK_OPS_RTO_CB:
+		rv = 1;
+		break;
+
+	case BPF_SOCK_OPS_TCP_CONNECT_CB:
+	case BPF_SOCK_OPS_TCP_LISTEN_CB:
+	case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+	case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+		bpf_sock_ops_cb_flags_set(skops, (BPF_SOCK_OPS_RETRANS_CB_FLAG|
+					  BPF_SOCK_OPS_RTO_CB_FLAG));
+		rv = 1;
+		break;
+	case BPF_SOCK_OPS_RETRANS_CB: {
+			__u32 key = 0;
+			struct tcpnotify_globals g, *gp;
+			struct tcp_notifier msg = {
+				.type = 0xde,
+				.subtype = 0xad,
+				.source = 0xbe,
+				.hash = 0xef,
+			};
+
+			rv = 1;
+
+			/* Update results */
+			gp = bpf_map_lookup_elem(&global_map, &key);
+			if (!gp)
+				break;
+			g = *gp;
+			g.total_retrans = skops->total_retrans;
+			g.ncalls++;
+			bpf_map_update_elem(&global_map, &key, &g,
+					    BPF_ANY);
+			bpf_perf_event_output(skops, &perf_event_map,
+					      BPF_F_CURRENT_CPU,
+					      &msg, sizeof(msg));
+		}
+		break;
+	default:
+		rv = -1;
+	}
+	skops->reply = rv;
+	return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_tcpnotify_user.c b/tools/testing/selftests/bpf/test_tcpnotify_user.c
new file mode 100644
index 0000000..8f88cb9
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpnotify_user.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <asm/types.h>
+#include <errno.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <sys/socket.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include <sys/ioctl.h>
+#include <linux/rtnetlink.h>
+#include <signal.h>
+#include <linux/perf_event.h>
+#include "perf-sys.h"
+
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+
+#include "test_tcpnotify.h"
+#include "trace_helpers.h"
+
+#define SOCKET_BUFFER_SIZE (getpagesize() < 8192L ? getpagesize() : 8192L)
+
+pthread_t tid;
+int rx_callbacks;
+
+static int dummyfn(void *data, int size)
+{
+	struct tcp_notifier *t = data;
+
+	if (t->type != 0xde || t->subtype != 0xad ||
+	    t->source != 0xbe || t->hash != 0xef)
+		return 1;
+	rx_callbacks++;
+	return 0;
+}
+
+void tcp_notifier_poller(int fd)
+{
+	while (1)
+		perf_event_poller(fd, dummyfn);
+}
+
+static void *poller_thread(void *arg)
+{
+	int fd = *(int *)arg;
+
+	tcp_notifier_poller(fd);
+	return arg;
+}
+
+int verify_result(const struct tcpnotify_globals *result)
+{
+	return (result->ncalls > 0 && result->ncalls == rx_callbacks ? 0 : 1);
+}
+
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+			const char *name)
+{
+	struct bpf_map *map;
+
+	map = bpf_object__find_map_by_name(obj, name);
+	if (!map) {
+		printf("%s:FAIL:map '%s' not found\n", test, name);
+		return -1;
+	}
+	return bpf_map__fd(map);
+}
+
+static int setup_bpf_perf_event(int mapfd)
+{
+	struct perf_event_attr attr = {
+		.sample_type = PERF_SAMPLE_RAW,
+		.type = PERF_TYPE_SOFTWARE,
+		.config = PERF_COUNT_SW_BPF_OUTPUT,
+	};
+	int key = 0;
+	int pmu_fd;
+
+	pmu_fd = sys_perf_event_open(&attr, -1, 0, -1, 0);
+	if (pmu_fd < 0)
+		return pmu_fd;
+	bpf_map_update_elem(mapfd, &key, &pmu_fd, BPF_ANY);
+
+	ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	return pmu_fd;
+}
+
+int main(int argc, char **argv)
+{
+	const char *file = "test_tcpnotify_kern.o";
+	int prog_fd, map_fd, perf_event_fd;
+	struct tcpnotify_globals g = {0};
+	const char *cg_path = "/foo";
+	int error = EXIT_FAILURE;
+	struct bpf_object *obj;
+	int cg_fd = -1;
+	__u32 key = 0;
+	int rv;
+	char test_script[80];
+	int pmu_fd;
+	cpu_set_t cpuset;
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(0, &cpuset);
+	pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+
+	if (setup_cgroup_environment())
+		goto err;
+
+	cg_fd = create_and_get_cgroup(cg_path);
+	if (!cg_fd)
+		goto err;
+
+	if (join_cgroup(cg_path))
+		goto err;
+
+	if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) {
+		printf("FAILED: load_bpf_file failed for: %s\n", file);
+		goto err;
+	}
+
+	rv = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_SOCK_OPS, 0);
+	if (rv) {
+		printf("FAILED: bpf_prog_attach: %d (%s)\n",
+		       error, strerror(errno));
+		goto err;
+	}
+
+	perf_event_fd = bpf_find_map(__func__, obj, "perf_event_map");
+	if (perf_event_fd < 0)
+		goto err;
+
+	map_fd = bpf_find_map(__func__, obj, "global_map");
+	if (map_fd < 0)
+		goto err;
+
+	pmu_fd = setup_bpf_perf_event(perf_event_fd);
+	if (pmu_fd < 0 || perf_event_mmap(pmu_fd) < 0)
+		goto err;
+
+	pthread_create(&tid, NULL, poller_thread, (void *)&pmu_fd);
+
+	sprintf(test_script,
+		"/usr/sbin/iptables -A INPUT -p tcp --dport %d -j DROP",
+		TESTPORT);
+	system(test_script);
+
+	sprintf(test_script,
+		"/usr/bin/nc 127.0.0.1 %d < /etc/passwd > /dev/null 2>&1 ",
+		TESTPORT);
+	system(test_script);
+
+	sprintf(test_script,
+		"/usr/sbin/iptables -D INPUT -p tcp --dport %d -j DROP",
+		TESTPORT);
+	system(test_script);
+
+	rv = bpf_map_lookup_elem(map_fd, &key, &g);
+	if (rv != 0) {
+		printf("FAILED: bpf_map_lookup_elem returns %d\n", rv);
+		goto err;
+	}
+
+	sleep(10);
+
+	if (verify_result(&g)) {
+		printf("FAILED: Wrong stats Expected %d calls, got %d\n",
+			g.ncalls, rx_callbacks);
+		goto err;
+	}
+
+	printf("PASSED!\n");
+	error = 0;
+err:
+	bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS);
+	close(cg_fd);
+	cleanup_cgroup_environment();
+	return error;
+}
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH net-next 1/3] devlink: Add fw_version_check generic parameter
From: Ido Schimmel @ 2018-11-06 20:37 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Ido Schimmel, netdev@vger.kernel.org, davem@davemloft.net,
	Jiri Pirko, Shalom Toledo, Moshe Shemesh, dsahern@gmail.com,
	andrew@lunn.ch, f.fainelli@gmail.com, mlxsw
In-Reply-To: <20181106121913.036b8c4d@cakuba.netronome.com>

On Tue, Nov 06, 2018 at 12:19:13PM -0800, Jakub Kicinski wrote:
> On Tue, 6 Nov 2018 20:05:00 +0000, Ido Schimmel wrote:
> > From: Shalom Toledo <shalomt@mellanox.com>
> > 
> > Many drivers checking the device's firmware version during the
> > initialization flow and flashing a compatible version if the current
> > version is not.
> > 
> > fw_version_check gives the ability to skip this check which allows to run
> > the device with a different firmware version than required by the driver
> > for testing and/or debugging purposes.
> > 
> > Signed-off-by: Shalom Toledo <shalomt@mellanox.com>
> > Reviewed-by: Jiri Pirko <jiri@mellanox.com>
> > Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> 
> The documentation is missing, so it's hard to comment on the definition
> of the parameter...  

I assume you mean Documentation/networking/devlink-params.txt ?

> We have a FW loading policy for NFP, too, so it'd be good to see if we
> can find a common ground.

If the parameter is set, then device runs with whatever firmware version
was last flashed (via ethtool, for example). Otherwise, the driver will
flash a version according to its policy. In mlxsw, it is a specific
version.

Will that work for you?

^ permalink raw reply

* [PATCH net-next 0/3] net: More extack messages
From: David Ahern @ 2018-11-06 20:51 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Add more extack messages for several link create errors (e.g., invalid
number of queues, unknown link kind) and invalid metrics argument.

David Ahern (3):
  net: Add extack argument to rtnl_create_link
  net: Add extack argument to ip_fib_metrics_init
  rtnetlink: Add more extack messages to rtnl_newlink

 drivers/net/can/vxcan.c  |  2 +-
 drivers/net/geneve.c     |  2 +-
 drivers/net/veth.c       |  2 +-
 drivers/net/vxlan.c      |  2 +-
 include/net/ip.h         |  3 ++-
 include/net/rtnetlink.h  |  3 ++-
 net/core/rtnetlink.c     | 24 ++++++++++++++++--------
 net/ipv4/fib_semantics.c |  2 +-
 net/ipv4/ip_gre.c        |  2 +-
 net/ipv4/metrics.c       | 26 +++++++++++++++++++-------
 net/ipv6/route.c         |  5 +++--
 11 files changed, 48 insertions(+), 25 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [PATCH net-next 3/3] rtnetlink: Add more extack messages to rtnl_newlink
From: David Ahern @ 2018-11-06 20:51 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern
In-Reply-To: <20181106205116.7718-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add extack arg to the nla_parse_nested calls in rtnl_newlink, and
add messages for unknown device type and link network namespace id.
In particular, it improves the failure message when the wrong link
type is used. From
    $ ip li add bond1 type bonding
    RTNETLINK answers: Operation not supported
to
    $ ip li add bond1 type bonding
    Error: Unknown device type.

(The module name is bonding but the link type is bond.)

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/core/rtnetlink.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f787b7640d49..86f2d9cbdae3 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3054,7 +3054,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 			if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
 				err = nla_parse_nested(attr, ops->maxtype,
 						       linkinfo[IFLA_INFO_DATA],
-						       ops->policy, NULL);
+						       ops->policy, extack);
 				if (err < 0)
 					return err;
 				data = attr;
@@ -3076,7 +3076,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 						       m_ops->slave_maxtype,
 						       linkinfo[IFLA_INFO_SLAVE_DATA],
 						       m_ops->slave_policy,
-						       NULL);
+						       extack);
 				if (err < 0)
 					return err;
 				slave_data = slave_attr;
@@ -3140,6 +3140,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 					goto replay;
 			}
 #endif
+			NL_SET_ERR_MSG(extack, "Unknown device type");
 			return -EOPNOTSUPP;
 		}
 
@@ -3160,6 +3161,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 			link_net = get_net_ns_by_id(dest_net, id);
 			if (!link_net) {
+				NL_SET_ERR_MSG(extack, "Unknown network namespace id");
 				err =  -EINVAL;
 				goto out;
 			}
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 1/3] net: Add extack argument to rtnl_create_link
From: David Ahern @ 2018-11-06 20:51 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern
In-Reply-To: <20181106205116.7718-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add extack arg to rtnl_create_link and add messages for invalid
number of Tx or Rx queues.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 drivers/net/can/vxcan.c |  2 +-
 drivers/net/geneve.c    |  2 +-
 drivers/net/veth.c      |  2 +-
 drivers/net/vxlan.c     |  2 +-
 include/net/rtnetlink.h |  3 ++-
 net/core/rtnetlink.c    | 18 ++++++++++++------
 net/ipv4/ip_gre.c       |  2 +-
 7 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c
index ed6828821fbd..80af658e530d 100644
--- a/drivers/net/can/vxcan.c
+++ b/drivers/net/can/vxcan.c
@@ -207,7 +207,7 @@ static int vxcan_newlink(struct net *net, struct net_device *dev,
 		return PTR_ERR(peer_net);
 
 	peer = rtnl_create_link(peer_net, ifname, name_assign_type,
-				&vxcan_link_ops, tbp);
+				&vxcan_link_ops, tbp, extack);
 	if (IS_ERR(peer)) {
 		put_net(peer_net);
 		return PTR_ERR(peer);
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index a0cd1c41cf5f..fbfc13d81f66 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1666,7 +1666,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 
 	memset(tb, 0, sizeof(tb));
 	dev = rtnl_create_link(net, name, name_assign_type,
-			       &geneve_link_ops, tb);
+			       &geneve_link_ops, tb, NULL);
 	if (IS_ERR(dev))
 		return dev;
 
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 890fa5b905e2..f412ea1cef18 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1253,7 +1253,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
 		return PTR_ERR(net);
 
 	peer = rtnl_create_link(net, ifname, name_assign_type,
-				&veth_link_ops, tbp);
+				&veth_link_ops, tbp, extack);
 	if (IS_ERR(peer)) {
 		put_net(net);
 		return PTR_ERR(peer);
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 297cdeaef479..ae969f806d56 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -3749,7 +3749,7 @@ struct net_device *vxlan_dev_create(struct net *net, const char *name,
 	memset(&tb, 0, sizeof(tb));
 
 	dev = rtnl_create_link(net, name, name_assign_type,
-			       &vxlan_link_ops, tb);
+			       &vxlan_link_ops, tb, NULL);
 	if (IS_ERR(dev))
 		return dev;
 
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index cf26e5aacac4..e2091bb2b3a8 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -159,7 +159,8 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]);
 struct net_device *rtnl_create_link(struct net *net, const char *ifname,
 				    unsigned char name_assign_type,
 				    const struct rtnl_link_ops *ops,
-				    struct nlattr *tb[]);
+				    struct nlattr *tb[],
+				    struct netlink_ext_ack *extack);
 int rtnl_delete_link(struct net_device *dev);
 int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 33d9227a8b80..f787b7640d49 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2885,9 +2885,11 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
 }
 EXPORT_SYMBOL(rtnl_configure_link);
 
-struct net_device *rtnl_create_link(struct net *net,
-	const char *ifname, unsigned char name_assign_type,
-	const struct rtnl_link_ops *ops, struct nlattr *tb[])
+struct net_device *rtnl_create_link(struct net *net, const char *ifname,
+				    unsigned char name_assign_type,
+				    const struct rtnl_link_ops *ops,
+				    struct nlattr *tb[],
+				    struct netlink_ext_ack *extack)
 {
 	struct net_device *dev;
 	unsigned int num_tx_queues = 1;
@@ -2903,11 +2905,15 @@ struct net_device *rtnl_create_link(struct net *net,
 	else if (ops->get_num_rx_queues)
 		num_rx_queues = ops->get_num_rx_queues();
 
-	if (num_tx_queues < 1 || num_tx_queues > 4096)
+	if (num_tx_queues < 1 || num_tx_queues > 4096) {
+		NL_SET_ERR_MSG(extack, "Invalid number of transmit queues");
 		return ERR_PTR(-EINVAL);
+	}
 
-	if (num_rx_queues < 1 || num_rx_queues > 4096)
+	if (num_rx_queues < 1 || num_rx_queues > 4096) {
+		NL_SET_ERR_MSG(extack, "Invalid number of receive queues");
 		return ERR_PTR(-EINVAL);
+	}
 
 	dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
 			       ops->setup, num_tx_queues, num_rx_queues);
@@ -3163,7 +3169,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 		}
 
 		dev = rtnl_create_link(link_net ? : dest_net, ifname,
-				       name_assign_type, ops, tb);
+				       name_assign_type, ops, tb, extack);
 		if (IS_ERR(dev)) {
 			err = PTR_ERR(dev);
 			goto out;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 38befe829caf..2c67af644e64 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1601,7 +1601,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
 	memset(&tb, 0, sizeof(tb));
 
 	dev = rtnl_create_link(net, name, name_assign_type,
-			       &ipgre_tap_ops, tb);
+			       &ipgre_tap_ops, tb, NULL);
 	if (IS_ERR(dev))
 		return dev;
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 2/3] net: Add extack argument to ip_fib_metrics_init
From: David Ahern @ 2018-11-06 20:51 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern
In-Reply-To: <20181106205116.7718-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add extack argument to ip_fib_metrics_init and add messages for invalid
metrics.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip.h         |  3 ++-
 net/ipv4/fib_semantics.c |  2 +-
 net/ipv4/metrics.c       | 26 +++++++++++++++++++-------
 net/ipv6/route.c         |  5 +++--
 4 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 72593e171d14..462182f78236 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -421,7 +421,8 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
 }
 
 struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
-					int fc_mx_len);
+					int fc_mx_len,
+					struct netlink_ext_ack *extack);
 static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics)
 {
 	if (fib_metrics != &dst_default_metrics &&
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b5c3937ca6ec..5022bc63863a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1076,7 +1076,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 	if (!fi)
 		goto failure;
 	fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
-					      cfg->fc_mx_len);
+					      cfg->fc_mx_len, extack);
 	if (unlikely(IS_ERR(fi->fib_metrics))) {
 		err = PTR_ERR(fi->fib_metrics);
 		kfree(fi);
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 6d218f5a2e71..ca9a5fefdefa 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -6,7 +6,8 @@
 #include <net/tcp.h>
 
 static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
-			      int fc_mx_len, u32 *metrics)
+			      int fc_mx_len, u32 *metrics,
+			      struct netlink_ext_ack *extack)
 {
 	bool ecn_ca = false;
 	struct nlattr *nla;
@@ -21,19 +22,26 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
 
 		if (!type)
 			continue;
-		if (type > RTAX_MAX)
+		if (type > RTAX_MAX) {
+			NL_SET_ERR_MSG(extack, "Invalid metric type");
 			return -EINVAL;
+		}
 
 		if (type == RTAX_CC_ALGO) {
 			char tmp[TCP_CA_NAME_MAX];
 
 			nla_strlcpy(tmp, nla, sizeof(tmp));
 			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
-			if (val == TCP_CA_UNSPEC)
+			if (val == TCP_CA_UNSPEC) {
+				NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm");
 				return -EINVAL;
+			}
 		} else {
-			if (nla_len(nla) != sizeof(u32))
+			if (nla_len(nla) != sizeof(u32)) {
+				NL_SET_ERR_MSG_ATTR(extack, nla,
+						    "Invalid attribute in metrics");
 				return -EINVAL;
+			}
 			val = nla_get_u32(nla);
 		}
 		if (type == RTAX_ADVMSS && val > 65535 - 40)
@@ -42,8 +50,10 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
 			val = 65535 - 15;
 		if (type == RTAX_HOPLIMIT && val > 255)
 			val = 255;
-		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) {
+			NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute");
 			return -EINVAL;
+		}
 		metrics[type - 1] = val;
 	}
 
@@ -54,7 +64,8 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
 }
 
 struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
-					int fc_mx_len)
+					int fc_mx_len,
+					struct netlink_ext_ack *extack)
 {
 	struct dst_metrics *fib_metrics;
 	int err;
@@ -66,7 +77,8 @@ struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
 	if (unlikely(!fib_metrics))
 		return ERR_PTR(-ENOMEM);
 
-	err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics);
+	err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics,
+				 extack);
 	if (!err) {
 		refcount_set(&fib_metrics->refcnt, 1);
 	} else {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2a7423c39456..b2447b7c7303 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2975,7 +2975,8 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 	if (!rt)
 		goto out;
 
-	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len);
+	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
+					       extack);
 	if (IS_ERR(rt->fib6_metrics)) {
 		err = PTR_ERR(rt->fib6_metrics);
 		/* Do not leave garbage there. */
@@ -3708,7 +3709,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
 	if (!f6i)
 		return ERR_PTR(-ENOMEM);
 
-	f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0);
+	f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
 	f6i->dst_nocount = true;
 	f6i->dst_host = true;
 	f6i->fib6_protocol = RTPROT_KERNEL;
-- 
2.11.0

^ permalink raw reply related

* [RFC perf,bpf 0/5] reveal invisible bpf programs
From: Song Liu @ 2018-11-06 20:52 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: kernel-team, Song Liu, ast, daniel, peterz, acme

This is to follow up Alexei's early effort to show bpf programs

   https://www.spinics.net/lists/netdev/msg524232.html

In this version, PERF_RECORD_BPF_EVENT is introduced to send real time BPF
load/unload events to user space. In user space, perf-record is modified
to listen to these events (through a dedicated ring buffer) and generate
detailed information about the program (struct bpf_prog_info_event). Then,
perf-report translates these events into proper symbols.

With this set, perf-report will show bpf program as:

   18.49%     0.16%  test  [kernel.vmlinux]    [k] ksys_write
   18.01%     0.47%  test  [kernel.vmlinux]    [k] vfs_write
   17.02%     0.40%  test  bpf_prog            [k] bpf_prog_F
   16.97%     0.10%  test  [kernel.vmlinux]    [k] __vfs_write
   16.86%     0.12%  test  [kernel.vmlinux]    [k] comm_write
   16.67%     0.39%  test  [kernel.vmlinux]    [k] bpf_probe_read

Note that, the program name is still work in progress, it will be cleaner
with function types in BTF.

Please share your comments on this.

Thanks,
Song

Song Liu (5):
  perf, bpf: Introduce PERF_RECORD_BPF_EVENT
  perf: sync tools/include/uapi/linux/perf_event.h
  perf util: basic handling of PERF_RECORD_BPF_EVENT
  perf util: introduce bpf_prog_info_event
  perf util: generate bpf_prog_info_event for short living bpf programs

 include/linux/perf_event.h            |   5 +
 include/uapi/linux/perf_event.h       |  27 ++-
 kernel/bpf/syscall.c                  |   4 +
 kernel/events/core.c                  |  82 +++++++-
 tools/include/uapi/linux/perf_event.h |  27 ++-
 tools/perf/builtin-record.c           |  55 +++++
 tools/perf/builtin-report.c           |   2 +
 tools/perf/util/Build                 |   2 +
 tools/perf/util/bpf-info.c            | 287 ++++++++++++++++++++++++++
 tools/perf/util/bpf-info.h            |  29 +++
 tools/perf/util/event.c               |  20 ++
 tools/perf/util/event.h               |  29 +++
 tools/perf/util/evlist.c              |  42 +++-
 tools/perf/util/evlist.h              |   4 +
 tools/perf/util/evsel.c               |   9 +
 tools/perf/util/evsel.h               |   3 +
 tools/perf/util/machine.c             |  10 +
 tools/perf/util/machine.h             |   2 +
 tools/perf/util/session.c             |   8 +
 tools/perf/util/tool.h                |   7 +-
 20 files changed, 646 insertions(+), 8 deletions(-)
 create mode 100644 tools/perf/util/bpf-info.c
 create mode 100644 tools/perf/util/bpf-info.h

^ permalink raw reply

* [RFC perf,bpf 3/5] perf util: basic handling of PERF_RECORD_BPF_EVENT
From: Song Liu @ 2018-11-06 20:52 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: kernel-team, Song Liu, ast, daniel, peterz, acme
In-Reply-To: <20181106205246.567448-1-songliubraving@fb.com>

This patch adds basic handling of PERF_RECORD_BPF_EVENT in perf util.
Following patches add more logic that leverages these events.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 tools/perf/util/event.c   | 19 +++++++++++++++++++
 tools/perf/util/event.h   | 15 +++++++++++++++
 tools/perf/util/evsel.c   |  1 +
 tools/perf/util/machine.c | 10 ++++++++++
 tools/perf/util/machine.h |  2 ++
 tools/perf/util/session.c |  4 ++++
 tools/perf/util/tool.h    |  4 +++-
 7 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 0cd42150f712..601432afbfb2 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -43,6 +43,7 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_SWITCH]			= "SWITCH",
 	[PERF_RECORD_SWITCH_CPU_WIDE]		= "SWITCH_CPU_WIDE",
 	[PERF_RECORD_NAMESPACES]		= "NAMESPACES",
+	[PERF_RECORD_BPF_EVENT]			= "BPF_EVENT",
 	[PERF_RECORD_HEADER_ATTR]		= "ATTR",
 	[PERF_RECORD_HEADER_EVENT_TYPE]		= "EVENT_TYPE",
 	[PERF_RECORD_HEADER_TRACING_DATA]	= "TRACING_DATA",
@@ -1333,6 +1334,14 @@ int perf_event__process_switch(struct perf_tool *tool __maybe_unused,
 	return machine__process_switch_event(machine, event);
 }
 
+int perf_event__process_bpf_event(struct perf_tool *tool __maybe_unused,
+				  union perf_event *event,
+				  struct perf_sample *sample __maybe_unused,
+				  struct machine *machine)
+{
+	return machine__process_bpf_event(machine, event);
+}
+
 size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp)
 {
 	return fprintf(fp, " %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %c %s\n",
@@ -1465,6 +1474,13 @@ static size_t perf_event__fprintf_lost(union perf_event *event, FILE *fp)
 	return fprintf(fp, " lost %" PRIu64 "\n", event->lost.lost);
 }
 
+size_t perf_event__fprintf_bpf_event(union perf_event *event, FILE *fp)
+{
+	return fprintf(fp, " bpf event with type %u, flags %u, id %u\n",
+		       event->bpf_event.type, event->bpf_event.flags,
+		       event->bpf_event.id);
+}
+
 size_t perf_event__fprintf(union perf_event *event, FILE *fp)
 {
 	size_t ret = fprintf(fp, "PERF_RECORD_%s",
@@ -1500,6 +1516,9 @@ size_t perf_event__fprintf(union perf_event *event, FILE *fp)
 	case PERF_RECORD_LOST:
 		ret += perf_event__fprintf_lost(event, fp);
 		break;
+	case PERF_RECORD_BPF_EVENT:
+		ret += perf_event__fprintf_bpf_event(event, fp);
+		break;
 	default:
 		ret += fprintf(fp, "\n");
 	}
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index bfa60bcafbde..13a0c64dd0ed 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -84,6 +84,15 @@ struct throttle_event {
 	u64 stream_id;
 };
 
+
+/* Record different types of bpf events, see enum perf_bpf_event_type */
+struct bpf_event {
+	struct perf_event_header header;
+	u16 type;
+	u16 flags;
+	u32 id;
+};
+
 #define PERF_SAMPLE_MASK				\
 	(PERF_SAMPLE_IP | PERF_SAMPLE_TID |		\
 	 PERF_SAMPLE_TIME | PERF_SAMPLE_ADDR |		\
@@ -651,6 +660,7 @@ union perf_event {
 	struct stat_round_event		stat_round;
 	struct time_conv_event		time_conv;
 	struct feature_event		feat;
+	struct bpf_event		bpf_event;
 };
 
 void perf_event__print_totals(void);
@@ -750,6 +760,10 @@ int perf_event__process_exit(struct perf_tool *tool,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine);
+int perf_event__process_bpf_event(struct perf_tool *tool,
+				  union perf_event *event,
+				  struct perf_sample *sample,
+				  struct machine *machine);
 int perf_tool__process_synth_event(struct perf_tool *tool,
 				   union perf_event *event,
 				   struct machine *machine,
@@ -814,6 +828,7 @@ size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_thread_map(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_cpu_map(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_namespaces(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_bpf_event(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf(union perf_event *event, FILE *fp);
 
 int kallsyms__get_function_start(const char *kallsyms_filename,
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index cb7f01059940..af9d539e4b6a 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1635,6 +1635,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
 	PRINT_ATTRf(context_switch, p_unsigned);
 	PRINT_ATTRf(write_backward, p_unsigned);
 	PRINT_ATTRf(namespaces, p_unsigned);
+	PRINT_ATTRf(bpf_event, p_unsigned);
 
 	PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned);
 	PRINT_ATTRf(bp_type, p_unsigned);
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 111ae858cbcb..74c295f6fdc4 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -681,6 +681,14 @@ int machine__process_switch_event(struct machine *machine __maybe_unused,
 	return 0;
 }
 
+int machine__process_bpf_event(struct machine *machine __maybe_unused,
+			       union perf_event *event)
+{
+	if (dump_trace)
+		perf_event__fprintf_bpf_event(event, stdout);
+	return 0;
+}
+
 static void dso__adjust_kmod_long_name(struct dso *dso, const char *filename)
 {
 	const char *dup_filename;
@@ -1795,6 +1803,8 @@ int machine__process_event(struct machine *machine, union perf_event *event,
 	case PERF_RECORD_SWITCH:
 	case PERF_RECORD_SWITCH_CPU_WIDE:
 		ret = machine__process_switch_event(machine, event); break;
+	case PERF_RECORD_BPF_EVENT:
+		ret = machine__process_bpf_event(machine, event); break;
 	default:
 		ret = -1;
 		break;
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index d856b85862e2..0ed237d6fd0a 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -127,6 +127,8 @@ int machine__process_mmap_event(struct machine *machine, union perf_event *event
 				struct perf_sample *sample);
 int machine__process_mmap2_event(struct machine *machine, union perf_event *event,
 				 struct perf_sample *sample);
+int machine__process_bpf_event(struct machine *machine,
+			       union perf_event *event);
 int machine__process_event(struct machine *machine, union perf_event *event,
 				struct perf_sample *sample);
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 7d2c8ce6cfad..dffe5120d2d3 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -371,6 +371,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->itrace_start = perf_event__process_itrace_start;
 	if (tool->context_switch == NULL)
 		tool->context_switch = perf_event__process_switch;
+	if (tool->bpf_event == NULL)
+		tool->bpf_event = perf_event__process_bpf_event;
 	if (tool->read == NULL)
 		tool->read = process_event_sample_stub;
 	if (tool->throttle == NULL)
@@ -1300,6 +1302,8 @@ static int machines__deliver_event(struct machines *machines,
 	case PERF_RECORD_SWITCH:
 	case PERF_RECORD_SWITCH_CPU_WIDE:
 		return tool->context_switch(tool, event, sample, machine);
+	case PERF_RECORD_BPF_EVENT:
+		return tool->bpf_event(tool, event, sample, machine);
 	default:
 		++evlist->stats.nr_unknown_events;
 		return -1;
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 56e4ca54020a..69ae898ca024 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -53,7 +53,9 @@ struct perf_tool {
 			itrace_start,
 			context_switch,
 			throttle,
-			unthrottle;
+			unthrottle,
+			bpf_event;
+
 	event_attr_op	attr;
 	event_attr_op	event_update;
 	event_op2	tracing_data;
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 0/5] net: dsa: bcm_sf2: Store rules in lists
From: Florian Fainelli @ 2018-11-06 20:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, vivien.didelot, davem, pablo, Florian Fainelli

Hi all,

This patch series changes the bcm-sf2 driver to keep a copy of the
inserted rules as opposed to using the HW as a storage area for a number
of reasons:

- this helps us with doing duplicate rule detection in a faster way, it
  would have required a full rule read before

- this helps with Pablo's on-going work to convert ethtool_rx_flow_spec
  to a more generic flow rule structure by having fewer code paths to
  convert to the new structure/helpers

- we need to cache copies to restore them during drive resumption,
  because depending on the low power mode the system has entered, the
  switch may have lost all of its context

Florian Fainelli (5):
  net: dsa: bcm_sf2: Keep copy of inserted rules
  net: dsa: bcm_sf2: Split rule handling from HW operation
  net: dsa: bcm_sf2: Restore CFP rules during system resume
  net: dsa: bcm_sf2: Get rid of unmarshalling functions
  net: systemport: Restore Broadcom tag match filters upon resume

 drivers/net/dsa/bcm_sf2.c                  |   6 +
 drivers/net/dsa/bcm_sf2.h                  |   3 +
 drivers/net/dsa/bcm_sf2_cfp.c              | 497 ++++++++-------------
 drivers/net/ethernet/broadcom/bcmsysport.c |  12 +
 drivers/net/ethernet/broadcom/bcmsysport.h |   1 +
 5 files changed, 204 insertions(+), 315 deletions(-)

-- 
2.17.1

^ permalink raw reply

* [PATCH net-next 1/5] net: dsa: bcm_sf2: Keep copy of inserted rules
From: Florian Fainelli @ 2018-11-06 20:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, vivien.didelot, davem, pablo, Florian Fainelli
In-Reply-To: <20181106205841.14308-1-f.fainelli@gmail.com>

We tried hard to use the hardware as a storage area, which made things
needlessly complex in that we had to both marshall and unmarshall the
ethtool_rx_flow_spec into what the CFP hardware understands but it did
not require any driver level allocations, so that was nice.

Keep a copy of the ethtool_rx_flow_spec rule we want to insert, and also
make sure we don't have a duplicate rule already. This greatly speeds up
the deletion time since we only need to clear the slice's valid bit and
not perform a full read.

This is a preparatory step for being able to restore rules upon system
resumption where the hardware loses its context partially or entirely.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/bcm_sf2.c     |   2 +
 drivers/net/dsa/bcm_sf2.h     |   2 +
 drivers/net/dsa/bcm_sf2_cfp.c | 144 +++++++++++++++++++++++++++++++---
 3 files changed, 137 insertions(+), 11 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 2eb68769562c..89722dbfafb8 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -1061,6 +1061,7 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev)
 	spin_lock_init(&priv->indir_lock);
 	mutex_init(&priv->stats_mutex);
 	mutex_init(&priv->cfp.lock);
+	INIT_LIST_HEAD(&priv->cfp.rules_list);
 
 	/* CFP rule #0 cannot be used for specific classifications, flag it as
 	 * permanently used
@@ -1166,6 +1167,7 @@ static int bcm_sf2_sw_remove(struct platform_device *pdev)
 
 	priv->wol_ports_mask = 0;
 	dsa_unregister_switch(priv->dev->ds);
+	bcm_sf2_cfp_exit(priv->dev->ds);
 	/* Disable all ports and interrupts */
 	bcm_sf2_sw_suspend(priv->dev->ds);
 	bcm_sf2_mdio_unregister(priv);
diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h
index cc31e986e6e3..03444982c25e 100644
--- a/drivers/net/dsa/bcm_sf2.h
+++ b/drivers/net/dsa/bcm_sf2.h
@@ -56,6 +56,7 @@ struct bcm_sf2_cfp_priv {
 	DECLARE_BITMAP(used, CFP_NUM_RULES);
 	DECLARE_BITMAP(unique, CFP_NUM_RULES);
 	unsigned int rules_cnt;
+	struct list_head rules_list;
 };
 
 struct bcm_sf2_priv {
@@ -213,5 +214,6 @@ int bcm_sf2_get_rxnfc(struct dsa_switch *ds, int port,
 int bcm_sf2_set_rxnfc(struct dsa_switch *ds, int port,
 		      struct ethtool_rxnfc *nfc);
 int bcm_sf2_cfp_rst(struct bcm_sf2_priv *priv);
+void bcm_sf2_cfp_exit(struct dsa_switch *ds);
 
 #endif /* __BCM_SF2_H */
diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 47c5f272a084..29b6b4204662 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -20,6 +20,12 @@
 #include "bcm_sf2.h"
 #include "bcm_sf2_regs.h"
 
+struct cfp_rule {
+	int port;
+	struct ethtool_rx_flow_spec fs;
+	struct list_head next;
+};
+
 struct cfp_udf_slice_layout {
 	u8 slices[UDFS_PER_SLICE];
 	u32 mask_value;
@@ -515,6 +521,61 @@ static void bcm_sf2_cfp_slice_ipv6(struct bcm_sf2_priv *priv,
 	core_writel(priv, reg, offset);
 }
 
+static struct cfp_rule *bcm_sf2_cfp_rule_find(struct bcm_sf2_priv *priv,
+					      int port, u32 location)
+{
+	struct cfp_rule *rule = NULL;
+
+	list_for_each_entry(rule, &priv->cfp.rules_list, next) {
+		if (rule->port == port && rule->fs.location == location)
+			break;
+	};
+
+	return rule;
+}
+
+static int bcm_sf2_cfp_rule_cmp(struct bcm_sf2_priv *priv, int port,
+				struct ethtool_rx_flow_spec *fs)
+{
+	struct cfp_rule *rule = NULL;
+	size_t fs_size = 0;
+	int ret = 1;
+
+	if (list_empty(&priv->cfp.rules_list))
+		return ret;
+
+	list_for_each_entry(rule, &priv->cfp.rules_list, next) {
+		ret = 1;
+		if (rule->port != port)
+			continue;
+
+		if (rule->fs.flow_type != fs->flow_type ||
+		    rule->fs.ring_cookie != fs->ring_cookie ||
+		    rule->fs.m_ext.data[0] != fs->m_ext.data[0])
+			continue;
+
+		switch (fs->flow_type & ~FLOW_EXT) {
+		case TCP_V6_FLOW:
+		case UDP_V6_FLOW:
+			fs_size = sizeof(struct ethtool_tcpip6_spec);
+			break;
+		case TCP_V4_FLOW:
+		case UDP_V4_FLOW:
+			fs_size = sizeof(struct ethtool_tcpip4_spec);
+			break;
+		default:
+			continue;
+		}
+
+		ret = memcmp(&rule->fs.h_u, &fs->h_u, fs_size);
+		ret |= memcmp(&rule->fs.m_u, &fs->m_u, fs_size);
+		if (ret == 0)
+			break;
+	}
+
+	return ret;
+}
+
 static int bcm_sf2_cfp_ipv6_rule_set(struct bcm_sf2_priv *priv, int port,
 				     unsigned int port_num,
 				     unsigned int queue_num,
@@ -735,6 +796,7 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
 	s8 cpu_port = ds->ports[port].cpu_dp->index;
 	__u64 ring_cookie = fs->ring_cookie;
 	unsigned int queue_num, port_num;
+	struct cfp_rule *rule = NULL;
 	int ret = -EINVAL;
 
 	/* Check for unsupported extensions */
@@ -750,6 +812,10 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
 	    fs->location > bcm_sf2_cfp_rule_size(priv))
 		return -EINVAL;
 
+	ret = bcm_sf2_cfp_rule_cmp(priv, port, fs);
+	if (ret == 0)
+		return -EEXIST;
+
 	/* This rule is a Wake-on-LAN filter and we must specifically
 	 * target the CPU port in order for it to be working.
 	 */
@@ -775,6 +841,10 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
 	if (port_num >= 7)
 		port_num -= 1;
 
+	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+	if (!rule)
+		return -ENOMEM;
+
 	switch (fs->flow_type & ~FLOW_EXT) {
 	case TCP_V4_FLOW:
 	case UDP_V4_FLOW:
@@ -787,9 +857,19 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
 						queue_num, fs);
 		break;
 	default:
+		ret = -EINVAL;
 		break;
 	}
 
+	if (ret) {
+		kfree(rule);
+		return ret;
+	}
+
+	rule->port = port;
+	memcpy(&rule->fs, fs, sizeof(*fs));
+	list_add_tail(&rule->next, &priv->cfp.rules_list);
+
 	return ret;
 }
 
@@ -833,6 +913,7 @@ static int bcm_sf2_cfp_rule_del_one(struct bcm_sf2_priv *priv, int port,
 static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port,
 				u32 loc)
 {
+	struct cfp_rule *rule;
 	u32 next_loc = 0;
 	int ret;
 
@@ -843,6 +924,10 @@ static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port,
 	if (!test_bit(loc, priv->cfp.unique) || loc == 0)
 		return -EINVAL;
 
+	rule = bcm_sf2_cfp_rule_find(priv, port, loc);
+	if (!rule)
+		return -EINVAL;
+
 	ret = bcm_sf2_cfp_rule_del_one(priv, port, loc, &next_loc);
 	if (ret)
 		return ret;
@@ -851,6 +936,9 @@ static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port,
 	if (next_loc)
 		ret = bcm_sf2_cfp_rule_del_one(priv, port, next_loc, NULL);
 
+	list_del(&rule->next);
+	kfree(rule);
+
 	return ret;
 }
 
@@ -867,9 +955,9 @@ static void bcm_sf2_invert_masks(struct ethtool_rx_flow_spec *flow)
 	flow->m_ext.data[1] ^= cpu_to_be32(~0);
 }
 
-static int bcm_sf2_cfp_unslice_ipv4(struct bcm_sf2_priv *priv,
-				    struct ethtool_tcpip4_spec *v4_spec,
-				    bool mask)
+static int __maybe_unused bcm_sf2_cfp_unslice_ipv4(struct bcm_sf2_priv *priv,
+						   struct ethtool_tcpip4_spec *v4_spec,
+						   bool mask)
 {
 	u32 reg, offset, ipv4;
 	u16 src_dst_port;
@@ -969,9 +1057,10 @@ static int bcm_sf2_cfp_ipv4_rule_get(struct bcm_sf2_priv *priv, int port,
 	return bcm_sf2_cfp_unslice_ipv4(priv, v4_m_spec, true);
 }
 
-static int bcm_sf2_cfp_unslice_ipv6(struct bcm_sf2_priv *priv,
-				     __be32 *ip6_addr, __be16 *port,
-				     bool mask)
+static int __maybe_unused bcm_sf2_cfp_unslice_ipv6(struct bcm_sf2_priv *priv,
+						   __be32 *ip6_addr,
+						   __be16 *port,
+						   bool mask)
 {
 	u32 reg, tmp, offset;
 
@@ -1050,9 +1139,10 @@ static int bcm_sf2_cfp_unslice_ipv6(struct bcm_sf2_priv *priv,
 	return 0;
 }
 
-static int bcm_sf2_cfp_ipv6_rule_get(struct bcm_sf2_priv *priv, int port,
-				     struct ethtool_rx_flow_spec *fs,
-				     u32 next_loc)
+static int __maybe_unused bcm_sf2_cfp_ipv6_rule_get(struct bcm_sf2_priv *priv,
+						    int port,
+						    struct ethtool_rx_flow_spec *fs,
+						    u32 next_loc)
 {
 	struct ethtool_tcpip6_spec *v6_spec = NULL, *v6_m_spec = NULL;
 	u32 reg;
@@ -1111,8 +1201,9 @@ static int bcm_sf2_cfp_ipv6_rule_get(struct bcm_sf2_priv *priv, int port,
 					&v6_m_spec->psrc, true);
 }
 
-static int bcm_sf2_cfp_rule_get(struct bcm_sf2_priv *priv, int port,
-				struct ethtool_rxnfc *nfc)
+static int __maybe_unused bcm_sf2_cfp_rule_get_hw(struct bcm_sf2_priv *priv,
+						  int port,
+						  struct ethtool_rxnfc *nfc)
 {
 	u32 reg, ipv4_or_chain_id;
 	unsigned int queue_num;
@@ -1174,6 +1265,25 @@ static int bcm_sf2_cfp_rule_get(struct bcm_sf2_priv *priv, int port,
 	return 0;
 }
 
+static int bcm_sf2_cfp_rule_get(struct bcm_sf2_priv *priv, int port,
+				struct ethtool_rxnfc *nfc)
+{
+	struct cfp_rule *rule;
+
+	rule = bcm_sf2_cfp_rule_find(priv, port, nfc->fs.location);
+	if (!rule)
+		return -EINVAL;
+
+	memcpy(&nfc->fs, &rule->fs, sizeof(rule->fs));
+
+	bcm_sf2_invert_masks(&nfc->fs);
+
+	/* Put the TCAM size here */
+	nfc->data = bcm_sf2_cfp_rule_size(priv);
+
+	return 0;
+}
+
 /* We implement the search doing a TCAM search operation */
 static int bcm_sf2_cfp_rule_get_all(struct bcm_sf2_priv *priv,
 				    int port, struct ethtool_rxnfc *nfc,
@@ -1302,3 +1412,15 @@ int bcm_sf2_cfp_rst(struct bcm_sf2_priv *priv)
 
 	return 0;
 }
+
+void bcm_sf2_cfp_exit(struct dsa_switch *ds)
+{
+	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
+	struct cfp_rule *rule, *n;
+
+	if (list_empty(&priv->cfp.rules_list))
+		return;
+
+	list_for_each_entry_safe_reverse(rule, n, &priv->cfp.rules_list, next)
+		bcm_sf2_cfp_rule_del(priv, rule->port, rule->fs.location);
+}
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 2/5] net: dsa: bcm_sf2: Split rule handling from HW operation
From: Florian Fainelli @ 2018-11-06 20:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, vivien.didelot, davem, pablo, Florian Fainelli
In-Reply-To: <20181106205841.14308-1-f.fainelli@gmail.com>

In preparation for restoring CFP rules during system wide system
suspend/resume where the hardware loses its context, split the rule
validation from its actual insertion as well as the rule removal from
its actual hardware deletion operation.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/bcm_sf2_cfp.c | 89 +++++++++++++++++++++--------------
 1 file changed, 54 insertions(+), 35 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 29b6b4204662..396bfa43c2e1 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -789,32 +789,14 @@ static int bcm_sf2_cfp_ipv6_rule_set(struct bcm_sf2_priv *priv, int port,
 	return ret;
 }
 
-static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
-				struct ethtool_rx_flow_spec *fs)
+static int bcm_sf2_cfp_rule_insert(struct dsa_switch *ds, int port,
+				   struct ethtool_rx_flow_spec *fs)
 {
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	s8 cpu_port = ds->ports[port].cpu_dp->index;
 	__u64 ring_cookie = fs->ring_cookie;
 	unsigned int queue_num, port_num;
-	struct cfp_rule *rule = NULL;
-	int ret = -EINVAL;
-
-	/* Check for unsupported extensions */
-	if ((fs->flow_type & FLOW_EXT) && (fs->m_ext.vlan_etype ||
-	     fs->m_ext.data[1]))
-		return -EINVAL;
-
-	if (fs->location != RX_CLS_LOC_ANY &&
-	    test_bit(fs->location, priv->cfp.used))
-		return -EBUSY;
-
-	if (fs->location != RX_CLS_LOC_ANY &&
-	    fs->location > bcm_sf2_cfp_rule_size(priv))
-		return -EINVAL;
-
-	ret = bcm_sf2_cfp_rule_cmp(priv, port, fs);
-	if (ret == 0)
-		return -EEXIST;
+	int ret;
 
 	/* This rule is a Wake-on-LAN filter and we must specifically
 	 * target the CPU port in order for it to be working.
@@ -841,10 +823,6 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
 	if (port_num >= 7)
 		port_num -= 1;
 
-	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
-	if (!rule)
-		return -ENOMEM;
-
 	switch (fs->flow_type & ~FLOW_EXT) {
 	case TCP_V4_FLOW:
 	case UDP_V4_FLOW:
@@ -861,6 +839,38 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
 		break;
 	}
 
+	return ret;
+}
+
+static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
+				struct ethtool_rx_flow_spec *fs)
+{
+	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
+	struct cfp_rule *rule = NULL;
+	int ret = -EINVAL;
+
+	/* Check for unsupported extensions */
+	if ((fs->flow_type & FLOW_EXT) && (fs->m_ext.vlan_etype ||
+	     fs->m_ext.data[1]))
+		return -EINVAL;
+
+	if (fs->location != RX_CLS_LOC_ANY &&
+	    test_bit(fs->location, priv->cfp.used))
+		return -EBUSY;
+
+	if (fs->location != RX_CLS_LOC_ANY &&
+	    fs->location > bcm_sf2_cfp_rule_size(priv))
+		return -EINVAL;
+
+	ret = bcm_sf2_cfp_rule_cmp(priv, port, fs);
+	if (ret == 0)
+		return -EEXIST;
+
+	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+	if (!rule)
+		return -ENOMEM;
+
+	ret = bcm_sf2_cfp_rule_insert(ds, port, fs);
 	if (ret) {
 		kfree(rule);
 		return ret;
@@ -910,13 +920,28 @@ static int bcm_sf2_cfp_rule_del_one(struct bcm_sf2_priv *priv, int port,
 	return 0;
 }
 
-static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port,
-				u32 loc)
+static int bcm_sf2_cfp_rule_remove(struct bcm_sf2_priv *priv, int port,
+				   u32 loc)
 {
-	struct cfp_rule *rule;
 	u32 next_loc = 0;
 	int ret;
 
+	ret = bcm_sf2_cfp_rule_del_one(priv, port, loc, &next_loc);
+	if (ret)
+		return ret;
+
+	/* If this was an IPv6 rule, delete is companion rule too */
+	if (next_loc)
+		ret = bcm_sf2_cfp_rule_del_one(priv, port, next_loc, NULL);
+
+	return ret;
+}
+
+static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port, u32 loc)
+{
+	struct cfp_rule *rule;
+	int ret;
+
 	/* Refuse deleting unused rules, and those that are not unique since
 	 * that could leave IPv6 rules with one of the chained rule in the
 	 * table.
@@ -928,13 +953,7 @@ static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port,
 	if (!rule)
 		return -EINVAL;
 
-	ret = bcm_sf2_cfp_rule_del_one(priv, port, loc, &next_loc);
-	if (ret)
-		return ret;
-
-	/* If this was an IPv6 rule, delete is companion rule too */
-	if (next_loc)
-		ret = bcm_sf2_cfp_rule_del_one(priv, port, next_loc, NULL);
+	ret = bcm_sf2_cfp_rule_remove(priv, port, loc);
 
 	list_del(&rule->next);
 	kfree(rule);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 3/5] net: dsa: bcm_sf2: Restore CFP rules during system resume
From: Florian Fainelli @ 2018-11-06 20:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, vivien.didelot, davem, pablo, Florian Fainelli
In-Reply-To: <20181106205841.14308-1-f.fainelli@gmail.com>

The hardware can lose its context during system suspend, and depending
on the switch generation (7445 vs. 7278), while the rules are still
there, they will have their valid bit cleared (because that's the
fastest way for the HW to reset things). Just make sure we re-apply them
coming back from resume. The 7445 switch is an older version of the core
that has some quirky RAM technology requiring a delete then re-inser to
guarantee the RAM entries are properly latched.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/bcm_sf2.c     |  4 ++++
 drivers/net/dsa/bcm_sf2.h     |  1 +
 drivers/net/dsa/bcm_sf2_cfp.c | 36 +++++++++++++++++++++++++++++++++++
 3 files changed, 41 insertions(+)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 89722dbfafb8..d8b93043b789 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -710,6 +710,10 @@ static int bcm_sf2_sw_resume(struct dsa_switch *ds)
 		return ret;
 	}
 
+	ret = bcm_sf2_cfp_resume(ds);
+	if (ret)
+		return ret;
+
 	if (priv->hw_params.num_gphy == 1)
 		bcm_sf2_gphy_enable_set(ds, true);
 
diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h
index 03444982c25e..faaef320ec48 100644
--- a/drivers/net/dsa/bcm_sf2.h
+++ b/drivers/net/dsa/bcm_sf2.h
@@ -215,5 +215,6 @@ int bcm_sf2_set_rxnfc(struct dsa_switch *ds, int port,
 		      struct ethtool_rxnfc *nfc);
 int bcm_sf2_cfp_rst(struct bcm_sf2_priv *priv);
 void bcm_sf2_cfp_exit(struct dsa_switch *ds);
+int bcm_sf2_cfp_resume(struct dsa_switch *ds);
 
 #endif /* __BCM_SF2_H */
diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 396bfa43c2e1..5034e4f56fd5 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -1443,3 +1443,39 @@ void bcm_sf2_cfp_exit(struct dsa_switch *ds)
 	list_for_each_entry_safe_reverse(rule, n, &priv->cfp.rules_list, next)
 		bcm_sf2_cfp_rule_del(priv, rule->port, rule->fs.location);
 }
+
+int bcm_sf2_cfp_resume(struct dsa_switch *ds)
+{
+	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
+	struct cfp_rule *rule;
+	int ret = 0;
+	u32 reg;
+
+	if (list_empty(&priv->cfp.rules_list))
+		return ret;
+
+	reg = core_readl(priv, CORE_CFP_CTL_REG);
+	reg &= ~CFP_EN_MAP_MASK;
+	core_writel(priv, reg, CORE_CFP_CTL_REG);
+
+	ret = bcm_sf2_cfp_rst(priv);
+	if (ret)
+		return ret;
+
+	list_for_each_entry(rule, &priv->cfp.rules_list, next) {
+		ret = bcm_sf2_cfp_rule_remove(priv, rule->port,
+					      rule->fs.location);
+		if (ret) {
+			dev_err(ds->dev, "failed to remove rule\n");
+			return ret;
+		}
+
+		ret = bcm_sf2_cfp_rule_insert(ds, rule->port, &rule->fs);
+		if (ret) {
+			dev_err(ds->dev, "failed to restore rule\n");
+			return ret;
+		}
+	};
+
+	return ret;
+}
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 4/5] net: dsa: bcm_sf2: Get rid of unmarshalling functions
From: Florian Fainelli @ 2018-11-06 20:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, vivien.didelot, davem, pablo, Florian Fainelli
In-Reply-To: <20181106205841.14308-1-f.fainelli@gmail.com>

Now that we have migrated the CFP rule handling to a list with a
software copy, the delete/get operation just returns what is on the
list, no need to read from the hardware which is both slow and more
error prone.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/bcm_sf2_cfp.c | 310 ----------------------------------
 1 file changed, 310 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 5034e4f56fd5..8f734abde7b3 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -974,316 +974,6 @@ static void bcm_sf2_invert_masks(struct ethtool_rx_flow_spec *flow)
 	flow->m_ext.data[1] ^= cpu_to_be32(~0);
 }
 
-static int __maybe_unused bcm_sf2_cfp_unslice_ipv4(struct bcm_sf2_priv *priv,
-						   struct ethtool_tcpip4_spec *v4_spec,
-						   bool mask)
-{
-	u32 reg, offset, ipv4;
-	u16 src_dst_port;
-
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(3);
-	else
-		offset = CORE_CFP_DATA_PORT(3);
-
-	reg = core_readl(priv, offset);
-	/* src port [15:8] */
-	src_dst_port = reg << 8;
-
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(2);
-	else
-		offset = CORE_CFP_DATA_PORT(2);
-
-	reg = core_readl(priv, offset);
-	/* src port [7:0] */
-	src_dst_port |= (reg >> 24);
-
-	v4_spec->pdst = cpu_to_be16(src_dst_port);
-	v4_spec->psrc = cpu_to_be16((u16)(reg >> 8));
-
-	/* IPv4 dst [15:8] */
-	ipv4 = (reg & 0xff) << 8;
-
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(1);
-	else
-		offset = CORE_CFP_DATA_PORT(1);
-
-	reg = core_readl(priv, offset);
-	/* IPv4 dst [31:16] */
-	ipv4 |= ((reg >> 8) & 0xffff) << 16;
-	/* IPv4 dst [7:0] */
-	ipv4 |= (reg >> 24) & 0xff;
-	v4_spec->ip4dst = cpu_to_be32(ipv4);
-
-	/* IPv4 src [15:8] */
-	ipv4 = (reg & 0xff) << 8;
-
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(0);
-	else
-		offset = CORE_CFP_DATA_PORT(0);
-	reg = core_readl(priv, offset);
-
-	/* Once the TCAM is programmed, the mask reflects the slice number
-	 * being matched, don't bother checking it when reading back the
-	 * mask spec
-	 */
-	if (!mask && !(reg & SLICE_VALID))
-		return -EINVAL;
-
-	/* IPv4 src [7:0] */
-	ipv4 |= (reg >> 24) & 0xff;
-	/* IPv4 src [31:16] */
-	ipv4 |= ((reg >> 8) & 0xffff) << 16;
-	v4_spec->ip4src = cpu_to_be32(ipv4);
-
-	return 0;
-}
-
-static int bcm_sf2_cfp_ipv4_rule_get(struct bcm_sf2_priv *priv, int port,
-				     struct ethtool_rx_flow_spec *fs)
-{
-	struct ethtool_tcpip4_spec *v4_spec = NULL, *v4_m_spec = NULL;
-	u32 reg;
-	int ret;
-
-	reg = core_readl(priv, CORE_CFP_DATA_PORT(6));
-
-	switch ((reg & IPPROTO_MASK) >> IPPROTO_SHIFT) {
-	case IPPROTO_TCP:
-		fs->flow_type = TCP_V4_FLOW;
-		v4_spec = &fs->h_u.tcp_ip4_spec;
-		v4_m_spec = &fs->m_u.tcp_ip4_spec;
-		break;
-	case IPPROTO_UDP:
-		fs->flow_type = UDP_V4_FLOW;
-		v4_spec = &fs->h_u.udp_ip4_spec;
-		v4_m_spec = &fs->m_u.udp_ip4_spec;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	fs->m_ext.data[0] = cpu_to_be32((reg >> IP_FRAG_SHIFT) & 1);
-	v4_spec->tos = (reg >> IPTOS_SHIFT) & IPTOS_MASK;
-
-	ret = bcm_sf2_cfp_unslice_ipv4(priv, v4_spec, false);
-	if (ret)
-		return ret;
-
-	return bcm_sf2_cfp_unslice_ipv4(priv, v4_m_spec, true);
-}
-
-static int __maybe_unused bcm_sf2_cfp_unslice_ipv6(struct bcm_sf2_priv *priv,
-						   __be32 *ip6_addr,
-						   __be16 *port,
-						   bool mask)
-{
-	u32 reg, tmp, offset;
-
-	/* C-Tag		[31:24]
-	 * UDF_n_B8		[23:8] (port)
-	 * UDF_n_B7 (upper)	[7:0] (addr[15:8])
-	 */
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(4);
-	else
-		offset = CORE_CFP_DATA_PORT(4);
-	reg = core_readl(priv, offset);
-	*port = cpu_to_be32(reg) >> 8;
-	tmp = (u32)(reg & 0xff) << 8;
-
-	/* UDF_n_B7 (lower)	[31:24] (addr[7:0])
-	 * UDF_n_B6		[23:8] (addr[31:16])
-	 * UDF_n_B5 (upper)	[7:0] (addr[47:40])
-	 */
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(3);
-	else
-		offset = CORE_CFP_DATA_PORT(3);
-	reg = core_readl(priv, offset);
-	tmp |= (reg >> 24) & 0xff;
-	tmp |= (u32)((reg >> 8) << 16);
-	ip6_addr[3] = cpu_to_be32(tmp);
-	tmp = (u32)(reg & 0xff) << 8;
-
-	/* UDF_n_B5 (lower)	[31:24] (addr[39:32])
-	 * UDF_n_B4		[23:8] (addr[63:48])
-	 * UDF_n_B3 (upper)	[7:0] (addr[79:72])
-	 */
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(2);
-	else
-		offset = CORE_CFP_DATA_PORT(2);
-	reg = core_readl(priv, offset);
-	tmp |= (reg >> 24) & 0xff;
-	tmp |= (u32)((reg >> 8) << 16);
-	ip6_addr[2] = cpu_to_be32(tmp);
-	tmp = (u32)(reg & 0xff) << 8;
-
-	/* UDF_n_B3 (lower)	[31:24] (addr[71:64])
-	 * UDF_n_B2		[23:8] (addr[95:80])
-	 * UDF_n_B1 (upper)	[7:0] (addr[111:104])
-	 */
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(1);
-	else
-		offset = CORE_CFP_DATA_PORT(1);
-	reg = core_readl(priv, offset);
-	tmp |= (reg >> 24) & 0xff;
-	tmp |= (u32)((reg >> 8) << 16);
-	ip6_addr[1] = cpu_to_be32(tmp);
-	tmp = (u32)(reg & 0xff) << 8;
-
-	/* UDF_n_B1 (lower)	[31:24] (addr[103:96])
-	 * UDF_n_B0		[23:8] (addr[127:112])
-	 * Reserved		[7:4]
-	 * Slice ID		[3:2]
-	 * Slice valid		[1:0]
-	 */
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(0);
-	else
-		offset = CORE_CFP_DATA_PORT(0);
-	reg = core_readl(priv, offset);
-	tmp |= (reg >> 24) & 0xff;
-	tmp |= (u32)((reg >> 8) << 16);
-	ip6_addr[0] = cpu_to_be32(tmp);
-
-	if (!mask && !(reg & SLICE_VALID))
-		return -EINVAL;
-
-	return 0;
-}
-
-static int __maybe_unused bcm_sf2_cfp_ipv6_rule_get(struct bcm_sf2_priv *priv,
-						    int port,
-						    struct ethtool_rx_flow_spec *fs,
-						    u32 next_loc)
-{
-	struct ethtool_tcpip6_spec *v6_spec = NULL, *v6_m_spec = NULL;
-	u32 reg;
-	int ret;
-
-	/* UDPv6 and TCPv6 both use ethtool_tcpip6_spec so we are fine
-	 * assuming tcp_ip6_spec here being an union.
-	 */
-	v6_spec = &fs->h_u.tcp_ip6_spec;
-	v6_m_spec = &fs->m_u.tcp_ip6_spec;
-
-	/* Read the second half first */
-	ret = bcm_sf2_cfp_unslice_ipv6(priv, v6_spec->ip6dst, &v6_spec->pdst,
-				       false);
-	if (ret)
-		return ret;
-
-	ret = bcm_sf2_cfp_unslice_ipv6(priv, v6_m_spec->ip6dst,
-				       &v6_m_spec->pdst, true);
-	if (ret)
-		return ret;
-
-	/* Read last to avoid next entry clobbering the results during search
-	 * operations. We would not have the port enabled for this rule, so
-	 * don't bother checking it.
-	 */
-	(void)core_readl(priv, CORE_CFP_DATA_PORT(7));
-
-	/* The slice number is valid, so read the rule we are chained from now
-	 * which is our first half.
-	 */
-	bcm_sf2_cfp_rule_addr_set(priv, next_loc);
-	ret = bcm_sf2_cfp_op(priv, OP_SEL_READ | TCAM_SEL);
-	if (ret)
-		return ret;
-
-	reg = core_readl(priv, CORE_CFP_DATA_PORT(6));
-
-	switch ((reg & IPPROTO_MASK) >> IPPROTO_SHIFT) {
-	case IPPROTO_TCP:
-		fs->flow_type = TCP_V6_FLOW;
-		break;
-	case IPPROTO_UDP:
-		fs->flow_type = UDP_V6_FLOW;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	ret = bcm_sf2_cfp_unslice_ipv6(priv, v6_spec->ip6src, &v6_spec->psrc,
-				       false);
-	if (ret)
-		return ret;
-
-	return bcm_sf2_cfp_unslice_ipv6(priv, v6_m_spec->ip6src,
-					&v6_m_spec->psrc, true);
-}
-
-static int __maybe_unused bcm_sf2_cfp_rule_get_hw(struct bcm_sf2_priv *priv,
-						  int port,
-						  struct ethtool_rxnfc *nfc)
-{
-	u32 reg, ipv4_or_chain_id;
-	unsigned int queue_num;
-	int ret;
-
-	bcm_sf2_cfp_rule_addr_set(priv, nfc->fs.location);
-
-	ret = bcm_sf2_cfp_op(priv, OP_SEL_READ | ACT_POL_RAM);
-	if (ret)
-		return ret;
-
-	reg = core_readl(priv, CORE_ACT_POL_DATA0);
-
-	ret = bcm_sf2_cfp_op(priv, OP_SEL_READ | TCAM_SEL);
-	if (ret)
-		return ret;
-
-	/* Extract the destination port */
-	nfc->fs.ring_cookie = fls((reg >> DST_MAP_IB_SHIFT) &
-				  DST_MAP_IB_MASK) - 1;
-
-	/* There is no Port 6, so we compensate for that here */
-	if (nfc->fs.ring_cookie >= 6)
-		nfc->fs.ring_cookie++;
-	nfc->fs.ring_cookie *= SF2_NUM_EGRESS_QUEUES;
-
-	/* Extract the destination queue */
-	queue_num = (reg >> NEW_TC_SHIFT) & NEW_TC_MASK;
-	nfc->fs.ring_cookie += queue_num;
-
-	/* Extract the L3_FRAMING or CHAIN_ID */
-	reg = core_readl(priv, CORE_CFP_DATA_PORT(6));
-
-	/* With IPv6 rules this would contain a non-zero chain ID since
-	 * we reserve entry 0 and it cannot be used. So if we read 0 here
-	 * this means an IPv4 rule.
-	 */
-	ipv4_or_chain_id = (reg >> L3_FRAMING_SHIFT) & 0xff;
-	if (ipv4_or_chain_id == 0)
-		ret = bcm_sf2_cfp_ipv4_rule_get(priv, port, &nfc->fs);
-	else
-		ret = bcm_sf2_cfp_ipv6_rule_get(priv, port, &nfc->fs,
-						ipv4_or_chain_id);
-	if (ret)
-		return ret;
-
-	/* Read last to avoid next entry clobbering the results during search
-	 * operations
-	 */
-	reg = core_readl(priv, CORE_CFP_DATA_PORT(7));
-	if (!(reg & 1 << port))
-		return -EINVAL;
-
-	bcm_sf2_invert_masks(&nfc->fs);
-
-	/* Put the TCAM size here */
-	nfc->data = bcm_sf2_cfp_rule_size(priv);
-
-	return 0;
-}
-
 static int bcm_sf2_cfp_rule_get(struct bcm_sf2_priv *priv, int port,
 				struct ethtool_rxnfc *nfc)
 {
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 5/5] net: systemport: Restore Broadcom tag match filters upon resume
From: Florian Fainelli @ 2018-11-06 20:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, vivien.didelot, davem, pablo, Florian Fainelli
In-Reply-To: <20181106205841.14308-1-f.fainelli@gmail.com>

Some of the system suspend states that we support wipe out entirely the
HW contents. If we had a Wake-on-LAN filter programmed prior to going
into suspend, but we did not actually wake-up from Wake-on-LAN and
instead used a deeper suspend state, make sure we restore the CID number
that we need to match against.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 12 ++++++++++++
 drivers/net/ethernet/broadcom/bcmsysport.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index 0e2d99c737e3..2e60dda32adc 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -1068,6 +1068,7 @@ static void mpd_enable_set(struct bcm_sysport_priv *priv, bool enable)
 
 static void bcm_sysport_resume_from_wol(struct bcm_sysport_priv *priv)
 {
+	unsigned int index;
 	u32 reg;
 
 	/* Disable RXCHK, active filters and Broadcom tag matching */
@@ -1076,6 +1077,15 @@ static void bcm_sysport_resume_from_wol(struct bcm_sysport_priv *priv)
 		 RXCHK_BRCM_TAG_MATCH_SHIFT | RXCHK_EN | RXCHK_BRCM_TAG_EN);
 	rxchk_writel(priv, reg, RXCHK_CONTROL);
 
+	/* Make sure we restore correct CID index in case HW lost
+	 * its context during deep idle state
+	 */
+	for_each_set_bit(index, priv->filters, RXCHK_BRCM_TAG_MAX) {
+		rxchk_writel(priv, priv->filters_loc[index] <<
+			     RXCHK_BRCM_TAG_CID_SHIFT, RXCHK_BRCM_TAG(index));
+		rxchk_writel(priv, 0xff00ffff, RXCHK_BRCM_TAG_MASK(index));
+	}
+
 	/* Clear the MagicPacket detection logic */
 	mpd_enable_set(priv, false);
 
@@ -2189,6 +2199,7 @@ static int bcm_sysport_rule_set(struct bcm_sysport_priv *priv,
 	rxchk_writel(priv, reg, RXCHK_BRCM_TAG(index));
 	rxchk_writel(priv, 0xff00ffff, RXCHK_BRCM_TAG_MASK(index));
 
+	priv->filters_loc[index] = nfc->fs.location;
 	set_bit(index, priv->filters);
 
 	return 0;
@@ -2208,6 +2219,7 @@ static int bcm_sysport_rule_del(struct bcm_sysport_priv *priv,
 	 * be taken care of during suspend time by bcm_sysport_suspend_to_wol
 	 */
 	clear_bit(index, priv->filters);
+	priv->filters_loc[index] = 0;
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h
index a7a230884a87..7a0b7bfedd19 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.h
+++ b/drivers/net/ethernet/broadcom/bcmsysport.h
@@ -786,6 +786,7 @@ struct bcm_sysport_priv {
 	/* Ethtool */
 	u32			msg_enable;
 	DECLARE_BITMAP(filters, RXCHK_BRCM_TAG_MAX);
+	u32			filters_loc[RXCHK_BRCM_TAG_MAX];
 
 	struct bcm_sysport_stats64	stats64;
 
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH rdma] net/mlx5: Fix XRC SRQ umem valid bits
From: Doug Ledford @ 2018-11-06 21:31 UTC (permalink / raw)
  To: Leon Romanovsky, Jason Gunthorpe
  Cc: Yishai Hadas, RDMA mailing list, Artemy Kovalyov, Saeed Mahameed,
	linux-netdev, Leon Romanovsky
In-Reply-To: <20181031102028.9142-1-leon@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 1175 bytes --]

On Wed, 2018-10-31 at 12:20 +0200, Leon Romanovsky wrote:
> From: Yishai Hadas <yishaih@mellanox.com>
> 
> Adapt XRC SRQ to the latest HW specification with fixed definition
> around umem valid bits. The previous definition relied on a bit which
> was taken for other purposes in legacy FW.
> 
> Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits")
> Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
> Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> ---
> Hi Doug, Jason
> 
> This commit fixes code sent in this merge window, so I'm not marking it
> with any rdma-rc/rdma-next. It will be better to be sent during this merge
> window if you have extra pull request to issue, or as a -rc material, if
> not.
> 
> BTW, we didn't combine reserved fields, because our convention is to align such
> fields to 32 bits for better readability.
> 
> Thanks

This looks fine.  Let me know when it's in the mlx5-next tree to pull.

-- 
Doug Ledford <dledford@redhat.com>
    GPG KeyID: B826A3330E572FDD
    Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH] e1000e: Change watchdog task to be delayed work
From: Neftin, Sasha @ 2018-11-07  7:01 UTC (permalink / raw)
  To: Robert Eshleman; +Cc: netdev, linux-kernel, intel-wired-lan, David S. Miller
In-Reply-To: <1541290645-25033-1-git-send-email-bobbyeshleman@gmail.com>

On 11/4/2018 02:17, Robert Eshleman wrote:
> This completes a pending TODO to use queue_delayed_work() instead of
> schedule_work().
> 
> Signed-off-by: Robert Eshleman <bobbyeshleman@gmail.com>
> ---
>   drivers/net/ethernet/intel/e1000e/netdev.c | 14 ++++++++++++--
>   1 file changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
> index 16a73bd..a387b21 100644
> --- a/drivers/net/ethernet/intel/e1000e/netdev.c
> +++ b/drivers/net/ethernet/intel/e1000e/netdev.c
> @@ -39,6 +39,8 @@ static int debug = -1;
>   module_param(debug, int, 0);
>   MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
>   
> +struct workqueue_struct *e1000e_workqueue;
> +
>   static const struct e1000_info *e1000_info_tbl[] = {
>   	[board_82571]		= &e1000_82571_info,
>   	[board_82572]		= &e1000_82572_info,
> @@ -5137,9 +5139,9 @@ static void e1000_watchdog(struct timer_list *t)
>   	struct e1000_adapter *adapter = from_timer(adapter, t, watchdog_timer);
>   
>   	/* Do the rest outside of interrupt context */
> -	schedule_work(&adapter->watchdog_task);
> +	struct delayed_work *dwork = to_delayed_work(&adapter->watchdog_task);
>   
> -	/* TODO: make this use queue_delayed_work() */
> +	queue_delayed_work(e1000e_workqueue, dwork, 1);
>   }
>   
>   static void e1000_watchdog_task(struct work_struct *work)
> @@ -7572,6 +7574,13 @@ static int __init e1000_init_module(void)
>   		e1000e_driver_version);
>   	pr_info("Copyright(c) 1999 - 2015 Intel Corporation.\n");
>   
> +	e1000e_workqueue = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0,
> +					   e1000e_driver_name);
> +	if (!e1000e_workqueue) {
> +		pr_err("%s: Failed to create workqueue\n", e1000e_driver_name);
> +		return -ENOMEM;
> +	}
> +
>   	return pci_register_driver(&e1000_driver);
>   }
>   module_init(e1000_init_module);
> @@ -7585,6 +7594,7 @@ module_init(e1000_init_module);
>   static void __exit e1000_exit_module(void)
>   {
>   	pci_unregister_driver(&e1000_driver);
> +	destroy_workqueue(e1000e_workqueue);
>   }
>   module_exit(e1000_exit_module);
>   
> 
Hello,
What is benefit of using 'delayed_work' mechanism instead of 
'schedue_work'? Some improvement in the driver behavior?
Thanks,
Sasha

^ permalink raw reply

* [PATCH net-next 00/11] ICMP error handling for UDP tunnels
From: Stefano Brivio @ 2018-11-06 21:38 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev

This series introduces ICMP error handling for UDP tunnels and
encapsulations and related selftests. We need to handle ICMP errors to
support PMTU discovery and route redirection -- this support is entirely
missing right now:

- patch 1/11 adds a socket lookup for UDP tunnels that use, by design,
  the same destination port on both endpoints -- i.e. VxLAN and GENEVE
- patches 2/11 to 7/11 are specific to VxLAN and GENEVE
- patches 8/11 and 9/11 add infrastructure for lookup of encapsulations
  where sent packets cannot be matched via receiving socket lookup, i.e.
  FoU and GUE
- patches 10/11 and 11/11 are specific to FoU and GUE

Stefano Brivio (11):
  udp: Handle ICMP errors for tunnels with same destination port on both
    endpoints
  vxlan: ICMP error lookup handler
  vxlan: Allow configuration of DF behaviour
  selftests: pmtu: Introduce tests for IPv4/IPv6 over VxLAN over IPv6
  geneve: ICMP error lookup handler
  geneve: Allow configuration of DF behaviour
  selftests: pmtu: Introduce tests for IPv4/IPv6 over GENEVE over IPv6
  net: Convert protocol error handlers from void to int
  udp: Support for error handlers of tunnels with arbitrary destination
    port
  fou, fou6: ICMP error handlers for FoU and GUE
  selftests: pmtu: Introduce FoU and GUE PMTU exceptions tests

 drivers/net/geneve.c                | 104 ++++++++-
 drivers/net/vxlan.c                 |  58 +++++
 include/linux/udp.h                 |   1 +
 include/net/icmp.h                  |   2 +-
 include/net/ip6_tunnel.h            |   2 +
 include/net/ip_tunnels.h            |   1 +
 include/net/protocol.h              |   9 +-
 include/net/sctp/sctp.h             |   2 +-
 include/net/tcp.h                   |   2 +-
 include/net/udp.h                   |   2 +-
 include/net/udp_tunnel.h            |   3 +
 include/net/vxlan.h                 |   1 +
 include/uapi/linux/if_link.h        |  18 ++
 net/dccp/ipv4.c                     |  13 +-
 net/dccp/ipv6.c                     |  13 +-
 net/ipv4/fou.c                      |  68 ++++++
 net/ipv4/gre_demux.c                |   9 +-
 net/ipv4/icmp.c                     |   6 +-
 net/ipv4/ip_gre.c                   |  48 ++--
 net/ipv4/ipip.c                     |  14 +-
 net/ipv4/protocol.c                 |   1 +
 net/ipv4/tcp_ipv4.c                 |  22 +-
 net/ipv4/tunnel4.c                  |  18 +-
 net/ipv4/udp.c                      | 119 ++++++++--
 net/ipv4/udp_impl.h                 |   2 +-
 net/ipv4/udp_tunnel.c               |   1 +
 net/ipv4/udplite.c                  |   4 +-
 net/ipv4/xfrm4_protocol.c           |  18 +-
 net/ipv6/fou6.c                     |  74 +++++++
 net/ipv6/icmp.c                     |   4 +-
 net/ipv6/ip6_gre.c                  |  18 +-
 net/ipv6/tcp_ipv6.c                 |  13 +-
 net/ipv6/tunnel6.c                  |  12 +-
 net/ipv6/udp.c                      | 141 ++++++++++--
 net/ipv6/udp_impl.h                 |   4 +-
 net/ipv6/udplite.c                  |   5 +-
 net/ipv6/xfrm6_protocol.c           |  18 +-
 net/sctp/input.c                    |   5 +-
 net/sctp/ipv6.c                     |   7 +-
 tools/testing/selftests/net/pmtu.sh | 326 ++++++++++++++++++++++++++--
 40 files changed, 1025 insertions(+), 163 deletions(-)

-- 
2.19.1

^ permalink raw reply

* [PATCH net-next 01/11] udp: Handle ICMP errors for tunnels with same destination port on both endpoints
From: Stefano Brivio @ 2018-11-06 21:38 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VxLAN and GENEVE.

For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.

Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 include/linux/udp.h      |  1 +
 include/net/udp_tunnel.h |  3 ++
 net/ipv4/udp.c           | 76 +++++++++++++++++++++++++++++++-----
 net/ipv4/udp_tunnel.c    |  1 +
 net/ipv6/udp.c           | 83 ++++++++++++++++++++++++++++++++++------
 5 files changed, 144 insertions(+), 20 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 320d49d85484..c8410837f044 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -71,6 +71,7 @@ struct udp_sock {
 	 * For encapsulation sockets.
 	 */
 	int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
+	int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb);
 	void (*encap_destroy)(struct sock *sk);
 
 	/* GRO functions for UDP socket */
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index fe680ab6b15a..bf2f84984392 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -64,6 +64,8 @@ static inline int udp_sock_create(struct net *net,
 }
 
 typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb);
+typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk,
+					     struct sk_buff *skb);
 typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk);
 typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk,
 						    struct list_head *head,
@@ -76,6 +78,7 @@ struct udp_tunnel_sock_cfg {
 	/* Used for setting up udp_sock fields, see udp.h for details */
 	__u8  encap_type;
 	udp_tunnel_encap_rcv_t encap_rcv;
+	udp_tunnel_encap_err_lookup_t encap_err_lookup;
 	udp_tunnel_encap_destroy_t encap_destroy;
 	udp_tunnel_gro_receive_t gro_receive;
 	udp_tunnel_gro_complete_t gro_complete;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ca3ed931f2a9..1f054a85062d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -585,6 +585,59 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
 	return true;
 }
 
+DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
+void udp_encap_enable(void)
+{
+	static_branch_enable(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VxLAN, GENEVE). Then ask the
+ * tunnel implementation to match the error against a valid association.
+ *
+ * Return the socket if we have a match.
+ */
+static struct sock *__udp4_lib_err_encap(struct net *net,
+					 const struct iphdr *iph,
+					 struct udphdr *uh,
+					 struct udp_table *udptable,
+					 struct sk_buff *skb)
+{
+	int (*lookup)(struct sock *sk, struct sk_buff *skb);
+	int network_offset, transport_offset;
+	struct udp_sock *up;
+	struct sock *sk;
+
+	sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
+			       iph->saddr, uh->dest, skb->dev->ifindex, 0,
+			       udptable, NULL);
+	if (!sk)
+		return NULL;
+
+	network_offset = skb_network_offset(skb);
+	transport_offset = skb_transport_offset(skb);
+
+	skb_reset_network_header(skb);
+
+	/* Network header needs to point to the outer IPv4 header inside ICMP */
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	/* Transport header needs to point to the UDP header */
+	skb_set_transport_header(skb, iph->ihl << 2);
+
+	up = udp_sk(sk);
+	lookup = READ_ONCE(up->encap_err_lookup);
+	if (!lookup || lookup(sk, skb))
+		sk = NULL;
+
+	skb_set_transport_header(skb, transport_offset);
+	skb_set_network_header(skb, network_offset);
+
+	return sk;
+}
+
 /*
  * This routine is called by the ICMP module when it gets some
  * sort of error condition.  If err < 0 then the socket should
@@ -603,6 +656,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
 	const int type = icmp_hdr(skb)->type;
 	const int code = icmp_hdr(skb)->code;
+	bool tunnel = false;
 	struct sock *sk;
 	int harderr;
 	int err;
@@ -612,8 +666,15 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 			       iph->saddr, uh->source, skb->dev->ifindex,
 			       inet_sdif(skb), udptable, NULL);
 	if (!sk) {
-		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
-		return;	/* No socket for error */
+		/* No socket for error: try tunnels before discarding */
+		if (static_branch_unlikely(&udp_encap_needed_key))
+			sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb);
+
+		if (!sk) {
+			__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+			return;
+		}
+		tunnel = true;
 	}
 
 	err = 0;
@@ -656,6 +717,10 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	 *      RFC1122: OK.  Passes ICMP errors back to application, as per
 	 *	4.1.3.3.
 	 */
+	if (tunnel) {
+		/* ...not for tunnels though: we don't have a sending socket */
+		goto out;
+	}
 	if (!inet->recverr) {
 		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
 			goto out;
@@ -1889,13 +1954,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
-void udp_encap_enable(void)
-{
-	static_branch_enable(&udp_encap_needed_key);
-}
-EXPORT_SYMBOL(udp_encap_enable);
-
 /* returns:
  *  -1: error
  *   0: success
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 6539ff15e9a3..d0c412fc56ad 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -68,6 +68,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 
 	udp_sk(sk)->encap_type = cfg->encap_type;
 	udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+	udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup;
 	udp_sk(sk)->encap_destroy = cfg->encap_destroy;
 	udp_sk(sk)->gro_receive = cfg->gro_receive;
 	udp_sk(sk)->gro_complete = cfg->gro_complete;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d2d97d07ef27..eebd90111646 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -463,6 +463,55 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	goto try_again;
 }
 
+DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
+void udpv6_encap_enable(void)
+{
+	static_branch_enable(&udpv6_encap_needed_key);
+}
+EXPORT_SYMBOL(udpv6_encap_enable);
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VxLAN, GENEVE). Then ask the
+ * tunnel implementation to match the error against a valid association.
+ *
+ * Return the socket if we have a match.
+ */
+static struct sock *__udp6_lib_err_encap(struct net *net,
+					 const struct ipv6hdr *hdr, int offset,
+					 struct udphdr *uh,
+					 struct udp_table *udptable,
+					 struct sk_buff *skb)
+{
+	int (*lookup)(struct sock *sk, struct sk_buff *skb);
+	int network_offset, transport_offset;
+	struct udp_sock *up;
+	struct sock *sk;
+
+	sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source,
+			       &hdr->saddr, uh->dest,
+			       inet6_iif(skb), 0, udptable, skb);
+	if (!sk)
+		return NULL;
+
+	network_offset = skb_network_offset(skb);
+	transport_offset = skb_transport_offset(skb);
+
+	/* Network header needs to point to the outer IPv6 header inside ICMP */
+	skb_reset_network_header(skb);
+	/* Transport header needs to point to the UDP header */
+	skb_set_transport_header(skb, offset);
+
+	up = udp_sk(sk);
+	lookup = READ_ONCE(up->encap_err_lookup);
+	if (!lookup || lookup(sk, skb))
+		sk = NULL;
+
+	skb_set_transport_header(skb, transport_offset);
+	skb_set_network_header(skb, network_offset);
+	return sk;
+}
+
 void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		    u8 type, u8 code, int offset, __be32 info,
 		    struct udp_table *udptable)
@@ -472,6 +521,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	const struct in6_addr *saddr = &hdr->saddr;
 	const struct in6_addr *daddr = &hdr->daddr;
 	struct udphdr *uh = (struct udphdr *)(skb->data+offset);
+	bool tunnel = false;
 	struct sock *sk;
 	int harderr;
 	int err;
@@ -480,9 +530,18 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
 			       inet6_iif(skb), inet6_sdif(skb), udptable, skb);
 	if (!sk) {
-		__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
-				  ICMP6_MIB_INERRORS);
-		return;
+		/* No socket for error: try tunnels before discarding */
+		if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+			sk = __udp6_lib_err_encap(net, hdr, offset, uh,
+						  udptable, skb);
+		}
+
+		if (!sk) {
+			__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
+					  ICMP6_MIB_INERRORS);
+			return;
+		}
+		tunnel = true;
 	}
 
 	harderr = icmpv6_err_convert(type, code, &err);
@@ -496,10 +555,19 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			harderr = 1;
 	}
 	if (type == NDISC_REDIRECT) {
-		ip6_sk_redirect(skb, sk);
+		if (tunnel) {
+			ip6_redirect(skb, sock_net(sk), inet6_iif(skb),
+				     sk->sk_mark, sk->sk_uid);
+		} else {
+			ip6_sk_redirect(skb, sk);
+		}
 		goto out;
 	}
 
+	/* Tunnels don't have an application socket: don't pass errors back */
+	if (tunnel)
+		goto out;
+
 	if (!np->recverr) {
 		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
 			goto out;
@@ -548,13 +616,6 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
 	__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
 }
 
-DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
-void udpv6_encap_enable(void)
-{
-	static_branch_enable(&udpv6_encap_needed_key);
-}
-EXPORT_SYMBOL(udpv6_encap_enable);
-
 static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct udp_sock *up = udp_sk(sk);
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 02/11] vxlan: ICMP error lookup handler
From: Stefano Brivio @ 2018-11-06 21:38 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

Export an encap_err_lookup() operation to match an ICMP error against a
valid VNI.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 drivers/net/vxlan.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 297cdeaef479..7706e392b2a7 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1552,6 +1552,34 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
+/* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */
+static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb)
+{
+	struct vxlan_dev *vxlan;
+	struct vxlan_sock *vs;
+	struct vxlanhdr *hdr;
+	__be32 vni;
+
+	if (skb->len < VXLAN_HLEN)
+		return -EINVAL;
+
+	hdr = vxlan_hdr(skb);
+
+	if (!(hdr->vx_flags & VXLAN_HF_VNI))
+		return -EINVAL;
+
+	vs = rcu_dereference_sk_user_data(sk);
+	if (!vs)
+		return -ENOENT;
+
+	vni = vxlan_vni(hdr->vx_vni);
+	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
+	if (!vxlan)
+		return -ENOENT;
+
+	return 0;
+}
+
 static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
@@ -2948,6 +2976,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
 	tunnel_cfg.sk_user_data = vs;
 	tunnel_cfg.encap_type = 1;
 	tunnel_cfg.encap_rcv = vxlan_rcv;
+	tunnel_cfg.encap_err_lookup = vxlan_err_lookup;
 	tunnel_cfg.encap_destroy = NULL;
 	tunnel_cfg.gro_receive = vxlan_gro_receive;
 	tunnel_cfg.gro_complete = vxlan_gro_complete;
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 03/11] vxlan: Allow configuration of DF behaviour
From: Stefano Brivio @ 2018-11-06 21:38 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

Allow users to set the IPv4 DF bit in outgoing packets, or to inherit its
value from the IPv4 inner header. If the encapsulated protocol is IPv6 and
DF is configured to be inherited, always set it.

For IPv4, inheriting DF from the inner header was probably intended from
the very beginning judging by the comment to vxlan_xmit(), but it wasn't
actually implemented -- also because it would have done more harm than
good, without handling for ICMP Fragmentation Needed messages.

According to RFC 7348, "Path MTU discovery MAY be used". An expired RFC
draft, draft-saum-nvo3-pmtud-over-vxlan-05, whose purpose was to describe
PMTUD implementation, says that "is a MUST that Vxlan gateways [...]
SHOULD set the DF-bit [...]", whatever that means.

Given this background, the only sane option is probably to let the user
decide, and keep the current behaviour as default.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 drivers/net/vxlan.c          | 29 +++++++++++++++++++++++++++++
 include/net/vxlan.h          |  1 +
 include/uapi/linux/if_link.h |  9 +++++++++
 3 files changed, 39 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 7706e392b2a7..ccb19b833706 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2289,6 +2289,19 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			df = htons(IP_DF);
 		}
 
+		if (!df) {
+			if (vxlan->cfg.df == VXLAN_DF_SET) {
+				df = htons(IP_DF);
+			} else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
+				struct ethhdr *eth = eth_hdr(skb);
+
+				if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
+				    (ntohs(eth->h_proto) == ETH_P_IP &&
+				     old_iph->frag_off & htons(IP_DF)))
+					df = htons(IP_DF);
+			}
+		}
+
 		ndst = &rt->dst;
 		skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
 
@@ -2837,6 +2850,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
 	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
 	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
+	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -2893,6 +2907,16 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
 		}
 	}
 
+	if (data[IFLA_VXLAN_DF]) {
+		enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]);
+
+		if (df < 0 || df > VXLAN_DF_MAX) {
+			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_DF],
+					    "Invalid DF attribute");
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
@@ -3538,6 +3562,9 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 		conf->mtu = nla_get_u32(tb[IFLA_MTU]);
 	}
 
+	if (data[IFLA_VXLAN_DF])
+		conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);
+
 	return 0;
 }
 
@@ -3630,6 +3657,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL_INHERIT */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_DF */
 		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
@@ -3696,6 +3724,7 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
 		       !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
+	    nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
 	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
 	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
 			!!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 03431c148e16..ec999c49df1f 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -216,6 +216,7 @@ struct vxlan_config {
 	unsigned long		age_interval;
 	unsigned int		addrmax;
 	bool			no_share;
+	enum ifla_vxlan_df	df;
 };
 
 struct vxlan_dev_node {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 1debfa42cba1..efc588949431 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -533,6 +533,7 @@ enum {
 	IFLA_VXLAN_LABEL,
 	IFLA_VXLAN_GPE,
 	IFLA_VXLAN_TTL_INHERIT,
+	IFLA_VXLAN_DF,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
@@ -542,6 +543,14 @@ struct ifla_vxlan_port_range {
 	__be16	high;
 };
 
+enum ifla_vxlan_df {
+	VXLAN_DF_UNSET = 0,
+	VXLAN_DF_SET,
+	VXLAN_DF_INHERIT,
+	__VXLAN_DF_END,
+	VXLAN_DF_MAX = __VXLAN_DF_END - 1,
+};
+
 /* GENEVE section */
 enum {
 	IFLA_GENEVE_UNSPEC,
-- 
2.19.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox