Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH net-next 1/3] devlink: Add fw_version_check generic parameter
From: Ido Schimmel @ 2018-11-06 20:37 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Ido Schimmel, netdev@vger.kernel.org, davem@davemloft.net,
	Jiri Pirko, Shalom Toledo, Moshe Shemesh, dsahern@gmail.com,
	andrew@lunn.ch, f.fainelli@gmail.com, mlxsw
In-Reply-To: <20181106121913.036b8c4d@cakuba.netronome.com>

On Tue, Nov 06, 2018 at 12:19:13PM -0800, Jakub Kicinski wrote:
> On Tue, 6 Nov 2018 20:05:00 +0000, Ido Schimmel wrote:
> > From: Shalom Toledo <shalomt@mellanox.com>
> > 
> > Many drivers checking the device's firmware version during the
> > initialization flow and flashing a compatible version if the current
> > version is not.
> > 
> > fw_version_check gives the ability to skip this check which allows to run
> > the device with a different firmware version than required by the driver
> > for testing and/or debugging purposes.
> > 
> > Signed-off-by: Shalom Toledo <shalomt@mellanox.com>
> > Reviewed-by: Jiri Pirko <jiri@mellanox.com>
> > Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> 
> The documentation is missing, so it's hard to comment on the definition
> of the parameter...  

I assume you mean Documentation/networking/devlink-params.txt ?

> We have a FW loading policy for NFP, too, so it'd be good to see if we
> can find a common ground.

If the parameter is set, then device runs with whatever firmware version
was last flashed (via ethtool, for example). Otherwise, the driver will
flash a version according to its policy. In mlxsw, it is a specific
version.

Will that work for you?

^ permalink raw reply

* [PATCH net-next 0/3] net: More extack messages
From: David Ahern @ 2018-11-06 20:51 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Add more extack messages for several link create errors (e.g., invalid
number of queues, unknown link kind) and invalid metrics argument.

David Ahern (3):
  net: Add extack argument to rtnl_create_link
  net: Add extack argument to ip_fib_metrics_init
  rtnetlink: Add more extack messages to rtnl_newlink

 drivers/net/can/vxcan.c  |  2 +-
 drivers/net/geneve.c     |  2 +-
 drivers/net/veth.c       |  2 +-
 drivers/net/vxlan.c      |  2 +-
 include/net/ip.h         |  3 ++-
 include/net/rtnetlink.h  |  3 ++-
 net/core/rtnetlink.c     | 24 ++++++++++++++++--------
 net/ipv4/fib_semantics.c |  2 +-
 net/ipv4/ip_gre.c        |  2 +-
 net/ipv4/metrics.c       | 26 +++++++++++++++++++-------
 net/ipv6/route.c         |  5 +++--
 11 files changed, 48 insertions(+), 25 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [PATCH net-next 3/3] rtnetlink: Add more extack messages to rtnl_newlink
From: David Ahern @ 2018-11-06 20:51 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern
In-Reply-To: <20181106205116.7718-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add extack arg to the nla_parse_nested calls in rtnl_newlink, and
add messages for unknown device type and link network namespace id.
In particular, it improves the failure message when the wrong link
type is used. From
    $ ip li add bond1 type bonding
    RTNETLINK answers: Operation not supported
to
    $ ip li add bond1 type bonding
    Error: Unknown device type.

(The module name is bonding but the link type is bond.)

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/core/rtnetlink.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f787b7640d49..86f2d9cbdae3 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3054,7 +3054,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 			if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
 				err = nla_parse_nested(attr, ops->maxtype,
 						       linkinfo[IFLA_INFO_DATA],
-						       ops->policy, NULL);
+						       ops->policy, extack);
 				if (err < 0)
 					return err;
 				data = attr;
@@ -3076,7 +3076,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 						       m_ops->slave_maxtype,
 						       linkinfo[IFLA_INFO_SLAVE_DATA],
 						       m_ops->slave_policy,
-						       NULL);
+						       extack);
 				if (err < 0)
 					return err;
 				slave_data = slave_attr;
@@ -3140,6 +3140,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 					goto replay;
 			}
 #endif
+			NL_SET_ERR_MSG(extack, "Unknown device type");
 			return -EOPNOTSUPP;
 		}
 
@@ -3160,6 +3161,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 			link_net = get_net_ns_by_id(dest_net, id);
 			if (!link_net) {
+				NL_SET_ERR_MSG(extack, "Unknown network namespace id");
 				err =  -EINVAL;
 				goto out;
 			}
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 1/3] net: Add extack argument to rtnl_create_link
From: David Ahern @ 2018-11-06 20:51 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern
In-Reply-To: <20181106205116.7718-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add extack arg to rtnl_create_link and add messages for invalid
number of Tx or Rx queues.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 drivers/net/can/vxcan.c |  2 +-
 drivers/net/geneve.c    |  2 +-
 drivers/net/veth.c      |  2 +-
 drivers/net/vxlan.c     |  2 +-
 include/net/rtnetlink.h |  3 ++-
 net/core/rtnetlink.c    | 18 ++++++++++++------
 net/ipv4/ip_gre.c       |  2 +-
 7 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c
index ed6828821fbd..80af658e530d 100644
--- a/drivers/net/can/vxcan.c
+++ b/drivers/net/can/vxcan.c
@@ -207,7 +207,7 @@ static int vxcan_newlink(struct net *net, struct net_device *dev,
 		return PTR_ERR(peer_net);
 
 	peer = rtnl_create_link(peer_net, ifname, name_assign_type,
-				&vxcan_link_ops, tbp);
+				&vxcan_link_ops, tbp, extack);
 	if (IS_ERR(peer)) {
 		put_net(peer_net);
 		return PTR_ERR(peer);
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index a0cd1c41cf5f..fbfc13d81f66 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1666,7 +1666,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 
 	memset(tb, 0, sizeof(tb));
 	dev = rtnl_create_link(net, name, name_assign_type,
-			       &geneve_link_ops, tb);
+			       &geneve_link_ops, tb, NULL);
 	if (IS_ERR(dev))
 		return dev;
 
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 890fa5b905e2..f412ea1cef18 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1253,7 +1253,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
 		return PTR_ERR(net);
 
 	peer = rtnl_create_link(net, ifname, name_assign_type,
-				&veth_link_ops, tbp);
+				&veth_link_ops, tbp, extack);
 	if (IS_ERR(peer)) {
 		put_net(net);
 		return PTR_ERR(peer);
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 297cdeaef479..ae969f806d56 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -3749,7 +3749,7 @@ struct net_device *vxlan_dev_create(struct net *net, const char *name,
 	memset(&tb, 0, sizeof(tb));
 
 	dev = rtnl_create_link(net, name, name_assign_type,
-			       &vxlan_link_ops, tb);
+			       &vxlan_link_ops, tb, NULL);
 	if (IS_ERR(dev))
 		return dev;
 
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index cf26e5aacac4..e2091bb2b3a8 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -159,7 +159,8 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]);
 struct net_device *rtnl_create_link(struct net *net, const char *ifname,
 				    unsigned char name_assign_type,
 				    const struct rtnl_link_ops *ops,
-				    struct nlattr *tb[]);
+				    struct nlattr *tb[],
+				    struct netlink_ext_ack *extack);
 int rtnl_delete_link(struct net_device *dev);
 int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 33d9227a8b80..f787b7640d49 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2885,9 +2885,11 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
 }
 EXPORT_SYMBOL(rtnl_configure_link);
 
-struct net_device *rtnl_create_link(struct net *net,
-	const char *ifname, unsigned char name_assign_type,
-	const struct rtnl_link_ops *ops, struct nlattr *tb[])
+struct net_device *rtnl_create_link(struct net *net, const char *ifname,
+				    unsigned char name_assign_type,
+				    const struct rtnl_link_ops *ops,
+				    struct nlattr *tb[],
+				    struct netlink_ext_ack *extack)
 {
 	struct net_device *dev;
 	unsigned int num_tx_queues = 1;
@@ -2903,11 +2905,15 @@ struct net_device *rtnl_create_link(struct net *net,
 	else if (ops->get_num_rx_queues)
 		num_rx_queues = ops->get_num_rx_queues();
 
-	if (num_tx_queues < 1 || num_tx_queues > 4096)
+	if (num_tx_queues < 1 || num_tx_queues > 4096) {
+		NL_SET_ERR_MSG(extack, "Invalid number of transmit queues");
 		return ERR_PTR(-EINVAL);
+	}
 
-	if (num_rx_queues < 1 || num_rx_queues > 4096)
+	if (num_rx_queues < 1 || num_rx_queues > 4096) {
+		NL_SET_ERR_MSG(extack, "Invalid number of receive queues");
 		return ERR_PTR(-EINVAL);
+	}
 
 	dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
 			       ops->setup, num_tx_queues, num_rx_queues);
@@ -3163,7 +3169,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 		}
 
 		dev = rtnl_create_link(link_net ? : dest_net, ifname,
-				       name_assign_type, ops, tb);
+				       name_assign_type, ops, tb, extack);
 		if (IS_ERR(dev)) {
 			err = PTR_ERR(dev);
 			goto out;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 38befe829caf..2c67af644e64 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1601,7 +1601,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
 	memset(&tb, 0, sizeof(tb));
 
 	dev = rtnl_create_link(net, name, name_assign_type,
-			       &ipgre_tap_ops, tb);
+			       &ipgre_tap_ops, tb, NULL);
 	if (IS_ERR(dev))
 		return dev;
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 2/3] net: Add extack argument to ip_fib_metrics_init
From: David Ahern @ 2018-11-06 20:51 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern
In-Reply-To: <20181106205116.7718-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add extack argument to ip_fib_metrics_init and add messages for invalid
metrics.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip.h         |  3 ++-
 net/ipv4/fib_semantics.c |  2 +-
 net/ipv4/metrics.c       | 26 +++++++++++++++++++-------
 net/ipv6/route.c         |  5 +++--
 4 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 72593e171d14..462182f78236 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -421,7 +421,8 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
 }
 
 struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
-					int fc_mx_len);
+					int fc_mx_len,
+					struct netlink_ext_ack *extack);
 static inline void ip_fib_metrics_put(struct dst_metrics *fib_metrics)
 {
 	if (fib_metrics != &dst_default_metrics &&
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b5c3937ca6ec..5022bc63863a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1076,7 +1076,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 	if (!fi)
 		goto failure;
 	fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx,
-					      cfg->fc_mx_len);
+					      cfg->fc_mx_len, extack);
 	if (unlikely(IS_ERR(fi->fib_metrics))) {
 		err = PTR_ERR(fi->fib_metrics);
 		kfree(fi);
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 6d218f5a2e71..ca9a5fefdefa 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -6,7 +6,8 @@
 #include <net/tcp.h>
 
 static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
-			      int fc_mx_len, u32 *metrics)
+			      int fc_mx_len, u32 *metrics,
+			      struct netlink_ext_ack *extack)
 {
 	bool ecn_ca = false;
 	struct nlattr *nla;
@@ -21,19 +22,26 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
 
 		if (!type)
 			continue;
-		if (type > RTAX_MAX)
+		if (type > RTAX_MAX) {
+			NL_SET_ERR_MSG(extack, "Invalid metric type");
 			return -EINVAL;
+		}
 
 		if (type == RTAX_CC_ALGO) {
 			char tmp[TCP_CA_NAME_MAX];
 
 			nla_strlcpy(tmp, nla, sizeof(tmp));
 			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
-			if (val == TCP_CA_UNSPEC)
+			if (val == TCP_CA_UNSPEC) {
+				NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm");
 				return -EINVAL;
+			}
 		} else {
-			if (nla_len(nla) != sizeof(u32))
+			if (nla_len(nla) != sizeof(u32)) {
+				NL_SET_ERR_MSG_ATTR(extack, nla,
+						    "Invalid attribute in metrics");
 				return -EINVAL;
+			}
 			val = nla_get_u32(nla);
 		}
 		if (type == RTAX_ADVMSS && val > 65535 - 40)
@@ -42,8 +50,10 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
 			val = 65535 - 15;
 		if (type == RTAX_HOPLIMIT && val > 255)
 			val = 255;
-		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) {
+			NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute");
 			return -EINVAL;
+		}
 		metrics[type - 1] = val;
 	}
 
@@ -54,7 +64,8 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx,
 }
 
 struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
-					int fc_mx_len)
+					int fc_mx_len,
+					struct netlink_ext_ack *extack)
 {
 	struct dst_metrics *fib_metrics;
 	int err;
@@ -66,7 +77,8 @@ struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx,
 	if (unlikely(!fib_metrics))
 		return ERR_PTR(-ENOMEM);
 
-	err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics);
+	err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics,
+				 extack);
 	if (!err) {
 		refcount_set(&fib_metrics->refcnt, 1);
 	} else {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2a7423c39456..b2447b7c7303 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2975,7 +2975,8 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
 	if (!rt)
 		goto out;
 
-	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len);
+	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
+					       extack);
 	if (IS_ERR(rt->fib6_metrics)) {
 		err = PTR_ERR(rt->fib6_metrics);
 		/* Do not leave garbage there. */
@@ -3708,7 +3709,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
 	if (!f6i)
 		return ERR_PTR(-ENOMEM);
 
-	f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0);
+	f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
 	f6i->dst_nocount = true;
 	f6i->dst_host = true;
 	f6i->fib6_protocol = RTPROT_KERNEL;
-- 
2.11.0

^ permalink raw reply related

* [RFC perf,bpf 0/5] reveal invisible bpf programs
From: Song Liu @ 2018-11-06 20:52 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: kernel-team, Song Liu, ast, daniel, peterz, acme

This is to follow up Alexei's early effort to show bpf programs

   https://www.spinics.net/lists/netdev/msg524232.html

In this version, PERF_RECORD_BPF_EVENT is introduced to send real time BPF
load/unload events to user space. In user space, perf-record is modified
to listen to these events (through a dedicated ring buffer) and generate
detailed information about the program (struct bpf_prog_info_event). Then,
perf-report translates these events into proper symbols.

With this set, perf-report will show bpf program as:

   18.49%     0.16%  test  [kernel.vmlinux]    [k] ksys_write
   18.01%     0.47%  test  [kernel.vmlinux]    [k] vfs_write
   17.02%     0.40%  test  bpf_prog            [k] bpf_prog_F
   16.97%     0.10%  test  [kernel.vmlinux]    [k] __vfs_write
   16.86%     0.12%  test  [kernel.vmlinux]    [k] comm_write
   16.67%     0.39%  test  [kernel.vmlinux]    [k] bpf_probe_read

Note that, the program name is still work in progress, it will be cleaner
with function types in BTF.

Please share your comments on this.

Thanks,
Song

Song Liu (5):
  perf, bpf: Introduce PERF_RECORD_BPF_EVENT
  perf: sync tools/include/uapi/linux/perf_event.h
  perf util: basic handling of PERF_RECORD_BPF_EVENT
  perf util: introduce bpf_prog_info_event
  perf util: generate bpf_prog_info_event for short living bpf programs

 include/linux/perf_event.h            |   5 +
 include/uapi/linux/perf_event.h       |  27 ++-
 kernel/bpf/syscall.c                  |   4 +
 kernel/events/core.c                  |  82 +++++++-
 tools/include/uapi/linux/perf_event.h |  27 ++-
 tools/perf/builtin-record.c           |  55 +++++
 tools/perf/builtin-report.c           |   2 +
 tools/perf/util/Build                 |   2 +
 tools/perf/util/bpf-info.c            | 287 ++++++++++++++++++++++++++
 tools/perf/util/bpf-info.h            |  29 +++
 tools/perf/util/event.c               |  20 ++
 tools/perf/util/event.h               |  29 +++
 tools/perf/util/evlist.c              |  42 +++-
 tools/perf/util/evlist.h              |   4 +
 tools/perf/util/evsel.c               |   9 +
 tools/perf/util/evsel.h               |   3 +
 tools/perf/util/machine.c             |  10 +
 tools/perf/util/machine.h             |   2 +
 tools/perf/util/session.c             |   8 +
 tools/perf/util/tool.h                |   7 +-
 20 files changed, 646 insertions(+), 8 deletions(-)
 create mode 100644 tools/perf/util/bpf-info.c
 create mode 100644 tools/perf/util/bpf-info.h

^ permalink raw reply

* [RFC perf,bpf 3/5] perf util: basic handling of PERF_RECORD_BPF_EVENT
From: Song Liu @ 2018-11-06 20:52 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: kernel-team, Song Liu, ast, daniel, peterz, acme
In-Reply-To: <20181106205246.567448-1-songliubraving@fb.com>

This patch adds basic handling of PERF_RECORD_BPF_EVENT in perf util.
Following patches add more logic that leverages these events.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 tools/perf/util/event.c   | 19 +++++++++++++++++++
 tools/perf/util/event.h   | 15 +++++++++++++++
 tools/perf/util/evsel.c   |  1 +
 tools/perf/util/machine.c | 10 ++++++++++
 tools/perf/util/machine.h |  2 ++
 tools/perf/util/session.c |  4 ++++
 tools/perf/util/tool.h    |  4 +++-
 7 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 0cd42150f712..601432afbfb2 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -43,6 +43,7 @@ static const char *perf_event__names[] = {
 	[PERF_RECORD_SWITCH]			= "SWITCH",
 	[PERF_RECORD_SWITCH_CPU_WIDE]		= "SWITCH_CPU_WIDE",
 	[PERF_RECORD_NAMESPACES]		= "NAMESPACES",
+	[PERF_RECORD_BPF_EVENT]			= "BPF_EVENT",
 	[PERF_RECORD_HEADER_ATTR]		= "ATTR",
 	[PERF_RECORD_HEADER_EVENT_TYPE]		= "EVENT_TYPE",
 	[PERF_RECORD_HEADER_TRACING_DATA]	= "TRACING_DATA",
@@ -1333,6 +1334,14 @@ int perf_event__process_switch(struct perf_tool *tool __maybe_unused,
 	return machine__process_switch_event(machine, event);
 }
 
+int perf_event__process_bpf_event(struct perf_tool *tool __maybe_unused,
+				  union perf_event *event,
+				  struct perf_sample *sample __maybe_unused,
+				  struct machine *machine)
+{
+	return machine__process_bpf_event(machine, event);
+}
+
 size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp)
 {
 	return fprintf(fp, " %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %c %s\n",
@@ -1465,6 +1474,13 @@ static size_t perf_event__fprintf_lost(union perf_event *event, FILE *fp)
 	return fprintf(fp, " lost %" PRIu64 "\n", event->lost.lost);
 }
 
+size_t perf_event__fprintf_bpf_event(union perf_event *event, FILE *fp)
+{
+	return fprintf(fp, " bpf event with type %u, flags %u, id %u\n",
+		       event->bpf_event.type, event->bpf_event.flags,
+		       event->bpf_event.id);
+}
+
 size_t perf_event__fprintf(union perf_event *event, FILE *fp)
 {
 	size_t ret = fprintf(fp, "PERF_RECORD_%s",
@@ -1500,6 +1516,9 @@ size_t perf_event__fprintf(union perf_event *event, FILE *fp)
 	case PERF_RECORD_LOST:
 		ret += perf_event__fprintf_lost(event, fp);
 		break;
+	case PERF_RECORD_BPF_EVENT:
+		ret += perf_event__fprintf_bpf_event(event, fp);
+		break;
 	default:
 		ret += fprintf(fp, "\n");
 	}
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index bfa60bcafbde..13a0c64dd0ed 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -84,6 +84,15 @@ struct throttle_event {
 	u64 stream_id;
 };
 
+
+/* Record different types of bpf events, see enum perf_bpf_event_type */
+struct bpf_event {
+	struct perf_event_header header;
+	u16 type;
+	u16 flags;
+	u32 id;
+};
+
 #define PERF_SAMPLE_MASK				\
 	(PERF_SAMPLE_IP | PERF_SAMPLE_TID |		\
 	 PERF_SAMPLE_TIME | PERF_SAMPLE_ADDR |		\
@@ -651,6 +660,7 @@ union perf_event {
 	struct stat_round_event		stat_round;
 	struct time_conv_event		time_conv;
 	struct feature_event		feat;
+	struct bpf_event		bpf_event;
 };
 
 void perf_event__print_totals(void);
@@ -750,6 +760,10 @@ int perf_event__process_exit(struct perf_tool *tool,
 			     union perf_event *event,
 			     struct perf_sample *sample,
 			     struct machine *machine);
+int perf_event__process_bpf_event(struct perf_tool *tool,
+				  union perf_event *event,
+				  struct perf_sample *sample,
+				  struct machine *machine);
 int perf_tool__process_synth_event(struct perf_tool *tool,
 				   union perf_event *event,
 				   struct machine *machine,
@@ -814,6 +828,7 @@ size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_thread_map(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_cpu_map(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf_namespaces(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_bpf_event(union perf_event *event, FILE *fp);
 size_t perf_event__fprintf(union perf_event *event, FILE *fp);
 
 int kallsyms__get_function_start(const char *kallsyms_filename,
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index cb7f01059940..af9d539e4b6a 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1635,6 +1635,7 @@ int perf_event_attr__fprintf(FILE *fp, struct perf_event_attr *attr,
 	PRINT_ATTRf(context_switch, p_unsigned);
 	PRINT_ATTRf(write_backward, p_unsigned);
 	PRINT_ATTRf(namespaces, p_unsigned);
+	PRINT_ATTRf(bpf_event, p_unsigned);
 
 	PRINT_ATTRn("{ wakeup_events, wakeup_watermark }", wakeup_events, p_unsigned);
 	PRINT_ATTRf(bp_type, p_unsigned);
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 111ae858cbcb..74c295f6fdc4 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -681,6 +681,14 @@ int machine__process_switch_event(struct machine *machine __maybe_unused,
 	return 0;
 }
 
+int machine__process_bpf_event(struct machine *machine __maybe_unused,
+			       union perf_event *event)
+{
+	if (dump_trace)
+		perf_event__fprintf_bpf_event(event, stdout);
+	return 0;
+}
+
 static void dso__adjust_kmod_long_name(struct dso *dso, const char *filename)
 {
 	const char *dup_filename;
@@ -1795,6 +1803,8 @@ int machine__process_event(struct machine *machine, union perf_event *event,
 	case PERF_RECORD_SWITCH:
 	case PERF_RECORD_SWITCH_CPU_WIDE:
 		ret = machine__process_switch_event(machine, event); break;
+	case PERF_RECORD_BPF_EVENT:
+		ret = machine__process_bpf_event(machine, event); break;
 	default:
 		ret = -1;
 		break;
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index d856b85862e2..0ed237d6fd0a 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -127,6 +127,8 @@ int machine__process_mmap_event(struct machine *machine, union perf_event *event
 				struct perf_sample *sample);
 int machine__process_mmap2_event(struct machine *machine, union perf_event *event,
 				 struct perf_sample *sample);
+int machine__process_bpf_event(struct machine *machine,
+			       union perf_event *event);
 int machine__process_event(struct machine *machine, union perf_event *event,
 				struct perf_sample *sample);
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 7d2c8ce6cfad..dffe5120d2d3 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -371,6 +371,8 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 		tool->itrace_start = perf_event__process_itrace_start;
 	if (tool->context_switch == NULL)
 		tool->context_switch = perf_event__process_switch;
+	if (tool->bpf_event == NULL)
+		tool->bpf_event = perf_event__process_bpf_event;
 	if (tool->read == NULL)
 		tool->read = process_event_sample_stub;
 	if (tool->throttle == NULL)
@@ -1300,6 +1302,8 @@ static int machines__deliver_event(struct machines *machines,
 	case PERF_RECORD_SWITCH:
 	case PERF_RECORD_SWITCH_CPU_WIDE:
 		return tool->context_switch(tool, event, sample, machine);
+	case PERF_RECORD_BPF_EVENT:
+		return tool->bpf_event(tool, event, sample, machine);
 	default:
 		++evlist->stats.nr_unknown_events;
 		return -1;
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index 56e4ca54020a..69ae898ca024 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -53,7 +53,9 @@ struct perf_tool {
 			itrace_start,
 			context_switch,
 			throttle,
-			unthrottle;
+			unthrottle,
+			bpf_event;
+
 	event_attr_op	attr;
 	event_attr_op	event_update;
 	event_op2	tracing_data;
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 0/5] net: dsa: bcm_sf2: Store rules in lists
From: Florian Fainelli @ 2018-11-06 20:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, vivien.didelot, davem, pablo, Florian Fainelli

Hi all,

This patch series changes the bcm-sf2 driver to keep a copy of the
inserted rules as opposed to using the HW as a storage area for a number
of reasons:

- this helps us with doing duplicate rule detection in a faster way, it
  would have required a full rule read before

- this helps with Pablo's on-going work to convert ethtool_rx_flow_spec
  to a more generic flow rule structure by having fewer code paths to
  convert to the new structure/helpers

- we need to cache copies to restore them during drive resumption,
  because depending on the low power mode the system has entered, the
  switch may have lost all of its context

Florian Fainelli (5):
  net: dsa: bcm_sf2: Keep copy of inserted rules
  net: dsa: bcm_sf2: Split rule handling from HW operation
  net: dsa: bcm_sf2: Restore CFP rules during system resume
  net: dsa: bcm_sf2: Get rid of unmarshalling functions
  net: systemport: Restore Broadcom tag match filters upon resume

 drivers/net/dsa/bcm_sf2.c                  |   6 +
 drivers/net/dsa/bcm_sf2.h                  |   3 +
 drivers/net/dsa/bcm_sf2_cfp.c              | 497 ++++++++-------------
 drivers/net/ethernet/broadcom/bcmsysport.c |  12 +
 drivers/net/ethernet/broadcom/bcmsysport.h |   1 +
 5 files changed, 204 insertions(+), 315 deletions(-)

-- 
2.17.1

^ permalink raw reply

* [PATCH net-next 1/5] net: dsa: bcm_sf2: Keep copy of inserted rules
From: Florian Fainelli @ 2018-11-06 20:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, vivien.didelot, davem, pablo, Florian Fainelli
In-Reply-To: <20181106205841.14308-1-f.fainelli@gmail.com>

We tried hard to use the hardware as a storage area, which made things
needlessly complex in that we had to both marshall and unmarshall the
ethtool_rx_flow_spec into what the CFP hardware understands but it did
not require any driver level allocations, so that was nice.

Keep a copy of the ethtool_rx_flow_spec rule we want to insert, and also
make sure we don't have a duplicate rule already. This greatly speeds up
the deletion time since we only need to clear the slice's valid bit and
not perform a full read.

This is a preparatory step for being able to restore rules upon system
resumption where the hardware loses its context partially or entirely.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/bcm_sf2.c     |   2 +
 drivers/net/dsa/bcm_sf2.h     |   2 +
 drivers/net/dsa/bcm_sf2_cfp.c | 144 +++++++++++++++++++++++++++++++---
 3 files changed, 137 insertions(+), 11 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 2eb68769562c..89722dbfafb8 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -1061,6 +1061,7 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev)
 	spin_lock_init(&priv->indir_lock);
 	mutex_init(&priv->stats_mutex);
 	mutex_init(&priv->cfp.lock);
+	INIT_LIST_HEAD(&priv->cfp.rules_list);
 
 	/* CFP rule #0 cannot be used for specific classifications, flag it as
 	 * permanently used
@@ -1166,6 +1167,7 @@ static int bcm_sf2_sw_remove(struct platform_device *pdev)
 
 	priv->wol_ports_mask = 0;
 	dsa_unregister_switch(priv->dev->ds);
+	bcm_sf2_cfp_exit(priv->dev->ds);
 	/* Disable all ports and interrupts */
 	bcm_sf2_sw_suspend(priv->dev->ds);
 	bcm_sf2_mdio_unregister(priv);
diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h
index cc31e986e6e3..03444982c25e 100644
--- a/drivers/net/dsa/bcm_sf2.h
+++ b/drivers/net/dsa/bcm_sf2.h
@@ -56,6 +56,7 @@ struct bcm_sf2_cfp_priv {
 	DECLARE_BITMAP(used, CFP_NUM_RULES);
 	DECLARE_BITMAP(unique, CFP_NUM_RULES);
 	unsigned int rules_cnt;
+	struct list_head rules_list;
 };
 
 struct bcm_sf2_priv {
@@ -213,5 +214,6 @@ int bcm_sf2_get_rxnfc(struct dsa_switch *ds, int port,
 int bcm_sf2_set_rxnfc(struct dsa_switch *ds, int port,
 		      struct ethtool_rxnfc *nfc);
 int bcm_sf2_cfp_rst(struct bcm_sf2_priv *priv);
+void bcm_sf2_cfp_exit(struct dsa_switch *ds);
 
 #endif /* __BCM_SF2_H */
diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 47c5f272a084..29b6b4204662 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -20,6 +20,12 @@
 #include "bcm_sf2.h"
 #include "bcm_sf2_regs.h"
 
+struct cfp_rule {
+	int port;
+	struct ethtool_rx_flow_spec fs;
+	struct list_head next;
+};
+
 struct cfp_udf_slice_layout {
 	u8 slices[UDFS_PER_SLICE];
 	u32 mask_value;
@@ -515,6 +521,61 @@ static void bcm_sf2_cfp_slice_ipv6(struct bcm_sf2_priv *priv,
 	core_writel(priv, reg, offset);
 }
 
+static struct cfp_rule *bcm_sf2_cfp_rule_find(struct bcm_sf2_priv *priv,
+					      int port, u32 location)
+{
+	struct cfp_rule *rule = NULL;
+
+	list_for_each_entry(rule, &priv->cfp.rules_list, next) {
+		if (rule->port == port && rule->fs.location == location)
+			break;
+	};
+
+	return rule;
+}
+
+static int bcm_sf2_cfp_rule_cmp(struct bcm_sf2_priv *priv, int port,
+				struct ethtool_rx_flow_spec *fs)
+{
+	struct cfp_rule *rule = NULL;
+	size_t fs_size = 0;
+	int ret = 1;
+
+	if (list_empty(&priv->cfp.rules_list))
+		return ret;
+
+	list_for_each_entry(rule, &priv->cfp.rules_list, next) {
+		ret = 1;
+		if (rule->port != port)
+			continue;
+
+		if (rule->fs.flow_type != fs->flow_type ||
+		    rule->fs.ring_cookie != fs->ring_cookie ||
+		    rule->fs.m_ext.data[0] != fs->m_ext.data[0])
+			continue;
+
+		switch (fs->flow_type & ~FLOW_EXT) {
+		case TCP_V6_FLOW:
+		case UDP_V6_FLOW:
+			fs_size = sizeof(struct ethtool_tcpip6_spec);
+			break;
+		case TCP_V4_FLOW:
+		case UDP_V4_FLOW:
+			fs_size = sizeof(struct ethtool_tcpip4_spec);
+			break;
+		default:
+			continue;
+		}
+
+		ret = memcmp(&rule->fs.h_u, &fs->h_u, fs_size);
+		ret |= memcmp(&rule->fs.m_u, &fs->m_u, fs_size);
+		if (ret == 0)
+			break;
+	}
+
+	return ret;
+}
+
 static int bcm_sf2_cfp_ipv6_rule_set(struct bcm_sf2_priv *priv, int port,
 				     unsigned int port_num,
 				     unsigned int queue_num,
@@ -735,6 +796,7 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
 	s8 cpu_port = ds->ports[port].cpu_dp->index;
 	__u64 ring_cookie = fs->ring_cookie;
 	unsigned int queue_num, port_num;
+	struct cfp_rule *rule = NULL;
 	int ret = -EINVAL;
 
 	/* Check for unsupported extensions */
@@ -750,6 +812,10 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
 	    fs->location > bcm_sf2_cfp_rule_size(priv))
 		return -EINVAL;
 
+	ret = bcm_sf2_cfp_rule_cmp(priv, port, fs);
+	if (ret == 0)
+		return -EEXIST;
+
 	/* This rule is a Wake-on-LAN filter and we must specifically
 	 * target the CPU port in order for it to be working.
 	 */
@@ -775,6 +841,10 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
 	if (port_num >= 7)
 		port_num -= 1;
 
+	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+	if (!rule)
+		return -ENOMEM;
+
 	switch (fs->flow_type & ~FLOW_EXT) {
 	case TCP_V4_FLOW:
 	case UDP_V4_FLOW:
@@ -787,9 +857,19 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
 						queue_num, fs);
 		break;
 	default:
+		ret = -EINVAL;
 		break;
 	}
 
+	if (ret) {
+		kfree(rule);
+		return ret;
+	}
+
+	rule->port = port;
+	memcpy(&rule->fs, fs, sizeof(*fs));
+	list_add_tail(&rule->next, &priv->cfp.rules_list);
+
 	return ret;
 }
 
@@ -833,6 +913,7 @@ static int bcm_sf2_cfp_rule_del_one(struct bcm_sf2_priv *priv, int port,
 static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port,
 				u32 loc)
 {
+	struct cfp_rule *rule;
 	u32 next_loc = 0;
 	int ret;
 
@@ -843,6 +924,10 @@ static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port,
 	if (!test_bit(loc, priv->cfp.unique) || loc == 0)
 		return -EINVAL;
 
+	rule = bcm_sf2_cfp_rule_find(priv, port, loc);
+	if (!rule)
+		return -EINVAL;
+
 	ret = bcm_sf2_cfp_rule_del_one(priv, port, loc, &next_loc);
 	if (ret)
 		return ret;
@@ -851,6 +936,9 @@ static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port,
 	if (next_loc)
 		ret = bcm_sf2_cfp_rule_del_one(priv, port, next_loc, NULL);
 
+	list_del(&rule->next);
+	kfree(rule);
+
 	return ret;
 }
 
@@ -867,9 +955,9 @@ static void bcm_sf2_invert_masks(struct ethtool_rx_flow_spec *flow)
 	flow->m_ext.data[1] ^= cpu_to_be32(~0);
 }
 
-static int bcm_sf2_cfp_unslice_ipv4(struct bcm_sf2_priv *priv,
-				    struct ethtool_tcpip4_spec *v4_spec,
-				    bool mask)
+static int __maybe_unused bcm_sf2_cfp_unslice_ipv4(struct bcm_sf2_priv *priv,
+						   struct ethtool_tcpip4_spec *v4_spec,
+						   bool mask)
 {
 	u32 reg, offset, ipv4;
 	u16 src_dst_port;
@@ -969,9 +1057,10 @@ static int bcm_sf2_cfp_ipv4_rule_get(struct bcm_sf2_priv *priv, int port,
 	return bcm_sf2_cfp_unslice_ipv4(priv, v4_m_spec, true);
 }
 
-static int bcm_sf2_cfp_unslice_ipv6(struct bcm_sf2_priv *priv,
-				     __be32 *ip6_addr, __be16 *port,
-				     bool mask)
+static int __maybe_unused bcm_sf2_cfp_unslice_ipv6(struct bcm_sf2_priv *priv,
+						   __be32 *ip6_addr,
+						   __be16 *port,
+						   bool mask)
 {
 	u32 reg, tmp, offset;
 
@@ -1050,9 +1139,10 @@ static int bcm_sf2_cfp_unslice_ipv6(struct bcm_sf2_priv *priv,
 	return 0;
 }
 
-static int bcm_sf2_cfp_ipv6_rule_get(struct bcm_sf2_priv *priv, int port,
-				     struct ethtool_rx_flow_spec *fs,
-				     u32 next_loc)
+static int __maybe_unused bcm_sf2_cfp_ipv6_rule_get(struct bcm_sf2_priv *priv,
+						    int port,
+						    struct ethtool_rx_flow_spec *fs,
+						    u32 next_loc)
 {
 	struct ethtool_tcpip6_spec *v6_spec = NULL, *v6_m_spec = NULL;
 	u32 reg;
@@ -1111,8 +1201,9 @@ static int bcm_sf2_cfp_ipv6_rule_get(struct bcm_sf2_priv *priv, int port,
 					&v6_m_spec->psrc, true);
 }
 
-static int bcm_sf2_cfp_rule_get(struct bcm_sf2_priv *priv, int port,
-				struct ethtool_rxnfc *nfc)
+static int __maybe_unused bcm_sf2_cfp_rule_get_hw(struct bcm_sf2_priv *priv,
+						  int port,
+						  struct ethtool_rxnfc *nfc)
 {
 	u32 reg, ipv4_or_chain_id;
 	unsigned int queue_num;
@@ -1174,6 +1265,25 @@ static int bcm_sf2_cfp_rule_get(struct bcm_sf2_priv *priv, int port,
 	return 0;
 }
 
+static int bcm_sf2_cfp_rule_get(struct bcm_sf2_priv *priv, int port,
+				struct ethtool_rxnfc *nfc)
+{
+	struct cfp_rule *rule;
+
+	rule = bcm_sf2_cfp_rule_find(priv, port, nfc->fs.location);
+	if (!rule)
+		return -EINVAL;
+
+	memcpy(&nfc->fs, &rule->fs, sizeof(rule->fs));
+
+	bcm_sf2_invert_masks(&nfc->fs);
+
+	/* Put the TCAM size here */
+	nfc->data = bcm_sf2_cfp_rule_size(priv);
+
+	return 0;
+}
+
 /* We implement the search doing a TCAM search operation */
 static int bcm_sf2_cfp_rule_get_all(struct bcm_sf2_priv *priv,
 				    int port, struct ethtool_rxnfc *nfc,
@@ -1302,3 +1412,15 @@ int bcm_sf2_cfp_rst(struct bcm_sf2_priv *priv)
 
 	return 0;
 }
+
+void bcm_sf2_cfp_exit(struct dsa_switch *ds)
+{
+	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
+	struct cfp_rule *rule, *n;
+
+	if (list_empty(&priv->cfp.rules_list))
+		return;
+
+	list_for_each_entry_safe_reverse(rule, n, &priv->cfp.rules_list, next)
+		bcm_sf2_cfp_rule_del(priv, rule->port, rule->fs.location);
+}
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 2/5] net: dsa: bcm_sf2: Split rule handling from HW operation
From: Florian Fainelli @ 2018-11-06 20:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, vivien.didelot, davem, pablo, Florian Fainelli
In-Reply-To: <20181106205841.14308-1-f.fainelli@gmail.com>

In preparation for restoring CFP rules during system wide system
suspend/resume where the hardware loses its context, split the rule
validation from its actual insertion as well as the rule removal from
its actual hardware deletion operation.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/bcm_sf2_cfp.c | 89 +++++++++++++++++++++--------------
 1 file changed, 54 insertions(+), 35 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 29b6b4204662..396bfa43c2e1 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -789,32 +789,14 @@ static int bcm_sf2_cfp_ipv6_rule_set(struct bcm_sf2_priv *priv, int port,
 	return ret;
 }
 
-static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
-				struct ethtool_rx_flow_spec *fs)
+static int bcm_sf2_cfp_rule_insert(struct dsa_switch *ds, int port,
+				   struct ethtool_rx_flow_spec *fs)
 {
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	s8 cpu_port = ds->ports[port].cpu_dp->index;
 	__u64 ring_cookie = fs->ring_cookie;
 	unsigned int queue_num, port_num;
-	struct cfp_rule *rule = NULL;
-	int ret = -EINVAL;
-
-	/* Check for unsupported extensions */
-	if ((fs->flow_type & FLOW_EXT) && (fs->m_ext.vlan_etype ||
-	     fs->m_ext.data[1]))
-		return -EINVAL;
-
-	if (fs->location != RX_CLS_LOC_ANY &&
-	    test_bit(fs->location, priv->cfp.used))
-		return -EBUSY;
-
-	if (fs->location != RX_CLS_LOC_ANY &&
-	    fs->location > bcm_sf2_cfp_rule_size(priv))
-		return -EINVAL;
-
-	ret = bcm_sf2_cfp_rule_cmp(priv, port, fs);
-	if (ret == 0)
-		return -EEXIST;
+	int ret;
 
 	/* This rule is a Wake-on-LAN filter and we must specifically
 	 * target the CPU port in order for it to be working.
@@ -841,10 +823,6 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
 	if (port_num >= 7)
 		port_num -= 1;
 
-	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
-	if (!rule)
-		return -ENOMEM;
-
 	switch (fs->flow_type & ~FLOW_EXT) {
 	case TCP_V4_FLOW:
 	case UDP_V4_FLOW:
@@ -861,6 +839,38 @@ static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
 		break;
 	}
 
+	return ret;
+}
+
+static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port,
+				struct ethtool_rx_flow_spec *fs)
+{
+	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
+	struct cfp_rule *rule = NULL;
+	int ret = -EINVAL;
+
+	/* Check for unsupported extensions */
+	if ((fs->flow_type & FLOW_EXT) && (fs->m_ext.vlan_etype ||
+	     fs->m_ext.data[1]))
+		return -EINVAL;
+
+	if (fs->location != RX_CLS_LOC_ANY &&
+	    test_bit(fs->location, priv->cfp.used))
+		return -EBUSY;
+
+	if (fs->location != RX_CLS_LOC_ANY &&
+	    fs->location > bcm_sf2_cfp_rule_size(priv))
+		return -EINVAL;
+
+	ret = bcm_sf2_cfp_rule_cmp(priv, port, fs);
+	if (ret == 0)
+		return -EEXIST;
+
+	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+	if (!rule)
+		return -ENOMEM;
+
+	ret = bcm_sf2_cfp_rule_insert(ds, port, fs);
 	if (ret) {
 		kfree(rule);
 		return ret;
@@ -910,13 +920,28 @@ static int bcm_sf2_cfp_rule_del_one(struct bcm_sf2_priv *priv, int port,
 	return 0;
 }
 
-static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port,
-				u32 loc)
+static int bcm_sf2_cfp_rule_remove(struct bcm_sf2_priv *priv, int port,
+				   u32 loc)
 {
-	struct cfp_rule *rule;
 	u32 next_loc = 0;
 	int ret;
 
+	ret = bcm_sf2_cfp_rule_del_one(priv, port, loc, &next_loc);
+	if (ret)
+		return ret;
+
+	/* If this was an IPv6 rule, delete is companion rule too */
+	if (next_loc)
+		ret = bcm_sf2_cfp_rule_del_one(priv, port, next_loc, NULL);
+
+	return ret;
+}
+
+static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port, u32 loc)
+{
+	struct cfp_rule *rule;
+	int ret;
+
 	/* Refuse deleting unused rules, and those that are not unique since
 	 * that could leave IPv6 rules with one of the chained rule in the
 	 * table.
@@ -928,13 +953,7 @@ static int bcm_sf2_cfp_rule_del(struct bcm_sf2_priv *priv, int port,
 	if (!rule)
 		return -EINVAL;
 
-	ret = bcm_sf2_cfp_rule_del_one(priv, port, loc, &next_loc);
-	if (ret)
-		return ret;
-
-	/* If this was an IPv6 rule, delete is companion rule too */
-	if (next_loc)
-		ret = bcm_sf2_cfp_rule_del_one(priv, port, next_loc, NULL);
+	ret = bcm_sf2_cfp_rule_remove(priv, port, loc);
 
 	list_del(&rule->next);
 	kfree(rule);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 3/5] net: dsa: bcm_sf2: Restore CFP rules during system resume
From: Florian Fainelli @ 2018-11-06 20:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, vivien.didelot, davem, pablo, Florian Fainelli
In-Reply-To: <20181106205841.14308-1-f.fainelli@gmail.com>

The hardware can lose its context during system suspend, and depending
on the switch generation (7445 vs. 7278), while the rules are still
there, they will have their valid bit cleared (because that's the
fastest way for the HW to reset things). Just make sure we re-apply them
coming back from resume. The 7445 switch is an older version of the core
that has some quirky RAM technology requiring a delete then re-inser to
guarantee the RAM entries are properly latched.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/bcm_sf2.c     |  4 ++++
 drivers/net/dsa/bcm_sf2.h     |  1 +
 drivers/net/dsa/bcm_sf2_cfp.c | 36 +++++++++++++++++++++++++++++++++++
 3 files changed, 41 insertions(+)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 89722dbfafb8..d8b93043b789 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -710,6 +710,10 @@ static int bcm_sf2_sw_resume(struct dsa_switch *ds)
 		return ret;
 	}
 
+	ret = bcm_sf2_cfp_resume(ds);
+	if (ret)
+		return ret;
+
 	if (priv->hw_params.num_gphy == 1)
 		bcm_sf2_gphy_enable_set(ds, true);
 
diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h
index 03444982c25e..faaef320ec48 100644
--- a/drivers/net/dsa/bcm_sf2.h
+++ b/drivers/net/dsa/bcm_sf2.h
@@ -215,5 +215,6 @@ int bcm_sf2_set_rxnfc(struct dsa_switch *ds, int port,
 		      struct ethtool_rxnfc *nfc);
 int bcm_sf2_cfp_rst(struct bcm_sf2_priv *priv);
 void bcm_sf2_cfp_exit(struct dsa_switch *ds);
+int bcm_sf2_cfp_resume(struct dsa_switch *ds);
 
 #endif /* __BCM_SF2_H */
diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 396bfa43c2e1..5034e4f56fd5 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -1443,3 +1443,39 @@ void bcm_sf2_cfp_exit(struct dsa_switch *ds)
 	list_for_each_entry_safe_reverse(rule, n, &priv->cfp.rules_list, next)
 		bcm_sf2_cfp_rule_del(priv, rule->port, rule->fs.location);
 }
+
+int bcm_sf2_cfp_resume(struct dsa_switch *ds)
+{
+	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
+	struct cfp_rule *rule;
+	int ret = 0;
+	u32 reg;
+
+	if (list_empty(&priv->cfp.rules_list))
+		return ret;
+
+	reg = core_readl(priv, CORE_CFP_CTL_REG);
+	reg &= ~CFP_EN_MAP_MASK;
+	core_writel(priv, reg, CORE_CFP_CTL_REG);
+
+	ret = bcm_sf2_cfp_rst(priv);
+	if (ret)
+		return ret;
+
+	list_for_each_entry(rule, &priv->cfp.rules_list, next) {
+		ret = bcm_sf2_cfp_rule_remove(priv, rule->port,
+					      rule->fs.location);
+		if (ret) {
+			dev_err(ds->dev, "failed to remove rule\n");
+			return ret;
+		}
+
+		ret = bcm_sf2_cfp_rule_insert(ds, rule->port, &rule->fs);
+		if (ret) {
+			dev_err(ds->dev, "failed to restore rule\n");
+			return ret;
+		}
+	};
+
+	return ret;
+}
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 4/5] net: dsa: bcm_sf2: Get rid of unmarshalling functions
From: Florian Fainelli @ 2018-11-06 20:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, vivien.didelot, davem, pablo, Florian Fainelli
In-Reply-To: <20181106205841.14308-1-f.fainelli@gmail.com>

Now that we have migrated the CFP rule handling to a list with a
software copy, the delete/get operation just returns what is on the
list, no need to read from the hardware which is both slow and more
error prone.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/bcm_sf2_cfp.c | 310 ----------------------------------
 1 file changed, 310 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 5034e4f56fd5..8f734abde7b3 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -974,316 +974,6 @@ static void bcm_sf2_invert_masks(struct ethtool_rx_flow_spec *flow)
 	flow->m_ext.data[1] ^= cpu_to_be32(~0);
 }
 
-static int __maybe_unused bcm_sf2_cfp_unslice_ipv4(struct bcm_sf2_priv *priv,
-						   struct ethtool_tcpip4_spec *v4_spec,
-						   bool mask)
-{
-	u32 reg, offset, ipv4;
-	u16 src_dst_port;
-
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(3);
-	else
-		offset = CORE_CFP_DATA_PORT(3);
-
-	reg = core_readl(priv, offset);
-	/* src port [15:8] */
-	src_dst_port = reg << 8;
-
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(2);
-	else
-		offset = CORE_CFP_DATA_PORT(2);
-
-	reg = core_readl(priv, offset);
-	/* src port [7:0] */
-	src_dst_port |= (reg >> 24);
-
-	v4_spec->pdst = cpu_to_be16(src_dst_port);
-	v4_spec->psrc = cpu_to_be16((u16)(reg >> 8));
-
-	/* IPv4 dst [15:8] */
-	ipv4 = (reg & 0xff) << 8;
-
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(1);
-	else
-		offset = CORE_CFP_DATA_PORT(1);
-
-	reg = core_readl(priv, offset);
-	/* IPv4 dst [31:16] */
-	ipv4 |= ((reg >> 8) & 0xffff) << 16;
-	/* IPv4 dst [7:0] */
-	ipv4 |= (reg >> 24) & 0xff;
-	v4_spec->ip4dst = cpu_to_be32(ipv4);
-
-	/* IPv4 src [15:8] */
-	ipv4 = (reg & 0xff) << 8;
-
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(0);
-	else
-		offset = CORE_CFP_DATA_PORT(0);
-	reg = core_readl(priv, offset);
-
-	/* Once the TCAM is programmed, the mask reflects the slice number
-	 * being matched, don't bother checking it when reading back the
-	 * mask spec
-	 */
-	if (!mask && !(reg & SLICE_VALID))
-		return -EINVAL;
-
-	/* IPv4 src [7:0] */
-	ipv4 |= (reg >> 24) & 0xff;
-	/* IPv4 src [31:16] */
-	ipv4 |= ((reg >> 8) & 0xffff) << 16;
-	v4_spec->ip4src = cpu_to_be32(ipv4);
-
-	return 0;
-}
-
-static int bcm_sf2_cfp_ipv4_rule_get(struct bcm_sf2_priv *priv, int port,
-				     struct ethtool_rx_flow_spec *fs)
-{
-	struct ethtool_tcpip4_spec *v4_spec = NULL, *v4_m_spec = NULL;
-	u32 reg;
-	int ret;
-
-	reg = core_readl(priv, CORE_CFP_DATA_PORT(6));
-
-	switch ((reg & IPPROTO_MASK) >> IPPROTO_SHIFT) {
-	case IPPROTO_TCP:
-		fs->flow_type = TCP_V4_FLOW;
-		v4_spec = &fs->h_u.tcp_ip4_spec;
-		v4_m_spec = &fs->m_u.tcp_ip4_spec;
-		break;
-	case IPPROTO_UDP:
-		fs->flow_type = UDP_V4_FLOW;
-		v4_spec = &fs->h_u.udp_ip4_spec;
-		v4_m_spec = &fs->m_u.udp_ip4_spec;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	fs->m_ext.data[0] = cpu_to_be32((reg >> IP_FRAG_SHIFT) & 1);
-	v4_spec->tos = (reg >> IPTOS_SHIFT) & IPTOS_MASK;
-
-	ret = bcm_sf2_cfp_unslice_ipv4(priv, v4_spec, false);
-	if (ret)
-		return ret;
-
-	return bcm_sf2_cfp_unslice_ipv4(priv, v4_m_spec, true);
-}
-
-static int __maybe_unused bcm_sf2_cfp_unslice_ipv6(struct bcm_sf2_priv *priv,
-						   __be32 *ip6_addr,
-						   __be16 *port,
-						   bool mask)
-{
-	u32 reg, tmp, offset;
-
-	/* C-Tag		[31:24]
-	 * UDF_n_B8		[23:8] (port)
-	 * UDF_n_B7 (upper)	[7:0] (addr[15:8])
-	 */
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(4);
-	else
-		offset = CORE_CFP_DATA_PORT(4);
-	reg = core_readl(priv, offset);
-	*port = cpu_to_be32(reg) >> 8;
-	tmp = (u32)(reg & 0xff) << 8;
-
-	/* UDF_n_B7 (lower)	[31:24] (addr[7:0])
-	 * UDF_n_B6		[23:8] (addr[31:16])
-	 * UDF_n_B5 (upper)	[7:0] (addr[47:40])
-	 */
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(3);
-	else
-		offset = CORE_CFP_DATA_PORT(3);
-	reg = core_readl(priv, offset);
-	tmp |= (reg >> 24) & 0xff;
-	tmp |= (u32)((reg >> 8) << 16);
-	ip6_addr[3] = cpu_to_be32(tmp);
-	tmp = (u32)(reg & 0xff) << 8;
-
-	/* UDF_n_B5 (lower)	[31:24] (addr[39:32])
-	 * UDF_n_B4		[23:8] (addr[63:48])
-	 * UDF_n_B3 (upper)	[7:0] (addr[79:72])
-	 */
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(2);
-	else
-		offset = CORE_CFP_DATA_PORT(2);
-	reg = core_readl(priv, offset);
-	tmp |= (reg >> 24) & 0xff;
-	tmp |= (u32)((reg >> 8) << 16);
-	ip6_addr[2] = cpu_to_be32(tmp);
-	tmp = (u32)(reg & 0xff) << 8;
-
-	/* UDF_n_B3 (lower)	[31:24] (addr[71:64])
-	 * UDF_n_B2		[23:8] (addr[95:80])
-	 * UDF_n_B1 (upper)	[7:0] (addr[111:104])
-	 */
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(1);
-	else
-		offset = CORE_CFP_DATA_PORT(1);
-	reg = core_readl(priv, offset);
-	tmp |= (reg >> 24) & 0xff;
-	tmp |= (u32)((reg >> 8) << 16);
-	ip6_addr[1] = cpu_to_be32(tmp);
-	tmp = (u32)(reg & 0xff) << 8;
-
-	/* UDF_n_B1 (lower)	[31:24] (addr[103:96])
-	 * UDF_n_B0		[23:8] (addr[127:112])
-	 * Reserved		[7:4]
-	 * Slice ID		[3:2]
-	 * Slice valid		[1:0]
-	 */
-	if (mask)
-		offset = CORE_CFP_MASK_PORT(0);
-	else
-		offset = CORE_CFP_DATA_PORT(0);
-	reg = core_readl(priv, offset);
-	tmp |= (reg >> 24) & 0xff;
-	tmp |= (u32)((reg >> 8) << 16);
-	ip6_addr[0] = cpu_to_be32(tmp);
-
-	if (!mask && !(reg & SLICE_VALID))
-		return -EINVAL;
-
-	return 0;
-}
-
-static int __maybe_unused bcm_sf2_cfp_ipv6_rule_get(struct bcm_sf2_priv *priv,
-						    int port,
-						    struct ethtool_rx_flow_spec *fs,
-						    u32 next_loc)
-{
-	struct ethtool_tcpip6_spec *v6_spec = NULL, *v6_m_spec = NULL;
-	u32 reg;
-	int ret;
-
-	/* UDPv6 and TCPv6 both use ethtool_tcpip6_spec so we are fine
-	 * assuming tcp_ip6_spec here being an union.
-	 */
-	v6_spec = &fs->h_u.tcp_ip6_spec;
-	v6_m_spec = &fs->m_u.tcp_ip6_spec;
-
-	/* Read the second half first */
-	ret = bcm_sf2_cfp_unslice_ipv6(priv, v6_spec->ip6dst, &v6_spec->pdst,
-				       false);
-	if (ret)
-		return ret;
-
-	ret = bcm_sf2_cfp_unslice_ipv6(priv, v6_m_spec->ip6dst,
-				       &v6_m_spec->pdst, true);
-	if (ret)
-		return ret;
-
-	/* Read last to avoid next entry clobbering the results during search
-	 * operations. We would not have the port enabled for this rule, so
-	 * don't bother checking it.
-	 */
-	(void)core_readl(priv, CORE_CFP_DATA_PORT(7));
-
-	/* The slice number is valid, so read the rule we are chained from now
-	 * which is our first half.
-	 */
-	bcm_sf2_cfp_rule_addr_set(priv, next_loc);
-	ret = bcm_sf2_cfp_op(priv, OP_SEL_READ | TCAM_SEL);
-	if (ret)
-		return ret;
-
-	reg = core_readl(priv, CORE_CFP_DATA_PORT(6));
-
-	switch ((reg & IPPROTO_MASK) >> IPPROTO_SHIFT) {
-	case IPPROTO_TCP:
-		fs->flow_type = TCP_V6_FLOW;
-		break;
-	case IPPROTO_UDP:
-		fs->flow_type = UDP_V6_FLOW;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	ret = bcm_sf2_cfp_unslice_ipv6(priv, v6_spec->ip6src, &v6_spec->psrc,
-				       false);
-	if (ret)
-		return ret;
-
-	return bcm_sf2_cfp_unslice_ipv6(priv, v6_m_spec->ip6src,
-					&v6_m_spec->psrc, true);
-}
-
-static int __maybe_unused bcm_sf2_cfp_rule_get_hw(struct bcm_sf2_priv *priv,
-						  int port,
-						  struct ethtool_rxnfc *nfc)
-{
-	u32 reg, ipv4_or_chain_id;
-	unsigned int queue_num;
-	int ret;
-
-	bcm_sf2_cfp_rule_addr_set(priv, nfc->fs.location);
-
-	ret = bcm_sf2_cfp_op(priv, OP_SEL_READ | ACT_POL_RAM);
-	if (ret)
-		return ret;
-
-	reg = core_readl(priv, CORE_ACT_POL_DATA0);
-
-	ret = bcm_sf2_cfp_op(priv, OP_SEL_READ | TCAM_SEL);
-	if (ret)
-		return ret;
-
-	/* Extract the destination port */
-	nfc->fs.ring_cookie = fls((reg >> DST_MAP_IB_SHIFT) &
-				  DST_MAP_IB_MASK) - 1;
-
-	/* There is no Port 6, so we compensate for that here */
-	if (nfc->fs.ring_cookie >= 6)
-		nfc->fs.ring_cookie++;
-	nfc->fs.ring_cookie *= SF2_NUM_EGRESS_QUEUES;
-
-	/* Extract the destination queue */
-	queue_num = (reg >> NEW_TC_SHIFT) & NEW_TC_MASK;
-	nfc->fs.ring_cookie += queue_num;
-
-	/* Extract the L3_FRAMING or CHAIN_ID */
-	reg = core_readl(priv, CORE_CFP_DATA_PORT(6));
-
-	/* With IPv6 rules this would contain a non-zero chain ID since
-	 * we reserve entry 0 and it cannot be used. So if we read 0 here
-	 * this means an IPv4 rule.
-	 */
-	ipv4_or_chain_id = (reg >> L3_FRAMING_SHIFT) & 0xff;
-	if (ipv4_or_chain_id == 0)
-		ret = bcm_sf2_cfp_ipv4_rule_get(priv, port, &nfc->fs);
-	else
-		ret = bcm_sf2_cfp_ipv6_rule_get(priv, port, &nfc->fs,
-						ipv4_or_chain_id);
-	if (ret)
-		return ret;
-
-	/* Read last to avoid next entry clobbering the results during search
-	 * operations
-	 */
-	reg = core_readl(priv, CORE_CFP_DATA_PORT(7));
-	if (!(reg & 1 << port))
-		return -EINVAL;
-
-	bcm_sf2_invert_masks(&nfc->fs);
-
-	/* Put the TCAM size here */
-	nfc->data = bcm_sf2_cfp_rule_size(priv);
-
-	return 0;
-}
-
 static int bcm_sf2_cfp_rule_get(struct bcm_sf2_priv *priv, int port,
 				struct ethtool_rxnfc *nfc)
 {
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 5/5] net: systemport: Restore Broadcom tag match filters upon resume
From: Florian Fainelli @ 2018-11-06 20:58 UTC (permalink / raw)
  To: netdev; +Cc: andrew, vivien.didelot, davem, pablo, Florian Fainelli
In-Reply-To: <20181106205841.14308-1-f.fainelli@gmail.com>

Some of the system suspend states that we support wipe out entirely the
HW contents. If we had a Wake-on-LAN filter programmed prior to going
into suspend, but we did not actually wake-up from Wake-on-LAN and
instead used a deeper suspend state, make sure we restore the CID number
that we need to match against.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/ethernet/broadcom/bcmsysport.c | 12 ++++++++++++
 drivers/net/ethernet/broadcom/bcmsysport.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index 0e2d99c737e3..2e60dda32adc 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -1068,6 +1068,7 @@ static void mpd_enable_set(struct bcm_sysport_priv *priv, bool enable)
 
 static void bcm_sysport_resume_from_wol(struct bcm_sysport_priv *priv)
 {
+	unsigned int index;
 	u32 reg;
 
 	/* Disable RXCHK, active filters and Broadcom tag matching */
@@ -1076,6 +1077,15 @@ static void bcm_sysport_resume_from_wol(struct bcm_sysport_priv *priv)
 		 RXCHK_BRCM_TAG_MATCH_SHIFT | RXCHK_EN | RXCHK_BRCM_TAG_EN);
 	rxchk_writel(priv, reg, RXCHK_CONTROL);
 
+	/* Make sure we restore correct CID index in case HW lost
+	 * its context during deep idle state
+	 */
+	for_each_set_bit(index, priv->filters, RXCHK_BRCM_TAG_MAX) {
+		rxchk_writel(priv, priv->filters_loc[index] <<
+			     RXCHK_BRCM_TAG_CID_SHIFT, RXCHK_BRCM_TAG(index));
+		rxchk_writel(priv, 0xff00ffff, RXCHK_BRCM_TAG_MASK(index));
+	}
+
 	/* Clear the MagicPacket detection logic */
 	mpd_enable_set(priv, false);
 
@@ -2189,6 +2199,7 @@ static int bcm_sysport_rule_set(struct bcm_sysport_priv *priv,
 	rxchk_writel(priv, reg, RXCHK_BRCM_TAG(index));
 	rxchk_writel(priv, 0xff00ffff, RXCHK_BRCM_TAG_MASK(index));
 
+	priv->filters_loc[index] = nfc->fs.location;
 	set_bit(index, priv->filters);
 
 	return 0;
@@ -2208,6 +2219,7 @@ static int bcm_sysport_rule_del(struct bcm_sysport_priv *priv,
 	 * be taken care of during suspend time by bcm_sysport_suspend_to_wol
 	 */
 	clear_bit(index, priv->filters);
+	priv->filters_loc[index] = 0;
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h
index a7a230884a87..7a0b7bfedd19 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.h
+++ b/drivers/net/ethernet/broadcom/bcmsysport.h
@@ -786,6 +786,7 @@ struct bcm_sysport_priv {
 	/* Ethtool */
 	u32			msg_enable;
 	DECLARE_BITMAP(filters, RXCHK_BRCM_TAG_MAX);
+	u32			filters_loc[RXCHK_BRCM_TAG_MAX];
 
 	struct bcm_sysport_stats64	stats64;
 
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH rdma] net/mlx5: Fix XRC SRQ umem valid bits
From: Doug Ledford @ 2018-11-06 21:31 UTC (permalink / raw)
  To: Leon Romanovsky, Jason Gunthorpe
  Cc: Yishai Hadas, RDMA mailing list, Artemy Kovalyov, Saeed Mahameed,
	linux-netdev, Leon Romanovsky
In-Reply-To: <20181031102028.9142-1-leon@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 1175 bytes --]

On Wed, 2018-10-31 at 12:20 +0200, Leon Romanovsky wrote:
> From: Yishai Hadas <yishaih@mellanox.com>
> 
> Adapt XRC SRQ to the latest HW specification with fixed definition
> around umem valid bits. The previous definition relied on a bit which
> was taken for other purposes in legacy FW.
> 
> Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits")
> Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
> Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
> Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
> ---
> Hi Doug, Jason
> 
> This commit fixes code sent in this merge window, so I'm not marking it
> with any rdma-rc/rdma-next. It will be better to be sent during this merge
> window if you have extra pull request to issue, or as a -rc material, if
> not.
> 
> BTW, we didn't combine reserved fields, because our convention is to align such
> fields to 32 bits for better readability.
> 
> Thanks

This looks fine.  Let me know when it's in the mlx5-next tree to pull.

-- 
Doug Ledford <dledford@redhat.com>
    GPG KeyID: B826A3330E572FDD
    Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH] e1000e: Change watchdog task to be delayed work
From: Neftin, Sasha @ 2018-11-07  7:01 UTC (permalink / raw)
  To: Robert Eshleman; +Cc: netdev, linux-kernel, intel-wired-lan, David S. Miller
In-Reply-To: <1541290645-25033-1-git-send-email-bobbyeshleman@gmail.com>

On 11/4/2018 02:17, Robert Eshleman wrote:
> This completes a pending TODO to use queue_delayed_work() instead of
> schedule_work().
> 
> Signed-off-by: Robert Eshleman <bobbyeshleman@gmail.com>
> ---
>   drivers/net/ethernet/intel/e1000e/netdev.c | 14 ++++++++++++--
>   1 file changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
> index 16a73bd..a387b21 100644
> --- a/drivers/net/ethernet/intel/e1000e/netdev.c
> +++ b/drivers/net/ethernet/intel/e1000e/netdev.c
> @@ -39,6 +39,8 @@ static int debug = -1;
>   module_param(debug, int, 0);
>   MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
>   
> +struct workqueue_struct *e1000e_workqueue;
> +
>   static const struct e1000_info *e1000_info_tbl[] = {
>   	[board_82571]		= &e1000_82571_info,
>   	[board_82572]		= &e1000_82572_info,
> @@ -5137,9 +5139,9 @@ static void e1000_watchdog(struct timer_list *t)
>   	struct e1000_adapter *adapter = from_timer(adapter, t, watchdog_timer);
>   
>   	/* Do the rest outside of interrupt context */
> -	schedule_work(&adapter->watchdog_task);
> +	struct delayed_work *dwork = to_delayed_work(&adapter->watchdog_task);
>   
> -	/* TODO: make this use queue_delayed_work() */
> +	queue_delayed_work(e1000e_workqueue, dwork, 1);
>   }
>   
>   static void e1000_watchdog_task(struct work_struct *work)
> @@ -7572,6 +7574,13 @@ static int __init e1000_init_module(void)
>   		e1000e_driver_version);
>   	pr_info("Copyright(c) 1999 - 2015 Intel Corporation.\n");
>   
> +	e1000e_workqueue = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0,
> +					   e1000e_driver_name);
> +	if (!e1000e_workqueue) {
> +		pr_err("%s: Failed to create workqueue\n", e1000e_driver_name);
> +		return -ENOMEM;
> +	}
> +
>   	return pci_register_driver(&e1000_driver);
>   }
>   module_init(e1000_init_module);
> @@ -7585,6 +7594,7 @@ module_init(e1000_init_module);
>   static void __exit e1000_exit_module(void)
>   {
>   	pci_unregister_driver(&e1000_driver);
> +	destroy_workqueue(e1000e_workqueue);
>   }
>   module_exit(e1000_exit_module);
>   
> 
Hello,
What is benefit of using 'delayed_work' mechanism instead of 
'schedue_work'? Some improvement in the driver behavior?
Thanks,
Sasha

^ permalink raw reply

* [PATCH net-next 00/11] ICMP error handling for UDP tunnels
From: Stefano Brivio @ 2018-11-06 21:38 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev

This series introduces ICMP error handling for UDP tunnels and
encapsulations and related selftests. We need to handle ICMP errors to
support PMTU discovery and route redirection -- this support is entirely
missing right now:

- patch 1/11 adds a socket lookup for UDP tunnels that use, by design,
  the same destination port on both endpoints -- i.e. VxLAN and GENEVE
- patches 2/11 to 7/11 are specific to VxLAN and GENEVE
- patches 8/11 and 9/11 add infrastructure for lookup of encapsulations
  where sent packets cannot be matched via receiving socket lookup, i.e.
  FoU and GUE
- patches 10/11 and 11/11 are specific to FoU and GUE

Stefano Brivio (11):
  udp: Handle ICMP errors for tunnels with same destination port on both
    endpoints
  vxlan: ICMP error lookup handler
  vxlan: Allow configuration of DF behaviour
  selftests: pmtu: Introduce tests for IPv4/IPv6 over VxLAN over IPv6
  geneve: ICMP error lookup handler
  geneve: Allow configuration of DF behaviour
  selftests: pmtu: Introduce tests for IPv4/IPv6 over GENEVE over IPv6
  net: Convert protocol error handlers from void to int
  udp: Support for error handlers of tunnels with arbitrary destination
    port
  fou, fou6: ICMP error handlers for FoU and GUE
  selftests: pmtu: Introduce FoU and GUE PMTU exceptions tests

 drivers/net/geneve.c                | 104 ++++++++-
 drivers/net/vxlan.c                 |  58 +++++
 include/linux/udp.h                 |   1 +
 include/net/icmp.h                  |   2 +-
 include/net/ip6_tunnel.h            |   2 +
 include/net/ip_tunnels.h            |   1 +
 include/net/protocol.h              |   9 +-
 include/net/sctp/sctp.h             |   2 +-
 include/net/tcp.h                   |   2 +-
 include/net/udp.h                   |   2 +-
 include/net/udp_tunnel.h            |   3 +
 include/net/vxlan.h                 |   1 +
 include/uapi/linux/if_link.h        |  18 ++
 net/dccp/ipv4.c                     |  13 +-
 net/dccp/ipv6.c                     |  13 +-
 net/ipv4/fou.c                      |  68 ++++++
 net/ipv4/gre_demux.c                |   9 +-
 net/ipv4/icmp.c                     |   6 +-
 net/ipv4/ip_gre.c                   |  48 ++--
 net/ipv4/ipip.c                     |  14 +-
 net/ipv4/protocol.c                 |   1 +
 net/ipv4/tcp_ipv4.c                 |  22 +-
 net/ipv4/tunnel4.c                  |  18 +-
 net/ipv4/udp.c                      | 119 ++++++++--
 net/ipv4/udp_impl.h                 |   2 +-
 net/ipv4/udp_tunnel.c               |   1 +
 net/ipv4/udplite.c                  |   4 +-
 net/ipv4/xfrm4_protocol.c           |  18 +-
 net/ipv6/fou6.c                     |  74 +++++++
 net/ipv6/icmp.c                     |   4 +-
 net/ipv6/ip6_gre.c                  |  18 +-
 net/ipv6/tcp_ipv6.c                 |  13 +-
 net/ipv6/tunnel6.c                  |  12 +-
 net/ipv6/udp.c                      | 141 ++++++++++--
 net/ipv6/udp_impl.h                 |   4 +-
 net/ipv6/udplite.c                  |   5 +-
 net/ipv6/xfrm6_protocol.c           |  18 +-
 net/sctp/input.c                    |   5 +-
 net/sctp/ipv6.c                     |   7 +-
 tools/testing/selftests/net/pmtu.sh | 326 ++++++++++++++++++++++++++--
 40 files changed, 1025 insertions(+), 163 deletions(-)

-- 
2.19.1

^ permalink raw reply

* [PATCH net-next 01/11] udp: Handle ICMP errors for tunnels with same destination port on both endpoints
From: Stefano Brivio @ 2018-11-06 21:38 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VxLAN and GENEVE.

For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.

Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 include/linux/udp.h      |  1 +
 include/net/udp_tunnel.h |  3 ++
 net/ipv4/udp.c           | 76 +++++++++++++++++++++++++++++++-----
 net/ipv4/udp_tunnel.c    |  1 +
 net/ipv6/udp.c           | 83 ++++++++++++++++++++++++++++++++++------
 5 files changed, 144 insertions(+), 20 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 320d49d85484..c8410837f044 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -71,6 +71,7 @@ struct udp_sock {
 	 * For encapsulation sockets.
 	 */
 	int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
+	int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb);
 	void (*encap_destroy)(struct sock *sk);
 
 	/* GRO functions for UDP socket */
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index fe680ab6b15a..bf2f84984392 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -64,6 +64,8 @@ static inline int udp_sock_create(struct net *net,
 }
 
 typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb);
+typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk,
+					     struct sk_buff *skb);
 typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk);
 typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk,
 						    struct list_head *head,
@@ -76,6 +78,7 @@ struct udp_tunnel_sock_cfg {
 	/* Used for setting up udp_sock fields, see udp.h for details */
 	__u8  encap_type;
 	udp_tunnel_encap_rcv_t encap_rcv;
+	udp_tunnel_encap_err_lookup_t encap_err_lookup;
 	udp_tunnel_encap_destroy_t encap_destroy;
 	udp_tunnel_gro_receive_t gro_receive;
 	udp_tunnel_gro_complete_t gro_complete;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ca3ed931f2a9..1f054a85062d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -585,6 +585,59 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
 	return true;
 }
 
+DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
+void udp_encap_enable(void)
+{
+	static_branch_enable(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VxLAN, GENEVE). Then ask the
+ * tunnel implementation to match the error against a valid association.
+ *
+ * Return the socket if we have a match.
+ */
+static struct sock *__udp4_lib_err_encap(struct net *net,
+					 const struct iphdr *iph,
+					 struct udphdr *uh,
+					 struct udp_table *udptable,
+					 struct sk_buff *skb)
+{
+	int (*lookup)(struct sock *sk, struct sk_buff *skb);
+	int network_offset, transport_offset;
+	struct udp_sock *up;
+	struct sock *sk;
+
+	sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
+			       iph->saddr, uh->dest, skb->dev->ifindex, 0,
+			       udptable, NULL);
+	if (!sk)
+		return NULL;
+
+	network_offset = skb_network_offset(skb);
+	transport_offset = skb_transport_offset(skb);
+
+	skb_reset_network_header(skb);
+
+	/* Network header needs to point to the outer IPv4 header inside ICMP */
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	/* Transport header needs to point to the UDP header */
+	skb_set_transport_header(skb, iph->ihl << 2);
+
+	up = udp_sk(sk);
+	lookup = READ_ONCE(up->encap_err_lookup);
+	if (!lookup || lookup(sk, skb))
+		sk = NULL;
+
+	skb_set_transport_header(skb, transport_offset);
+	skb_set_network_header(skb, network_offset);
+
+	return sk;
+}
+
 /*
  * This routine is called by the ICMP module when it gets some
  * sort of error condition.  If err < 0 then the socket should
@@ -603,6 +656,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
 	const int type = icmp_hdr(skb)->type;
 	const int code = icmp_hdr(skb)->code;
+	bool tunnel = false;
 	struct sock *sk;
 	int harderr;
 	int err;
@@ -612,8 +666,15 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 			       iph->saddr, uh->source, skb->dev->ifindex,
 			       inet_sdif(skb), udptable, NULL);
 	if (!sk) {
-		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
-		return;	/* No socket for error */
+		/* No socket for error: try tunnels before discarding */
+		if (static_branch_unlikely(&udp_encap_needed_key))
+			sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb);
+
+		if (!sk) {
+			__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+			return;
+		}
+		tunnel = true;
 	}
 
 	err = 0;
@@ -656,6 +717,10 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	 *      RFC1122: OK.  Passes ICMP errors back to application, as per
 	 *	4.1.3.3.
 	 */
+	if (tunnel) {
+		/* ...not for tunnels though: we don't have a sending socket */
+		goto out;
+	}
 	if (!inet->recverr) {
 		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
 			goto out;
@@ -1889,13 +1954,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
-void udp_encap_enable(void)
-{
-	static_branch_enable(&udp_encap_needed_key);
-}
-EXPORT_SYMBOL(udp_encap_enable);
-
 /* returns:
  *  -1: error
  *   0: success
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 6539ff15e9a3..d0c412fc56ad 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -68,6 +68,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 
 	udp_sk(sk)->encap_type = cfg->encap_type;
 	udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+	udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup;
 	udp_sk(sk)->encap_destroy = cfg->encap_destroy;
 	udp_sk(sk)->gro_receive = cfg->gro_receive;
 	udp_sk(sk)->gro_complete = cfg->gro_complete;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d2d97d07ef27..eebd90111646 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -463,6 +463,55 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 	goto try_again;
 }
 
+DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
+void udpv6_encap_enable(void)
+{
+	static_branch_enable(&udpv6_encap_needed_key);
+}
+EXPORT_SYMBOL(udpv6_encap_enable);
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VxLAN, GENEVE). Then ask the
+ * tunnel implementation to match the error against a valid association.
+ *
+ * Return the socket if we have a match.
+ */
+static struct sock *__udp6_lib_err_encap(struct net *net,
+					 const struct ipv6hdr *hdr, int offset,
+					 struct udphdr *uh,
+					 struct udp_table *udptable,
+					 struct sk_buff *skb)
+{
+	int (*lookup)(struct sock *sk, struct sk_buff *skb);
+	int network_offset, transport_offset;
+	struct udp_sock *up;
+	struct sock *sk;
+
+	sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source,
+			       &hdr->saddr, uh->dest,
+			       inet6_iif(skb), 0, udptable, skb);
+	if (!sk)
+		return NULL;
+
+	network_offset = skb_network_offset(skb);
+	transport_offset = skb_transport_offset(skb);
+
+	/* Network header needs to point to the outer IPv6 header inside ICMP */
+	skb_reset_network_header(skb);
+	/* Transport header needs to point to the UDP header */
+	skb_set_transport_header(skb, offset);
+
+	up = udp_sk(sk);
+	lookup = READ_ONCE(up->encap_err_lookup);
+	if (!lookup || lookup(sk, skb))
+		sk = NULL;
+
+	skb_set_transport_header(skb, transport_offset);
+	skb_set_network_header(skb, network_offset);
+	return sk;
+}
+
 void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		    u8 type, u8 code, int offset, __be32 info,
 		    struct udp_table *udptable)
@@ -472,6 +521,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	const struct in6_addr *saddr = &hdr->saddr;
 	const struct in6_addr *daddr = &hdr->daddr;
 	struct udphdr *uh = (struct udphdr *)(skb->data+offset);
+	bool tunnel = false;
 	struct sock *sk;
 	int harderr;
 	int err;
@@ -480,9 +530,18 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
 			       inet6_iif(skb), inet6_sdif(skb), udptable, skb);
 	if (!sk) {
-		__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
-				  ICMP6_MIB_INERRORS);
-		return;
+		/* No socket for error: try tunnels before discarding */
+		if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+			sk = __udp6_lib_err_encap(net, hdr, offset, uh,
+						  udptable, skb);
+		}
+
+		if (!sk) {
+			__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
+					  ICMP6_MIB_INERRORS);
+			return;
+		}
+		tunnel = true;
 	}
 
 	harderr = icmpv6_err_convert(type, code, &err);
@@ -496,10 +555,19 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			harderr = 1;
 	}
 	if (type == NDISC_REDIRECT) {
-		ip6_sk_redirect(skb, sk);
+		if (tunnel) {
+			ip6_redirect(skb, sock_net(sk), inet6_iif(skb),
+				     sk->sk_mark, sk->sk_uid);
+		} else {
+			ip6_sk_redirect(skb, sk);
+		}
 		goto out;
 	}
 
+	/* Tunnels don't have an application socket: don't pass errors back */
+	if (tunnel)
+		goto out;
+
 	if (!np->recverr) {
 		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
 			goto out;
@@ -548,13 +616,6 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
 	__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
 }
 
-DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
-void udpv6_encap_enable(void)
-{
-	static_branch_enable(&udpv6_encap_needed_key);
-}
-EXPORT_SYMBOL(udpv6_encap_enable);
-
 static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct udp_sock *up = udp_sk(sk);
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 02/11] vxlan: ICMP error lookup handler
From: Stefano Brivio @ 2018-11-06 21:38 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

Export an encap_err_lookup() operation to match an ICMP error against a
valid VNI.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 drivers/net/vxlan.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 297cdeaef479..7706e392b2a7 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1552,6 +1552,34 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
+/* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */
+static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb)
+{
+	struct vxlan_dev *vxlan;
+	struct vxlan_sock *vs;
+	struct vxlanhdr *hdr;
+	__be32 vni;
+
+	if (skb->len < VXLAN_HLEN)
+		return -EINVAL;
+
+	hdr = vxlan_hdr(skb);
+
+	if (!(hdr->vx_flags & VXLAN_HF_VNI))
+		return -EINVAL;
+
+	vs = rcu_dereference_sk_user_data(sk);
+	if (!vs)
+		return -ENOENT;
+
+	vni = vxlan_vni(hdr->vx_vni);
+	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
+	if (!vxlan)
+		return -ENOENT;
+
+	return 0;
+}
+
 static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
@@ -2948,6 +2976,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
 	tunnel_cfg.sk_user_data = vs;
 	tunnel_cfg.encap_type = 1;
 	tunnel_cfg.encap_rcv = vxlan_rcv;
+	tunnel_cfg.encap_err_lookup = vxlan_err_lookup;
 	tunnel_cfg.encap_destroy = NULL;
 	tunnel_cfg.gro_receive = vxlan_gro_receive;
 	tunnel_cfg.gro_complete = vxlan_gro_complete;
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 03/11] vxlan: Allow configuration of DF behaviour
From: Stefano Brivio @ 2018-11-06 21:38 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

Allow users to set the IPv4 DF bit in outgoing packets, or to inherit its
value from the IPv4 inner header. If the encapsulated protocol is IPv6 and
DF is configured to be inherited, always set it.

For IPv4, inheriting DF from the inner header was probably intended from
the very beginning judging by the comment to vxlan_xmit(), but it wasn't
actually implemented -- also because it would have done more harm than
good, without handling for ICMP Fragmentation Needed messages.

According to RFC 7348, "Path MTU discovery MAY be used". An expired RFC
draft, draft-saum-nvo3-pmtud-over-vxlan-05, whose purpose was to describe
PMTUD implementation, says that "is a MUST that Vxlan gateways [...]
SHOULD set the DF-bit [...]", whatever that means.

Given this background, the only sane option is probably to let the user
decide, and keep the current behaviour as default.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 drivers/net/vxlan.c          | 29 +++++++++++++++++++++++++++++
 include/net/vxlan.h          |  1 +
 include/uapi/linux/if_link.h |  9 +++++++++
 3 files changed, 39 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 7706e392b2a7..ccb19b833706 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2289,6 +2289,19 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			df = htons(IP_DF);
 		}
 
+		if (!df) {
+			if (vxlan->cfg.df == VXLAN_DF_SET) {
+				df = htons(IP_DF);
+			} else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
+				struct ethhdr *eth = eth_hdr(skb);
+
+				if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
+				    (ntohs(eth->h_proto) == ETH_P_IP &&
+				     old_iph->frag_off & htons(IP_DF)))
+					df = htons(IP_DF);
+			}
+		}
+
 		ndst = &rt->dst;
 		skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
 
@@ -2837,6 +2850,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
 	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
 	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
+	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -2893,6 +2907,16 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
 		}
 	}
 
+	if (data[IFLA_VXLAN_DF]) {
+		enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]);
+
+		if (df < 0 || df > VXLAN_DF_MAX) {
+			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_DF],
+					    "Invalid DF attribute");
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
@@ -3538,6 +3562,9 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 		conf->mtu = nla_get_u32(tb[IFLA_MTU]);
 	}
 
+	if (data[IFLA_VXLAN_DF])
+		conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);
+
 	return 0;
 }
 
@@ -3630,6 +3657,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL_INHERIT */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_DF */
 		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
@@ -3696,6 +3724,7 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
 		       !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
+	    nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
 	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
 	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
 			!!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 03431c148e16..ec999c49df1f 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -216,6 +216,7 @@ struct vxlan_config {
 	unsigned long		age_interval;
 	unsigned int		addrmax;
 	bool			no_share;
+	enum ifla_vxlan_df	df;
 };
 
 struct vxlan_dev_node {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 1debfa42cba1..efc588949431 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -533,6 +533,7 @@ enum {
 	IFLA_VXLAN_LABEL,
 	IFLA_VXLAN_GPE,
 	IFLA_VXLAN_TTL_INHERIT,
+	IFLA_VXLAN_DF,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
@@ -542,6 +543,14 @@ struct ifla_vxlan_port_range {
 	__be16	high;
 };
 
+enum ifla_vxlan_df {
+	VXLAN_DF_UNSET = 0,
+	VXLAN_DF_SET,
+	VXLAN_DF_INHERIT,
+	__VXLAN_DF_END,
+	VXLAN_DF_MAX = __VXLAN_DF_END - 1,
+};
+
 /* GENEVE section */
 enum {
 	IFLA_GENEVE_UNSPEC,
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 04/11] selftests: pmtu: Introduce tests for IPv4/IPv6 over VxLAN over IPv6
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

Use a router between endpoints, implemented via namespaces, set a low MTU
between router and destination endpoint, exceed it and check PMTU value in
route exceptions.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
This only introduces tests over VxLAN over IPv6 right now. I'll introduce
tests over IPv4 (they can be added trivially) once DF configuration support
is accepted into iproute2.

 tools/testing/selftests/net/pmtu.sh | 115 +++++++++++++++++++++++-----
 1 file changed, 97 insertions(+), 18 deletions(-)

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index a369d616b390..19ede74af560 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -26,6 +26,17 @@
 # - pmtu_ipv6
 #	Same as pmtu_ipv4, except for locked PMTU tests, using IPv6
 #
+# - pmtu_ipv4_vxlan6_exception
+#	Set up the same network topology as pmtu_ipv4, create a VxLAN tunnel
+#	over IPv6 between A and B, routed via R1. On the link between R1 and B,
+#	set a MTU lower than the VxLAN MTU and the MTU on the link between A and
+#	R1. Send IPv4 packets, exceeding the MTU between R1 and B, over VxLAN
+#	from A to B and check that the PMTU exception is created with the right
+#	value on A
+#
+# - pmtu_ipv6_vxlan6_exception
+#	Same as pmtu_ipv4_vxlan6_exception, but send IPv6 packets from A to B
+#
 # - pmtu_vti4_exception
 #	Set up vti tunnel on top of veth, with xfrm states and policies, in two
 #	namespaces with matching endpoints. Check that route exception is not
@@ -72,6 +83,8 @@ which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
 tests="
 	pmtu_ipv4_exception		ipv4: PMTU exceptions
 	pmtu_ipv6_exception		ipv6: PMTU exceptions
+	pmtu_ipv4_vxlan6_exception	IPv4 over vxlan6: PMTU exceptions
+	pmtu_ipv6_vxlan6_exception	IPv6 over vxlan6: PMTU exceptions
 	pmtu_vti6_exception		vti6: PMTU exceptions
 	pmtu_vti4_exception		vti4: PMTU exceptions
 	pmtu_vti4_default_mtu		vti4: default MTU assignment
@@ -95,8 +108,8 @@ ns_r2="ip netns exec ${NS_R2}"
 # Addresses are:
 # - IPv4: PREFIX4.SEGMENT.ID (/24)
 # - IPv6: PREFIX6:SEGMENT::ID (/64)
-prefix4="192.168"
-prefix6="fd00"
+prefix4="10.0"
+prefix6="fc00"
 a_r1=1
 a_r2=2
 b_r1=3
@@ -129,12 +142,12 @@ veth6_a_addr="fd00:1::a"
 veth6_b_addr="fd00:1::b"
 veth6_mask="64"
 
-vti4_a_addr="192.168.2.1"
-vti4_b_addr="192.168.2.2"
-vti4_mask="24"
-vti6_a_addr="fd00:2::a"
-vti6_b_addr="fd00:2::b"
-vti6_mask="64"
+tunnel4_a_addr="192.168.2.1"
+tunnel4_b_addr="192.168.2.2"
+tunnel4_mask="24"
+tunnel6_a_addr="fd00:2::a"
+tunnel6_b_addr="fd00:2::b"
+tunnel6_mask="64"
 
 dummy6_0_addr="fc00:1000::0"
 dummy6_1_addr="fc00:1001::0"
@@ -202,11 +215,34 @@ setup_vti() {
 }
 
 setup_vti4() {
-	setup_vti 4 ${veth4_a_addr} ${veth4_b_addr} ${vti4_a_addr} ${vti4_b_addr} ${vti4_mask}
+	setup_vti 4 ${veth4_a_addr} ${veth4_b_addr} ${tunnel4_a_addr} ${tunnel4_b_addr} ${tunnel4_mask}
 }
 
 setup_vti6() {
-	setup_vti 6 ${veth6_a_addr} ${veth6_b_addr} ${vti6_a_addr} ${vti6_b_addr} ${vti6_mask}
+	setup_vti 6 ${veth6_a_addr} ${veth6_b_addr} ${tunnel6_a_addr} ${tunnel6_b_addr} ${tunnel6_mask}
+}
+
+setup_vxlan() {
+	a_addr="${1}"
+	b_addr="${2}"
+
+	${ns_a} ip link add vxlan_a type vxlan id 1 local ${a_addr} remote ${b_addr} ttl 64 dstport 4789 || return 1
+	${ns_b} ip link add vxlan_b type vxlan id 1 local ${b_addr} remote ${a_addr} ttl 64 dstport 4789
+
+	${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask}   dev vxlan_a
+	${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask}   dev vxlan_b
+
+	${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask}   dev vxlan_a
+	${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask}   dev vxlan_b
+
+	${ns_a} ip link set vxlan_a up
+	${ns_b} ip link set vxlan_b up
+
+	sleep 1
+}
+
+setup_vxlan6() {
+	setup_vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
 }
 
 setup_xfrm() {
@@ -465,6 +501,49 @@ test_pmtu_ipv6_exception() {
 	test_pmtu_ipvX 6
 }
 
+test_pmtu_ipvX_over_vxlan6_exception() {
+	family=${1}
+	ll_mtu=4000
+
+	setup namespaces routing vxlan6 || return 2
+	#                      IPv6 header   UDP header   VxLAN header   Ethernet header
+	exp_mtu=$((${ll_mtu} - 40          - 8          - 8            - 14))
+
+	trace "${ns_a}" vxlan_a      "${ns_b}"  vxlan_b \
+	      "${ns_a}" veth_A-R1    "${ns_r1}" veth_R1-A \
+	      "${ns_b}" veth_B-R1    "${ns_r1}" veth_R1-B
+
+	if [ ${family} -eq 4 ]; then
+		ping=ping
+		dst=${tunnel4_b_addr}
+	else
+		ping=${ping6}
+		dst=${tunnel6_b_addr}
+	fi
+
+	# Create route exception by exceeding link layer MTU
+	mtu "${ns_a}"  veth_A-R1 $((${ll_mtu} + 1000))
+	mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
+	mtu "${ns_b}"  veth_B-R1 ${ll_mtu}
+	mtu "${ns_r1}" veth_R1-B ${ll_mtu}
+
+	mtu "${ns_a}" vxlan_a $((${ll_mtu} + 1000))
+	mtu "${ns_b}" vxlan_b $((${ll_mtu} + 1000))
+	${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s $((${ll_mtu} + 500)) ${dst} > /dev/null
+
+	# Check that exception was created
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
+	check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on VxLAN interface"
+}
+
+test_pmtu_ipv4_vxlan6_exception() {
+	test_pmtu_ipvX_over_vxlan6_exception 4
+}
+
+test_pmtu_ipv6_vxlan6_exception() {
+	test_pmtu_ipvX_over_vxlan6_exception 6
+}
+
 test_pmtu_vti4_exception() {
 	setup namespaces veth vti4 xfrm4 || return 2
 	trace "${ns_a}" veth_a    "${ns_b}" veth_b \
@@ -484,14 +563,14 @@ test_pmtu_vti4_exception() {
 
 	# Send DF packet without exceeding link layer MTU, check that no
 	# exception is created
-	${ns_a} ping -q -M want -i 0.1 -w 2 -s ${ping_payload} ${vti4_b_addr} > /dev/null
-	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti4_b_addr})"
+	${ns_a} ping -q -M want -i 0.1 -w 2 -s ${ping_payload} ${tunnel4_b_addr} > /dev/null
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})"
 	check_pmtu_value "" "${pmtu}" "sending packet smaller than PMTU (IP payload length ${esp_payload_rfc4106})" || return 1
 
 	# Now exceed link layer MTU by one byte, check that exception is created
 	# with the right PMTU value
-	${ns_a} ping -q -M want -i 0.1 -w 2 -s $((ping_payload + 1)) ${vti4_b_addr} > /dev/null
-	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti4_b_addr})"
+	${ns_a} ping -q -M want -i 0.1 -w 2 -s $((ping_payload + 1)) ${tunnel4_b_addr} > /dev/null
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})"
 	check_pmtu_value "${esp_payload_rfc4106}" "${pmtu}" "exceeding PMTU (IP payload length $((esp_payload_rfc4106 + 1)))"
 }
 
@@ -506,20 +585,20 @@ test_pmtu_vti6_exception() {
 	mtu "${ns_b}" veth_b 4000
 	mtu "${ns_a}" vti6_a 5000
 	mtu "${ns_b}" vti6_b 5000
-	${ns_a} ${ping6} -q -i 0.1 -w 2 -s 60000 ${vti6_b_addr} > /dev/null
+	${ns_a} ${ping6} -q -i 0.1 -w 2 -s 60000 ${tunnel6_b_addr} > /dev/null
 
 	# Check that exception was created
-	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})"
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
 	check_pmtu_value any "${pmtu}" "creating tunnel exceeding link layer MTU" || return 1
 
 	# Decrease tunnel MTU, check for PMTU decrease in route exception
 	mtu "${ns_a}" vti6_a 3000
-	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})"
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
 	check_pmtu_value "3000" "${pmtu}" "decreasing tunnel MTU" || fail=1
 
 	# Increase tunnel MTU, check for PMTU increase in route exception
 	mtu "${ns_a}" vti6_a 9000
-	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})"
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
 	check_pmtu_value "9000" "${pmtu}" "increasing tunnel MTU" || fail=1
 
 	return ${fail}
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 05/11] geneve: ICMP error lookup handler
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

Export an encap_err_lookup() operation to match an ICMP error against a
valid VNI.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 drivers/net/geneve.c | 52 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index a0cd1c41cf5f..8a69879d516a 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -387,6 +387,57 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
+/* Callback from net/ipv{4,6}/udp.c to check that we have a tunnel for errors */
+static int geneve_udp_encap_err_lookup(struct sock *sk, struct sk_buff *skb)
+{
+	struct genevehdr *geneveh;
+	struct geneve_sock *gs;
+	u8 zero_vni[3] = { 0 };
+	u8 *vni = zero_vni;
+
+	if (skb->len < GENEVE_BASE_HLEN)
+		return -EINVAL;
+
+	geneveh = geneve_hdr(skb);
+	if (geneveh->ver != GENEVE_VER)
+		return -EINVAL;
+
+	if (geneveh->proto_type != htons(ETH_P_TEB))
+		return -EINVAL;
+
+	gs = rcu_dereference_sk_user_data(sk);
+	if (!gs)
+		return -ENOENT;
+
+	if (geneve_get_sk_family(gs) == AF_INET) {
+		struct iphdr *iph = ip_hdr(skb);
+		__be32 addr4 = 0;
+
+		if (!gs->collect_md) {
+			vni = geneve_hdr(skb)->vni;
+			addr4 = iph->daddr;
+		}
+
+		return geneve_lookup(gs, addr4, vni) ? 0 : -ENOENT;
+	}
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (geneve_get_sk_family(gs) == AF_INET6) {
+		struct ipv6hdr *ip6h = ipv6_hdr(skb);
+		struct in6_addr addr6 = { 0 };
+
+		if (!gs->collect_md) {
+			vni = geneve_hdr(skb)->vni;
+			addr6 = ip6h->daddr;
+		}
+
+		return geneve6_lookup(gs, addr6, vni) ? 0 : -ENOENT;
+	}
+#endif
+
+	return -EPFNOSUPPORT;
+}
+
 static struct socket *geneve_create_sock(struct net *net, bool ipv6,
 					 __be16 port, bool ipv6_rx_csum)
 {
@@ -544,6 +595,7 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
 	tunnel_cfg.gro_receive = geneve_gro_receive;
 	tunnel_cfg.gro_complete = geneve_gro_complete;
 	tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
+	tunnel_cfg.encap_err_lookup = geneve_udp_encap_err_lookup;
 	tunnel_cfg.encap_destroy = NULL;
 	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
 	list_add(&gs->list, &gn->sock_list);
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 06/11] geneve: Allow configuration of DF behaviour
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

draft-ietf-nvo3-geneve-08 says:

   It is strongly RECOMMENDED that Path MTU Discovery ([RFC1191],
   [RFC1981]) be used by setting the DF bit in the IP header when Geneve
   packets are transmitted over IPv4 (this is the default with IPv6).

Now that ICMP error handling is working for GENEVE, we can comply with
this recommendation.

Make this configurable, though, to avoid breaking existing setups. By
default, DF won't be set. It can be set or inherited from inner IPv4
packets. If it's configured to be inherited and we are encapsulating IPv6,
it will be set.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 drivers/net/geneve.c         | 52 +++++++++++++++++++++++++++++++-----
 include/uapi/linux/if_link.h |  9 +++++++
 2 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 8a69879d516a..cafdee06b5c8 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -70,6 +70,7 @@ struct geneve_dev {
 	bool		   collect_md;
 	bool		   use_udp6_rx_checksums;
 	bool		   ttl_inherit;
+	enum ifla_geneve_df df;
 };
 
 struct geneve_sock {
@@ -898,7 +899,24 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 			ttl = key->ttl;
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
 	}
+
 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+	if (!df) {
+		if (geneve->df == GENEVE_DF_SET) {
+			df = htons(IP_DF);
+		} else if (geneve->df == GENEVE_DF_INHERIT) {
+			struct ethhdr *eth = eth_hdr(skb);
+
+			if (ntohs(eth->h_proto) == ETH_P_IPV6) {
+				df = htons(IP_DF);
+			} else if (ntohs(eth->h_proto) == ETH_P_IP) {
+				struct iphdr *iph = ip_hdr(skb);
+
+				if (iph->frag_off & htons(IP_DF))
+					df = htons(IP_DF);
+			}
+		}
+	}
 
 	err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr));
 	if (unlikely(err))
@@ -1145,6 +1163,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
 	[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
 	[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]	= { .type = NLA_U8 },
 	[IFLA_GENEVE_TTL_INHERIT]	= { .type = NLA_U8 },
+	[IFLA_GENEVE_DF]		= { .type = NLA_U8 },
 };
 
 static int geneve_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -1180,6 +1199,16 @@ static int geneve_validate(struct nlattr *tb[], struct nlattr *data[],
 		}
 	}
 
+	if (data[IFLA_GENEVE_DF]) {
+		enum ifla_geneve_df df = nla_get_u8(data[IFLA_GENEVE_DF]);
+
+		if (df < 0 || df > GENEVE_DF_MAX) {
+			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_GENEVE_DF],
+					    "Invalid DF attribute");
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
@@ -1225,7 +1254,7 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 			    struct netlink_ext_ack *extack,
 			    const struct ip_tunnel_info *info,
 			    bool metadata, bool ipv6_rx_csum,
-			    bool ttl_inherit)
+			    bool ttl_inherit, enum ifla_geneve_df df)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_dev *t, *geneve = netdev_priv(dev);
@@ -1275,6 +1304,7 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 	geneve->collect_md = metadata;
 	geneve->use_udp6_rx_checksums = ipv6_rx_csum;
 	geneve->ttl_inherit = ttl_inherit;
+	geneve->df = df;
 
 	err = register_netdevice(dev);
 	if (err)
@@ -1294,7 +1324,7 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 			  struct netlink_ext_ack *extack,
 			  struct ip_tunnel_info *info, bool *metadata,
 			  bool *use_udp6_rx_checksums, bool *ttl_inherit,
-			  bool changelink)
+			  enum ifla_geneve_df *df, bool changelink)
 {
 	int attrtype;
 
@@ -1382,6 +1412,9 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 	if (data[IFLA_GENEVE_TOS])
 		info->key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
 
+	if (data[IFLA_GENEVE_DF])
+		*df = nla_get_u8(data[IFLA_GENEVE_DF]);
+
 	if (data[IFLA_GENEVE_LABEL]) {
 		info->key.label = nla_get_be32(data[IFLA_GENEVE_LABEL]) &
 				  IPV6_FLOWLABEL_MASK;
@@ -1500,6 +1533,7 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 			  struct nlattr *tb[], struct nlattr *data[],
 			  struct netlink_ext_ack *extack)
 {
+	enum ifla_geneve_df df = GENEVE_DF_UNSET;
 	bool use_udp6_rx_checksums = false;
 	struct ip_tunnel_info info;
 	bool ttl_inherit = false;
@@ -1508,12 +1542,12 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 
 	init_tnl_info(&info, GENEVE_UDP_PORT);
 	err = geneve_nl2info(tb, data, extack, &info, &metadata,
-			     &use_udp6_rx_checksums, &ttl_inherit, false);
+			     &use_udp6_rx_checksums, &ttl_inherit, &df, false);
 	if (err)
 		return err;
 
 	err = geneve_configure(net, dev, extack, &info, metadata,
-			       use_udp6_rx_checksums, ttl_inherit);
+			       use_udp6_rx_checksums, ttl_inherit, df);
 	if (err)
 		return err;
 
@@ -1576,6 +1610,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
 	struct ip_tunnel_info info;
 	bool metadata;
 	bool use_udp6_rx_checksums;
+	enum ifla_geneve_df df;
 	bool ttl_inherit;
 	int err;
 
@@ -1591,7 +1626,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
 	use_udp6_rx_checksums = geneve->use_udp6_rx_checksums;
 	ttl_inherit = geneve->ttl_inherit;
 	err = geneve_nl2info(tb, data, extack, &info, &metadata,
-			     &use_udp6_rx_checksums, &ttl_inherit, true);
+			     &use_udp6_rx_checksums, &ttl_inherit, &df, true);
 	if (err)
 		return err;
 
@@ -1624,6 +1659,7 @@ static size_t geneve_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_GENEVE_DF */
 		nla_total_size(sizeof(__be32)) +  /* IFLA_GENEVE_LABEL */
 		nla_total_size(sizeof(__be16)) +  /* IFLA_GENEVE_PORT */
 		nla_total_size(0) +	 /* IFLA_GENEVE_COLLECT_METADATA */
@@ -1672,6 +1708,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_be32(skb, IFLA_GENEVE_LABEL, info->key.label))
 		goto nla_put_failure;
 
+	if (nla_put_u8(skb, IFLA_GENEVE_DF, geneve->df))
+		goto nla_put_failure;
+
 	if (nla_put_be16(skb, IFLA_GENEVE_PORT, info->key.tp_dst))
 		goto nla_put_failure;
 
@@ -1723,7 +1762,8 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 		return dev;
 
 	init_tnl_info(&info, dst_port);
-	err = geneve_configure(net, dev, NULL, &info, true, true, false);
+	err = geneve_configure(net, dev, NULL, &info,
+			       true, true, false, GENEVE_DF_UNSET);
 	if (err) {
 		free_netdev(dev);
 		return ERR_PTR(err);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index efc588949431..f42c069d81db 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -566,10 +566,19 @@ enum {
 	IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
 	IFLA_GENEVE_LABEL,
 	IFLA_GENEVE_TTL_INHERIT,
+	IFLA_GENEVE_DF,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
 
+enum ifla_geneve_df {
+	GENEVE_DF_UNSET = 0,
+	GENEVE_DF_SET,
+	GENEVE_DF_INHERIT,
+	__GENEVE_DF_END,
+	GENEVE_DF_MAX = __GENEVE_DF_END - 1,
+};
+
 /* PPP section */
 enum {
 	IFLA_PPP_UNSPEC,
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 07/11] selftests: pmtu: Introduce tests for IPv4/IPv6 over GENEVE over IPv6
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

Use a router between endpoints, implemented via namespaces, set a low MTU
between router and destination endpoint, exceed it and check PMTU value in
route exceptions.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
This only introduces tests over GENEVE over IPv6 right now. I'll introduce
tests over IPv4 (they can be added trivially) once DF configuration support
is accepted into iproute2.

 tools/testing/selftests/net/pmtu.sh | 78 ++++++++++++++++++++---------
 1 file changed, 55 insertions(+), 23 deletions(-)

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 19ede74af560..e9bb0c37bdfc 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -37,6 +37,12 @@
 # - pmtu_ipv6_vxlan6_exception
 #	Same as pmtu_ipv4_vxlan6_exception, but send IPv6 packets from A to B
 #
+# - pmtu_ipv4_geneve6_exception
+#	Same as pmtu_ipv4_vxlan6, but using a GENEVE tunnel instead of VxLAN
+#
+# - pmtu_ipv6_geneve6_exception
+#	Same as pmtu_ipv6_vxlan6, but using a GENEVE tunnel instead of VxLAN
+#
 # - pmtu_vti4_exception
 #	Set up vti tunnel on top of veth, with xfrm states and policies, in two
 #	namespaces with matching endpoints. Check that route exception is not
@@ -85,6 +91,8 @@ tests="
 	pmtu_ipv6_exception		ipv6: PMTU exceptions
 	pmtu_ipv4_vxlan6_exception	IPv4 over vxlan6: PMTU exceptions
 	pmtu_ipv6_vxlan6_exception	IPv6 over vxlan6: PMTU exceptions
+	pmtu_ipv4_geneve6_exception	IPv4 over geneve6: PMTU exceptions
+	pmtu_ipv6_geneve6_exception	IPv6 over geneve6: PMTU exceptions
 	pmtu_vti6_exception		vti6: PMTU exceptions
 	pmtu_vti4_exception		vti4: PMTU exceptions
 	pmtu_vti4_default_mtu		vti4: default MTU assignment
@@ -222,27 +230,42 @@ setup_vti6() {
 	setup_vti 6 ${veth6_a_addr} ${veth6_b_addr} ${tunnel6_a_addr} ${tunnel6_b_addr} ${tunnel6_mask}
 }
 
-setup_vxlan() {
-	a_addr="${1}"
-	b_addr="${2}"
+setup_vxlan_or_geneve() {
+	type="${1}"
+	a_addr="${2}"
+	b_addr="${3}"
+
+	if [ "${type}" = "vxlan" ]; then
+		opts="ttl 64 dstport 4789"
+		opts_a="local ${a_addr}"
+		opts_b="local ${b_addr}"
+	else
+		opts=""
+		opts_a=""
+		opts_b=""
+	fi
 
-	${ns_a} ip link add vxlan_a type vxlan id 1 local ${a_addr} remote ${b_addr} ttl 64 dstport 4789 || return 1
-	${ns_b} ip link add vxlan_b type vxlan id 1 local ${b_addr} remote ${a_addr} ttl 64 dstport 4789
+	${ns_a} ip link add ${type}_a type ${type} id 1 ${opts_a} remote ${b_addr} ${opts} || return 1
+	${ns_b} ip link add ${type}_b type ${type} id 1 ${opts_b} remote ${a_addr} ${opts}
 
-	${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask}   dev vxlan_a
-	${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask}   dev vxlan_b
+	${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${type}_a
+	${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${type}_b
 
-	${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask}   dev vxlan_a
-	${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask}   dev vxlan_b
+	${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${type}_a
+	${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${type}_b
 
-	${ns_a} ip link set vxlan_a up
-	${ns_b} ip link set vxlan_b up
+	${ns_a} ip link set ${type}_a up
+	${ns_b} ip link set ${type}_b up
 
 	sleep 1
 }
 
+setup_geneve6() {
+	setup_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
+}
+
 setup_vxlan6() {
-	setup_vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
+	setup_vxlan_or_geneve vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
 }
 
 setup_xfrm() {
@@ -501,15 +524,16 @@ test_pmtu_ipv6_exception() {
 	test_pmtu_ipvX 6
 }
 
-test_pmtu_ipvX_over_vxlan6_exception() {
-	family=${1}
+test_pmtu_ipvX_over_vxlan6_or_geneve6_exception() {
+	type=${1}
+	family=${2}
 	ll_mtu=4000
 
-	setup namespaces routing vxlan6 || return 2
-	#                      IPv6 header   UDP header   VxLAN header   Ethernet header
-	exp_mtu=$((${ll_mtu} - 40          - 8          - 8            - 14))
+	setup namespaces routing ${type}6 || return 2
+	#                      IPv6 header   UDP header   VxLAN/GENEVE header   Ethernet header
+	exp_mtu=$((${ll_mtu} - 40          - 8          - 8                   - 14))
 
-	trace "${ns_a}" vxlan_a      "${ns_b}"  vxlan_b \
+	trace "${ns_a}" ${type}_a    "${ns_b}"  ${type}_b \
 	      "${ns_a}" veth_A-R1    "${ns_r1}" veth_R1-A \
 	      "${ns_b}" veth_B-R1    "${ns_r1}" veth_R1-B
 
@@ -527,21 +551,29 @@ test_pmtu_ipvX_over_vxlan6_exception() {
 	mtu "${ns_b}"  veth_B-R1 ${ll_mtu}
 	mtu "${ns_r1}" veth_R1-B ${ll_mtu}
 
-	mtu "${ns_a}" vxlan_a $((${ll_mtu} + 1000))
-	mtu "${ns_b}" vxlan_b $((${ll_mtu} + 1000))
+	mtu "${ns_a}" ${type}_a $((${ll_mtu} + 1000))
+	mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000))
 	${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s $((${ll_mtu} + 500)) ${dst} > /dev/null
 
 	# Check that exception was created
 	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
-	check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on VxLAN interface"
+	check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ${type} interface"
 }
 
 test_pmtu_ipv4_vxlan6_exception() {
-	test_pmtu_ipvX_over_vxlan6_exception 4
+	test_pmtu_ipvX_over_vxlan6_or_geneve6_exception vxlan 4
 }
 
 test_pmtu_ipv6_vxlan6_exception() {
-	test_pmtu_ipvX_over_vxlan6_exception 6
+	test_pmtu_ipvX_over_vxlan6_or_geneve6_exception vxlan 6
+}
+
+test_pmtu_ipv4_geneve6_exception() {
+	test_pmtu_ipvX_over_vxlan6_or_geneve6_exception geneve 4
+}
+
+test_pmtu_ipv6_geneve6_exception() {
+	test_pmtu_ipvX_over_vxlan6_or_geneve6_exception geneve 6
 }
 
 test_pmtu_vti4_exception() {
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 08/11] net: Convert protocol error handlers from void to int
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

We'll need this to handle ICMP errors for tunnels without a sending socket
(i.e. FoU and GUE). There, we might have to look up different types of IP
tunnels, registered as network protocols, before we get a match, so we
want this for the error handlers of IPPROTO_IPIP and IPPROTO_IPV6 in both
inet_protos and inet6_protos. These error codes will be used in the next
patch.

For consistency, return sensible error codes in protocol error handlers
whenever errors don't match a protocol or any of its states.

This has no effect on existing error handling paths.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 include/net/icmp.h        |  2 +-
 include/net/protocol.h    |  9 ++++++--
 include/net/sctp/sctp.h   |  2 +-
 include/net/tcp.h         |  2 +-
 include/net/udp.h         |  2 +-
 net/dccp/ipv4.c           | 13 +++++++----
 net/dccp/ipv6.c           | 13 +++++++----
 net/ipv4/gre_demux.c      |  9 ++++++--
 net/ipv4/icmp.c           |  6 +++--
 net/ipv4/ip_gre.c         | 48 ++++++++++++++++++++-------------------
 net/ipv4/ipip.c           | 14 ++++++------
 net/ipv4/tcp_ipv4.c       | 22 ++++++++++--------
 net/ipv4/tunnel4.c        | 18 ++++++++++-----
 net/ipv4/udp.c            | 10 ++++----
 net/ipv4/udp_impl.h       |  2 +-
 net/ipv4/udplite.c        |  4 ++--
 net/ipv4/xfrm4_protocol.c | 18 ++++++++++-----
 net/ipv6/icmp.c           |  4 +++-
 net/ipv6/ip6_gre.c        | 18 ++++++++-------
 net/ipv6/tcp_ipv6.c       | 13 +++++++----
 net/ipv6/tunnel6.c        | 12 ++++++----
 net/ipv6/udp.c            | 18 +++++++--------
 net/ipv6/udp_impl.h       |  4 ++--
 net/ipv6/udplite.c        |  5 ++--
 net/ipv6/xfrm6_protocol.c | 18 ++++++++++-----
 net/sctp/input.c          |  5 ++--
 net/sctp/ipv6.c           |  7 ++++--
 27 files changed, 177 insertions(+), 121 deletions(-)

diff --git a/include/net/icmp.h b/include/net/icmp.h
index 3ef2743a8eec..6ac3a5bd0117 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -41,7 +41,7 @@ struct net;
 
 void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info);
 int icmp_rcv(struct sk_buff *skb);
-void icmp_err(struct sk_buff *skb, u32 info);
+int icmp_err(struct sk_buff *skb, u32 info);
 int icmp_init(void);
 void icmp_out_count(struct net *net, unsigned char type);
 
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 4fc75f7ae23b..92b3eaad6088 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -42,7 +42,10 @@ struct net_protocol {
 	int			(*early_demux)(struct sk_buff *skb);
 	int			(*early_demux_handler)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
-	void			(*err_handler)(struct sk_buff *skb, u32 info);
+
+	/* This returns an error if we weren't able to handle the error. */
+	int			(*err_handler)(struct sk_buff *skb, u32 info);
+
 	unsigned int		no_policy:1,
 				netns_ok:1,
 				/* does the protocol do more stringent
@@ -58,10 +61,12 @@ struct inet6_protocol {
 	void    (*early_demux_handler)(struct sk_buff *skb);
 	int	(*handler)(struct sk_buff *skb);
 
-	void	(*err_handler)(struct sk_buff *skb,
+	/* This returns an error if we weren't able to handle the error. */
+	int	(*err_handler)(struct sk_buff *skb,
 			       struct inet6_skb_parm *opt,
 			       u8 type, u8 code, int offset,
 			       __be32 info);
+
 	unsigned int	flags;	/* INET6_PROTO_xxx */
 };
 
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 8c2caa370e0f..9a3b48a35e90 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -151,7 +151,7 @@ int sctp_primitive_RECONF(struct net *net, struct sctp_association *asoc,
  * sctp/input.c
  */
 int sctp_rcv(struct sk_buff *skb);
-void sctp_v4_err(struct sk_buff *skb, u32 info);
+int sctp_v4_err(struct sk_buff *skb, u32 info);
 void sctp_hash_endpoint(struct sctp_endpoint *);
 void sctp_unhash_endpoint(struct sctp_endpoint *);
 struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a18914d20486..4743836bed2e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -313,7 +313,7 @@ extern struct proto tcp_prot;
 
 void tcp_tasklet_init(void);
 
-void tcp_v4_err(struct sk_buff *skb, u32);
+int tcp_v4_err(struct sk_buff *skb, u32);
 
 void tcp_shutdown(struct sock *sk, int how);
 
diff --git a/include/net/udp.h b/include/net/udp.h
index 9e82cb391dea..7e3f1e2b68eb 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -272,7 +272,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
 int udp_get_port(struct sock *sk, unsigned short snum,
 		 int (*saddr_cmp)(const struct sock *,
 				  const struct sock *));
-void udp_err(struct sk_buff *, u32);
+int udp_err(struct sk_buff *, u32);
 int udp_abort(struct sock *sk, int err);
 int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
 int udp_push_pending_frames(struct sock *sk);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 8e08cea6f178..26a21d97b6b0 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -231,7 +231,7 @@ EXPORT_SYMBOL(dccp_req_err);
  * check at all. A more general error queue to queue errors for later handling
  * is probably better.
  */
-static void dccp_v4_err(struct sk_buff *skb, u32 info)
+static int dccp_v4_err(struct sk_buff *skb, u32 info)
 {
 	const struct iphdr *iph = (struct iphdr *)skb->data;
 	const u8 offset = iph->ihl << 2;
@@ -259,16 +259,18 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
 				       inet_iif(skb), 0);
 	if (!sk) {
 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
-		return;
+		return -ENOENT;
 	}
 
 	if (sk->sk_state == DCCP_TIME_WAIT) {
 		inet_twsk_put(inet_twsk(sk));
-		return;
+		return 0;
 	}
 	seq = dccp_hdr_seq(dh);
-	if (sk->sk_state == DCCP_NEW_SYN_RECV)
-		return dccp_req_err(sk, seq);
+	if (sk->sk_state == DCCP_NEW_SYN_RECV) {
+		dccp_req_err(sk, seq);
+		return 0;
+	}
 
 	bh_lock_sock(sk);
 	/* If too many ICMPs get dropped on busy
@@ -357,6 +359,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
+	return 0;
 }
 
 static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6344f1b18a6a..d5740bad5b18 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -68,7 +68,7 @@ static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb)
 
 }
 
-static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			u8 type, u8 code, int offset, __be32 info)
 {
 	const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
@@ -96,16 +96,18 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!sk) {
 		__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
 				  ICMP6_MIB_INERRORS);
-		return;
+		return -ENOENT;
 	}
 
 	if (sk->sk_state == DCCP_TIME_WAIT) {
 		inet_twsk_put(inet_twsk(sk));
-		return;
+		return 0;
 	}
 	seq = dccp_hdr_seq(dh);
-	if (sk->sk_state == DCCP_NEW_SYN_RECV)
-		return dccp_req_err(sk, seq);
+	if (sk->sk_state == DCCP_NEW_SYN_RECV) {
+		dccp_req_err(sk, seq);
+		return 0;
+	}
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk))
@@ -183,6 +185,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
+	return 0;
 }
 
 
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 7efe740c06eb..a4bf22ee3aed 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -151,20 +151,25 @@ static int gre_rcv(struct sk_buff *skb)
 	return NET_RX_DROP;
 }
 
-static void gre_err(struct sk_buff *skb, u32 info)
+static int gre_err(struct sk_buff *skb, u32 info)
 {
 	const struct gre_protocol *proto;
 	const struct iphdr *iph = (const struct iphdr *)skb->data;
 	u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+	int err = 0;
 
 	if (ver >= GREPROTO_MAX)
-		return;
+		return -EINVAL;
 
 	rcu_read_lock();
 	proto = rcu_dereference(gre_proto[ver]);
 	if (proto && proto->err_handler)
 		proto->err_handler(skb, info);
+	else
+		err = -EPROTONOSUPPORT;
 	rcu_read_unlock();
+
+	return err;
 }
 
 static const struct net_protocol net_gre_protocol = {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index d832beed6e3a..065997f414e6 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1079,7 +1079,7 @@ int icmp_rcv(struct sk_buff *skb)
 	goto drop;
 }
 
-void icmp_err(struct sk_buff *skb, u32 info)
+int icmp_err(struct sk_buff *skb, u32 info)
 {
 	struct iphdr *iph = (struct iphdr *)skb->data;
 	int offset = iph->ihl<<2;
@@ -1094,13 +1094,15 @@ void icmp_err(struct sk_buff *skb, u32 info)
 	 */
 	if (icmph->type != ICMP_ECHOREPLY) {
 		ping_err(skb, offset, info);
-		return;
+		return 0;
 	}
 
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
 		ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
 	else if (type == ICMP_REDIRECT)
 		ipv4_redirect(skb, net, 0, IPPROTO_ICMP);
+
+	return 0;
 }
 
 /*
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 38befe829caf..46eeb9663051 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -121,8 +121,8 @@ static unsigned int ipgre_net_id __read_mostly;
 static unsigned int gre_tap_net_id __read_mostly;
 static unsigned int erspan_net_id __read_mostly;
 
-static void ipgre_err(struct sk_buff *skb, u32 info,
-		      const struct tnl_ptk_info *tpi)
+static int ipgre_err(struct sk_buff *skb, u32 info,
+		     const struct tnl_ptk_info *tpi)
 {
 
 	/* All the routers (except for Linux) return only
@@ -146,17 +146,32 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
 	unsigned int data_len = 0;
 	struct ip_tunnel *t;
 
+	if (tpi->proto == htons(ETH_P_TEB))
+		itn = net_generic(net, gre_tap_net_id);
+	else if (tpi->proto == htons(ETH_P_ERSPAN) ||
+		 tpi->proto == htons(ETH_P_ERSPAN2))
+		itn = net_generic(net, erspan_net_id);
+	else
+		itn = net_generic(net, ipgre_net_id);
+
+	iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
+	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+			     iph->daddr, iph->saddr, tpi->key);
+
+	if (!t)
+		return -ENOENT;
+
 	switch (type) {
 	default:
 	case ICMP_PARAMETERPROB:
-		return;
+		return 0;
 
 	case ICMP_DEST_UNREACH:
 		switch (code) {
 		case ICMP_SR_FAILED:
 		case ICMP_PORT_UNREACH:
 			/* Impossible event. */
-			return;
+			return 0;
 		default:
 			/* All others are translated to HOST_UNREACH.
 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -168,7 +183,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
 
 	case ICMP_TIME_EXCEEDED:
 		if (code != ICMP_EXC_TTL)
-			return;
+			return 0;
 		data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
 		break;
 
@@ -176,40 +191,27 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
 		break;
 	}
 
-	if (tpi->proto == htons(ETH_P_TEB))
-		itn = net_generic(net, gre_tap_net_id);
-	else if (tpi->proto == htons(ETH_P_ERSPAN) ||
-		 tpi->proto == htons(ETH_P_ERSPAN2))
-		itn = net_generic(net, erspan_net_id);
-	else
-		itn = net_generic(net, ipgre_net_id);
-
-	iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
-	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
-			     iph->daddr, iph->saddr, tpi->key);
-
-	if (!t)
-		return;
-
 #if IS_ENABLED(CONFIG_IPV6)
        if (tpi->proto == htons(ETH_P_IPV6) &&
            !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
 				       type, data_len))
-               return;
+               return 0;
 #endif
 
 	if (t->parms.iph.daddr == 0 ||
 	    ipv4_is_multicast(t->parms.iph.daddr))
-		return;
+		return 0;
 
 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
-		return;
+		return 0;
 
 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
 		t->err_count++;
 	else
 		t->err_count = 1;
 	t->err_time = jiffies;
+
+	return 0;
 }
 
 static void gre_err(struct sk_buff *skb, u32 info)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index e65287c27e3d..57c5dd283a2c 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -140,6 +140,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 	struct ip_tunnel *t;
 	int err = 0;
 
+	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+			     iph->daddr, iph->saddr, 0);
+	if (!t) {
+		err = -ENOENT;
+		goto out;
+	}
+
 	switch (type) {
 	case ICMP_DEST_UNREACH:
 		switch (code) {
@@ -167,13 +174,6 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 		goto out;
 	}
 
-	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
-			     iph->daddr, iph->saddr, 0);
-	if (!t) {
-		err = -ENOENT;
-		goto out;
-	}
-
 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
 		ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);
 		goto out;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index de47038afdf0..a336787d75e5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -423,7 +423,7 @@ EXPORT_SYMBOL(tcp_req_err);
  *
  */
 
-void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 {
 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
@@ -446,20 +446,21 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 				       inet_iif(icmp_skb), 0);
 	if (!sk) {
 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
-		return;
+		return -ENOENT;
 	}
 	if (sk->sk_state == TCP_TIME_WAIT) {
 		inet_twsk_put(inet_twsk(sk));
-		return;
+		return 0;
 	}
 	seq = ntohl(th->seq);
-	if (sk->sk_state == TCP_NEW_SYN_RECV)
-		return tcp_req_err(sk, seq,
-				  type == ICMP_PARAMETERPROB ||
-				  type == ICMP_TIME_EXCEEDED ||
-				  (type == ICMP_DEST_UNREACH &&
-				   (code == ICMP_NET_UNREACH ||
-				    code == ICMP_HOST_UNREACH)));
+	if (sk->sk_state == TCP_NEW_SYN_RECV) {
+		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
+				     type == ICMP_TIME_EXCEEDED ||
+				     (type == ICMP_DEST_UNREACH &&
+				      (code == ICMP_NET_UNREACH ||
+				       code == ICMP_HOST_UNREACH)));
+		return 0;
+	}
 
 	bh_lock_sock(sk);
 	/* If too many ICMPs get dropped on busy
@@ -613,6 +614,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
+	return 0;
 }
 
 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index c0630013c1ae..33bf8e9c8663 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -149,34 +149,40 @@ static int tunnelmpls4_rcv(struct sk_buff *skb)
 }
 #endif
 
-static void tunnel4_err(struct sk_buff *skb, u32 info)
+static int tunnel4_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm_tunnel *handler;
 
 	for_each_tunnel_rcu(tunnel4_handlers, handler)
 		if (!handler->err_handler(skb, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static void tunnel64_err(struct sk_buff *skb, u32 info)
+static int tunnel64_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm_tunnel *handler;
 
 	for_each_tunnel_rcu(tunnel64_handlers, handler)
 		if (!handler->err_handler(skb, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 #endif
 
 #if IS_ENABLED(CONFIG_MPLS)
-static void tunnelmpls4_err(struct sk_buff *skb, u32 info)
+static int tunnelmpls4_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm_tunnel *handler;
 
 	for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
 		if (!handler->err_handler(skb, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 #endif
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1f054a85062d..b89c4cfd7c62 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -649,7 +649,7 @@ static struct sock *__udp4_lib_err_encap(struct net *net,
  * to find the appropriate port.
  */
 
-void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
+int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 {
 	struct inet_sock *inet;
 	const struct iphdr *iph = (const struct iphdr *)skb->data;
@@ -672,7 +672,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 
 		if (!sk) {
 			__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
-			return;
+			return -ENOENT;
 		}
 		tunnel = true;
 	}
@@ -730,12 +730,12 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	sk->sk_err = err;
 	sk->sk_error_report(sk);
 out:
-	return;
+	return 0;
 }
 
-void udp_err(struct sk_buff *skb, u32 info)
+int udp_err(struct sk_buff *skb, u32 info)
 {
-	__udp4_lib_err(skb, info, &udp_table);
+	return __udp4_lib_err(skb, info, &udp_table);
 }
 
 /*
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index e7d18b140287..322672655419 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -7,7 +7,7 @@
 #include <net/inet_common.h>
 
 int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
-void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
 
 int udp_v4_get_port(struct sock *sk, unsigned short snum);
 
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 8545457752fb..39c7f17d916f 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -25,9 +25,9 @@ static int udplite_rcv(struct sk_buff *skb)
 	return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
 }
 
-static void udplite_err(struct sk_buff *skb, u32 info)
+static int udplite_err(struct sk_buff *skb, u32 info)
 {
-	__udp4_lib_err(skb, info, &udplite_table);
+	return __udp4_lib_err(skb, info, &udplite_table);
 }
 
 static const struct net_protocol udplite_protocol = {
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index 8dd0e6ab8606..35c54865dc42 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -106,13 +106,15 @@ static int xfrm4_esp_rcv(struct sk_buff *skb)
 	return 0;
 }
 
-static void xfrm4_esp_err(struct sk_buff *skb, u32 info)
+static int xfrm4_esp_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm4_protocol *handler;
 
 	for_each_protocol_rcu(esp4_handlers, handler)
 		if (!handler->err_handler(skb, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 static int xfrm4_ah_rcv(struct sk_buff *skb)
@@ -132,13 +134,15 @@ static int xfrm4_ah_rcv(struct sk_buff *skb)
 	return 0;
 }
 
-static void xfrm4_ah_err(struct sk_buff *skb, u32 info)
+static int xfrm4_ah_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm4_protocol *handler;
 
 	for_each_protocol_rcu(ah4_handlers, handler)
 		if (!handler->err_handler(skb, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
@@ -158,13 +162,15 @@ static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
 	return 0;
 }
 
-static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
+static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm4_protocol *handler;
 
 	for_each_protocol_rcu(ipcomp4_handlers, handler)
 		if (!handler->err_handler(skb, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 static const struct net_protocol esp4_protocol = {
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index c9c53ade55c3..5d7aa2c2770c 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -84,7 +84,7 @@ static inline struct sock *icmpv6_sk(struct net *net)
 	return net->ipv6.icmp_sk[smp_processor_id()];
 }
 
-static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		       u8 type, u8 code, int offset, __be32 info)
 {
 	/* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */
@@ -100,6 +100,8 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!(type & ICMPV6_INFOMSG_MASK))
 		if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
 			ping_err(skb, offset, ntohl(info));
+
+	return 0;
 }
 
 static int icmpv6_rcv(struct sk_buff *skb);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 515adbdba1d2..81b69bcee714 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -423,7 +423,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
 }
 
 
-static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		       u8 type, u8 code, int offset, __be32 info)
 {
 	struct net *net = dev_net(skb->dev);
@@ -433,13 +433,13 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 	if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6),
 			     offset) < 0)
-		return;
+		return -EINVAL;
 
 	ipv6h = (const struct ipv6hdr *)skb->data;
 	t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
 				 tpi.key, tpi.proto);
 	if (!t)
-		return;
+		return -ENOENT;
 
 	switch (type) {
 		struct ipv6_tlv_tnl_enc_lim *tel;
@@ -449,14 +449,14 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 				    t->parms.name);
 		if (code != ICMPV6_PORT_UNREACH)
 			break;
-		return;
+		return 0;
 	case ICMPV6_TIME_EXCEED:
 		if (code == ICMPV6_EXC_HOPLIMIT) {
 			net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
 					    t->parms.name);
 			break;
 		}
-		return;
+		return 0;
 	case ICMPV6_PARAMPROB:
 		teli = 0;
 		if (code == ICMPV6_HDR_FIELD)
@@ -472,14 +472,14 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
 					    t->parms.name);
 		}
-		return;
+		return 0;
 	case ICMPV6_PKT_TOOBIG:
 		ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
-		return;
+		return 0;
 	case NDISC_REDIRECT:
 		ip6_redirect(skb, net, skb->dev->ifindex, 0,
 			     sock_net_uid(net, NULL));
-		return;
+		return 0;
 	}
 
 	if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO))
@@ -487,6 +487,8 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	else
 		t->err_count = 1;
 	t->err_time = jiffies;
+
+	return 0;
 }
 
 static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 03e6b7a2bc53..a3f559162521 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -349,7 +349,7 @@ static void tcp_v6_mtu_reduced(struct sock *sk)
 	}
 }
 
-static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		u8 type, u8 code, int offset, __be32 info)
 {
 	const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
@@ -371,17 +371,19 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!sk) {
 		__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
 				  ICMP6_MIB_INERRORS);
-		return;
+		return -ENOENT;
 	}
 
 	if (sk->sk_state == TCP_TIME_WAIT) {
 		inet_twsk_put(inet_twsk(sk));
-		return;
+		return 0;
 	}
 	seq = ntohl(th->seq);
 	fatal = icmpv6_err_convert(type, code, &err);
-	if (sk->sk_state == TCP_NEW_SYN_RECV)
-		return tcp_req_err(sk, seq, fatal);
+	if (sk->sk_state == TCP_NEW_SYN_RECV) {
+		tcp_req_err(sk, seq, fatal);
+		return 0;
+	}
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
@@ -467,6 +469,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 out:
 	bh_unlock_sock(sk);
 	sock_put(sk);
+	return 0;
 }
 
 
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index dae25cad05cd..1991dede7367 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -134,24 +134,28 @@ static int tunnel46_rcv(struct sk_buff *skb)
 	return 0;
 }
 
-static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			u8 type, u8 code, int offset, __be32 info)
 {
 	struct xfrm6_tunnel *handler;
 
 	for_each_tunnel_rcu(tunnel6_handlers, handler)
 		if (!handler->err_handler(skb, opt, type, code, offset, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
-static void tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			 u8 type, u8 code, int offset, __be32 info)
 {
 	struct xfrm6_tunnel *handler;
 
 	for_each_tunnel_rcu(tunnel46_handlers, handler)
 		if (!handler->err_handler(skb, opt, type, code, offset, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 static const struct inet6_protocol tunnel6_protocol = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index eebd90111646..34641cb5d358 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -512,9 +512,9 @@ static struct sock *__udp6_lib_err_encap(struct net *net,
 	return sk;
 }
 
-void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
-		    u8 type, u8 code, int offset, __be32 info,
-		    struct udp_table *udptable)
+int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+		   u8 type, u8 code, int offset, __be32 info,
+		   struct udp_table *udptable)
 {
 	struct ipv6_pinfo *np;
 	const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
@@ -539,7 +539,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (!sk) {
 			__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
 					  ICMP6_MIB_INERRORS);
-			return;
+			return -ENOENT;
 		}
 		tunnel = true;
 	}
@@ -578,7 +578,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	sk->sk_err = err;
 	sk->sk_error_report(sk);
 out:
-	return;
+	return 0;
 }
 
 static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -609,11 +609,11 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-static __inline__ void udpv6_err(struct sk_buff *skb,
-				 struct inet6_skb_parm *opt, u8 type,
-				 u8 code, int offset, __be32 info)
+static __inline__ int udpv6_err(struct sk_buff *skb,
+				struct inet6_skb_parm *opt, u8 type,
+				u8 code, int offset, __be32 info)
 {
-	__udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
+	return __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
 }
 
 static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 7903e21c178b..5730e6503cb4 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -9,8 +9,8 @@
 #include <net/transp_v6.h>
 
 int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int);
-void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int,
-		    __be32, struct udp_table *);
+int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int,
+		   __be32, struct udp_table *);
 
 int udp_v6_get_port(struct sock *sk, unsigned short snum);
 
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 5000ad6878e6..a125aebc29e5 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -20,11 +20,12 @@ static int udplitev6_rcv(struct sk_buff *skb)
 	return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
 }
 
-static void udplitev6_err(struct sk_buff *skb,
+static int udplitev6_err(struct sk_buff *skb,
 			  struct inet6_skb_parm *opt,
 			  u8 type, u8 code, int offset, __be32 info)
 {
-	__udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table);
+	return __udp6_lib_err(skb, opt, type, code, offset, info,
+			      &udplite_table);
 }
 
 static const struct inet6_protocol udplitev6_protocol = {
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
index b2dc8ce49378..cc979b702c89 100644
--- a/net/ipv6/xfrm6_protocol.c
+++ b/net/ipv6/xfrm6_protocol.c
@@ -80,14 +80,16 @@ static int xfrm6_esp_rcv(struct sk_buff *skb)
 	return 0;
 }
 
-static void xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			  u8 type, u8 code, int offset, __be32 info)
 {
 	struct xfrm6_protocol *handler;
 
 	for_each_protocol_rcu(esp6_handlers, handler)
 		if (!handler->err_handler(skb, opt, type, code, offset, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 static int xfrm6_ah_rcv(struct sk_buff *skb)
@@ -107,14 +109,16 @@ static int xfrm6_ah_rcv(struct sk_buff *skb)
 	return 0;
 }
 
-static void xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			 u8 type, u8 code, int offset, __be32 info)
 {
 	struct xfrm6_protocol *handler;
 
 	for_each_protocol_rcu(ah6_handlers, handler)
 		if (!handler->err_handler(skb, opt, type, code, offset, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 static int xfrm6_ipcomp_rcv(struct sk_buff *skb)
@@ -134,14 +138,16 @@ static int xfrm6_ipcomp_rcv(struct sk_buff *skb)
 	return 0;
 }
 
-static void xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			     u8 type, u8 code, int offset, __be32 info)
 {
 	struct xfrm6_protocol *handler;
 
 	for_each_protocol_rcu(ipcomp6_handlers, handler)
 		if (!handler->err_handler(skb, opt, type, code, offset, info))
-			break;
+			return 0;
+
+	return -ENOENT;
 }
 
 static const struct inet6_protocol esp6_protocol = {
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 5c36a99882ed..7ab08a5b36dc 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -574,7 +574,7 @@ void sctp_err_finish(struct sock *sk, struct sctp_transport *t)
  * is probably better.
  *
  */
-void sctp_v4_err(struct sk_buff *skb, __u32 info)
+int sctp_v4_err(struct sk_buff *skb, __u32 info)
 {
 	const struct iphdr *iph = (const struct iphdr *)skb->data;
 	const int ihlen = iph->ihl * 4;
@@ -599,7 +599,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
 	skb->transport_header = savesctp;
 	if (!sk) {
 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
-		return;
+		return -ENOENT;
 	}
 	/* Warning:  The sock lock is held.  Remember to call
 	 * sctp_err_finish!
@@ -653,6 +653,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
 
 out_unlock:
 	sctp_err_finish(sk, transport);
+	return 0;
 }
 
 /*
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index fc6c5e4bffa5..6e27c62646e9 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -138,7 +138,7 @@ static struct notifier_block sctp_inet6addr_notifier = {
 };
 
 /* ICMP error handler. */
-static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			u8 type, u8 code, int offset, __be32 info)
 {
 	struct inet6_dev *idev;
@@ -147,7 +147,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	struct sctp_transport *transport;
 	struct ipv6_pinfo *np;
 	__u16 saveip, savesctp;
-	int err;
+	int err, ret = 0;
 	struct net *net = dev_net(skb->dev);
 
 	idev = in6_dev_get(skb->dev);
@@ -163,6 +163,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	skb->transport_header = savesctp;
 	if (!sk) {
 		__ICMP6_INC_STATS(net, idev, ICMP6_MIB_INERRORS);
+		ret = -ENOENT;
 		goto out;
 	}
 
@@ -202,6 +203,8 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 out:
 	if (likely(idev != NULL))
 		in6_dev_put(idev);
+
+	return ret;
 }
 
 static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 09/11] udp: Support for error handlers of tunnels with arbitrary destination port
From: Stefano Brivio @ 2018-11-06 21:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: Sabrina Dubroca, Xin Long, netdev
In-Reply-To: <cover.1541533786.git.sbrivio@redhat.com>

ICMP error handling is currently not possible for UDP tunnels not
employing a receiving socket with local destination port matching the
remote one, because we have no way to look them up.

Add an err_handler tunnel encapsulation operation that can be exported by
tunnels in order to pass the error to the protocol implementing the
encapsulation. We can't easily use a lookup function as we did for VxLAN
and GENEVE, as protocol error handlers, which would be in turn called by
implementations of this new operation, handle the errors themselves,
together with the tunnel lookup.

Without a socket, we can't be sure which encapsulation error handler is
the appropriate one: encapsulation handlers (the ones for FoU and GUE
introduced in the next patch, e.g.) will need to check the new error codes
returned by protocol handlers to figure out if errors match the given
encapsulation, and, in turn, report this error back, so that we can try
all of them in __udp{4,6}_lib_err_encap_no_sk() until we have a match.

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 include/net/ip6_tunnel.h |  2 +
 include/net/ip_tunnels.h |  1 +
 net/ipv4/udp.c           | 75 +++++++++++++++++++++++++++----------
 net/ipv6/udp.c           | 80 ++++++++++++++++++++++++++++++----------
 4 files changed, 119 insertions(+), 39 deletions(-)

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 236e40ba06bf..7855966b4a19 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -69,6 +69,8 @@ struct ip6_tnl_encap_ops {
 	size_t (*encap_hlen)(struct ip_tunnel_encap *e);
 	int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
 			    u8 *protocol, struct flowi6 *fl6);
+	int (*err_handler)(struct sk_buff *, struct inet6_skb_parm *opt,
+			   u8 type, u8 code, int offset, __be32 info);
 };
 
 #ifdef CONFIG_INET
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index b0d022ff6ea1..5980659312e5 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -311,6 +311,7 @@ struct ip_tunnel_encap_ops {
 	size_t (*encap_hlen)(struct ip_tunnel_encap *e);
 	int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
 			    u8 *protocol, struct flowi4 *fl4);
+	int (*err_handler)(struct sk_buff *, u32);
 };
 
 #define MAX_IPTUN_ENCAP_OPS 8
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b89c4cfd7c62..83950b4faced 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -105,6 +105,7 @@
 #include <net/net_namespace.h>
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
+#include <net/ip_tunnels.h>
 #include <net/route.h>
 #include <net/checksum.h>
 #include <net/xfrm.h>
@@ -592,30 +593,48 @@ void udp_encap_enable(void)
 }
 EXPORT_SYMBOL(udp_encap_enable);
 
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
+{
+	int i;
+
+	for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+		int (*handler)(struct sk_buff *skb, u32 info);
+
+		if (!iptun_encaps[i])
+			continue;
+		handler = rcu_dereference(iptun_encaps[i]->err_handler);
+		if (handler && !handler(skb, info))
+			return 0;
+	}
+
+	return -ENOENT;
+}
+
 /* Try to match ICMP errors to UDP tunnels by looking up a socket without
  * reversing source and destination port: this will match tunnels that force the
- * same destination port on both endpoints (e.g. VxLAN, GENEVE). Then ask the
- * tunnel implementation to match the error against a valid association.
+ * same destination port on both endpoints (e.g. VxLAN, GENEVE). If this doesn't
+ * match any socket, probe tunnels with arbitrary destination ports (e.g. FoU,
+ * GUE): there, the receiving socket is useless, as the port we've sent packets
+ * to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
  *
- * Return the socket if we have a match.
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
  */
 static struct sock *__udp4_lib_err_encap(struct net *net,
 					 const struct iphdr *iph,
 					 struct udphdr *uh,
 					 struct udp_table *udptable,
-					 struct sk_buff *skb)
+					 struct sk_buff *skb, u32 info)
 {
-	int (*lookup)(struct sock *sk, struct sk_buff *skb);
 	int network_offset, transport_offset;
-	struct udp_sock *up;
 	struct sock *sk;
 
-	sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
-			       iph->saddr, uh->dest, skb->dev->ifindex, 0,
-			       udptable, NULL);
-	if (!sk)
-		return NULL;
-
 	network_offset = skb_network_offset(skb);
 	transport_offset = skb_transport_offset(skb);
 
@@ -627,10 +646,20 @@ static struct sock *__udp4_lib_err_encap(struct net *net,
 	/* Transport header needs to point to the UDP header */
 	skb_set_transport_header(skb, iph->ihl << 2);
 
-	up = udp_sk(sk);
-	lookup = READ_ONCE(up->encap_err_lookup);
-	if (!lookup || lookup(sk, skb))
-		sk = NULL;
+	sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
+			       iph->saddr, uh->dest, skb->dev->ifindex, 0,
+			       udptable, NULL);
+	if (sk) {
+		int (*lookup)(struct sock *sk, struct sk_buff *skb);
+		struct udp_sock *up = udp_sk(sk);
+
+		lookup = READ_ONCE(up->encap_err_lookup);
+		if (!lookup || lookup(sk, skb))
+			sk = NULL;
+	}
+
+	if (!sk)
+		sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));
 
 	skb_set_transport_header(skb, transport_offset);
 	skb_set_network_header(skb, network_offset);
@@ -667,13 +696,19 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 			       inet_sdif(skb), udptable, NULL);
 	if (!sk) {
 		/* No socket for error: try tunnels before discarding */
-		if (static_branch_unlikely(&udp_encap_needed_key))
-			sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb);
+		sk = ERR_PTR(-ENOENT);
+		if (static_branch_unlikely(&udp_encap_needed_key)) {
+			sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb,
+						  info);
+			if (!sk)
+				return 0;
+		}
 
-		if (!sk) {
+		if (IS_ERR(sk)) {
 			__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
-			return -ENOENT;
+			return PTR_ERR(sk);
 		}
+
 		tunnel = true;
 	}
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 34641cb5d358..a3beeb6ee85a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -45,6 +45,7 @@
 #include <net/raw.h>
 #include <net/tcp_states.h>
 #include <net/ip6_checksum.h>
+#include <net/ip6_tunnel.h>
 #include <net/xfrm.h>
 #include <net/inet_hashtables.h>
 #include <net/inet6_hashtables.h>
@@ -470,30 +471,53 @@ void udpv6_encap_enable(void)
 }
 EXPORT_SYMBOL(udpv6_encap_enable);
 
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp6_lib_err_encap_no_sk(struct sk_buff *skb,
+				      struct inet6_skb_parm *opt,
+				      u8 type, u8 code, int offset, u32 info)
+{
+	int i;
+
+	for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+		int (*handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
+			       u8 type, u8 code, int offset, u32 info);
+
+		if (!ip6tun_encaps[i])
+			continue;
+		handler = rcu_dereference(ip6tun_encaps[i]->err_handler);
+		if (handler && !handler(skb, opt, type, code, offset, info))
+			return 0;
+	}
+
+	return -ENOENT;
+}
+
 /* Try to match ICMP errors to UDP tunnels by looking up a socket without
  * reversing source and destination port: this will match tunnels that force the
- * same destination port on both endpoints (e.g. VxLAN, GENEVE). Then ask the
- * tunnel implementation to match the error against a valid association.
+ * same destination port on both endpoints (e.g. VxLAN, GENEVE). If this doesn't
+ * match any socket, probe tunnels with arbitrary destination ports (e.g. FoU,
+ * GUE): there, the receiving socket is useless, as the port we've sent packets
+ * to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
  *
- * Return the socket if we have a match.
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
  */
 static struct sock *__udp6_lib_err_encap(struct net *net,
 					 const struct ipv6hdr *hdr, int offset,
 					 struct udphdr *uh,
 					 struct udp_table *udptable,
-					 struct sk_buff *skb)
+					 struct sk_buff *skb,
+					 struct inet6_skb_parm *opt,
+					 u8 type, u8 code, __be32 info)
 {
-	int (*lookup)(struct sock *sk, struct sk_buff *skb);
 	int network_offset, transport_offset;
-	struct udp_sock *up;
 	struct sock *sk;
 
-	sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source,
-			       &hdr->saddr, uh->dest,
-			       inet6_iif(skb), 0, udptable, skb);
-	if (!sk)
-		return NULL;
-
 	network_offset = skb_network_offset(skb);
 	transport_offset = skb_transport_offset(skb);
 
@@ -502,13 +526,26 @@ static struct sock *__udp6_lib_err_encap(struct net *net,
 	/* Transport header needs to point to the UDP header */
 	skb_set_transport_header(skb, offset);
 
-	up = udp_sk(sk);
-	lookup = READ_ONCE(up->encap_err_lookup);
-	if (!lookup || lookup(sk, skb))
-		sk = NULL;
+	sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source,
+			       &hdr->saddr, uh->dest,
+			       inet6_iif(skb), 0, udptable, skb);
+	if (sk) {
+		int (*lookup)(struct sock *sk, struct sk_buff *skb);
+		struct udp_sock *up = udp_sk(sk);
+
+		lookup = READ_ONCE(up->encap_err_lookup);
+		if (!lookup || lookup(sk, skb))
+			sk = NULL;
+	}
+
+	if (!sk) {
+		sk = ERR_PTR(__udp6_lib_err_encap_no_sk(skb, opt, type, code,
+							offset, info));
+	}
 
 	skb_set_transport_header(skb, transport_offset);
 	skb_set_network_header(skb, network_offset);
+
 	return sk;
 }
 
@@ -531,16 +568,21 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			       inet6_iif(skb), inet6_sdif(skb), udptable, skb);
 	if (!sk) {
 		/* No socket for error: try tunnels before discarding */
+		sk = ERR_PTR(-ENOENT);
 		if (static_branch_unlikely(&udpv6_encap_needed_key)) {
 			sk = __udp6_lib_err_encap(net, hdr, offset, uh,
-						  udptable, skb);
+						  udptable, skb,
+						  opt, type, code, info);
+			if (!sk)
+				return 0;
 		}
 
-		if (!sk) {
+		if (IS_ERR(sk)) {
 			__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
 					  ICMP6_MIB_INERRORS);
-			return -ENOENT;
+			return PTR_ERR(sk);
 		}
+
 		tunnel = true;
 	}
 
-- 
2.19.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox