Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 5/6] net/ncsi: Reset channel state in ncsi_start_dev()
From: Samuel Mendoza-Jonas @ 2018-10-18  3:59 UTC (permalink / raw)
  To: netdev
  Cc: Samuel Mendoza-Jonas, David S . Miller, Justin.Lee1, linux-kernel,
	openbmc
In-Reply-To: <20181018035917.19413-1-sam@mendozajonas.com>

When the NCSI driver is stopped with ncsi_stop_dev() the channel
monitors are stopped and the state set to "inactive". However the
channels are still configured and active from the perspective of the
network controller. We should suspend each active channel but in the
context of ncsi_stop_dev() the transmit queue has been or is about to be
stopped so we won't have time to do so.

Instead when ncsi_start_dev() is called if the NCSI topology has already
been probed then call ncsi_reset_dev() to suspend any channels that were
previously active. This resets the network controller to a known state,
provides an up to date view of channel link state, and makes sure that
mode flags such as NCSI_MODE_TX_ENABLE are properly reset.

In addition to ncsi_start_dev() use ncsi_reset_dev() in ncsi-netlink.c
to update the channel configuration more cleanly.

Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
---
 net/ncsi/internal.h     |  2 ++
 net/ncsi/ncsi-manage.c  | 59 +++++++++++++++++++++++++++++++++++++----
 net/ncsi/ncsi-netlink.c | 10 +++----
 3 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 592e3cd40728..ed68ca677e41 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -279,6 +279,7 @@ struct ncsi_dev_priv {
 #define NCSI_DEV_PROBED		1            /* Finalized NCSI topology    */
 #define NCSI_DEV_HWA		2            /* Enabled HW arbitration     */
 #define NCSI_DEV_RESHUFFLE	4
+#define NCSI_DEV_RESET		8            /* Reset state of NC          */
 	spinlock_t          lock;            /* Protect the NCSI device    */
 #if IS_ENABLED(CONFIG_IPV6)
 	unsigned int        inet6_addr_num;  /* Number of IPv6 addresses   */
@@ -333,6 +334,7 @@ extern spinlock_t ncsi_dev_lock;
 	list_for_each_entry_rcu(nc, &np->channels, node)
 
 /* Resources */
+int ncsi_reset_dev(struct ncsi_dev *nd);
 void ncsi_start_channel_monitor(struct ncsi_channel *nc);
 void ncsi_stop_channel_monitor(struct ncsi_channel *nc);
 struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 632caeea672f..af23e2cc6c1d 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -550,8 +550,10 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
 		spin_lock_irqsave(&nc->lock, flags);
 		nc->state = NCSI_CHANNEL_INACTIVE;
 		spin_unlock_irqrestore(&nc->lock, flags);
-		ncsi_process_next_channel(ndp);
-
+		if (ndp->flags & NCSI_DEV_RESET)
+			ncsi_reset_dev(nd);
+		else
+			ncsi_process_next_channel(ndp);
 		break;
 	default:
 		netdev_warn(nd->dev, "Wrong NCSI state 0x%x in suspend\n",
@@ -1472,7 +1474,7 @@ int ncsi_start_dev(struct ncsi_dev *nd)
 		return 0;
 	}
 
-	return ncsi_choose_active_channel(nd);
+	return ncsi_reset_dev(nd);
 }
 EXPORT_SYMBOL_GPL(ncsi_start_dev);
 
@@ -1485,7 +1487,10 @@ void ncsi_stop_dev(struct ncsi_dev *nd)
 	int old_state;
 	unsigned long flags;
 
-	/* Stop the channel monitor and reset channel's state */
+	/* Stop the channel monitor on any active channels. Don't reset the
+	 * channel state so we know which were active when ncsi_start_dev()
+	 * is next called.
+	 */
 	NCSI_FOR_EACH_PACKAGE(ndp, np) {
 		NCSI_FOR_EACH_CHANNEL(np, nc) {
 			ncsi_stop_channel_monitor(nc);
@@ -1493,7 +1498,6 @@ void ncsi_stop_dev(struct ncsi_dev *nd)
 			spin_lock_irqsave(&nc->lock, flags);
 			chained = !list_empty(&nc->link);
 			old_state = nc->state;
-			nc->state = NCSI_CHANNEL_INACTIVE;
 			spin_unlock_irqrestore(&nc->lock, flags);
 
 			WARN_ON_ONCE(chained ||
@@ -1506,6 +1510,51 @@ void ncsi_stop_dev(struct ncsi_dev *nd)
 }
 EXPORT_SYMBOL_GPL(ncsi_stop_dev);
 
+int ncsi_reset_dev(struct ncsi_dev *nd)
+{
+	struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
+	struct ncsi_channel *nc, *active;
+	struct ncsi_package *np;
+	unsigned long flags;
+	bool enabled;
+	int state;
+
+	active = NULL;
+	NCSI_FOR_EACH_PACKAGE(ndp, np) {
+		NCSI_FOR_EACH_CHANNEL(np, nc) {
+			spin_lock_irqsave(&nc->lock, flags);
+			enabled = nc->monitor.enabled;
+			state = nc->state;
+			spin_unlock_irqrestore(&nc->lock, flags);
+
+			if (enabled)
+				ncsi_stop_channel_monitor(nc);
+			if (state == NCSI_CHANNEL_ACTIVE) {
+				active = nc;
+				break;
+			}
+		}
+	}
+
+	if (!active) {
+		/* Done */
+		spin_lock_irqsave(&ndp->lock, flags);
+		ndp->flags &= ~NCSI_DEV_RESET;
+		spin_unlock_irqrestore(&ndp->lock, flags);
+		return ncsi_choose_active_channel(ndp);
+	}
+
+	spin_lock_irqsave(&ndp->lock, flags);
+	ndp->flags |= NCSI_DEV_RESET;
+	ndp->active_channel = active;
+	ndp->active_package = active->package;
+	spin_unlock_irqrestore(&ndp->lock, flags);
+
+	nd->state = ncsi_dev_state_suspend;
+	schedule_work(&ndp->work);
+	return 0;
+}
+
 void ncsi_unregister_dev(struct ncsi_dev *nd)
 {
 	struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index 33314381b4f5..06bb8bc2c798 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -330,9 +330,8 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info)
 		    package_id, channel_id,
 		    channel_id == NCSI_RESERVED_CHANNEL ? " (any)" : "");
 
-	/* Bounce the NCSI channel to set changes */
-	ncsi_stop_dev(&ndp->ndev);
-	ncsi_start_dev(&ndp->ndev);
+	/* Update channel configuration */
+	ncsi_reset_dev(&ndp->ndev);
 
 	return 0;
 }
@@ -360,9 +359,8 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
 	spin_unlock_irqrestore(&ndp->lock, flags);
 	netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n");
 
-	/* Bounce the NCSI channel to set changes */
-	ncsi_stop_dev(&ndp->ndev);
-	ncsi_start_dev(&ndp->ndev);
+	/* Update channel configuration */
+	ncsi_reset_dev(&ndp->ndev);
 
 	return 0;
 }
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 4/6] net/ncsi: Don't mark configured channels inactive
From: Samuel Mendoza-Jonas @ 2018-10-18  3:59 UTC (permalink / raw)
  To: netdev
  Cc: Samuel Mendoza-Jonas, David S . Miller, Justin.Lee1, linux-kernel,
	openbmc
In-Reply-To: <20181018035917.19413-1-sam@mendozajonas.com>

The concepts of a channel being 'active' and it having link are slightly
muddled in the NCSI driver. Tweak this slightly so that
NCSI_CHANNEL_ACTIVE represents a channel that has been configured and
enabled, and NCSI_CHANNEL_INACTIVE represents a de-configured channel.
This distinction is important because a channel can be 'active' but have
its link down; in this case the channel may still need to be configured
so that it may receive AEN link-state-change packets.

Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
---
 net/ncsi/ncsi-aen.c    | 17 +++++++++++------
 net/ncsi/ncsi-manage.c |  3 +--
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
index 65f47a648be3..57f77e5d381a 100644
--- a/net/ncsi/ncsi-aen.c
+++ b/net/ncsi/ncsi-aen.c
@@ -57,6 +57,7 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
 	int state;
 	unsigned long old_data, data;
 	unsigned long flags;
+	bool had_link, has_link;
 
 	/* Find the NCSI channel */
 	ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc);
@@ -73,6 +74,9 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
 	ncm->data[2] = data;
 	ncm->data[4] = ntohl(lsc->oem_status);
 
+	had_link = !!(old_data & 0x1);
+	has_link = !!(data & 0x1);
+
 	netdev_dbg(ndp->ndev.dev, "NCSI: LSC AEN - channel %u state %s\n",
 		   nc->id, data & 0x1 ? "up" : "down");
 
@@ -80,15 +84,16 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
 	state = nc->state;
 	spin_unlock_irqrestore(&nc->lock, flags);
 
-	if (!((old_data ^ data) & 0x1) || chained)
-		return 0;
-	if (!(state == NCSI_CHANNEL_INACTIVE && (data & 0x1)) &&
-	    !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1)))
+	if (state == NCSI_CHANNEL_INACTIVE)
+		netdev_warn(ndp->ndev.dev,
+			    "NCSI: Inactive channel %u received AEN!\n",
+			    nc->id);
+
+	if ((had_link == has_link) || chained)
 		return 0;
 
-	if (state == NCSI_CHANNEL_ACTIVE)
+	if (had_link)
 		ndp->flags |= NCSI_DEV_RESHUFFLE;
-
 	ncsi_stop_channel_monitor(nc);
 	spin_lock_irqsave(&ndp->lock, flags);
 	list_add_tail_rcu(&nc->link, &ndp->channel_queue);
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 602af74bee81..632caeea672f 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -834,12 +834,11 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
 			break;
 		}
 
+		nc->state = NCSI_CHANNEL_ACTIVE;
 		if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) {
 			hot_nc = nc;
-			nc->state = NCSI_CHANNEL_ACTIVE;
 		} else {
 			hot_nc = NULL;
-			nc->state = NCSI_CHANNEL_INACTIVE;
 			netdev_dbg(ndp->ndev.dev,
 				   "NCSI: channel %u link down after config\n",
 				   nc->id);
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 3/6] net/ncsi: Don't deselect package in suspend if active
From: Samuel Mendoza-Jonas @ 2018-10-18  3:59 UTC (permalink / raw)
  To: netdev
  Cc: Samuel Mendoza-Jonas, David S . Miller, Justin.Lee1, linux-kernel,
	openbmc
In-Reply-To: <20181018035917.19413-1-sam@mendozajonas.com>

When a package is deselected all channels of that package cease
communication. If there are other channels active on the package of the
suspended channel this will disable them as well, so only send a
deselect-package command if no other channels are active.

Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
---
 net/ncsi/ncsi-manage.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index fbbfed559f9c..602af74bee81 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -440,12 +440,14 @@ static void ncsi_request_timeout(struct timer_list *t)
 static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
 {
 	struct ncsi_dev *nd = &ndp->ndev;
-	struct ncsi_package *np = ndp->active_package;
-	struct ncsi_channel *nc = ndp->active_channel;
+	struct ncsi_package *np;
+	struct ncsi_channel *nc, *tmp;
 	struct ncsi_cmd_arg nca;
 	unsigned long flags;
 	int ret;
 
+	np = ndp->active_package;
+	nc = ndp->active_channel;
 	nca.ndp = ndp;
 	nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN;
 	switch (nd->state) {
@@ -521,6 +523,15 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
 		if (ret)
 			goto error;
 
+		NCSI_FOR_EACH_CHANNEL(np, tmp) {
+			/* If there is another channel active on this package
+			 * do not deselect the package.
+			 */
+			if (tmp != nc && tmp->state == NCSI_CHANNEL_ACTIVE) {
+				nd->state = ncsi_dev_state_suspend_done;
+				break;
+			}
+		}
 		break;
 	case ncsi_dev_state_suspend_deselect:
 		ndp->pending_req_num = 1;
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next 2/6] net/ncsi: Probe single packages to avoid conflict
From: Samuel Mendoza-Jonas @ 2018-10-18  3:59 UTC (permalink / raw)
  To: netdev
  Cc: Samuel Mendoza-Jonas, David S . Miller, Justin.Lee1, linux-kernel,
	openbmc
In-Reply-To: <20181018035917.19413-1-sam@mendozajonas.com>

Currently the NCSI driver sends a select-package command to all possible
packages simultaneously to discover what packages are available. However
at this stage in the probe process the driver does not know if
hardware arbitration is available: if it isn't then this process could
cause collisions on the RMII bus when packages try to respond.

Update the probe loop to probe each package one by one, and once
complete check if HWA is universally supported.

Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
---
 net/ncsi/internal.h    |  1 +
 net/ncsi/ncsi-manage.c | 88 ++++++++++++++----------------------------
 2 files changed, 31 insertions(+), 58 deletions(-)

diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 13c9b5eeb3b7..592e3cd40728 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -283,6 +283,7 @@ struct ncsi_dev_priv {
 #if IS_ENABLED(CONFIG_IPV6)
 	unsigned int        inet6_addr_num;  /* Number of IPv6 addresses   */
 #endif
+	unsigned int        package_probe_id;/* Current ID during probe    */
 	unsigned int        package_num;     /* Number of packages         */
 	struct list_head    packages;        /* List of packages           */
 	struct ncsi_channel *hot_channel;    /* Channel was ever active    */
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 65ee46c4d8f4..fbbfed559f9c 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -997,70 +997,28 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
 		nd->state = ncsi_dev_state_probe_package;
 		break;
 	case ncsi_dev_state_probe_package:
-		ndp->pending_req_num = 16;
+		ndp->pending_req_num = 1;
 
-		/* Select all possible packages */
 		nca.type = NCSI_PKT_CMD_SP;
 		nca.bytes[0] = 1;
+		nca.package = ndp->package_probe_id;
 		nca.channel = NCSI_RESERVED_CHANNEL;
-		for (index = 0; index < 8; index++) {
-			nca.package = index;
-			ret = ncsi_xmit_cmd(&nca);
-			if (ret)
-				goto error;
-		}
-
-		/* Disable all possible packages */
-		nca.type = NCSI_PKT_CMD_DP;
-		for (index = 0; index < 8; index++) {
-			nca.package = index;
-			ret = ncsi_xmit_cmd(&nca);
-			if (ret)
-				goto error;
-		}
-
+		ret = ncsi_xmit_cmd(&nca);
+		if (ret)
+			goto error;
 		nd->state = ncsi_dev_state_probe_channel;
 		break;
 	case ncsi_dev_state_probe_channel:
-		if (!ndp->active_package)
-			ndp->active_package = list_first_or_null_rcu(
-				&ndp->packages, struct ncsi_package, node);
-		else if (list_is_last(&ndp->active_package->node,
-				      &ndp->packages))
-			ndp->active_package = NULL;
-		else
-			ndp->active_package = list_next_entry(
-				ndp->active_package, node);
-
-		/* All available packages and channels are enumerated. The
-		 * enumeration happens for once when the NCSI interface is
-		 * started. So we need continue to start the interface after
-		 * the enumeration.
-		 *
-		 * We have to choose an active channel before configuring it.
-		 * Note that we possibly don't have active channel in extreme
-		 * situation.
-		 */
+		ndp->active_package = ncsi_find_package(ndp,
+							ndp->package_probe_id);
 		if (!ndp->active_package) {
-			ndp->flags |= NCSI_DEV_PROBED;
-			if (ncsi_check_hwa(ndp))
-				ncsi_enable_hwa(ndp);
-			else
-				ncsi_choose_active_channel(ndp);
-			return;
+			/* No response */
+			nd->state = ncsi_dev_state_probe_dp;
+			schedule_work(&ndp->work);
+			break;
 		}
-
-		/* Select the active package */
-		ndp->pending_req_num = 1;
-		nca.type = NCSI_PKT_CMD_SP;
-		nca.bytes[0] = 1;
-		nca.package = ndp->active_package->id;
-		nca.channel = NCSI_RESERVED_CHANNEL;
-		ret = ncsi_xmit_cmd(&nca);
-		if (ret)
-			goto error;
-
 		nd->state = ncsi_dev_state_probe_cis;
+		schedule_work(&ndp->work);
 		break;
 	case ncsi_dev_state_probe_cis:
 		ndp->pending_req_num = NCSI_RESERVED_CHANNEL;
@@ -1109,22 +1067,35 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
 	case ncsi_dev_state_probe_dp:
 		ndp->pending_req_num = 1;
 
-		/* Deselect the active package */
+		/* Deselect the current package */
 		nca.type = NCSI_PKT_CMD_DP;
-		nca.package = ndp->active_package->id;
+		nca.package = ndp->package_probe_id;
 		nca.channel = NCSI_RESERVED_CHANNEL;
 		ret = ncsi_xmit_cmd(&nca);
 		if (ret)
 			goto error;
 
-		/* Scan channels in next package */
-		nd->state = ncsi_dev_state_probe_channel;
+		/* Probe next package */
+		ndp->package_probe_id++;
+		if (ndp->package_probe_id >= 8) {
+			/* Probe finished */
+			ndp->flags |= NCSI_DEV_PROBED;
+			break;
+		}
+		nd->state = ncsi_dev_state_probe_package;
+		ndp->active_package = NULL;
 		break;
 	default:
 		netdev_warn(nd->dev, "Wrong NCSI state 0x%0x in enumeration\n",
 			    nd->state);
 	}
 
+	if (ndp->flags & NCSI_DEV_PROBED) {
+		/* Check if all packages have HWA support */
+		ncsi_check_hwa(ndp);
+		ncsi_choose_active_channel(ndp);
+	}
+
 	return;
 error:
 	netdev_err(ndp->ndev.dev,
@@ -1485,6 +1456,7 @@ int ncsi_start_dev(struct ncsi_dev *nd)
 		return -ENOTTY;
 
 	if (!(ndp->flags & NCSI_DEV_PROBED)) {
+		ndp->package_probe_id = 0;
 		nd->state = ncsi_dev_state_probe;
 		schedule_work(&ndp->work);
 		return 0;
-- 
2.19.1

^ permalink raw reply related

* [bpf-next PATCH 2/2] bpf: test_maps add a test to catch kcm + sockmap
From: John Fastabend @ 2018-10-18  3:37 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, eric.dumazet
In-Reply-To: <20181018032125.22427.85747.stgit@john-Precision-Tower-5810>

Adding a socket to both sockmap and kcm is not supported due to
collision on sk_user_data usage.

If selftests is run without KCM support we will issue a warning
and continue with the tests.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 tools/testing/selftests/bpf/Makefile      |    2 -
 tools/testing/selftests/bpf/sockmap_kcm.c |   14 ++++++
 tools/testing/selftests/bpf/test_maps.c   |   64 ++++++++++++++++++++++++++++-
 3 files changed, 77 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/sockmap_kcm.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index d99dd6f..f290554 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -28,7 +28,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
-	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
+	sockmap_verdict_prog.o sockmap_kcm.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
diff --git a/tools/testing/selftests/bpf/sockmap_kcm.c b/tools/testing/selftests/bpf/sockmap_kcm.c
new file mode 100644
index 0000000..4377adc
--- /dev/null
+++ b/tools/testing/selftests/bpf/sockmap_kcm.c
@@ -0,0 +1,14 @@
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+#include "bpf_util.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+
+SEC("socket_kcm")
+int bpf_prog1(struct __sk_buff *skb)
+{
+	return skb->len;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 9b552c0..be20f1d 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -20,6 +20,7 @@
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <linux/bpf.h>
+#include <linux/kcm.h>
 
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
@@ -479,14 +480,16 @@ static void test_devmap(int task, void *data)
 #define SOCKMAP_PARSE_PROG "./sockmap_parse_prog.o"
 #define SOCKMAP_VERDICT_PROG "./sockmap_verdict_prog.o"
 #define SOCKMAP_TCP_MSG_PROG "./sockmap_tcp_msg_prog.o"
+#define KCM_PROG "./sockmap_kcm.o"
 static void test_sockmap(int tasks, void *data)
 {
 	struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_msg, *bpf_map_break;
-	int map_fd_msg = 0, map_fd_rx = 0, map_fd_tx = 0, map_fd_break;
+	int map_fd_msg = 0, map_fd_rx = 0, map_fd_tx = 0, map_fd_break, kcm;
 	int ports[] = {50200, 50201, 50202, 50204};
 	int err, i, fd, udp, sfd[6] = {0xdeadbeef};
 	u8 buf[20] = {0x0, 0x5, 0x3, 0x2, 0x1, 0x0};
-	int parse_prog, verdict_prog, msg_prog;
+	int parse_prog, verdict_prog, msg_prog, kcm_prog;
+	struct kcm_attach attach_info;
 	struct sockaddr_in addr;
 	int one = 1, s, sc, rc;
 	struct bpf_object *obj;
@@ -744,6 +747,62 @@ static void test_sockmap(int tasks, void *data)
 		goto out_sockmap;
 	}
 
+	/* Test adding a KCM socket into map */
+#define AF_KCM 41
+	kcm = socket(AF_KCM, SOCK_DGRAM, KCMPROTO_CONNECTED);
+	if (kcm == -1) {
+		printf("Warning, KCM+Sockmap could not be tested.\n");
+		goto skip_kcm;
+	}
+
+	err = bpf_prog_load(KCM_PROG,
+			    BPF_PROG_TYPE_SOCKET_FILTER,
+			    &obj, &kcm_prog);
+	if (err) {
+		printf("Failed to load SK_SKB parse prog\n");
+		goto out_sockmap;
+	}
+
+	i = 2;
+	memset(&attach_info, 0, sizeof(attach_info));
+	attach_info.fd = sfd[i];
+	attach_info.bpf_fd = kcm_prog;
+	err = ioctl(kcm, SIOCKCMATTACH, &attach_info);
+	if (!err) {
+		perror("Failed KCM attached to sockmap fd: ");
+		goto out_sockmap;
+	}
+
+	err = bpf_map_delete_elem(fd, &i);
+	if (err) {
+		printf("Failed delete sockmap from empty map %i %i\n", err, errno);
+		goto out_sockmap;
+	}
+
+	err = ioctl(kcm, SIOCKCMATTACH, &attach_info);
+	if (err) {
+		perror("Failed KCM attach");
+		goto out_sockmap;
+	}
+
+	err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
+	if (!err) {
+		printf("Failed sockmap attached KCM sock!\n");
+		goto out_sockmap;
+	}
+	err = ioctl(kcm, SIOCKCMUNATTACH, &attach_info);
+	if (err) {
+		printf("Failed detach KCM sock!\n");
+		goto out_sockmap;
+	}
+
+	err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
+	if (err) {
+		printf("Failed post-kcm update sockmap '%i:%i'\n",
+		       i, sfd[i]);
+		goto out_sockmap;
+	}
+
 	/* Test map update elem afterwards fd lives in fd and map_fd */
 	for (i = 2; i < 6; i++) {
 		err = bpf_map_update_elem(map_fd_rx, &i, &sfd[i], BPF_ANY);
@@ -776,6 +835,7 @@ static void test_sockmap(int tasks, void *data)
 		}
 	}
 
+skip_kcm:
 	/* Put sfd[2] (sending fd below) into msg map to test sendmsg bpf */
 	i = 0;
 	err = bpf_map_update_elem(map_fd_msg, &i, &sfd[2], BPF_ANY);

^ permalink raw reply related

* [bpf-next PATCH 1/2] bpf: skmsg, fix psock create on existing kcm/tls port
From: John Fastabend @ 2018-10-18  3:37 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, eric.dumazet
In-Reply-To: <20181018032125.22427.85747.stgit@john-Precision-Tower-5810>

Before using the psock returned by sk_psock_get() when adding it to a
sockmap we need to ensure it is actually a sockmap based psock.
Previously we were only checking this after incrementing the reference
counter which was an error. This resulted in a slab-out-of-bounds
error when the psock was not actually a sockmap type.

This moves the check up so the reference counter is only used
if it is a sockmap psock.

Eric reported the following KASAN BUG,

BUG: KASAN: slab-out-of-bounds in atomic_read include/asm-generic/atomic-instrumented.h:21 [inline]
BUG: KASAN: slab-out-of-bounds in refcount_inc_not_zero_checked+0x97/0x2f0 lib/refcount.c:120
Read of size 4 at addr ffff88019548be58 by task syz-executor4/22387

CPU: 1 PID: 22387 Comm: syz-executor4 Not tainted 4.19.0-rc7+ #264
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1c4/0x2b4 lib/dump_stack.c:113
 print_address_description.cold.8+0x9/0x1ff mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.9+0x242/0x309 mm/kasan/report.c:412
 check_memory_region_inline mm/kasan/kasan.c:260 [inline]
 check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267
 kasan_check_read+0x11/0x20 mm/kasan/kasan.c:272
 atomic_read include/asm-generic/atomic-instrumented.h:21 [inline]
 refcount_inc_not_zero_checked+0x97/0x2f0 lib/refcount.c:120
 sk_psock_get include/linux/skmsg.h:379 [inline]
 sock_map_link.isra.6+0x41f/0xe30 net/core/sock_map.c:178
 sock_hash_update_common+0x19b/0x11e0 net/core/sock_map.c:669
 sock_hash_update_elem+0x306/0x470 net/core/sock_map.c:738
 map_update_elem+0x819/0xdf0 kernel/bpf/syscall.c:818

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
---
 0 files changed

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 677b673..f44ca6b 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -275,11 +275,6 @@ static inline struct sk_psock *sk_psock(const struct sock *sk)
 	return rcu_dereference_sk_user_data(sk);
 }
 
-static inline bool sk_has_psock(struct sock *sk)
-{
-	return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg;
-}
-
 static inline void sk_psock_queue_msg(struct sk_psock *psock,
 				      struct sk_msg *msg)
 {
@@ -379,6 +374,26 @@ static inline bool sk_psock_test_state(const struct sk_psock *psock,
 	return test_bit(bit, &psock->state);
 }
 
+static inline struct sk_psock *sk_psock_get_checked(struct sock *sk)
+{
+	struct sk_psock *psock;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (psock) {
+		if (sk->sk_prot->recvmsg != tcp_bpf_recvmsg) {
+			psock = ERR_PTR(-EBUSY);
+			goto out;
+		}
+
+		if (!refcount_inc_not_zero(&psock->refcnt))
+			psock = NULL;
+	}
+out:
+	rcu_read_unlock();
+	return psock;
+}
+
 static inline struct sk_psock *sk_psock_get(struct sock *sk)
 {
 	struct sk_psock *psock;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 3c0e44c..be6092a 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -175,12 +175,13 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 		}
 	}
 
-	psock = sk_psock_get(sk);
+	psock = sk_psock_get_checked(sk);
+	if (IS_ERR(psock)) {
+		ret = PTR_ERR(psock);
+		goto out_progs;
+	}
+
 	if (psock) {
-		if (!sk_has_psock(sk)) {
-			ret = -EBUSY;
-			goto out_progs;
-		}
 		if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
 		    (skb_progs  && READ_ONCE(psock->progs.skb_parser))) {
 			sk_psock_put(sk, psock);

^ permalink raw reply related

* [bpf-next PATCH 0/2] Fix kcm + sockmap by checking psock type
From: John Fastabend @ 2018-10-18  3:37 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, eric.dumazet

We check if the sk_user_data (the psock in skmsg) is in fact a sockmap
type to late, after we read the refcnt which is an error. This
series moves the check up before reading refcnt and also adds a test
to test_maps to test trying to add a KCM socket into a sockmap.

While reviewig this code I also found an issue with KCM and kTLS
where each uses sk_data_ready hooks and associated stream parser
breaking expectations in kcm, ktls or both. But that fix will need
to go to net.

Thanks to Eric for reporting.

---

John Fastabend (2):
      bpf: skmsg, fix psock create on existing kcm/tls port
      bpf: test_maps add a test to catch kcm + sockmap


 tools/testing/selftests/bpf/Makefile      |    2 -
 tools/testing/selftests/bpf/sockmap_kcm.c |   14 ++++++
 tools/testing/selftests/bpf/test_maps.c   |   64 ++++++++++++++++++++++++++++-
 3 files changed, 77 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/sockmap_kcm.c

^ permalink raw reply

* Re: [PATCH bpf-next v3 00/13] bpf: add btf func info support
From: Alexei Starovoitov @ 2018-10-18  3:24 UTC (permalink / raw)
  To: Yonghong Song; +Cc: ast, kafai, daniel, netdev, kernel-team
In-Reply-To: <20181017233026.1071519-1-yhs@fb.com>

On Wed, Oct 17, 2018 at 04:30:22PM -0700, Yonghong Song wrote:
> 
> This patch added func info support to the kernel so we can
> get better ksym's for bpf function calls. Basically,
> pairs of bpf function calls and their corresponding types
> are passed to the kernel. Extracting function names from
> the types, the kernel is able to construct a ksym for
> each function call with embedded function name.
...
> Below is a demonstration from Patch #13.
>   $ bpftool prog dump jited id 1
>   int _dummy_tracepoint(struct dummy_tracepoint_args * ):
>   bpf_prog_b07ccb89267cf242__dummy_tracepoint:
>          0:   push   %rbp
>          1:   mov    %rsp,%rbp
>         ......
>         3c:   add    $0x28,%rbp
>         40:   leaveq
>         41:   retq
>   
>   int test_long_fname_1(struct dummy_tracepoint_args * ):
>   bpf_prog_2dcecc18072623fc_test_long_fname_1:
>          0:   push   %rbp
>          1:   mov    %rsp,%rbp

Considering we only had 16-byte names for main prog and
no names at all for subprogs, this is huge improvement
in BPF introspection.
Cannot wait for the followup patches with line info support.

For the set:
Acked-by: Alexei Starovoitov <ast@kernel.org>

^ permalink raw reply

* Re: [RFC] VSOCK: The performance problem of vhost_vsock.
From: Jason Wang @ 2018-10-18  2:45 UTC (permalink / raw)
  To: jiangyiwen, stefanha; +Cc: netdev, kvm, virtualization
In-Reply-To: <5BC7E04E.9000002@huawei.com>


On 2018/10/18 上午9:22, jiangyiwen wrote:
> On 2018/10/17 20:31, Jason Wang wrote:
>> On 2018/10/17 下午7:41, jiangyiwen wrote:
>>> On 2018/10/17 17:51, Jason Wang wrote:
>>>> On 2018/10/17 下午5:39, Jason Wang wrote:
>>>>>> Hi Jason and Stefan,
>>>>>>
>>>>>> Maybe I find the reason of bad performance.
>>>>>>
>>>>>> I found pkt_len is limited to VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE(4K),
>>>>>> it will cause the bandwidth is limited to 500~600MB/s. And once I
>>>>>> increase to 64k, it can improve about 3 times(~1500MB/s).
>>>>> Looks like the value was chosen for a balance between rx buffer size and performance. Allocating 64K always even for small packet is kind of waste and stress for guest memory. Virito-net try to avoid this by inventing the merge able rx buffer which allows big packet to be scattered in into different buffers. We can reuse this idea or revisit the idea of using virtio-net/vhost-net as a transport of vsock.
>>>>>
>>>>> What interesting is the performance is still behind vhost-net.
>>>>>
>>>>> Thanks
>>>>>
>>>>>> By the way, I send to 64K in application once, and I don't use
>>>>>> sg_init_one and rewrite function to packet sg list because pkt_len
>>>>>> include multiple pages.
>>>>>>
>>>>>> Thanks,
>>>>>> Yiwen.
>>>> Btw, if you're using vsock for transferring large files, maybe it's more efficient to implement sendpage() for vsock to allow sendfile()/splice() work.
>>>>
>>>> Thanks
>>>>
>>> I can't agree more.
>>>
>>> why vhost_vsock is still behind vhost_net?
>>> Because I use sendfile() to test performance at first, and then
>>> I found vsock don't implement sendpage() and cause the bandwidth
>>> can't be increased. So I use read() and send() to replace sendfile(),
>>> it will increase some switch between kernel and user mode, and sendfile()
>>> can support zero copy. I think this is main reason.
>>>
>>> Thanks.
>>
>> Want to post patches for this then :) ?
>>
>> Thanks
>>
> I may not post patches at the moment because there are other tasks.
>
> After a period of time, I will consider implement the feature.
>
> Thanks.


That's fine.

Thanks

^ permalink raw reply

* Re: [RFC] virtio_net: add local_bh_disable() around u64_stats_update_begin
From: Toshiaki Makita @ 2018-10-18  9:53 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Michael S. Tsirkin, netdev, virtualization, tglx, David S. Miller
In-Reply-To: <20181018093035.i56byogaslsnzcnl@linutronix.de>

On 2018/10/18 18:30, Sebastian Andrzej Siewior wrote:
> On 2018-10-18 18:19:21 [+0900], Toshiaki Makita wrote:
>> On 2018/10/18 18:08, Sebastian Andrzej Siewior wrote:
>>> Again: lockdep saw the lock in softirq context once and in process
>>> context once and this is what triggers the warning. It does not matter
>>> if NAPI is enabled or not during the access in process context. If you
>>> want to allow this you need further lockdep annotation…
>>>
>>> … but: refill_work() disables NAPI for &vi->rq[1] and refills + updates
>>> stats while NAPI is enabled for &vi->rq[0].
>>
>> Do you mean this is false positive? rq[0] and rq[1] never race with each
>> other...
> 
> Why? So you can't refill rq[1] and then be interrupted and process NAPI
> for rq[0]?

Probably I don't understand what you are trying to say, but rq[0] and
rq[1] are different objects so they can be processed concurrently but
should not affect each other.

> 
> But as I said. If lockdep saw the lock in acquired in softirq (what it
> did) _and_ in process context (what it did as well) _once_ then this is
> enough evidence for the warning.
> If you claim that this can not happen due to NAPI guard [0] then this is
> something lockdep does not know about.

The point is that it is not the problem of stats. try_fill_recv() should
not be called for the same rq concurrently in the first place. This
requirement should be satisfied by NAPI guard, so if the NAPI guard
logic is buggy then we need to fix it.

> [0] which I currently don't understand and therefore sent the patch [1]
>     as Jason pointed out that in the ->ndo_open case the work is
>     scheduled and then NAPI is enabled (which means the worker could
>     disable NAPI and refill but before it finishes, ->ndo_open would
>     continue and enable NAPI)).

It may be related but I have not investigated it deeply. I'll check if
it can cause this problem later.

> [1] 20181018084753.wefvsypdevbzoadg@linutronix.de

-- 
Toshiaki Makita

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH] virtio_net: add local_bh_disable() around u64_stats_update_begin
From: Toshiaki Makita @ 2018-10-18  9:26 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Michael S. Tsirkin, netdev, virtualization, tglx, David S. Miller
In-Reply-To: <20181018091114.jqiedzds2bqezxtb@linutronix.de>

On 2018/10/18 18:11, Sebastian Andrzej Siewior wrote:
> On 2018-10-18 18:06:57 [+0900], Toshiaki Makita wrote:
>> NACK. Again, this race should not happen because of NAPI guard.
>> We need to investigate why this warning happened.
> 
> I tried to explain this. Please see
> 	20181018090812.rry5qgnqxxrjxaii@linutronix.de
Anyway, if this really happens, then it means try_fill_recv() can be
called concurrently for the same rq, which looks like a far more severe
problem to me. If so, we need to fix it instead.

-- 
Toshiaki Makita

^ permalink raw reply

* Re: [RFC] virtio_net: add local_bh_disable() around u64_stats_update_begin
From: Toshiaki Makita @ 2018-10-18  9:19 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Michael S. Tsirkin, netdev, virtualization, tglx, David S. Miller
In-Reply-To: <20181018090812.rry5qgnqxxrjxaii@linutronix.de>

On 2018/10/18 18:08, Sebastian Andrzej Siewior wrote:
> On 2018-10-18 18:00:05 [+0900], Toshiaki Makita wrote:
>> On 2018/10/18 17:47, Sebastian Andrzej Siewior wrote:
>>> On 2018-10-17 14:48:02 [+0800], Jason Wang wrote:
>>>>
>>>> On 2018/10/17 上午9:13, Toshiaki Makita wrote:
>>>>> I'm not sure what condition triggered this warning.
>>>
>>> If the seqlock is acquired once in softirq and then in process context
>>> again it is enough evidence for lockdep to trigger this warning.
>>
>> No. As I said that should not happen because of NAPI guard.
> Again: lockdep saw the lock in softirq context once and in process
> context once and this is what triggers the warning. It does not matter
> if NAPI is enabled or not during the access in process context. If you
> want to allow this you need further lockdep annotation…
> 
> … but: refill_work() disables NAPI for &vi->rq[1] and refills + updates
> stats while NAPI is enabled for &vi->rq[0].

Do you mean this is false positive? rq[0] and rq[1] never race with each
other...

-- 
Toshiaki Makita

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH] virtio_net: add local_bh_disable() around u64_stats_update_begin
From: Toshiaki Makita @ 2018-10-18  9:06 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Michael S. Tsirkin, netdev, virtualization, tglx, David S. Miller
In-Reply-To: <20181018084313.oopu34iwfwgkcwwc@linutronix.de>

On 2018/10/18 17:43, Sebastian Andrzej Siewior wrote:
> on 32bit, lockdep notices that virtnet_open() and refill_work() invoke
> try_fill_recv() from process context while virtnet_receive() invokes the
> same function from BH context. The problem that the seqcounter within
> u64_stats_update_begin() may deadlock if it is interrupted by BH and
> then acquired again.
> 
> Introduce u64_stats_update_begin_bh() which disables BH on 32bit
> architectures. Since the BH might interrupt the worker, this new
> function should not limited to SMP like the others which are expected
> to be used in softirq.
> 
> With this change we might lose increments but this is okay. The
> important part that the two 32bit parts of the 64bit counter are not
> corrupted.
> 
> Fixes: 461f03dc99cf6 ("virtio_net: Add kick stats").
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

NACK. Again, this race should not happen because of NAPI guard.
We need to investigate why this warning happened.

-- 
Toshiaki Makita

^ permalink raw reply

* Re: [RFC] virtio_net: add local_bh_disable() around u64_stats_update_begin
From: Toshiaki Makita @ 2018-10-18  9:00 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Jason Wang
  Cc: netdev, tglx, Michael S. Tsirkin, David S. Miller, virtualization
In-Reply-To: <20181018084753.wefvsypdevbzoadg@linutronix.de>

On 2018/10/18 17:47, Sebastian Andrzej Siewior wrote:
> On 2018-10-17 14:48:02 [+0800], Jason Wang wrote:
>>
>> On 2018/10/17 上午9:13, Toshiaki Makita wrote:
>>> I'm not sure what condition triggered this warning.
> 
> If the seqlock is acquired once in softirq and then in process context
> again it is enough evidence for lockdep to trigger this warning.

No. As I said that should not happen because of NAPI guard.

-- 
Toshiaki Makita

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* [RFC PATCH] cgroup, netclassid: add a preemption point to write_classid
From: Michal Hocko @ 2018-10-18  8:56 UTC (permalink / raw)
  To: Tejun Heo, David S. Miller
  Cc: Johannes Weiner, cgroups, netdev, LKML, Michal Hocko

From: Michal Hocko <mhocko@suse.com>

We have seen a customer complaining about soft lockups on !PREEMPT
kernel config with 4.4 based kernel

[1072141.435366] NMI watchdog: BUG: soft lockup - CPU#21 stuck for 22s! [systemd:1]
[1072141.444090] Modules linked in: mpt3sas raid_class binfmt_misc af_packet 8021q garp mrp stp llc xfs libcrc32c bonding iscsi_ibft iscsi_boot_sysfs msr ext4 crc16 jbd2 mbcache cdc_ether usbnet mii joydev hid_generic usbhid intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel ipmi_ssif mgag200 i2c_algo_bit ttm ipmi_devintf drbg ixgbe drm_kms_helper vxlan ansi_cprng ip6_udp_tunnel drm aesni_intel udp_tunnel aes_x86_64 iTCO_wdt syscopyarea ptp xhci_pci lrw iTCO_vendor_support pps_core gf128mul ehci_pci glue_helper sysfillrect mdio pcspkr sb_edac ablk_helper cryptd ehci_hcd sysimgblt xhci_hcd fb_sys_fops edac_core mei_me lpc_ich ses usbcore enclosure dca mfd_core ipmi_si mei i2c_i801 scsi_transport_sas usb_common ipmi_msghandler shpchp f
 jes wmi processor button acpi_pad btrfs xor raid6_pq sd_mod crc32c_intel megaraid_sas sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod md_mod autofs4
[1072141.444146] Supported: Yes
[1072141.444149] CPU: 21 PID: 1 Comm: systemd Not tainted 4.4.121-92.80-default #1
[1072141.444150] Hardware name: LENOVO Lenovo System x3650 M5 -[5462P4U]- -[5462P4U]-/01GR451, BIOS -[TCE136H-2.70]- 06/13/2018
[1072141.444151] task: ffff880191bd0040 ti: ffff880191bd4000 task.ti: ffff880191bd4000
[1072141.444153] RIP: 0010:[<ffffffff815229f9>]  [<ffffffff815229f9>] update_classid_sock+0x29/0x40
[1072141.444157] RSP: 0018:ffff880191bd7d58  EFLAGS: 00000286
[1072141.444158] RAX: ffff883b177cb7c0 RBX: 0000000000000000 RCX: 0000000000000000
[1072141.444159] RDX: 00000000000009c7 RSI: ffff880191bd7d5c RDI: ffff8822e29bb200
[1072141.444160] RBP: ffff883a72230980 R08: 0000000000000101 R09: 0000000000000000
[1072141.444161] R10: 0000000000000008 R11: f000000000000000 R12: ffffffff815229d0
[1072141.444162] R13: 0000000000000000 R14: ffff881fd0a47ac0 R15: ffff880191bd7f28
[1072141.444163] FS:  00007f3e2f1eb8c0(0000) GS:ffff882000340000(0000) knlGS:0000000000000000
[1072141.444164] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[1072141.444165] CR2: 00007f3e2f200000 CR3: 0000001ffea4e000 CR4: 00000000001606f0
[1072141.444166] Stack:
[1072141.444166]  ffffffa800000246 00000000000009c7 ffffffff8121d583 ffff8818312a05c0
[1072141.444168]  ffff8818312a1100 ffff880197c3b280 ffff881861422858 ffffffffffffffea
[1072141.444170]  ffffffff81522b1c ffffffff81d0ca20 ffff8817fa17b950 ffff883fdd8121e0
[1072141.444171] Call Trace:
[1072141.444179]  [<ffffffff8121d583>] iterate_fd+0x53/0x80
[1072141.444182]  [<ffffffff81522b1c>] write_classid+0x4c/0x80
[1072141.444187]  [<ffffffff8111328b>] cgroup_file_write+0x9b/0x100
[1072141.444193]  [<ffffffff81278bcb>] kernfs_fop_write+0x11b/0x150
[1072141.444198]  [<ffffffff81201566>] __vfs_write+0x26/0x100
[1072141.444201]  [<ffffffff81201bed>] vfs_write+0x9d/0x190
[1072141.444203]  [<ffffffff812028c2>] SyS_write+0x42/0xa0
[1072141.444207]  [<ffffffff815f58c3>] entry_SYSCALL_64_fastpath+0x1e/0xca
[1072141.445490] DWARF2 unwinder stuck at entry_SYSCALL_64_fastpath+0x1e/0xca

If a cgroup has many tasks with many open file descriptors then we would
end up in a large loop without any rescheduling point throught the
operation. Add cond_resched once per task.

Signed-off-by: Michal Hocko <mhocko@suse.com>
---
Hi,
I am sending this as an RFC because I am not sure this is the best
approach to be honest. The customer who reported this issue was not
entirely thrilled about testing this so it might be possible that the
patch is not sufficient. Details about the workload are not very
detailed either.

Any comment would be highly appreciated.

 net/core/netclassid_cgroup.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 5e4f04004a49..7bf833598615 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -106,6 +106,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
 		iterate_fd(p->files, 0, update_classid_sock,
 			   (void *)(unsigned long)cs->classid);
 		task_unlock(p);
+		cond_resched();
 	}
 	css_task_iter_end(&it);
 
-- 
2.19.0

^ permalink raw reply related

* [net-next 13/14] net/mlx5e: Use a slow path rule instead if vxlan neighbour isn't available
From: Saeed Mahameed @ 2018-10-18  0:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>

From: Paul Blakey <paulb@mellanox.com>

When adding a vxlan tc rule, and a neighbour isn't available, we
don't insert any rule to hardware. Once we enable offloading flows
with multiple priorities, a packet that should have matched this rule
will continue in hardware pipeline and might match a wrong one.

This is unlike in tc software path where it will be matched and
forwarded to the vxlan device (which will cause a ARP lookup
eventually) and stop processing further tc filters.

To address that, when when a neighbour isn't available (EAGAIN from
attach_encap), or gets deleted, change the original action to be a
forward to slow path instead. Neighbour update will restore the original
action once the neighbour becomes available. This will be done atomically
so at any given time we will have a the correct match.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   | 104 ++++++++++++++----
 1 file changed, 82 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 1786e25644ac..cb66964aa1ff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -74,6 +74,7 @@ enum {
 	MLX5E_TC_FLOW_OFFLOADED	= BIT(MLX5E_TC_FLOW_BASE + 2),
 	MLX5E_TC_FLOW_HAIRPIN	= BIT(MLX5E_TC_FLOW_BASE + 3),
 	MLX5E_TC_FLOW_HAIRPIN_RSS = BIT(MLX5E_TC_FLOW_BASE + 4),
+	MLX5E_TC_FLOW_SLOW	  = BIT(MLX5E_TC_FLOW_BASE + 5),
 };
 
 #define MLX5E_TC_MAX_SPLITS 1
@@ -82,7 +83,7 @@ struct mlx5e_tc_flow {
 	struct rhash_head	node;
 	struct mlx5e_priv	*priv;
 	u64			cookie;
-	u8			flags;
+	u16			flags;
 	struct mlx5_flow_handle *rule[MLX5E_TC_MAX_SPLITS + 1];
 	struct list_head	encap;   /* flows sharing the same encap ID */
 	struct list_head	mod_hdr; /* flows sharing the same mod hdr ID */
@@ -860,6 +861,36 @@ mlx5e_tc_unoffload_fdb_rules(struct mlx5_eswitch *esw,
 	mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
 }
 
+static struct mlx5_flow_handle *
+mlx5e_tc_offload_to_slow_path(struct mlx5_eswitch *esw,
+			      struct mlx5e_tc_flow *flow,
+			      struct mlx5_flow_spec *spec,
+			      struct mlx5_esw_flow_attr *slow_attr)
+{
+	struct mlx5_flow_handle *rule;
+
+	memcpy(slow_attr, flow->esw_attr, sizeof(*slow_attr));
+	slow_attr->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+	slow_attr->mirror_count = 0,
+	slow_attr->dest_chain = FDB_SLOW_PATH_CHAIN,
+
+	rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, slow_attr);
+	if (!IS_ERR(rule))
+		flow->flags |= MLX5E_TC_FLOW_SLOW;
+
+	return rule;
+}
+
+static void
+mlx5e_tc_unoffload_from_slow_path(struct mlx5_eswitch *esw,
+				  struct mlx5e_tc_flow *flow,
+				  struct mlx5_esw_flow_attr *slow_attr)
+{
+	memcpy(slow_attr, flow->esw_attr, sizeof(*slow_attr));
+	mlx5e_tc_unoffload_fdb_rules(esw, flow, slow_attr);
+	flow->flags &= ~MLX5E_TC_FLOW_SLOW;
+}
+
 static int
 mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 		      struct mlx5e_tc_flow_parse_attr *parse_attr,
@@ -917,15 +948,21 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 	/* we get here if (1) there's no error or when
 	 * (2) there's an encap action and we're on -EAGAIN (no valid neigh)
 	 */
-	if (encap_err != -EAGAIN) {
+	if (encap_err == -EAGAIN) {
+		/* continue with goto slow path rule instead */
+		struct mlx5_esw_flow_attr slow_attr;
+
+		flow->rule[0] = mlx5e_tc_offload_to_slow_path(esw, flow, &parse_attr->spec, &slow_attr);
+	} else {
 		flow->rule[0] = mlx5e_tc_offload_fdb_rules(esw, flow, &parse_attr->spec, attr);
-		if (IS_ERR(flow->rule[0])) {
-			err = PTR_ERR(flow->rule[0]);
-			goto err_add_rule;
-		}
 	}
 
-	return encap_err;
+	if (IS_ERR(flow->rule[0])) {
+		err = PTR_ERR(flow->rule[0]);
+		goto err_add_rule;
+	}
+
+	return 0;
 
 err_add_rule:
 	mlx5_fc_destroy(esw->dev, counter);
@@ -946,9 +983,14 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+	struct mlx5_esw_flow_attr slow_attr;
 
-	if (flow->flags & MLX5E_TC_FLOW_OFFLOADED)
-		mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->esw_attr);
+	if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
+		if (flow->flags & MLX5E_TC_FLOW_SLOW)
+			mlx5e_tc_unoffload_from_slow_path(esw, flow, &slow_attr);
+		else
+			mlx5e_tc_unoffload_fdb_rules(esw, flow, attr);
+	}
 
 	mlx5_eswitch_del_vlan_action(esw, attr);
 
@@ -968,7 +1010,7 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
 			      struct mlx5e_encap_entry *e)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
-	struct mlx5_esw_flow_attr *esw_attr;
+	struct mlx5_esw_flow_attr slow_attr, *esw_attr;
 	struct mlx5_flow_handle *rule;
 	struct mlx5_flow_spec *spec;
 	struct mlx5e_tc_flow *flow;
@@ -991,6 +1033,7 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
 		esw_attr->encap_id = e->encap_id;
 		spec = &esw_attr->parse_attr->spec;
 
+		/* update from slow path rule to encap rule */
 		rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, esw_attr);
 		if (IS_ERR(rule)) {
 			err = PTR_ERR(rule);
@@ -998,6 +1041,9 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
 				       err);
 			continue;
 		}
+
+		mlx5e_tc_unoffload_from_slow_path(esw, flow, &slow_attr);
+		flow->flags |= MLX5E_TC_FLOW_OFFLOADED; /* was unset when slow path rule removed */
 		flow->rule[0] = rule;
 	}
 }
@@ -1006,11 +1052,28 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
 			      struct mlx5e_encap_entry *e)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5_esw_flow_attr slow_attr;
+	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_spec *spec;
 	struct mlx5e_tc_flow *flow;
+	int err;
 
 	list_for_each_entry(flow, &e->flows, encap) {
-		if (flow->flags & MLX5E_TC_FLOW_OFFLOADED)
-			mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->esw_attr);
+		spec = &flow->esw_attr->parse_attr->spec;
+
+		/* update from encap rule to slow path rule */
+		rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec, &slow_attr);
+
+		if (IS_ERR(rule)) {
+			err = PTR_ERR(rule);
+			mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n",
+				       err);
+			continue;
+		}
+
+		mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->esw_attr);
+		flow->flags |= MLX5E_TC_FLOW_OFFLOADED; /* was unset when fast path rule removed */
+		flow->rule[0] = rule;
 	}
 
 	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
@@ -2888,9 +2951,9 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	return 0;
 }
 
-static void get_flags(int flags, u8 *flow_flags)
+static void get_flags(int flags, u16 *flow_flags)
 {
-	u8 __flow_flags = 0;
+	u16 __flow_flags = 0;
 
 	if (flags & MLX5E_TC_INGRESS)
 		__flow_flags |= MLX5E_TC_FLOW_INGRESS;
@@ -2921,7 +2984,7 @@ static struct rhashtable *get_tc_ht(struct mlx5e_priv *priv)
 
 static int
 mlx5e_alloc_flow(struct mlx5e_priv *priv, int attr_size,
-		 struct tc_cls_flower_offload *f, u8 flow_flags,
+		 struct tc_cls_flower_offload *f, u16 flow_flags,
 		 struct mlx5e_tc_flow_parse_attr **__parse_attr,
 		 struct mlx5e_tc_flow **__flow)
 {
@@ -2958,7 +3021,7 @@ mlx5e_alloc_flow(struct mlx5e_priv *priv, int attr_size,
 static int
 mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
 		   struct tc_cls_flower_offload *f,
-		   u8 flow_flags,
+		   u16 flow_flags,
 		   struct mlx5e_tc_flow **__flow)
 {
 	struct netlink_ext_ack *extack = f->common.extack;
@@ -2978,12 +3041,9 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
 		goto err_free;
 
 	err = mlx5e_tc_add_fdb_flow(priv, parse_attr, flow, extack);
-	if (err && err != -EAGAIN)
+	if (err)
 		goto err_free;
 
-	if (!err)
-		flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
-
 	if (!(flow->esw_attr->action &
 	      MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT))
 		kvfree(parse_attr);
@@ -3002,7 +3062,7 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
 static int
 mlx5e_add_nic_flow(struct mlx5e_priv *priv,
 		   struct tc_cls_flower_offload *f,
-		   u8 flow_flags,
+		   u16 flow_flags,
 		   struct mlx5e_tc_flow **__flow)
 {
 	struct netlink_ext_ack *extack = f->common.extack;
@@ -3045,7 +3105,7 @@ mlx5e_tc_add_flow(struct mlx5e_priv *priv,
 		  struct mlx5e_tc_flow **flow)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
-	u8 flow_flags;
+	u16 flow_flags;
 	int err;
 
 	get_flags(flags, &flow_flags);
-- 
2.17.2

^ permalink raw reply related

* [net-next 14/14] net/mlx5e: Support offloading tc priorities and chains for eswitch flows
From: Saeed Mahameed @ 2018-10-18  0:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>

From: Paul Blakey <paulb@mellanox.com>

Currently we fail when user specify a non-zero chain, this patch adds the
support for it and tc priorities. To get to a new chain, use the tc
goto action.

Currently we support a fixed prio range 1-16, and chain range 0-3.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  3 --
 .../net/ethernet/mellanox/mlx5/core/en_rep.c  |  6 ---
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   | 51 ++++++++++++++++++-
 .../mellanox/mlx5/core/eswitch_offloads.c     |  2 +-
 4 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c9848e333450..1243edbedc9e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3392,9 +3392,6 @@ static int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
 {
 	struct mlx5e_priv *priv = cb_priv;
 
-	if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
-		return -EOPNOTSUPP;
-
 	switch (type) {
 	case TC_SETUP_CLSFLOWER:
 		return mlx5e_setup_tc_cls_flower(priv, type_data, MLX5E_TC_INGRESS);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 64c2b9ea8b1e..c3c657548824 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -853,9 +853,6 @@ static int mlx5e_rep_setup_tc_cb_egdev(enum tc_setup_type type, void *type_data,
 {
 	struct mlx5e_priv *priv = cb_priv;
 
-	if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
-		return -EOPNOTSUPP;
-
 	switch (type) {
 	case TC_SETUP_CLSFLOWER:
 		return mlx5e_rep_setup_tc_cls_flower(priv, type_data, MLX5E_TC_EGRESS);
@@ -869,9 +866,6 @@ static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data,
 {
 	struct mlx5e_priv *priv = cb_priv;
 
-	if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
-		return -EOPNOTSUPP;
-
 	switch (type) {
 	case TC_SETUP_CLSFLOWER:
 		return mlx5e_rep_setup_tc_cls_flower(priv, type_data, MLX5E_TC_INGRESS);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index cb66964aa1ff..608025ca5c04 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -898,15 +898,32 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 		      struct netlink_ext_ack *extack)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	u32 max_chain = mlx5_eswitch_get_chain_range(esw);
 	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+	u16 max_prio = mlx5_eswitch_get_prio_range(esw);
 	struct net_device *out_dev, *encap_dev = NULL;
 	struct mlx5_fc *counter = NULL;
 	struct mlx5e_rep_priv *rpriv;
 	struct mlx5e_priv *out_priv;
 	int err = 0, encap_err = 0;
 
-	/* keep the old behaviour, use same prio for all offloaded rules */
-	attr->prio = 1;
+	/* if prios are not supported, keep the old behaviour of using same prio
+	 * for all offloaded rules.
+	 */
+	if (!mlx5_eswitch_prios_supported(esw))
+		attr->prio = 1;
+
+	if (attr->chain > max_chain) {
+		NL_SET_ERR_MSG(extack, "Requested chain is out of supported range");
+		err = -EOPNOTSUPP;
+		goto err_max_prio_chain;
+	}
+
+	if (attr->prio > max_prio) {
+		NL_SET_ERR_MSG(extack, "Requested priority is out of supported range");
+		err = -EOPNOTSUPP;
+		goto err_max_prio_chain;
+	}
 
 	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) {
 		out_dev = __dev_get_by_index(dev_net(priv->netdev),
@@ -975,6 +992,7 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
 		mlx5e_detach_encap(priv, flow);
 err_attach_encap:
+err_max_prio_chain:
 	return err;
 }
 
@@ -2826,6 +2844,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				struct mlx5e_tc_flow *flow,
 				struct netlink_ext_ack *extack)
 {
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
 	struct mlx5e_rep_priv *rpriv = priv->ppriv;
 	struct ip_tunnel_info *info = NULL;
@@ -2934,6 +2953,25 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 			continue;
 		}
 
+		if (is_tcf_gact_goto_chain(a)) {
+			u32 dest_chain = tcf_gact_goto_chain_index(a);
+			u32 max_chain = mlx5_eswitch_get_chain_range(esw);
+
+			if (dest_chain <= attr->chain) {
+				NL_SET_ERR_MSG(extack, "Goto earlier chain isn't supported");
+				return -EOPNOTSUPP;
+			}
+			if (dest_chain > max_chain) {
+				NL_SET_ERR_MSG(extack, "Requested destination chain is out of supported range");
+				return -EOPNOTSUPP;
+			}
+			action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
+				  MLX5_FLOW_CONTEXT_ACTION_COUNT;
+			attr->dest_chain = dest_chain;
+
+			continue;
+		}
+
 		return -EINVAL;
 	}
 
@@ -3036,6 +3074,8 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
 	if (err)
 		goto out;
 
+	flow->esw_attr->chain = f->common.chain_index;
+	flow->esw_attr->prio = TC_H_MAJ(f->common.prio) >> 16;
 	err = parse_tc_fdb_actions(priv, f->exts, parse_attr, flow, extack);
 	if (err)
 		goto err_free;
@@ -3070,6 +3110,10 @@ mlx5e_add_nic_flow(struct mlx5e_priv *priv,
 	struct mlx5e_tc_flow *flow;
 	int attr_size, err;
 
+	/* multi-chain not supported for NIC rules */
+	if (!tc_cls_can_offload_and_chain0(priv->netdev, &f->common))
+		return -EOPNOTSUPP;
+
 	flow_flags |= MLX5E_TC_FLOW_NIC;
 	attr_size  = sizeof(struct mlx5_nic_flow_attr);
 	err = mlx5e_alloc_flow(priv, attr_size, f, flow_flags,
@@ -3110,6 +3154,9 @@ mlx5e_tc_add_flow(struct mlx5e_priv *priv,
 
 	get_flags(flags, &flow_flags);
 
+	if (!tc_can_offload_extack(priv->netdev, f->common.extack))
+		return -EOPNOTSUPP;
+
 	if (esw && esw->mode == SRIOV_OFFLOADS)
 		err = mlx5e_add_fdb_flow(priv, f, flow_flags, flow);
 	else
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 42a130455ef8..9eac137790f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -71,7 +71,7 @@ u16 mlx5_eswitch_get_prio_range(struct mlx5_eswitch *esw)
 	if (esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED)
 		return FDB_MAX_PRIO;
 
-	return U16_MAX;
+	return 1;
 }
 
 struct mlx5_flow_handle *
-- 
2.17.2

^ permalink raw reply related

* [net-next 11/14] net/mlx5e: Avoid duplicated code for tc offloads add/del fdb rule
From: Saeed Mahameed @ 2018-10-18  0:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Or Gerlitz, Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

The code for adding/deleting fdb flow is repeated when
user-space does flow add/del and when we add/del from
the neigh update path - unify them to avoid the duplication.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   | 91 ++++++++++---------
 1 file changed, 50 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 8b25850cbf6a..1786e25644ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -823,6 +823,43 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 			      struct mlx5e_tc_flow *flow,
 			      struct netlink_ext_ack *extack);
 
+static struct mlx5_flow_handle *
+mlx5e_tc_offload_fdb_rules(struct mlx5_eswitch *esw,
+			   struct mlx5e_tc_flow *flow,
+			   struct mlx5_flow_spec *spec,
+			   struct mlx5_esw_flow_attr *attr)
+{
+	struct mlx5_flow_handle *rule;
+
+	rule = mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
+	if (IS_ERR(rule))
+		return rule;
+
+	if (attr->mirror_count) {
+		flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, spec, attr);
+		if (IS_ERR(flow->rule[1])) {
+			mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
+			return flow->rule[1];
+		}
+	}
+
+	flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
+	return rule;
+}
+
+static void
+mlx5e_tc_unoffload_fdb_rules(struct mlx5_eswitch *esw,
+			     struct mlx5e_tc_flow *flow,
+			   struct mlx5_esw_flow_attr *attr)
+{
+	flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
+
+	if (attr->mirror_count)
+		mlx5_eswitch_del_fwd_rule(esw, flow->rule[1], attr);
+
+	mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
+}
+
 static int
 mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 		      struct mlx5e_tc_flow_parse_attr *parse_attr,
@@ -881,25 +918,15 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 	 * (2) there's an encap action and we're on -EAGAIN (no valid neigh)
 	 */
 	if (encap_err != -EAGAIN) {
-		flow->rule[0] = mlx5_eswitch_add_offloaded_rule(esw, &parse_attr->spec, attr);
+		flow->rule[0] = mlx5e_tc_offload_fdb_rules(esw, flow, &parse_attr->spec, attr);
 		if (IS_ERR(flow->rule[0])) {
 			err = PTR_ERR(flow->rule[0]);
 			goto err_add_rule;
 		}
-
-		if (attr->mirror_count) {
-			flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, &parse_attr->spec, attr);
-			if (IS_ERR(flow->rule[1])) {
-				err = PTR_ERR(flow->rule[1]);
-				goto err_fwd_rule;
-			}
-		}
 	}
 
 	return encap_err;
 
-err_fwd_rule:
-	mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
 err_add_rule:
 	mlx5_fc_destroy(esw->dev, counter);
 err_create_counter:
@@ -920,12 +947,8 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
 
-	if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
-		flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
-		if (attr->mirror_count)
-			mlx5_eswitch_del_fwd_rule(esw, flow->rule[1], attr);
-		mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
-	}
+	if (flow->flags & MLX5E_TC_FLOW_OFFLOADED)
+		mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->esw_attr);
 
 	mlx5_eswitch_del_vlan_action(esw, attr);
 
@@ -946,6 +969,8 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct mlx5_esw_flow_attr *esw_attr;
+	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_spec *spec;
 	struct mlx5e_tc_flow *flow;
 	int err;
 
@@ -964,26 +989,16 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
 	list_for_each_entry(flow, &e->flows, encap) {
 		esw_attr = flow->esw_attr;
 		esw_attr->encap_id = e->encap_id;
-		flow->rule[0] = mlx5_eswitch_add_offloaded_rule(esw, &esw_attr->parse_attr->spec, esw_attr);
-		if (IS_ERR(flow->rule[0])) {
-			err = PTR_ERR(flow->rule[0]);
+		spec = &esw_attr->parse_attr->spec;
+
+		rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, esw_attr);
+		if (IS_ERR(rule)) {
+			err = PTR_ERR(rule);
 			mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
 				       err);
 			continue;
 		}
-
-		if (esw_attr->mirror_count) {
-			flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, &esw_attr->parse_attr->spec, esw_attr);
-			if (IS_ERR(flow->rule[1])) {
-				mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], esw_attr);
-				err = PTR_ERR(flow->rule[1]);
-				mlx5_core_warn(priv->mdev, "Failed to update cached mirror flow, %d\n",
-					       err);
-				continue;
-			}
-		}
-
-		flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
+		flow->rule[0] = rule;
 	}
 }
 
@@ -994,14 +1009,8 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
 	struct mlx5e_tc_flow *flow;
 
 	list_for_each_entry(flow, &e->flows, encap) {
-		if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
-			struct mlx5_esw_flow_attr *attr = flow->esw_attr;
-
-			flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
-			if (attr->mirror_count)
-				mlx5_eswitch_del_fwd_rule(esw, flow->rule[1], attr);
-			mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
-		}
+		if (flow->flags & MLX5E_TC_FLOW_OFFLOADED)
+			mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->esw_attr);
 	}
 
 	if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
-- 
2.17.2

^ permalink raw reply related

* [net-next 12/14] net/mlx5: E-Switch, Enable setting goto slow path chain action
From: Saeed Mahameed @ 2018-10-18  0:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>

From: Paul Blakey <paulb@mellanox.com>

A pre-step for the tc offloads code to use this when a neigh is
not available for encap rules.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h          | 2 ++
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 54215f4312fa..aaafc9f17115 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -60,6 +60,7 @@
 	MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_to_table)
 
 #define FDB_MAX_CHAIN 3
+#define FDB_SLOW_PATH_CHAIN (FDB_MAX_CHAIN + 1)
 #define FDB_MAX_PRIO 16
 
 struct vport_ingress {
@@ -356,6 +357,7 @@ static inline int  mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs,
 static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw) {}
 
 #define FDB_MAX_CHAIN 1
+#define FDB_SLOW_PATH_CHAIN (FDB_MAX_CHAIN + 1)
 #define FDB_MAX_PRIO 1
 
 #endif /* CONFIG_MLX5_ESWITCH */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 289f1992f624..42a130455ef8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -671,6 +671,9 @@ esw_get_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level)
 	int table_prio, l = 0;
 	u32 flags = 0;
 
+	if (chain == FDB_SLOW_PATH_CHAIN)
+		return esw->fdb_table.offloads.slow_fdb;
+
 	mutex_lock(&esw->fdb_table.offloads.fdb_prio_lock);
 
 	fdb = fdb_prio_table(esw, chain, prio, level).fdb;
@@ -730,6 +733,9 @@ esw_put_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level)
 {
 	int l;
 
+	if (chain == FDB_SLOW_PATH_CHAIN)
+		return;
+
 	mutex_lock(&esw->fdb_table.offloads.fdb_prio_lock);
 
 	for (l = level; l >= 0; l--) {
-- 
2.17.2

^ permalink raw reply related

* [net-next 09/14] net/mlx5: Add a no-append flow insertion mode
From: Saeed Mahameed @ 2018-10-18  0:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>

From: Paul Blakey <paulb@mellanox.com>

If no-append flag is set, we will add a new FTE, instead of appending
the actions of the inserted rule when the same match already exists.

While here, move the has_flow_tag boolean indicator to be a flag too.

This patch doesn't change any functionality.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanmox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c                  |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  2 +-
 .../net/ethernet/mellanox/mlx5/core/fpga/ipsec.c   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |  9 ++++++++-
 include/linux/mlx5/fs.h                            | 14 +++++++++++---
 5 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 5ced0cc46ba1..af32899bb72a 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2793,7 +2793,7 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
 			return -EINVAL;
 
 		action->flow_tag = ib_spec->flow_tag.tag_id;
-		action->has_flow_tag = true;
+		action->flags |= FLOW_ACT_HAS_TAG;
 		break;
 	case IB_FLOW_SPEC_ACTION_DROP:
 		if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
@@ -2886,7 +2886,7 @@ is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev,
 		return egress ? VALID_SPEC_INVALID : VALID_SPEC_NA;
 
 	return is_crypto && is_ipsec &&
-		(!egress || (!is_drop && !flow_act->has_flow_tag)) ?
+		(!egress || (!is_drop && !(flow_act->flags & FLOW_ACT_HAS_TAG))) ?
 		VALID_SPEC_VALID : VALID_SPEC_INVALID;
 }
 
@@ -3349,7 +3349,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
 					MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
 	}
 
-	if (flow_act.has_flow_tag &&
+	if ((flow_act.flags & FLOW_ACT_HAS_TAG)  &&
 	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
 	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
 		mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 6c04e11f9a05..a9c68b7859b4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -684,9 +684,9 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 	struct mlx5_flow_destination dest[2] = {};
 	struct mlx5_flow_act flow_act = {
 		.action = attr->action,
-		.has_flow_tag = true,
 		.flow_tag = attr->flow_tag,
 		.reformat_id = 0,
+		.flags    = FLOW_ACT_HAS_TAG,
 	};
 	struct mlx5_fc *counter = NULL;
 	bool table_created = false;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
index 5645a4facad2..28aa8c968a80 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
@@ -650,7 +650,7 @@ static bool mlx5_is_fpga_egress_ipsec_rule(struct mlx5_core_dev *dev,
 	    (match_criteria_enable &
 	     ~(MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS)) ||
 	    (flow_act->action & ~(MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | MLX5_FLOW_CONTEXT_ACTION_ALLOW)) ||
-	     flow_act->has_flow_tag)
+	     (flow_act->flags & FLOW_ACT_HAS_TAG))
 		return false;
 
 	return true;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 7eb6d58733ac..67ba4c975d81 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1428,7 +1428,7 @@ static int check_conflicting_ftes(struct fs_fte *fte, const struct mlx5_flow_act
 		return -EEXIST;
 	}
 
-	if (flow_act->has_flow_tag &&
+	if ((flow_act->flags & FLOW_ACT_HAS_TAG) &&
 	    fte->action.flow_tag != flow_act->flow_tag) {
 		mlx5_core_warn(get_dev(&fte->node),
 			       "FTE flow tag %u already exists with different flow tag %u\n",
@@ -1628,6 +1628,8 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft,
 
 search_again_locked:
 	version = matched_fgs_get_version(match_head);
+	if (flow_act->flags & FLOW_ACT_NO_APPEND)
+		goto skip_search;
 	/* Try to find a fg that already contains a matching fte */
 	list_for_each_entry(iter, match_head, list) {
 		struct fs_fte *fte_tmp;
@@ -1644,6 +1646,11 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft,
 		return rule;
 	}
 
+skip_search:
+	/* No group with matching fte found, or we skipped the search.
+	 * Try to add a new fte to any matching fg.
+	 */
+
 	/* Check the ft version, for case that new flow group
 	 * was added while the fgs weren't locked
 	 */
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index f8d00872c7d3..5660f07d3be0 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -158,20 +158,28 @@ struct mlx5_fs_vlan {
 
 #define MLX5_FS_VLAN_DEPTH	2
 
+enum {
+	FLOW_ACT_HAS_TAG   = BIT(0),
+	FLOW_ACT_NO_APPEND = BIT(1),
+};
+
 struct mlx5_flow_act {
 	u32 action;
-	bool has_flow_tag;
 	u32 flow_tag;
 	u32 reformat_id;
 	u32 modify_id;
 	uintptr_t esp_id;
+	u32 flags;
 	struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH];
 	struct ib_counters *counters;
 };
 
 #define MLX5_DECLARE_FLOW_ACT(name) \
-	struct mlx5_flow_act name = {MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,\
-				     MLX5_FS_DEFAULT_FLOW_TAG, 0, 0}
+	struct mlx5_flow_act name = { .action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,\
+				      .flow_tag = MLX5_FS_DEFAULT_FLOW_TAG, \
+				      .reformat_id = 0, \
+				      .modify_id = 0, \
+				      .flags =  0, }
 
 /* Single destination per rule.
  * Group ID is implied by the match criteria.
-- 
2.17.2

^ permalink raw reply related

* [net-next 10/14] net/mlx5e: For TC offloads, always add new flow instead of appending the actions
From: Saeed Mahameed @ 2018-10-18  0:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>

From: Paul Blakey <paulb@mellanox.com>

When replacing a tc flower rule, flower first requests to add the
new rule (new action), then deletes the old one.
But currently when asked to add a new tc flower flow, we append the
actions (and counters to it).

This can result in a fte with two flow counters or conflicting
actions (drop and encap action) which firmware complains/errs
about and isn't achieving what the user aimed for.

Instead, insert the flow using the new no-append flag which will add a
new HW rule, the old flow and rule will be deleted later by flower

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanmox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c            | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index a9c68b7859b4..8b25850cbf6a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -686,7 +686,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 		.action = attr->action,
 		.flow_tag = attr->flow_tag,
 		.reformat_id = 0,
-		.flags    = FLOW_ACT_HAS_TAG,
+		.flags    = FLOW_ACT_HAS_TAG | FLOW_ACT_NO_APPEND,
 	};
 	struct mlx5_fc *counter = NULL;
 	bool table_created = false;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 8501b6c31c02..289f1992f624 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -80,8 +80,8 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_esw_flow_attr *attr)
 {
 	struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
+	struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND, };
 	bool mirror = !!(attr->mirror_count);
-	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_flow_handle *rule;
 	struct mlx5_flow_table *fdb;
 	int j, i = 0;
@@ -195,7 +195,7 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
 			  struct mlx5_esw_flow_attr *attr)
 {
 	struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
-	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND, };
 	struct mlx5_flow_table *fast_fdb;
 	struct mlx5_flow_table *fwd_fdb;
 	struct mlx5_flow_handle *rule;
-- 
2.17.2

^ permalink raw reply related

* [net-next 08/14] net/mlx5: E-Switch, Add chains and priorities
From: Saeed Mahameed @ 2018-10-18  0:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>

From: Paul Blakey <paulb@mellanox.com>

A chain is a group of priorities, so use the fdb parallel
sub namespaces to implement chains, and a flow table for each
priority in them.

Because these namespaces are parallel and in series to the slow path
fdb, the chains aren't connected to one another (but to the slow path),
and one must use a explicit goto action to reach a different chain.

Flow tables for the priorities will be created on demand and destroyed
once not used.

The Firmware has four pools of tables for sizes S/XS/M/L (4k, 64k, 1m, 4m).
We maintain ghost copies of the pools occupancy.

When a new table is to be created, we scan the pools from large to small
and find the 1st table size which can be now created. When a table is
destroyed, we update the relevant pool.

Multi chain/prio isn't enabled yet by this patch, for now all flows
will use the default chain 0, and prio 1.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   |   3 +
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |  32 +-
 .../mellanox/mlx5/core/eswitch_offloads.c     | 390 ++++++++++++++----
 3 files changed, 339 insertions(+), 86 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 7487bdd55f23..6c04e11f9a05 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -837,6 +837,9 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 	struct mlx5e_priv *out_priv;
 	int err = 0, encap_err = 0;
 
+	/* keep the old behaviour, use same prio for all offloaded rules */
+	attr->prio = 1;
+
 	if (attr->action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) {
 		out_dev = __dev_get_by_index(dev_net(priv->netdev),
 					     attr->parse_attr->mirred_ifindex);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 584e735bbad1..54215f4312fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -123,6 +123,13 @@ struct mlx5_vport {
 	u16                     enabled_events;
 };
 
+enum offloads_fdb_flags {
+	ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED = BIT(0),
+};
+
+extern const unsigned int ESW_POOLS[4];
+
+#define PRIO_LEVELS 2
 struct mlx5_eswitch_fdb {
 	union {
 		struct legacy_fdb {
@@ -133,16 +140,24 @@ struct mlx5_eswitch_fdb {
 		} legacy;
 
 		struct offloads_fdb {
-			struct mlx5_flow_table *fast_fdb;
-			struct mlx5_flow_table *fwd_fdb;
 			struct mlx5_flow_table *slow_fdb;
 			struct mlx5_flow_group *send_to_vport_grp;
 			struct mlx5_flow_group *miss_grp;
 			struct mlx5_flow_handle *miss_rule_uni;
 			struct mlx5_flow_handle *miss_rule_multi;
 			int vlan_push_pop_refcount;
+
+			struct {
+				struct mlx5_flow_table *fdb;
+				u32 num_rules;
+			} fdb_prio[FDB_MAX_CHAIN + 1][FDB_MAX_PRIO + 1][PRIO_LEVELS];
+			/* Protects fdb_prio table */
+			struct mutex fdb_prio_lock;
+
+			int fdb_left[ARRAY_SIZE(ESW_POOLS)];
 		} offloads;
 	};
+	u32 flags;
 };
 
 struct mlx5_esw_offload {
@@ -184,6 +199,7 @@ struct mlx5_eswitch {
 
 	struct mlx5_esw_offload offloads;
 	int                     mode;
+	int                     nvports;
 };
 
 void esw_offloads_cleanup(struct mlx5_eswitch *esw, int nvports);
@@ -236,6 +252,15 @@ mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw,
 			  struct mlx5_flow_handle *rule,
 			  struct mlx5_esw_flow_attr *attr);
 
+bool
+mlx5_eswitch_prios_supported(struct mlx5_eswitch *esw);
+
+u16
+mlx5_eswitch_get_prio_range(struct mlx5_eswitch *esw);
+
+u32
+mlx5_eswitch_get_chain_range(struct mlx5_eswitch *esw);
+
 struct mlx5_flow_handle *
 mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport,
 				  struct mlx5_flow_destination *dest);
@@ -274,6 +299,9 @@ struct mlx5_esw_flow_attr {
 	u32	mod_hdr_id;
 	u8	match_level;
 	struct mlx5_fc *counter;
+	u32	chain;
+	u16	prio;
+	u32	dest_chain;
 	struct mlx5e_tc_flow_parse_attr *parse_attr;
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 983bb8a80f75..8501b6c31c02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -37,32 +37,59 @@
 #include <linux/mlx5/fs.h>
 #include "mlx5_core.h"
 #include "eswitch.h"
+#include "en.h"
+#include "fs_core.h"
 
 enum {
 	FDB_FAST_PATH = 0,
 	FDB_SLOW_PATH
 };
 
+#define fdb_prio_table(esw, chain, prio, level) \
+	(esw)->fdb_table.offloads.fdb_prio[(chain)][(prio)][(level)]
+
+static struct mlx5_flow_table *
+esw_get_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level);
+static void
+esw_put_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level);
+
+bool mlx5_eswitch_prios_supported(struct mlx5_eswitch *esw)
+{
+	return (!!(esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED));
+}
+
+u32 mlx5_eswitch_get_chain_range(struct mlx5_eswitch *esw)
+{
+	if (esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED)
+		return FDB_MAX_CHAIN;
+
+	return 0;
+}
+
+u16 mlx5_eswitch_get_prio_range(struct mlx5_eswitch *esw)
+{
+	if (esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED)
+		return FDB_MAX_PRIO;
+
+	return U16_MAX;
+}
+
 struct mlx5_flow_handle *
 mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_spec *spec,
 				struct mlx5_esw_flow_attr *attr)
 {
 	struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
+	bool mirror = !!(attr->mirror_count);
 	struct mlx5_flow_act flow_act = {0};
-	struct mlx5_flow_table *ft = NULL;
 	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_table *fdb;
 	int j, i = 0;
 	void *misc;
 
 	if (esw->mode != SRIOV_OFFLOADS)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	if (attr->mirror_count)
-		ft = esw->fdb_table.offloads.fwd_fdb;
-	else
-		ft = esw->fdb_table.offloads.fast_fdb;
-
 	flow_act.action = attr->action;
 	/* if per flow vlan pop/push is emulated, don't set that into the firmware */
 	if (!mlx5_eswitch_vlan_actions_supported(esw->dev, 1))
@@ -80,13 +107,28 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	}
 
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
-		for (j = attr->mirror_count; j < attr->out_count; j++) {
-			dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-			dest[i].vport.num = attr->out_rep[j]->vport;
-			dest[i].vport.vhca_id =
-				MLX5_CAP_GEN(attr->out_mdev[j], vhca_id);
-			dest[i].vport.vhca_id_valid = !!MLX5_CAP_ESW(esw->dev, merged_eswitch);
+		if (attr->dest_chain) {
+			struct mlx5_flow_table *ft;
+
+			ft = esw_get_prio_table(esw, attr->dest_chain, 1, 0);
+			if (IS_ERR(ft)) {
+				rule = ERR_CAST(ft);
+				goto err_create_goto_table;
+			}
+
+			dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+			dest[i].ft = ft;
 			i++;
+		} else {
+			for (j = attr->mirror_count; j < attr->out_count; j++) {
+				dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+				dest[i].vport.num = attr->out_rep[j]->vport;
+				dest[i].vport.vhca_id =
+					MLX5_CAP_GEN(attr->out_mdev[j], vhca_id);
+				dest[i].vport.vhca_id_valid =
+					!!MLX5_CAP_ESW(esw->dev, merged_eswitch);
+				i++;
+			}
 		}
 	}
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
@@ -124,13 +166,26 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
 		flow_act.reformat_id = attr->encap_id;
 
-	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, i);
+	fdb = esw_get_prio_table(esw, attr->chain, attr->prio, !!mirror);
+	if (IS_ERR(fdb)) {
+		rule = ERR_CAST(fdb);
+		goto err_esw_get;
+	}
+
+	rule = mlx5_add_flow_rules(fdb, spec, &flow_act, dest, i);
 	if (IS_ERR(rule))
-		goto out;
+		goto err_add_rule;
 	else
 		esw->offloads.num_flows++;
 
-out:
+	return rule;
+
+err_add_rule:
+	esw_put_prio_table(esw, attr->chain, attr->prio, !!mirror);
+err_esw_get:
+	if (attr->dest_chain)
+		esw_put_prio_table(esw, attr->dest_chain, 1, 0);
+err_create_goto_table:
 	return rule;
 }
 
@@ -141,10 +196,24 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
 {
 	struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
 	struct mlx5_flow_act flow_act = {0};
+	struct mlx5_flow_table *fast_fdb;
+	struct mlx5_flow_table *fwd_fdb;
 	struct mlx5_flow_handle *rule;
 	void *misc;
 	int i;
 
+	fast_fdb = esw_get_prio_table(esw, attr->chain, attr->prio, 0);
+	if (IS_ERR(fast_fdb)) {
+		rule = ERR_CAST(fast_fdb);
+		goto err_get_fast;
+	}
+
+	fwd_fdb = esw_get_prio_table(esw, attr->chain, attr->prio, 1);
+	if (IS_ERR(fwd_fdb)) {
+		rule = ERR_CAST(fwd_fdb);
+		goto err_get_fwd;
+	}
+
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 	for (i = 0; i < attr->mirror_count; i++) {
 		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
@@ -154,7 +223,7 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
 		dest[i].vport.vhca_id_valid = !!MLX5_CAP_ESW(esw->dev, merged_eswitch);
 	}
 	dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
-	dest[i].ft = esw->fdb_table.offloads.fwd_fdb,
+	dest[i].ft = fwd_fdb,
 	i++;
 
 	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
@@ -177,21 +246,49 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
 		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS |
 					      MLX5_MATCH_MISC_PARAMETERS;
 
-	rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fast_fdb, spec, &flow_act, dest, i);
+	rule = mlx5_add_flow_rules(fast_fdb, spec, &flow_act, dest, i);
 
-	if (!IS_ERR(rule))
-		esw->offloads.num_flows++;
+	if (IS_ERR(rule))
+		goto add_err;
 
+	esw->offloads.num_flows++;
+
+	return rule;
+add_err:
+	esw_put_prio_table(esw, attr->chain, attr->prio, 1);
+err_get_fwd:
+	esw_put_prio_table(esw, attr->chain, attr->prio, 0);
+err_get_fast:
 	return rule;
 }
 
+static void
+__mlx5_eswitch_del_rule(struct mlx5_eswitch *esw,
+			struct mlx5_flow_handle *rule,
+			struct mlx5_esw_flow_attr *attr,
+			bool fwd_rule)
+{
+	bool mirror = (attr->mirror_count > 0);
+
+	mlx5_del_flow_rules(rule);
+	esw->offloads.num_flows--;
+
+	if (fwd_rule)  {
+		esw_put_prio_table(esw, attr->chain, attr->prio, 1);
+		esw_put_prio_table(esw, attr->chain, attr->prio, 0);
+	} else {
+		esw_put_prio_table(esw, attr->chain, attr->prio, !!mirror);
+		if (attr->dest_chain)
+			esw_put_prio_table(esw, attr->dest_chain, 1, 0);
+	}
+}
+
 void
 mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_handle *rule,
 				struct mlx5_esw_flow_attr *attr)
 {
-	mlx5_del_flow_rules(rule);
-	esw->offloads.num_flows--;
+	__mlx5_eswitch_del_rule(esw, rule, attr, false);
 }
 
 void
@@ -199,7 +296,7 @@ mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw,
 			  struct mlx5_flow_handle *rule,
 			  struct mlx5_esw_flow_attr *attr)
 {
-	mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
+	__mlx5_eswitch_del_rule(esw, rule, attr, true);
 }
 
 static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val)
@@ -288,7 +385,8 @@ int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
 
 	push = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH);
 	pop  = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP);
-	fwd  = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST);
+	fwd  = !!((attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) &&
+		   !attr->dest_chain);
 
 	err = esw_add_vlan_action_check(attr, push, pop, fwd);
 	if (err)
@@ -495,75 +593,164 @@ static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
 
 #define ESW_OFFLOADS_NUM_GROUPS  4
 
-static int esw_create_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
+/* Firmware currently has 4 pool of 4 sizes that it supports (ESW_POOLS),
+ * and a virtual memory region of 16M (ESW_SIZE), this region is duplicated
+ * for each flow table pool. We can allocate up to 16M of each pool,
+ * and we keep track of how much we used via put/get_sz_to_pool.
+ * Firmware doesn't report any of this for now.
+ * ESW_POOL is expected to be sorted from large to small
+ */
+#define ESW_SIZE (16 * 1024 * 1024)
+const unsigned int ESW_POOLS[4] = { 4 * 1024 * 1024, 1 * 1024 * 1024,
+				    64 * 1024, 4 * 1024 };
+
+static int
+get_sz_from_pool(struct mlx5_eswitch *esw)
+{
+	int sz = 0, i;
+
+	for (i = 0; i < ARRAY_SIZE(ESW_POOLS); i++) {
+		if (esw->fdb_table.offloads.fdb_left[i]) {
+			--esw->fdb_table.offloads.fdb_left[i];
+			sz = ESW_POOLS[i];
+			break;
+		}
+	}
+
+	return sz;
+}
+
+static void
+put_sz_to_pool(struct mlx5_eswitch *esw, int sz)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ESW_POOLS); i++) {
+		if (sz >= ESW_POOLS[i]) {
+			++esw->fdb_table.offloads.fdb_left[i];
+			break;
+		}
+	}
+}
+
+static struct mlx5_flow_table *
+create_next_size_table(struct mlx5_eswitch *esw,
+		       struct mlx5_flow_namespace *ns,
+		       u16 table_prio,
+		       int level,
+		       u32 flags)
+{
+	struct mlx5_flow_table *fdb;
+	int sz;
+
+	sz = get_sz_from_pool(esw);
+	if (!sz)
+		return ERR_PTR(-ENOSPC);
+
+	fdb = mlx5_create_auto_grouped_flow_table(ns,
+						  table_prio,
+						  sz,
+						  ESW_OFFLOADS_NUM_GROUPS,
+						  level,
+						  flags);
+	if (IS_ERR(fdb)) {
+		esw_warn(esw->dev, "Failed to create FDB Table err %d (table prio: %d, level: %d, size: %d)\n",
+			 (int)PTR_ERR(fdb), table_prio, level, sz);
+		put_sz_to_pool(esw, sz);
+	}
+
+	return fdb;
+}
+
+static struct mlx5_flow_table *
+esw_get_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level)
 {
 	struct mlx5_core_dev *dev = esw->dev;
-	struct mlx5_flow_namespace *root_ns;
 	struct mlx5_flow_table *fdb = NULL;
-	int esw_size, err = 0;
+	struct mlx5_flow_namespace *ns;
+	int table_prio, l = 0;
 	u32 flags = 0;
-	u32 max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
-				MLX5_CAP_GEN(dev, max_flow_counter_15_0);
 
-	root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
-	if (!root_ns) {
-		esw_warn(dev, "Failed to get FDB flow namespace\n");
-		err = -EOPNOTSUPP;
-		goto out_namespace;
-	}
-
-	esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d)*groups(%d))\n",
-		  MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size),
-		  max_flow_counter, ESW_OFFLOADS_NUM_GROUPS);
+	mutex_lock(&esw->fdb_table.offloads.fdb_prio_lock);
 
-	esw_size = min_t(int, max_flow_counter * ESW_OFFLOADS_NUM_GROUPS,
-			 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
+	fdb = fdb_prio_table(esw, chain, prio, level).fdb;
+	if (fdb) {
+		/* take ref on earlier levels as well */
+		while (level >= 0)
+			fdb_prio_table(esw, chain, prio, level--).num_rules++;
+		mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+		return fdb;
+	}
 
-	if (mlx5_esw_has_fwd_fdb(dev))
-		esw_size >>= 1;
+	ns = mlx5_get_fdb_sub_ns(dev, chain);
+	if (!ns) {
+		esw_warn(dev, "Failed to get FDB sub namespace\n");
+		mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
 
 	if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
 		flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT |
 			  MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
 
-	fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
-						  esw_size,
-						  ESW_OFFLOADS_NUM_GROUPS, 0,
-						  flags);
-	if (IS_ERR(fdb)) {
-		err = PTR_ERR(fdb);
-		esw_warn(dev, "Failed to create Fast path FDB Table err %d\n", err);
-		goto out_namespace;
-	}
-	esw->fdb_table.offloads.fast_fdb = fdb;
+	table_prio = (chain * FDB_MAX_PRIO) + prio - 1;
 
-	if (!mlx5_esw_has_fwd_fdb(dev))
-		goto out_namespace;
+	/* create earlier levels for correct fs_core lookup when
+	 * connecting tables
+	 */
+	for (l = 0; l <= level; l++) {
+		if (fdb_prio_table(esw, chain, prio, l).fdb) {
+			fdb_prio_table(esw, chain, prio, l).num_rules++;
+			continue;
+		}
 
-	fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
-						  esw_size,
-						  ESW_OFFLOADS_NUM_GROUPS, 1,
-						  flags);
-	if (IS_ERR(fdb)) {
-		err = PTR_ERR(fdb);
-		esw_warn(dev, "Failed to create fwd table err %d\n", err);
-		goto out_ft;
+		fdb = create_next_size_table(esw, ns, table_prio, l, flags);
+		if (IS_ERR(fdb)) {
+			l--;
+			goto err_create_fdb;
+		}
+
+		fdb_prio_table(esw, chain, prio, l).fdb = fdb;
+		fdb_prio_table(esw, chain, prio, l).num_rules = 1;
 	}
-	esw->fdb_table.offloads.fwd_fdb = fdb;
 
-	return err;
+	mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+	return fdb;
 
-out_ft:
-	mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
-out_namespace:
-	return err;
+err_create_fdb:
+	mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+	if (l >= 0)
+		esw_put_prio_table(esw, chain, prio, l);
+
+	return fdb;
 }
 
-static void esw_destroy_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
+static void
+esw_put_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level)
 {
-	if (mlx5_esw_has_fwd_fdb(esw->dev))
-		mlx5_destroy_flow_table(esw->fdb_table.offloads.fwd_fdb);
-	mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
+	int l;
+
+	mutex_lock(&esw->fdb_table.offloads.fdb_prio_lock);
+
+	for (l = level; l >= 0; l--) {
+		if (--(fdb_prio_table(esw, chain, prio, l).num_rules) > 0)
+			continue;
+
+		put_sz_to_pool(esw, fdb_prio_table(esw, chain, prio, l).fdb->max_fte);
+		mlx5_destroy_flow_table(fdb_prio_table(esw, chain, prio, l).fdb);
+		fdb_prio_table(esw, chain, prio, l).fdb = NULL;
+	}
+
+	mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+}
+
+static void esw_destroy_offloads_fast_fdb_tables(struct mlx5_eswitch *esw)
+{
+	/* If lazy creation isn't supported, deref the fast path tables */
+	if (!(esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED)) {
+		esw_put_prio_table(esw, 0, 1, 1);
+		esw_put_prio_table(esw, 0, 1, 0);
+	}
 }
 
 #define MAX_PF_SQ 256
@@ -574,12 +761,13 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
 	struct mlx5_flow_table_attr ft_attr = {};
 	struct mlx5_core_dev *dev = esw->dev;
+	u32 *flow_group_in, max_flow_counter;
 	struct mlx5_flow_namespace *root_ns;
 	struct mlx5_flow_table *fdb = NULL;
-	int table_size, ix, err = 0;
+	int table_size, ix, err = 0, i;
 	struct mlx5_flow_group *g;
+	u32 flags = 0, fdb_max;
 	void *match_criteria;
-	u32 *flow_group_in;
 	u8 *dmac;
 
 	esw_debug(esw->dev, "Create offloads FDB Tables\n");
@@ -594,12 +782,29 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 		goto ns_err;
 	}
 
-	err = esw_create_offloads_fast_fdb_table(esw);
-	if (err)
-		goto fast_fdb_err;
+	max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
+			    MLX5_CAP_GEN(dev, max_flow_counter_15_0);
+	fdb_max = 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size);
+
+	esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d), groups(%d), max flow table size(2^%d))\n",
+		  MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size),
+		  max_flow_counter, ESW_OFFLOADS_NUM_GROUPS,
+		  fdb_max);
+
+	for (i = 0; i < ARRAY_SIZE(ESW_POOLS); i++)
+		esw->fdb_table.offloads.fdb_left[i] =
+			ESW_POOLS[i] <= fdb_max ? ESW_SIZE / ESW_POOLS[i] : 0;
 
 	table_size = nvports * MAX_SQ_NVPORTS + MAX_PF_SQ + 2;
 
+	/* create the slow path fdb with encap set, so further table instances
+	 * can be created at run time while VFs are probed if the FW allows that.
+	 */
+	if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
+		flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT |
+			  MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
+
+	ft_attr.flags = flags;
 	ft_attr.max_fte = table_size;
 	ft_attr.prio = FDB_SLOW_PATH;
 
@@ -611,6 +816,18 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 	}
 	esw->fdb_table.offloads.slow_fdb = fdb;
 
+	/* If lazy creation isn't supported, open the fast path tables now */
+	if (!MLX5_CAP_ESW_FLOWTABLE(esw->dev, multi_fdb_encap) &&
+	    esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE) {
+		esw->fdb_table.flags &= ~ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED;
+		esw_warn(dev, "Lazy creation of flow tables isn't supported, ignoring priorities\n");
+		esw_get_prio_table(esw, 0, 1, 0);
+		esw_get_prio_table(esw, 0, 1, 1);
+	} else {
+		esw_debug(dev, "Lazy creation of flow tables supported, deferring table opening\n");
+		esw->fdb_table.flags |= ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED;
+	}
+
 	/* create send-to-vport group */
 	memset(flow_group_in, 0, inlen);
 	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
@@ -658,6 +875,7 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 	if (err)
 		goto miss_rule_err;
 
+	esw->nvports = nvports;
 	kvfree(flow_group_in);
 	return 0;
 
@@ -666,10 +884,9 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 miss_err:
 	mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
 send_vport_err:
+	esw_destroy_offloads_fast_fdb_tables(esw);
 	mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
 slow_fdb_err:
-	esw_destroy_offloads_fast_fdb_table(esw);
-fast_fdb_err:
 ns_err:
 	kvfree(flow_group_in);
 	return err;
@@ -677,7 +894,7 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 
 static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw)
 {
-	if (!esw->fdb_table.offloads.fast_fdb)
+	if (!esw->fdb_table.offloads.slow_fdb)
 		return;
 
 	esw_debug(esw->dev, "Destroy offloads FDB Tables\n");
@@ -687,7 +904,7 @@ static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw)
 	mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp);
 
 	mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
-	esw_destroy_offloads_fast_fdb_table(esw);
+	esw_destroy_offloads_fast_fdb_tables(esw);
 }
 
 static int esw_create_offloads_table(struct mlx5_eswitch *esw)
@@ -944,6 +1161,8 @@ int esw_offloads_init(struct mlx5_eswitch *esw, int nvports)
 {
 	int err;
 
+	mutex_init(&esw->fdb_table.offloads.fdb_prio_lock);
+
 	err = esw_create_offloads_fdb_tables(esw, nvports);
 	if (err)
 		return err;
@@ -1272,16 +1491,19 @@ int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap,
 		return -EOPNOTSUPP;
 	}
 
-	esw_destroy_offloads_fast_fdb_table(esw);
+	esw_destroy_offloads_fdb_tables(esw);
 
 	esw->offloads.encap = encap;
-	err = esw_create_offloads_fast_fdb_table(esw);
+
+	err = esw_create_offloads_fdb_tables(esw, esw->nvports);
+
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Failed re-creating fast FDB table");
 		esw->offloads.encap = !encap;
-		(void)esw_create_offloads_fast_fdb_table(esw);
+		(void)esw_create_offloads_fdb_tables(esw, esw->nvports);
 	}
+
 	return err;
 }
 
-- 
2.17.2

^ permalink raw reply related

* [net-next 07/14] net/mlx5: E-Switch, Have explicit API to delete fwd rules
From: Saeed Mahameed @ 2018-10-18  0:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Or Gerlitz, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

Be symmetric with the e-switch API to add rules which has a
specific function to add fwd rules which are used as part of
vport mirroring.

This patch doesn't change any functionality.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c           | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h         | 4 ++++
 .../net/ethernet/mellanox/mlx5/core/eswitch_offloads.c    | 8 ++++++++
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index a4a432f02930..7487bdd55f23 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -920,7 +920,7 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
 	if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
 		flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
 		if (attr->mirror_count)
-			mlx5_eswitch_del_offloaded_rule(esw, flow->rule[1], attr);
+			mlx5_eswitch_del_fwd_rule(esw, flow->rule[1], attr);
 		mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
 	}
 
@@ -996,7 +996,7 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
 
 			flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
 			if (attr->mirror_count)
-				mlx5_eswitch_del_offloaded_rule(esw, flow->rule[1], attr);
+				mlx5_eswitch_del_fwd_rule(esw, flow->rule[1], attr);
 			mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
 		}
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 1698a322a7c4..584e735bbad1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -231,6 +231,10 @@ void
 mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_handle *rule,
 				struct mlx5_esw_flow_attr *attr);
+void
+mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw,
+			  struct mlx5_flow_handle *rule,
+			  struct mlx5_esw_flow_attr *attr);
 
 struct mlx5_flow_handle *
 mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 39932dce15cb..983bb8a80f75 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -194,6 +194,14 @@ mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
 	esw->offloads.num_flows--;
 }
 
+void
+mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw,
+			  struct mlx5_flow_handle *rule,
+			  struct mlx5_esw_flow_attr *attr)
+{
+	mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
+}
+
 static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val)
 {
 	struct mlx5_eswitch_rep *rep;
-- 
2.17.2

^ permalink raw reply related

* [net-next 06/14] net/mlx5: Split FDB fast path prio to multiple namespaces
From: Saeed Mahameed @ 2018-10-18  0:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>

From: Paul Blakey <paulb@mellanox.com>

Towards supporting multi-chains and priorities, split the FDB fast path
to multiple namespaces (sub namespaces), each with multiple priorities.

This patch adds a new flow steering type, FS_TYPE_PRIO_CHAINS, which is
like current FS_TYPE_PRIO, but may contain only namespaces, and those
will be in parallel to one another in terms of managing of the flow
tables connections inside them. Meaning, while searching for the next
or previous flow table to connect for a new table inside such namespace
we skip the parallel namespaces in the same level under the
FS_TYPE_PRIO_CHAINS prio we originated from.

We use this new type for splitting the fast path prio into multiple
parallel namespaces, each containing normal prios.
The prios inside them (and their tables) will be connected to one
another, but not from one parallel namespace to another, instead the
last prio in each namespace will be connected to the next prio in
the containing FDB namespace, which is the slow path prio.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/eswitch.c |  2 +-
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |  7 ++
 .../net/ethernet/mellanox/mlx5/core/fs_core.c | 88 ++++++++++++++++---
 .../net/ethernet/mellanox/mlx5/core/fs_core.h | 13 +++
 include/linux/mlx5/fs.h                       |  2 +
 5 files changed, 101 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 9c893d7d273e..d004957328f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -263,7 +263,7 @@ static int esw_create_legacy_fdb_table(struct mlx5_eswitch *esw)
 	esw_debug(dev, "Create FDB log_max_size(%d)\n",
 		  MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
 
-	root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+	root_ns = mlx5_get_fdb_sub_ns(dev, 0);
 	if (!root_ns) {
 		esw_warn(dev, "Failed to get FDB flow namespace\n");
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index c1b627577003..1698a322a7c4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -59,6 +59,9 @@
 #define mlx5_esw_has_fwd_fdb(dev) \
 	MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_to_table)
 
+#define FDB_MAX_CHAIN 3
+#define FDB_MAX_PRIO 16
+
 struct vport_ingress {
 	struct mlx5_flow_table *acl;
 	struct mlx5_flow_group *allow_untagged_spoofchk_grp;
@@ -319,6 +322,10 @@ static inline void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) {}
 static inline void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe) {}
 static inline int  mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode) { return 0; }
 static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw) {}
+
+#define FDB_MAX_CHAIN 1
+#define FDB_MAX_PRIO 1
+
 #endif /* CONFIG_MLX5_ESWITCH */
 
 #endif /* __MLX5_ESWITCH_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index cdcbf9d0ae6c..7eb6d58733ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -40,6 +40,7 @@
 #include "diag/fs_tracepoint.h"
 #include "accel/ipsec.h"
 #include "fpga/ipsec.h"
+#include "eswitch.h"
 
 #define INIT_TREE_NODE_ARRAY_SIZE(...)	(sizeof((struct init_tree_node[]){__VA_ARGS__}) /\
 					 sizeof(struct init_tree_node))
@@ -713,7 +714,7 @@ static struct mlx5_flow_table *find_closest_ft_recursive(struct fs_node  *root,
 	struct fs_node *iter = list_entry(start, struct fs_node, list);
 	struct mlx5_flow_table *ft = NULL;
 
-	if (!root)
+	if (!root || root->type == FS_TYPE_PRIO_CHAINS)
 		return NULL;
 
 	list_for_each_advance_continue(iter, &root->children, reverse) {
@@ -1973,6 +1974,18 @@ void mlx5_destroy_flow_group(struct mlx5_flow_group *fg)
 			       fg->id);
 }
 
+struct mlx5_flow_namespace *mlx5_get_fdb_sub_ns(struct mlx5_core_dev *dev,
+						int n)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+
+	if (!steering || !steering->fdb_sub_ns)
+		return NULL;
+
+	return steering->fdb_sub_ns[n];
+}
+EXPORT_SYMBOL(mlx5_get_fdb_sub_ns);
+
 struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
 						    enum mlx5_flow_namespace_type type)
 {
@@ -2051,8 +2064,10 @@ struct mlx5_flow_namespace *mlx5_get_flow_vport_acl_namespace(struct mlx5_core_d
 	}
 }
 
-static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
-				      unsigned int prio, int num_levels)
+static struct fs_prio *_fs_create_prio(struct mlx5_flow_namespace *ns,
+				       unsigned int prio,
+				       int num_levels,
+				       enum fs_node_type type)
 {
 	struct fs_prio *fs_prio;
 
@@ -2060,7 +2075,7 @@ static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
 	if (!fs_prio)
 		return ERR_PTR(-ENOMEM);
 
-	fs_prio->node.type = FS_TYPE_PRIO;
+	fs_prio->node.type = type;
 	tree_init_node(&fs_prio->node, NULL, del_sw_prio);
 	tree_add_node(&fs_prio->node, &ns->node);
 	fs_prio->num_levels = num_levels;
@@ -2070,6 +2085,19 @@ static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
 	return fs_prio;
 }
 
+static struct fs_prio *fs_create_prio_chained(struct mlx5_flow_namespace *ns,
+					      unsigned int prio,
+					      int num_levels)
+{
+	return _fs_create_prio(ns, prio, num_levels, FS_TYPE_PRIO_CHAINS);
+}
+
+static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
+				      unsigned int prio, int num_levels)
+{
+	return _fs_create_prio(ns, prio, num_levels, FS_TYPE_PRIO);
+}
+
 static struct mlx5_flow_namespace *fs_init_namespace(struct mlx5_flow_namespace
 						     *ns)
 {
@@ -2374,6 +2402,9 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
 	cleanup_egress_acls_root_ns(dev);
 	cleanup_ingress_acls_root_ns(dev);
 	cleanup_root_ns(steering->fdb_root_ns);
+	steering->fdb_root_ns = NULL;
+	kfree(steering->fdb_sub_ns);
+	steering->fdb_sub_ns = NULL;
 	cleanup_root_ns(steering->sniffer_rx_root_ns);
 	cleanup_root_ns(steering->sniffer_tx_root_ns);
 	cleanup_root_ns(steering->egress_root_ns);
@@ -2419,27 +2450,64 @@ static int init_sniffer_rx_root_ns(struct mlx5_flow_steering *steering)
 
 static int init_fdb_root_ns(struct mlx5_flow_steering *steering)
 {
-	struct fs_prio *prio;
+	struct mlx5_flow_namespace *ns;
+	struct fs_prio *maj_prio;
+	struct fs_prio *min_prio;
+	int levels;
+	int chain;
+	int prio;
+	int err;
 
 	steering->fdb_root_ns = create_root_ns(steering, FS_FT_FDB);
 	if (!steering->fdb_root_ns)
 		return -ENOMEM;
 
-	prio = fs_create_prio(&steering->fdb_root_ns->ns, 0, 2);
-	if (IS_ERR(prio))
+	steering->fdb_sub_ns = kzalloc(sizeof(steering->fdb_sub_ns) *
+				       FDB_MAX_CHAIN + 1, GFP_KERNEL);
+	if (!steering->fdb_sub_ns)
+		return -ENOMEM;
+
+	levels = 2 * FDB_MAX_PRIO * (FDB_MAX_CHAIN + 1);
+	maj_prio = fs_create_prio_chained(&steering->fdb_root_ns->ns, 0,
+					  levels);
+	if (IS_ERR(maj_prio)) {
+		err = PTR_ERR(maj_prio);
 		goto out_err;
+	}
 
-	prio = fs_create_prio(&steering->fdb_root_ns->ns, 1, 1);
-	if (IS_ERR(prio))
+	for (chain = 0; chain <= FDB_MAX_CHAIN; chain++) {
+		ns = fs_create_namespace(maj_prio);
+		if (IS_ERR(ns)) {
+			err = PTR_ERR(ns);
+			goto out_err;
+		}
+
+		for (prio = 0; prio < FDB_MAX_PRIO * (chain + 1); prio++) {
+			min_prio = fs_create_prio(ns, prio, 2);
+			if (IS_ERR(min_prio)) {
+				err = PTR_ERR(min_prio);
+				goto out_err;
+			}
+		}
+
+		steering->fdb_sub_ns[chain] = ns;
+	}
+
+	maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, 1, 1);
+	if (IS_ERR(maj_prio)) {
+		err = PTR_ERR(maj_prio);
 		goto out_err;
+	}
 
 	set_prio_attrs(steering->fdb_root_ns);
 	return 0;
 
 out_err:
 	cleanup_root_ns(steering->fdb_root_ns);
+	kfree(steering->fdb_sub_ns);
+	steering->fdb_sub_ns = NULL;
 	steering->fdb_root_ns = NULL;
-	return PTR_ERR(prio);
+	return err;
 }
 
 static int init_egress_acl_root_ns(struct mlx5_flow_steering *steering, int vport)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index a06f83c0c2b6..b51ad217da32 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -38,9 +38,21 @@
 #include <linux/rhashtable.h>
 #include <linux/llist.h>
 
+/* FS_TYPE_PRIO_CHAINS is a PRIO that will have namespaces only,
+ * and those are in parallel to one another when going over them to connect
+ * a new flow table. Meaning the last flow table in a TYPE_PRIO prio in one
+ * parallel namespace will not automatically connect to the first flow table
+ * found in any prio in any next namespace, but skip the entire containing
+ * TYPE_PRIO_CHAINS prio.
+ *
+ * This is used to implement tc chains, each chain of prios is a different
+ * namespace inside a containing TYPE_PRIO_CHAINS prio.
+ */
+
 enum fs_node_type {
 	FS_TYPE_NAMESPACE,
 	FS_TYPE_PRIO,
+	FS_TYPE_PRIO_CHAINS,
 	FS_TYPE_FLOW_TABLE,
 	FS_TYPE_FLOW_GROUP,
 	FS_TYPE_FLOW_ENTRY,
@@ -73,6 +85,7 @@ struct mlx5_flow_steering {
 	struct kmem_cache               *ftes_cache;
 	struct mlx5_flow_root_namespace *root_ns;
 	struct mlx5_flow_root_namespace *fdb_root_ns;
+	struct mlx5_flow_namespace	**fdb_sub_ns;
 	struct mlx5_flow_root_namespace **esw_egress_root_ns;
 	struct mlx5_flow_root_namespace **esw_ingress_root_ns;
 	struct mlx5_flow_root_namespace	*sniffer_tx_root_ns;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index a5fc62184195..f8d00872c7d3 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -101,6 +101,8 @@ struct mlx5_flow_destination {
 	};
 };
 
+struct mlx5_flow_namespace *
+mlx5_get_fdb_sub_ns(struct mlx5_core_dev *dev, int n);
 struct mlx5_flow_namespace *
 mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
 			enum mlx5_flow_namespace_type type);
-- 
2.17.2

^ permalink raw reply related

* [net-next 05/14] net/mlx5: Add cap bits for multi fdb encap
From: Saeed Mahameed @ 2018-10-18  0:08 UTC (permalink / raw)
  To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>

From: Paul Blakey <paulb@mellanox.com>

If set, the firmware supports creating of flow tables with encap
enabled while VFs are configured, if we already created one
(restriction still applies on the first creation).

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 15e36198f85f..963611820006 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -584,7 +584,9 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 struct mlx5_ifc_flow_table_eswitch_cap_bits {
 	u8      reserved_at_0[0x1c];
 	u8      fdb_multi_path_to_table[0x1];
-	u8      reserved_at_1d[0x1e3];
+	u8      reserved_at_1d[0x1];
+	u8      multi_fdb_encap[0x1];
+	u8      reserved_at_1e[0x1e1];
 
 	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb;
 
-- 
2.17.2

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox