Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 6/8] nfp: flower: monitor and offload LAG groups
From: Jakub Kicinski @ 2018-05-24  2:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, John Hurley
In-Reply-To: <20180524022255.18548-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

Monitor LAG events via the NETDEV_CHANGEUPPER/NETDEV_CHANGELOWERSTATE
notifiers to maintain a list of offloadable groups. Sync these groups with
HW via a delayed workqueue to prevent excessive re-configuration. When the
workqueue is triggered it may generate multiple control messages for
different groups. These messages are linked via a batch ID and flags to
indicate a new batch and the end of a batch.

Update private data in each repr to track their LAG lower state flags. The
state of a repr is used to determine the active netdevs that can be
offloaded. For example, in active-backup mode, we only offload the netdev
currently active.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/Makefile   |   1 +
 .../ethernet/netronome/nfp/flower/lag_conf.c  | 589 ++++++++++++++++++
 .../net/ethernet/netronome/nfp/flower/main.c  |  29 +-
 .../net/ethernet/netronome/nfp/flower/main.h  |  30 +
 4 files changed, 646 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/netronome/nfp/flower/lag_conf.c

diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile
index 6373f56205fd..4afb10375397 100644
--- a/drivers/net/ethernet/netronome/nfp/Makefile
+++ b/drivers/net/ethernet/netronome/nfp/Makefile
@@ -37,6 +37,7 @@ ifeq ($(CONFIG_NFP_APP_FLOWER),y)
 nfp-objs += \
 	    flower/action.o \
 	    flower/cmsg.o \
+	    flower/lag_conf.o \
 	    flower/main.o \
 	    flower/match.o \
 	    flower/metadata.o \
diff --git a/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
new file mode 100644
index 000000000000..35a700b879d7
--- /dev/null
+++ b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
@@ -0,0 +1,589 @@
+/*
+ * Copyright (C) 2018 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "main.h"
+
+/* LAG group config flags. */
+#define NFP_FL_LAG_LAST			BIT(1)
+#define NFP_FL_LAG_FIRST		BIT(2)
+#define NFP_FL_LAG_SWITCH		BIT(6)
+#define NFP_FL_LAG_RESET		BIT(7)
+
+/* LAG port state flags. */
+#define NFP_PORT_LAG_LINK_UP		BIT(0)
+#define NFP_PORT_LAG_TX_ENABLED		BIT(1)
+#define NFP_PORT_LAG_CHANGED		BIT(2)
+
+enum nfp_fl_lag_batch {
+	NFP_FL_LAG_BATCH_FIRST,
+	NFP_FL_LAG_BATCH_MEMBER,
+	NFP_FL_LAG_BATCH_FINISHED
+};
+
+/**
+ * struct nfp_flower_cmsg_lag_config - control message payload for LAG config
+ * @ctrl_flags:	Configuration flags
+ * @reserved:	Reserved for future use
+ * @ttl:	Time to live of packet - host always sets to 0xff
+ * @pkt_number:	Config message packet number - increment for each message
+ * @batch_ver:	Batch version of messages - increment for each batch of messages
+ * @group_id:	Group ID applicable
+ * @group_inst:	Group instance number - increment when group is reused
+ * @members:	Array of 32-bit words listing all active group members
+ */
+struct nfp_flower_cmsg_lag_config {
+	u8 ctrl_flags;
+	u8 reserved[2];
+	u8 ttl;
+	__be32 pkt_number;
+	__be32 batch_ver;
+	__be32 group_id;
+	__be32 group_inst;
+	__be32 members[];
+};
+
+/**
+ * struct nfp_fl_lag_group - list entry for each LAG group
+ * @group_id:		Assigned group ID for host/kernel sync
+ * @group_inst:		Group instance in case of ID reuse
+ * @list:		List entry
+ * @master_ndev:	Group master Netdev
+ * @dirty:		Marked if the group needs synced to HW
+ * @offloaded:		Marked if the group is currently offloaded to NIC
+ * @to_remove:		Marked if the group should be removed from NIC
+ * @to_destroy:		Marked if the group should be removed from driver
+ * @slave_cnt:		Number of slaves in group
+ */
+struct nfp_fl_lag_group {
+	unsigned int group_id;
+	u8 group_inst;
+	struct list_head list;
+	struct net_device *master_ndev;
+	bool dirty;
+	bool offloaded;
+	bool to_remove;
+	bool to_destroy;
+	unsigned int slave_cnt;
+};
+
+#define NFP_FL_LAG_PKT_NUMBER_MASK	GENMASK(30, 0)
+#define NFP_FL_LAG_VERSION_MASK		GENMASK(22, 0)
+#define NFP_FL_LAG_HOST_TTL		0xff
+
+/* Use this ID with zero members to ack a batch config */
+#define NFP_FL_LAG_SYNC_ID		0
+#define NFP_FL_LAG_GROUP_MIN		1 /* ID 0 reserved */
+#define NFP_FL_LAG_GROUP_MAX		32 /* IDs 1 to 31 are valid */
+
+/* wait for more config */
+#define NFP_FL_LAG_DELAY		(msecs_to_jiffies(2))
+
+static unsigned int nfp_fl_get_next_pkt_number(struct nfp_fl_lag *lag)
+{
+	lag->pkt_num++;
+	lag->pkt_num &= NFP_FL_LAG_PKT_NUMBER_MASK;
+
+	return lag->pkt_num;
+}
+
+static void nfp_fl_increment_version(struct nfp_fl_lag *lag)
+{
+	/* LSB is not considered by firmware so add 2 for each increment. */
+	lag->batch_ver += 2;
+	lag->batch_ver &= NFP_FL_LAG_VERSION_MASK;
+
+	/* Zero is reserved by firmware. */
+	if (!lag->batch_ver)
+		lag->batch_ver += 2;
+}
+
+static struct nfp_fl_lag_group *
+nfp_fl_lag_group_create(struct nfp_fl_lag *lag, struct net_device *master)
+{
+	struct nfp_fl_lag_group *group;
+	struct nfp_flower_priv *priv;
+	int id;
+
+	priv = container_of(lag, struct nfp_flower_priv, nfp_lag);
+
+	id = ida_simple_get(&lag->ida_handle, NFP_FL_LAG_GROUP_MIN,
+			    NFP_FL_LAG_GROUP_MAX, GFP_KERNEL);
+	if (id < 0) {
+		nfp_flower_cmsg_warn(priv->app,
+				     "No more bonding groups available\n");
+		return ERR_PTR(id);
+	}
+
+	group = kmalloc(sizeof(*group), GFP_KERNEL);
+	if (!group) {
+		ida_simple_remove(&lag->ida_handle, id);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	group->group_id = id;
+	group->master_ndev = master;
+	group->dirty = true;
+	group->offloaded = false;
+	group->to_remove = false;
+	group->to_destroy = false;
+	group->slave_cnt = 0;
+	group->group_inst = ++lag->global_inst;
+	list_add_tail(&group->list, &lag->group_list);
+
+	return group;
+}
+
+static struct nfp_fl_lag_group *
+nfp_fl_lag_find_group_for_master_with_lag(struct nfp_fl_lag *lag,
+					  struct net_device *master)
+{
+	struct nfp_fl_lag_group *entry;
+
+	if (!master)
+		return NULL;
+
+	list_for_each_entry(entry, &lag->group_list, list)
+		if (entry->master_ndev == master)
+			return entry;
+
+	return NULL;
+}
+
+static int
+nfp_fl_lag_config_group(struct nfp_fl_lag *lag, struct nfp_fl_lag_group *group,
+			struct net_device **active_members,
+			unsigned int member_cnt, enum nfp_fl_lag_batch *batch)
+{
+	struct nfp_flower_cmsg_lag_config *cmsg_payload;
+	struct nfp_flower_priv *priv;
+	unsigned long int flags;
+	unsigned int size, i;
+	struct sk_buff *skb;
+
+	priv = container_of(lag, struct nfp_flower_priv, nfp_lag);
+	size = sizeof(*cmsg_payload) + sizeof(__be32) * member_cnt;
+	skb = nfp_flower_cmsg_alloc(priv->app, size,
+				    NFP_FLOWER_CMSG_TYPE_LAG_CONFIG,
+				    GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	cmsg_payload = nfp_flower_cmsg_get_data(skb);
+	flags = 0;
+
+	/* Increment batch version for each new batch of config messages. */
+	if (*batch == NFP_FL_LAG_BATCH_FIRST) {
+		flags |= NFP_FL_LAG_FIRST;
+		nfp_fl_increment_version(lag);
+		*batch = NFP_FL_LAG_BATCH_MEMBER;
+	}
+
+	/* If it is a reset msg then it is also the end of the batch. */
+	if (lag->rst_cfg) {
+		flags |= NFP_FL_LAG_RESET;
+		*batch = NFP_FL_LAG_BATCH_FINISHED;
+	}
+
+	/* To signal the end of a batch, both the switch and last flags are set
+	 * and the the reserved SYNC group ID is used.
+	 */
+	if (*batch == NFP_FL_LAG_BATCH_FINISHED) {
+		flags |= NFP_FL_LAG_SWITCH | NFP_FL_LAG_LAST;
+		lag->rst_cfg = false;
+		cmsg_payload->group_id = cpu_to_be32(NFP_FL_LAG_SYNC_ID);
+		cmsg_payload->group_inst = 0;
+	} else {
+		cmsg_payload->group_id = cpu_to_be32(group->group_id);
+		cmsg_payload->group_inst = cpu_to_be32(group->group_inst);
+	}
+
+	cmsg_payload->reserved[0] = 0;
+	cmsg_payload->reserved[1] = 0;
+	cmsg_payload->ttl = NFP_FL_LAG_HOST_TTL;
+	cmsg_payload->ctrl_flags = flags;
+	cmsg_payload->batch_ver = cpu_to_be32(lag->batch_ver);
+	cmsg_payload->pkt_number = cpu_to_be32(nfp_fl_get_next_pkt_number(lag));
+
+	for (i = 0; i < member_cnt; i++)
+		cmsg_payload->members[i] =
+			cpu_to_be32(nfp_repr_get_port_id(active_members[i]));
+
+	nfp_ctrl_tx(priv->app->ctrl, skb);
+	return 0;
+}
+
+static void nfp_fl_lag_do_work(struct work_struct *work)
+{
+	enum nfp_fl_lag_batch batch = NFP_FL_LAG_BATCH_FIRST;
+	struct nfp_fl_lag_group *entry, *storage;
+	struct delayed_work *delayed_work;
+	struct nfp_flower_priv *priv;
+	struct nfp_fl_lag *lag;
+	int err;
+
+	delayed_work = to_delayed_work(work);
+	lag = container_of(delayed_work, struct nfp_fl_lag, work);
+	priv = container_of(lag, struct nfp_flower_priv, nfp_lag);
+
+	mutex_lock(&lag->lock);
+	list_for_each_entry_safe(entry, storage, &lag->group_list, list) {
+		struct net_device *iter_netdev, **acti_netdevs;
+		struct nfp_flower_repr_priv *repr_priv;
+		int active_count = 0, slaves = 0;
+		struct nfp_repr *repr;
+		unsigned long *flags;
+
+		if (entry->to_remove) {
+			/* Active count of 0 deletes group on hw. */
+			err = nfp_fl_lag_config_group(lag, entry, NULL, 0,
+						      &batch);
+			if (!err) {
+				entry->to_remove = false;
+				entry->offloaded = false;
+			} else {
+				nfp_flower_cmsg_warn(priv->app,
+						     "group delete failed\n");
+				schedule_delayed_work(&lag->work,
+						      NFP_FL_LAG_DELAY);
+				continue;
+			}
+
+			if (entry->to_destroy) {
+				ida_simple_remove(&lag->ida_handle,
+						  entry->group_id);
+				list_del(&entry->list);
+				kfree(entry);
+			}
+			continue;
+		}
+
+		acti_netdevs = kmalloc_array(entry->slave_cnt,
+					     sizeof(*acti_netdevs), GFP_KERNEL);
+
+		/* Include sanity check in the loop. It may be that a bond has
+		 * changed between processing the last notification and the
+		 * work queue triggering. If the number of slaves has changed
+		 * or it now contains netdevs that cannot be offloaded, ignore
+		 * the group until pending notifications are processed.
+		 */
+		rcu_read_lock();
+		for_each_netdev_in_bond_rcu(entry->master_ndev, iter_netdev) {
+			if (!nfp_netdev_is_nfp_repr(iter_netdev)) {
+				slaves = 0;
+				break;
+			}
+
+			repr = netdev_priv(iter_netdev);
+
+			if (repr->app != priv->app) {
+				slaves = 0;
+				break;
+			}
+
+			slaves++;
+			if (slaves > entry->slave_cnt)
+				break;
+
+			/* Check the ports for state changes. */
+			repr_priv = repr->app_priv;
+			flags = &repr_priv->lag_port_flags;
+
+			if (*flags & NFP_PORT_LAG_CHANGED) {
+				*flags &= ~NFP_PORT_LAG_CHANGED;
+				entry->dirty = true;
+			}
+
+			if ((*flags & NFP_PORT_LAG_TX_ENABLED) &&
+			    (*flags & NFP_PORT_LAG_LINK_UP))
+				acti_netdevs[active_count++] = iter_netdev;
+		}
+		rcu_read_unlock();
+
+		if (slaves != entry->slave_cnt || !entry->dirty) {
+			kfree(acti_netdevs);
+			continue;
+		}
+
+		err = nfp_fl_lag_config_group(lag, entry, acti_netdevs,
+					      active_count, &batch);
+		if (!err) {
+			entry->offloaded = true;
+			entry->dirty = false;
+		} else {
+			nfp_flower_cmsg_warn(priv->app,
+					     "group offload failed\n");
+			schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY);
+		}
+
+		kfree(acti_netdevs);
+	}
+
+	/* End the config batch if at least one packet has been batched. */
+	if (batch == NFP_FL_LAG_BATCH_MEMBER) {
+		batch = NFP_FL_LAG_BATCH_FINISHED;
+		err = nfp_fl_lag_config_group(lag, NULL, NULL, 0, &batch);
+		if (err)
+			nfp_flower_cmsg_warn(priv->app,
+					     "group batch end cmsg failed\n");
+	}
+
+	mutex_unlock(&lag->lock);
+}
+
+static void
+nfp_fl_lag_schedule_group_remove(struct nfp_fl_lag *lag,
+				 struct nfp_fl_lag_group *group)
+{
+	group->to_remove = true;
+
+	schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY);
+}
+
+static int
+nfp_fl_lag_schedule_group_delete(struct nfp_fl_lag *lag,
+				 struct net_device *master)
+{
+	struct nfp_fl_lag_group *group;
+
+	mutex_lock(&lag->lock);
+	group = nfp_fl_lag_find_group_for_master_with_lag(lag, master);
+	if (!group) {
+		mutex_unlock(&lag->lock);
+		return -ENOENT;
+	}
+
+	group->to_remove = true;
+	group->to_destroy = true;
+	mutex_unlock(&lag->lock);
+
+	schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY);
+	return 0;
+}
+
+static int
+nfp_fl_lag_changeupper_event(struct nfp_fl_lag *lag,
+			     struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *upper = info->upper_dev, *iter_netdev;
+	struct netdev_lag_upper_info *lag_upper_info;
+	struct nfp_fl_lag_group *group;
+	struct nfp_flower_priv *priv;
+	unsigned int slave_count = 0;
+	bool can_offload = true;
+	struct nfp_repr *repr;
+
+	if (!netif_is_lag_master(upper))
+		return 0;
+
+	priv = container_of(lag, struct nfp_flower_priv, nfp_lag);
+
+	rcu_read_lock();
+	for_each_netdev_in_bond_rcu(upper, iter_netdev) {
+		if (!nfp_netdev_is_nfp_repr(iter_netdev)) {
+			can_offload = false;
+			break;
+		}
+		repr = netdev_priv(iter_netdev);
+
+		/* Ensure all ports are created by the same app/on same card. */
+		if (repr->app != priv->app) {
+			can_offload = false;
+			break;
+		}
+
+		slave_count++;
+	}
+	rcu_read_unlock();
+
+	lag_upper_info = info->upper_info;
+
+	/* Firmware supports active/backup and L3/L4 hash bonds. */
+	if (lag_upper_info &&
+	    lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
+	    (lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_HASH ||
+	    (lag_upper_info->hash_type != NETDEV_LAG_HASH_L34 &&
+	    lag_upper_info->hash_type != NETDEV_LAG_HASH_E34))) {
+		can_offload = false;
+		nfp_flower_cmsg_warn(priv->app,
+				     "Unable to offload tx_type %u hash %u\n",
+				     lag_upper_info->tx_type,
+				     lag_upper_info->hash_type);
+	}
+
+	mutex_lock(&lag->lock);
+	group = nfp_fl_lag_find_group_for_master_with_lag(lag, upper);
+
+	if (slave_count == 0 || !can_offload) {
+		/* Cannot offload the group - remove if previously offloaded. */
+		if (group && group->offloaded)
+			nfp_fl_lag_schedule_group_remove(lag, group);
+
+		mutex_unlock(&lag->lock);
+		return 0;
+	}
+
+	if (!group) {
+		group = nfp_fl_lag_group_create(lag, upper);
+		if (IS_ERR(group)) {
+			mutex_unlock(&lag->lock);
+			return PTR_ERR(group);
+		}
+	}
+
+	group->dirty = true;
+	group->slave_cnt = slave_count;
+
+	/* Group may have been on queue for removal but is now offfloable. */
+	group->to_remove = false;
+	mutex_unlock(&lag->lock);
+
+	schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY);
+	return 0;
+}
+
+static int
+nfp_fl_lag_changels_event(struct nfp_fl_lag *lag, struct net_device *netdev,
+			  struct netdev_notifier_changelowerstate_info *info)
+{
+	struct netdev_lag_lower_state_info *lag_lower_info;
+	struct nfp_flower_repr_priv *repr_priv;
+	struct nfp_flower_priv *priv;
+	struct nfp_repr *repr;
+	unsigned long *flags;
+
+	if (!netif_is_lag_port(netdev) || !nfp_netdev_is_nfp_repr(netdev))
+		return 0;
+
+	lag_lower_info = info->lower_state_info;
+	if (!lag_lower_info)
+		return 0;
+
+	priv = container_of(lag, struct nfp_flower_priv, nfp_lag);
+	repr = netdev_priv(netdev);
+
+	/* Verify that the repr is associated with this app. */
+	if (repr->app != priv->app)
+		return 0;
+
+	repr_priv = repr->app_priv;
+	flags = &repr_priv->lag_port_flags;
+
+	mutex_lock(&lag->lock);
+	if (lag_lower_info->link_up)
+		*flags |= NFP_PORT_LAG_LINK_UP;
+	else
+		*flags &= ~NFP_PORT_LAG_LINK_UP;
+
+	if (lag_lower_info->tx_enabled)
+		*flags |= NFP_PORT_LAG_TX_ENABLED;
+	else
+		*flags &= ~NFP_PORT_LAG_TX_ENABLED;
+
+	*flags |= NFP_PORT_LAG_CHANGED;
+	mutex_unlock(&lag->lock);
+
+	schedule_delayed_work(&lag->work, NFP_FL_LAG_DELAY);
+	return 0;
+}
+
+static int
+nfp_fl_lag_netdev_event(struct notifier_block *nb, unsigned long event,
+			void *ptr)
+{
+	struct net_device *netdev;
+	struct nfp_fl_lag *lag;
+	int err;
+
+	netdev = netdev_notifier_info_to_dev(ptr);
+	lag = container_of(nb, struct nfp_fl_lag, lag_nb);
+
+	switch (event) {
+	case NETDEV_CHANGEUPPER:
+		err = nfp_fl_lag_changeupper_event(lag, ptr);
+		if (err)
+			return NOTIFY_BAD;
+		return NOTIFY_OK;
+	case NETDEV_CHANGELOWERSTATE:
+		err = nfp_fl_lag_changels_event(lag, netdev, ptr);
+		if (err)
+			return NOTIFY_BAD;
+		return NOTIFY_OK;
+	case NETDEV_UNREGISTER:
+		if (netif_is_bond_master(netdev)) {
+			err = nfp_fl_lag_schedule_group_delete(lag, netdev);
+			if (err)
+				return NOTIFY_BAD;
+			return NOTIFY_OK;
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+int nfp_flower_lag_reset(struct nfp_fl_lag *lag)
+{
+	enum nfp_fl_lag_batch batch = NFP_FL_LAG_BATCH_FIRST;
+
+	lag->rst_cfg = true;
+	return nfp_fl_lag_config_group(lag, NULL, NULL, 0, &batch);
+}
+
+void nfp_flower_lag_init(struct nfp_fl_lag *lag)
+{
+	INIT_DELAYED_WORK(&lag->work, nfp_fl_lag_do_work);
+	INIT_LIST_HEAD(&lag->group_list);
+	mutex_init(&lag->lock);
+	ida_init(&lag->ida_handle);
+
+	/* 0 is a reserved batch version so increment to first valid value. */
+	nfp_fl_increment_version(lag);
+
+	lag->lag_nb.notifier_call = nfp_fl_lag_netdev_event;
+}
+
+void nfp_flower_lag_cleanup(struct nfp_fl_lag *lag)
+{
+	struct nfp_fl_lag_group *entry, *storage;
+
+	cancel_delayed_work_sync(&lag->work);
+
+	/* Remove all groups. */
+	mutex_lock(&lag->lock);
+	list_for_each_entry_safe(entry, storage, &lag->group_list, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+	mutex_unlock(&lag->lock);
+	mutex_destroy(&lag->lock);
+	ida_destroy(&lag->ida_handle);
+}
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 202284b42fd9..19cfa162ac65 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -575,12 +575,14 @@ static int nfp_flower_init(struct nfp_app *app)
 	/* Tell the firmware that the driver supports lag. */
 	err = nfp_rtsym_write_le(app->pf->rtbl,
 				 "_abi_flower_balance_sync_enable", 1);
-	if (!err)
+	if (!err) {
 		app_priv->flower_ext_feats |= NFP_FL_FEATS_LAG;
-	else if (err == -ENOENT)
+		nfp_flower_lag_init(&app_priv->nfp_lag);
+	} else if (err == -ENOENT) {
 		nfp_warn(app->cpp, "LAG not supported by FW.\n");
-	else
+	} else {
 		goto err_cleanup_metadata;
+	}
 
 	return 0;
 
@@ -599,6 +601,9 @@ static void nfp_flower_clean(struct nfp_app *app)
 	skb_queue_purge(&app_priv->cmsg_skbs_low);
 	flush_work(&app_priv->cmsg_work);
 
+	if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG)
+		nfp_flower_lag_cleanup(&app_priv->nfp_lag);
+
 	nfp_flower_metadata_cleanup(app);
 	vfree(app->priv);
 	app->priv = NULL;
@@ -665,11 +670,29 @@ nfp_flower_repr_change_mtu(struct nfp_app *app, struct net_device *netdev,
 
 static int nfp_flower_start(struct nfp_app *app)
 {
+	struct nfp_flower_priv *app_priv = app->priv;
+	int err;
+
+	if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG) {
+		err = nfp_flower_lag_reset(&app_priv->nfp_lag);
+		if (err)
+			return err;
+
+		err = register_netdevice_notifier(&app_priv->nfp_lag.lag_nb);
+		if (err)
+			return err;
+	}
+
 	return nfp_tunnel_config_start(app);
 }
 
 static void nfp_flower_stop(struct nfp_app *app)
 {
+	struct nfp_flower_priv *app_priv = app->priv;
+
+	if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG)
+		unregister_netdevice_notifier(&app_priv->nfp_lag.lag_nb);
+
 	nfp_tunnel_config_stop(app);
 }
 
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 7ce255705446..e03efb034948 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -43,6 +43,7 @@
 #include <net/pkt_cls.h>
 #include <net/tcp.h>
 #include <linux/workqueue.h>
+#include <linux/idr.h>
 
 struct net_device;
 struct nfp_app;
@@ -97,6 +98,30 @@ struct nfp_mtu_conf {
 	spinlock_t lock;
 };
 
+/**
+ * struct nfp_fl_lag - Flower APP priv data for link aggregation
+ * @lag_nb:		Notifier to track master/slave events
+ * @work:		Work queue for writing configs to the HW
+ * @lock:		Lock to protect lag_group_list
+ * @group_list:		List of all master/slave groups offloaded
+ * @ida_handle:		IDA to handle group ids
+ * @pkt_num:		Incremented for each config packet sent
+ * @batch_ver:		Incremented for each batch of config packets
+ * @global_inst:	Instance allocator for groups
+ * @rst_cfg:		Marker to reset HW LAG config
+ */
+struct nfp_fl_lag {
+	struct notifier_block lag_nb;
+	struct delayed_work work;
+	struct mutex lock;
+	struct list_head group_list;
+	struct ida ida_handle;
+	unsigned int pkt_num;
+	unsigned int batch_ver;
+	u8 global_inst;
+	bool rst_cfg;
+};
+
 /**
  * struct nfp_flower_priv - Flower APP per-vNIC priv data
  * @app:		Back pointer to app
@@ -129,6 +154,7 @@ struct nfp_mtu_conf {
  *			from firmware for repr reify
  * @reify_wait_queue:	wait queue for repr reify response counting
  * @mtu_conf:		Configuration of repr MTU value
+ * @nfp_lag:		Link aggregation data block
  */
 struct nfp_flower_priv {
 	struct nfp_app *app;
@@ -158,6 +184,7 @@ struct nfp_flower_priv {
 	atomic_t reify_replies;
 	wait_queue_head_t reify_wait_queue;
 	struct nfp_mtu_conf mtu_conf;
+	struct nfp_fl_lag nfp_lag;
 };
 
 /**
@@ -250,5 +277,8 @@ void nfp_tunnel_request_route(struct nfp_app *app, struct sk_buff *skb);
 void nfp_tunnel_keep_alive(struct nfp_app *app, struct sk_buff *skb);
 int nfp_flower_setup_tc_egress_cb(enum tc_setup_type type, void *type_data,
 				  void *cb_priv);
+void nfp_flower_lag_init(struct nfp_fl_lag *lag);
+void nfp_flower_lag_cleanup(struct nfp_fl_lag *lag);
+int nfp_flower_lag_reset(struct nfp_fl_lag *lag);
 
 #endif
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next 7/8] nfp: flower: implement host cmsg handler for LAG
From: Jakub Kicinski @ 2018-05-24  2:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, John Hurley
In-Reply-To: <20180524022255.18548-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

Adds the control message handler to synchronize offloaded group config
with that of the kernel. Such messages are sent from fw to driver and
feature the following 3 flags:

- Data: an attached cmsg could not be processed - store for retransmission
- Xon: FW can accept new messages - retransmit any stored cmsgs
- Sync: full sync requested so retransmit all kernel LAG group info

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../net/ethernet/netronome/nfp/flower/cmsg.c  |  8 +-
 .../ethernet/netronome/nfp/flower/lag_conf.c  | 95 +++++++++++++++++++
 .../net/ethernet/netronome/nfp/flower/main.h  |  4 +
 3 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
index 03aae2ed9983..cb8565222621 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
@@ -242,6 +242,7 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb)
 	struct nfp_flower_priv *app_priv = app->priv;
 	struct nfp_flower_cmsg_hdr *cmsg_hdr;
 	enum nfp_flower_cmsg_type_port type;
+	bool skb_stored = false;
 
 	cmsg_hdr = nfp_flower_cmsg_get_hdr(skb);
 
@@ -260,8 +261,10 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb)
 		nfp_tunnel_keep_alive(app, skb);
 		break;
 	case NFP_FLOWER_CMSG_TYPE_LAG_CONFIG:
-		if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG)
+		if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG) {
+			skb_stored = nfp_flower_lag_unprocessed_msg(app, skb);
 			break;
+		}
 		/* fall through */
 	default:
 		nfp_flower_cmsg_warn(app, "Cannot handle invalid repr control type %u\n",
@@ -269,7 +272,8 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb)
 		goto out;
 	}
 
-	dev_consume_skb_any(skb);
+	if (!skb_stored)
+		dev_consume_skb_any(skb);
 	return;
 out:
 	dev_kfree_skb_any(skb);
diff --git a/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
index 35a700b879d7..a09fe2778250 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
@@ -36,6 +36,9 @@
 /* LAG group config flags. */
 #define NFP_FL_LAG_LAST			BIT(1)
 #define NFP_FL_LAG_FIRST		BIT(2)
+#define NFP_FL_LAG_DATA			BIT(3)
+#define NFP_FL_LAG_XON			BIT(4)
+#define NFP_FL_LAG_SYNC			BIT(5)
 #define NFP_FL_LAG_SWITCH		BIT(6)
 #define NFP_FL_LAG_RESET		BIT(7)
 
@@ -108,6 +111,8 @@ struct nfp_fl_lag_group {
 /* wait for more config */
 #define NFP_FL_LAG_DELAY		(msecs_to_jiffies(2))
 
+#define NFP_FL_LAG_RETRANS_LIMIT	100 /* max retrans cmsgs to store */
+
 static unsigned int nfp_fl_get_next_pkt_number(struct nfp_fl_lag *lag)
 {
 	lag->pkt_num++;
@@ -360,6 +365,92 @@ static void nfp_fl_lag_do_work(struct work_struct *work)
 	mutex_unlock(&lag->lock);
 }
 
+static int
+nfp_fl_lag_put_unprocessed(struct nfp_fl_lag *lag, struct sk_buff *skb)
+{
+	struct nfp_flower_cmsg_lag_config *cmsg_payload;
+
+	cmsg_payload = nfp_flower_cmsg_get_data(skb);
+	if (be32_to_cpu(cmsg_payload->group_id) >= NFP_FL_LAG_GROUP_MAX)
+		return -EINVAL;
+
+	/* Drop cmsg retrans if storage limit is exceeded to prevent
+	 * overloading. If the fw notices that expected messages have not been
+	 * received in a given time block, it will request a full resync.
+	 */
+	if (skb_queue_len(&lag->retrans_skbs) >= NFP_FL_LAG_RETRANS_LIMIT)
+		return -ENOSPC;
+
+	__skb_queue_tail(&lag->retrans_skbs, skb);
+
+	return 0;
+}
+
+static void nfp_fl_send_unprocessed(struct nfp_fl_lag *lag)
+{
+	struct nfp_flower_priv *priv;
+	struct sk_buff *skb;
+
+	priv = container_of(lag, struct nfp_flower_priv, nfp_lag);
+
+	while ((skb = __skb_dequeue(&lag->retrans_skbs)))
+		nfp_ctrl_tx(priv->app->ctrl, skb);
+}
+
+bool nfp_flower_lag_unprocessed_msg(struct nfp_app *app, struct sk_buff *skb)
+{
+	struct nfp_flower_cmsg_lag_config *cmsg_payload;
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_fl_lag_group *group_entry;
+	unsigned long int flags;
+	bool store_skb = false;
+	int err;
+
+	cmsg_payload = nfp_flower_cmsg_get_data(skb);
+	flags = cmsg_payload->ctrl_flags;
+
+	/* Note the intentional fall through below. If DATA and XON are both
+	 * set, the message will stored and sent again with the rest of the
+	 * unprocessed messages list.
+	 */
+
+	/* Store */
+	if (flags & NFP_FL_LAG_DATA)
+		if (!nfp_fl_lag_put_unprocessed(&priv->nfp_lag, skb))
+			store_skb = true;
+
+	/* Send stored */
+	if (flags & NFP_FL_LAG_XON)
+		nfp_fl_send_unprocessed(&priv->nfp_lag);
+
+	/* Resend all */
+	if (flags & NFP_FL_LAG_SYNC) {
+		/* To resend all config:
+		 * 1) Clear all unprocessed messages
+		 * 2) Mark all groups dirty
+		 * 3) Reset NFP group config
+		 * 4) Schedule a LAG config update
+		 */
+
+		__skb_queue_purge(&priv->nfp_lag.retrans_skbs);
+
+		mutex_lock(&priv->nfp_lag.lock);
+		list_for_each_entry(group_entry, &priv->nfp_lag.group_list,
+				    list)
+			group_entry->dirty = true;
+
+		err = nfp_flower_lag_reset(&priv->nfp_lag);
+		if (err)
+			nfp_flower_cmsg_warn(priv->app,
+					     "mem err in group reset msg\n");
+		mutex_unlock(&priv->nfp_lag.lock);
+
+		schedule_delayed_work(&priv->nfp_lag.work, 0);
+	}
+
+	return store_skb;
+}
+
 static void
 nfp_fl_lag_schedule_group_remove(struct nfp_fl_lag *lag,
 				 struct nfp_fl_lag_group *group)
@@ -565,6 +656,8 @@ void nfp_flower_lag_init(struct nfp_fl_lag *lag)
 	mutex_init(&lag->lock);
 	ida_init(&lag->ida_handle);
 
+	__skb_queue_head_init(&lag->retrans_skbs);
+
 	/* 0 is a reserved batch version so increment to first valid value. */
 	nfp_fl_increment_version(lag);
 
@@ -577,6 +670,8 @@ void nfp_flower_lag_cleanup(struct nfp_fl_lag *lag)
 
 	cancel_delayed_work_sync(&lag->work);
 
+	__skb_queue_purge(&lag->retrans_skbs);
+
 	/* Remove all groups. */
 	mutex_lock(&lag->lock);
 	list_for_each_entry_safe(entry, storage, &lag->group_list, list) {
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index e03efb034948..2fd75c155ccb 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -109,6 +109,8 @@ struct nfp_mtu_conf {
  * @batch_ver:		Incremented for each batch of config packets
  * @global_inst:	Instance allocator for groups
  * @rst_cfg:		Marker to reset HW LAG config
+ * @retrans_skbs:	Cmsgs that could not be processed by HW and require
+ *			retransmission
  */
 struct nfp_fl_lag {
 	struct notifier_block lag_nb;
@@ -120,6 +122,7 @@ struct nfp_fl_lag {
 	unsigned int batch_ver;
 	u8 global_inst;
 	bool rst_cfg;
+	struct sk_buff_head retrans_skbs;
 };
 
 /**
@@ -280,5 +283,6 @@ int nfp_flower_setup_tc_egress_cb(enum tc_setup_type type, void *type_data,
 void nfp_flower_lag_init(struct nfp_fl_lag *lag);
 void nfp_flower_lag_cleanup(struct nfp_fl_lag *lag);
 int nfp_flower_lag_reset(struct nfp_fl_lag *lag);
+bool nfp_flower_lag_unprocessed_msg(struct nfp_app *app, struct sk_buff *skb);
 
 #endif
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next 8/8] nfp: flower: compute link aggregation action
From: Jakub Kicinski @ 2018-05-24  2:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, John Hurley
In-Reply-To: <20180524022255.18548-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

If the egress device of an offloaded rule is a LAG port, then encode the
output port to the NFP with a LAG identifier and the offloaded group ID.

A prelag action is also offloaded which must be the first action of the
series (although may appear after other pre-actions - e.g. tunnels). This
causes the FW to check that it has the necessary information to output to
the requested LAG port. If it does not, the packet is sent to the kernel
before any other actions are applied to it.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../ethernet/netronome/nfp/flower/action.c    | 131 ++++++++++++++----
 .../net/ethernet/netronome/nfp/flower/cmsg.h  |  13 ++
 .../ethernet/netronome/nfp/flower/lag_conf.c  |  42 ++++++
 .../net/ethernet/netronome/nfp/flower/main.h  |   9 +-
 .../ethernet/netronome/nfp/flower/offload.c   |   2 +-
 5 files changed, 169 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 80df9a5d4217..4a6d2db75071 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -72,6 +72,42 @@ nfp_fl_push_vlan(struct nfp_fl_push_vlan *push_vlan,
 	push_vlan->vlan_tci = cpu_to_be16(tmp_push_vlan_tci);
 }
 
+static int
+nfp_fl_pre_lag(struct nfp_app *app, const struct tc_action *action,
+	       struct nfp_fl_payload *nfp_flow, int act_len)
+{
+	size_t act_size = sizeof(struct nfp_fl_pre_lag);
+	struct nfp_fl_pre_lag *pre_lag;
+	struct net_device *out_dev;
+	int err;
+
+	out_dev = tcf_mirred_dev(action);
+	if (!out_dev || !netif_is_lag_master(out_dev))
+		return 0;
+
+	if (act_len + act_size > NFP_FL_MAX_A_SIZ)
+		return -EOPNOTSUPP;
+
+	/* Pre_lag action must be first on action list.
+	 * If other actions already exist they need pushed forward.
+	 */
+	if (act_len)
+		memmove(nfp_flow->action_data + act_size,
+			nfp_flow->action_data, act_len);
+
+	pre_lag = (struct nfp_fl_pre_lag *)nfp_flow->action_data;
+	err = nfp_flower_lag_populate_pre_action(app, out_dev, pre_lag);
+	if (err)
+		return err;
+
+	pre_lag->head.jump_id = NFP_FL_ACTION_OPCODE_PRE_LAG;
+	pre_lag->head.len_lw = act_size >> NFP_FL_LW_SIZ;
+
+	nfp_flow->meta.shortcut = cpu_to_be32(NFP_FL_SC_ACT_NULL);
+
+	return act_size;
+}
+
 static bool nfp_fl_netdev_is_tunnel_type(struct net_device *out_dev,
 					 enum nfp_flower_tun_type tun_type)
 {
@@ -88,12 +124,13 @@ static bool nfp_fl_netdev_is_tunnel_type(struct net_device *out_dev,
 }
 
 static int
-nfp_fl_output(struct nfp_fl_output *output, const struct tc_action *action,
-	      struct nfp_fl_payload *nfp_flow, bool last,
-	      struct net_device *in_dev, enum nfp_flower_tun_type tun_type,
-	      int *tun_out_cnt)
+nfp_fl_output(struct nfp_app *app, struct nfp_fl_output *output,
+	      const struct tc_action *action, struct nfp_fl_payload *nfp_flow,
+	      bool last, struct net_device *in_dev,
+	      enum nfp_flower_tun_type tun_type, int *tun_out_cnt)
 {
 	size_t act_size = sizeof(struct nfp_fl_output);
+	struct nfp_flower_priv *priv = app->priv;
 	struct net_device *out_dev;
 	u16 tmp_flags;
 
@@ -118,6 +155,15 @@ nfp_fl_output(struct nfp_fl_output *output, const struct tc_action *action,
 		output->flags = cpu_to_be16(tmp_flags |
 					    NFP_FL_OUT_FLAGS_USE_TUN);
 		output->port = cpu_to_be32(NFP_FL_PORT_TYPE_TUN | tun_type);
+	} else if (netif_is_lag_master(out_dev) &&
+		   priv->flower_ext_feats & NFP_FL_FEATS_LAG) {
+		int gid;
+
+		output->flags = cpu_to_be16(tmp_flags);
+		gid = nfp_flower_lag_get_output_id(app, out_dev);
+		if (gid < 0)
+			return gid;
+		output->port = cpu_to_be32(NFP_FL_LAG_OUT | gid);
 	} else {
 		/* Set action output parameters. */
 		output->flags = cpu_to_be16(tmp_flags);
@@ -164,7 +210,7 @@ static struct nfp_fl_pre_tunnel *nfp_fl_pre_tunnel(char *act_data, int act_len)
 	struct nfp_fl_pre_tunnel *pre_tun_act;
 
 	/* Pre_tunnel action must be first on action list.
-	 * If other actions already exist they need pushed forward.
+	 * If other actions already exist they need to be pushed forward.
 	 */
 	if (act_len)
 		memmove(act_data + act_size, act_data, act_len);
@@ -443,42 +489,73 @@ nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
 }
 
 static int
-nfp_flower_loop_action(const struct tc_action *a,
+nfp_flower_output_action(struct nfp_app *app, const struct tc_action *a,
+			 struct nfp_fl_payload *nfp_fl, int *a_len,
+			 struct net_device *netdev, bool last,
+			 enum nfp_flower_tun_type *tun_type, int *tun_out_cnt,
+			 int *out_cnt)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_fl_output *output;
+	int err, prelag_size;
+
+	if (*a_len + sizeof(struct nfp_fl_output) > NFP_FL_MAX_A_SIZ)
+		return -EOPNOTSUPP;
+
+	output = (struct nfp_fl_output *)&nfp_fl->action_data[*a_len];
+	err = nfp_fl_output(app, output, a, nfp_fl, last, netdev, *tun_type,
+			    tun_out_cnt);
+	if (err)
+		return err;
+
+	*a_len += sizeof(struct nfp_fl_output);
+
+	if (priv->flower_ext_feats & NFP_FL_FEATS_LAG) {
+		/* nfp_fl_pre_lag returns -err or size of prelag action added.
+		 * This will be 0 if it is not egressing to a lag dev.
+		 */
+		prelag_size = nfp_fl_pre_lag(app, a, nfp_fl, *a_len);
+		if (prelag_size < 0)
+			return prelag_size;
+		else if (prelag_size > 0 && (!last || *out_cnt))
+			return -EOPNOTSUPP;
+
+		*a_len += prelag_size;
+	}
+	(*out_cnt)++;
+
+	return 0;
+}
+
+static int
+nfp_flower_loop_action(struct nfp_app *app, const struct tc_action *a,
 		       struct nfp_fl_payload *nfp_fl, int *a_len,
 		       struct net_device *netdev,
-		       enum nfp_flower_tun_type *tun_type, int *tun_out_cnt)
+		       enum nfp_flower_tun_type *tun_type, int *tun_out_cnt,
+		       int *out_cnt)
 {
 	struct nfp_fl_set_ipv4_udp_tun *set_tun;
 	struct nfp_fl_pre_tunnel *pre_tun;
 	struct nfp_fl_push_vlan *psh_v;
 	struct nfp_fl_pop_vlan *pop_v;
-	struct nfp_fl_output *output;
 	int err;
 
 	if (is_tcf_gact_shot(a)) {
 		nfp_fl->meta.shortcut = cpu_to_be32(NFP_FL_SC_ACT_DROP);
 	} else if (is_tcf_mirred_egress_redirect(a)) {
-		if (*a_len + sizeof(struct nfp_fl_output) > NFP_FL_MAX_A_SIZ)
-			return -EOPNOTSUPP;
-
-		output = (struct nfp_fl_output *)&nfp_fl->action_data[*a_len];
-		err = nfp_fl_output(output, a, nfp_fl, true, netdev, *tun_type,
-				    tun_out_cnt);
+		err = nfp_flower_output_action(app, a, nfp_fl, a_len, netdev,
+					       true, tun_type, tun_out_cnt,
+					       out_cnt);
 		if (err)
 			return err;
 
-		*a_len += sizeof(struct nfp_fl_output);
 	} else if (is_tcf_mirred_egress_mirror(a)) {
-		if (*a_len + sizeof(struct nfp_fl_output) > NFP_FL_MAX_A_SIZ)
-			return -EOPNOTSUPP;
-
-		output = (struct nfp_fl_output *)&nfp_fl->action_data[*a_len];
-		err = nfp_fl_output(output, a, nfp_fl, false, netdev, *tun_type,
-				    tun_out_cnt);
+		err = nfp_flower_output_action(app, a, nfp_fl, a_len, netdev,
+					       false, tun_type, tun_out_cnt,
+					       out_cnt);
 		if (err)
 			return err;
 
-		*a_len += sizeof(struct nfp_fl_output);
 	} else if (is_tcf_vlan(a) && tcf_vlan_action(a) == TCA_VLAN_ACT_POP) {
 		if (*a_len + sizeof(struct nfp_fl_pop_vlan) > NFP_FL_MAX_A_SIZ)
 			return -EOPNOTSUPP;
@@ -535,11 +612,12 @@ nfp_flower_loop_action(const struct tc_action *a,
 	return 0;
 }
 
-int nfp_flower_compile_action(struct tc_cls_flower_offload *flow,
+int nfp_flower_compile_action(struct nfp_app *app,
+			      struct tc_cls_flower_offload *flow,
 			      struct net_device *netdev,
 			      struct nfp_fl_payload *nfp_flow)
 {
-	int act_len, act_cnt, err, tun_out_cnt;
+	int act_len, act_cnt, err, tun_out_cnt, out_cnt;
 	enum nfp_flower_tun_type tun_type;
 	const struct tc_action *a;
 	LIST_HEAD(actions);
@@ -550,11 +628,12 @@ int nfp_flower_compile_action(struct tc_cls_flower_offload *flow,
 	act_len = 0;
 	act_cnt = 0;
 	tun_out_cnt = 0;
+	out_cnt = 0;
 
 	tcf_exts_to_list(flow->exts, &actions);
 	list_for_each_entry(a, &actions, list) {
-		err = nfp_flower_loop_action(a, nfp_flow, &act_len, netdev,
-					     &tun_type, &tun_out_cnt);
+		err = nfp_flower_loop_action(app, a, nfp_flow, &act_len, netdev,
+					     &tun_type, &tun_out_cnt, &out_cnt);
 		if (err)
 			return err;
 		act_cnt++;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index 3a42a1fc55cb..4a7f3510a296 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -92,6 +92,7 @@
 #define NFP_FL_ACTION_OPCODE_SET_IPV6_DST	12
 #define NFP_FL_ACTION_OPCODE_SET_UDP		14
 #define NFP_FL_ACTION_OPCODE_SET_TCP		15
+#define NFP_FL_ACTION_OPCODE_PRE_LAG		16
 #define NFP_FL_ACTION_OPCODE_PRE_TUNNEL		17
 #define NFP_FL_ACTION_OPCODE_NUM		32
 
@@ -103,6 +104,9 @@
 #define NFP_FL_PUSH_VLAN_CFI		BIT(12)
 #define NFP_FL_PUSH_VLAN_VID		GENMASK(11, 0)
 
+/* LAG ports */
+#define NFP_FL_LAG_OUT			0xC0DE0000
+
 /* Tunnel ports */
 #define NFP_FL_PORT_TYPE_TUN		0x50000000
 #define NFP_FL_IPV4_TUNNEL_TYPE		GENMASK(7, 4)
@@ -177,6 +181,15 @@ struct nfp_fl_pop_vlan {
 	__be16 reserved;
 };
 
+struct nfp_fl_pre_lag {
+	struct nfp_fl_act_head head;
+	__be16 group_id;
+	u8 lag_version[3];
+	u8 instance;
+};
+
+#define NFP_FL_PRE_LAG_VER_OFF	8
+
 struct nfp_fl_pre_tunnel {
 	struct nfp_fl_act_head head;
 	__be16 reserved;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
index a09fe2778250..0c4c957717ea 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
@@ -184,6 +184,48 @@ nfp_fl_lag_find_group_for_master_with_lag(struct nfp_fl_lag *lag,
 	return NULL;
 }
 
+int nfp_flower_lag_populate_pre_action(struct nfp_app *app,
+				       struct net_device *master,
+				       struct nfp_fl_pre_lag *pre_act)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_fl_lag_group *group = NULL;
+	__be32 temp_vers;
+
+	mutex_lock(&priv->nfp_lag.lock);
+	group = nfp_fl_lag_find_group_for_master_with_lag(&priv->nfp_lag,
+							  master);
+	if (!group) {
+		mutex_unlock(&priv->nfp_lag.lock);
+		return -ENOENT;
+	}
+
+	pre_act->group_id = cpu_to_be16(group->group_id);
+	temp_vers = cpu_to_be32(priv->nfp_lag.batch_ver <<
+				NFP_FL_PRE_LAG_VER_OFF);
+	memcpy(pre_act->lag_version, &temp_vers, 3);
+	pre_act->instance = group->group_inst;
+	mutex_unlock(&priv->nfp_lag.lock);
+
+	return 0;
+}
+
+int nfp_flower_lag_get_output_id(struct nfp_app *app, struct net_device *master)
+{
+	struct nfp_flower_priv *priv = app->priv;
+	struct nfp_fl_lag_group *group = NULL;
+	int group_id = -ENOENT;
+
+	mutex_lock(&priv->nfp_lag.lock);
+	group = nfp_fl_lag_find_group_for_master_with_lag(&priv->nfp_lag,
+							  master);
+	if (group)
+		group_id = group->group_id;
+	mutex_unlock(&priv->nfp_lag.lock);
+
+	return group_id;
+}
+
 static int
 nfp_fl_lag_config_group(struct nfp_fl_lag *lag, struct nfp_fl_lag_group *group,
 			struct net_device **active_members,
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 2fd75c155ccb..bbe5764d26cb 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -45,6 +45,7 @@
 #include <linux/workqueue.h>
 #include <linux/idr.h>
 
+struct nfp_fl_pre_lag;
 struct net_device;
 struct nfp_app;
 
@@ -253,7 +254,8 @@ int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow,
 				  struct net_device *netdev,
 				  struct nfp_fl_payload *nfp_flow,
 				  enum nfp_flower_tun_type tun_type);
-int nfp_flower_compile_action(struct tc_cls_flower_offload *flow,
+int nfp_flower_compile_action(struct nfp_app *app,
+			      struct tc_cls_flower_offload *flow,
 			      struct net_device *netdev,
 			      struct nfp_fl_payload *nfp_flow);
 int nfp_compile_flow_metadata(struct nfp_app *app,
@@ -284,5 +286,10 @@ void nfp_flower_lag_init(struct nfp_fl_lag *lag);
 void nfp_flower_lag_cleanup(struct nfp_fl_lag *lag);
 int nfp_flower_lag_reset(struct nfp_fl_lag *lag);
 bool nfp_flower_lag_unprocessed_msg(struct nfp_app *app, struct sk_buff *skb);
+int nfp_flower_lag_populate_pre_action(struct nfp_app *app,
+				       struct net_device *master,
+				       struct nfp_fl_pre_lag *pre_act);
+int nfp_flower_lag_get_output_id(struct nfp_app *app,
+				 struct net_device *master);
 
 #endif
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 70ec9d821b91..c42e64f32333 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -440,7 +440,7 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
 	if (err)
 		goto err_destroy_flow;
 
-	err = nfp_flower_compile_action(flow, netdev, flow_pay);
+	err = nfp_flower_compile_action(app, flow, netdev, flow_pay);
 	if (err)
 		goto err_destroy_flow;
 
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next] bpfilter: fix build dependency
From: Alexei Starovoitov @ 2018-05-24  4:29 UTC (permalink / raw)
  To: David S . Miller; +Cc: daniel, jakub.kicinski, netdev, kernel-team

BPFILTER could have been enabled without INET causing this build error:
ERROR: "bpfilter_process_sockopt" [net/bpfilter/bpfilter.ko] undefined!

Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module")
Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/bpfilter/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
index 60725c5f79db..a948b072c28f 100644
--- a/net/bpfilter/Kconfig
+++ b/net/bpfilter/Kconfig
@@ -1,7 +1,7 @@
 menuconfig BPFILTER
 	bool "BPF based packet filtering framework (BPFILTER)"
 	default n
-	depends on NET && BPF
+	depends on NET && BPF && INET
 	help
 	  This builds experimental bpfilter framework that is aiming to
 	  provide netfilter compatible functionality via BPF
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH bpf-next 0/5] fix test_sockmap
From: Prashant Bhole @ 2018-05-24  4:47 UTC (permalink / raw)
  To: John Fastabend, Alexei Starovoitov, Daniel Borkmann
  Cc: David S . Miller, Shuah Khan, netdev
In-Reply-To: <67df711c-75c0-b674-e394-148645353a5a@lab.ntt.co.jp>



On 5/23/2018 6:44 PM, Prashant Bhole wrote:
> 
> 
> On 5/22/2018 2:08 AM, John Fastabend wrote:
>> On 05/20/2018 10:13 PM, Prashant Bhole wrote:
>>>
>>>
>>> On 5/19/2018 1:42 AM, John Fastabend wrote:
>>>> On 05/18/2018 12:17 AM, Prashant Bhole wrote:
>>>>> This series fixes bugs in test_sockmap code. They weren't caught
>>>>> previously because failure in RX/TX thread was not notified to the
>>>>> main thread.
>>>>>
>>>>> Also fixed data verification logic and slightly improved test output
>>>>> such that parameters values (cork, apply, start, end) of failed test
>>>>> can be easily seen.
>>>>>
>>>>
>>>> Great, this was on my list so thanks for taking care of it.
>>>>
>>>>> Note: Even after fixing above problems there are issues with tests
>>>>> which set cork parameter. Tests fail (RX thread timeout) when cork
>>>>> value is non-zero and overall data sent by TX thread isn't multiples
>>>>> of cork value.
>>>>
>>>>
>>>> This is expected. When 'cork' is set the sender should only xmit
>>>> the data when 'cork' bytes are available. If the user doesn't
>>>> provide the N bytes the data is cork'ed waiting for the bytes and
>>>> if the socket is closed the state is cleaned up. What these tests
>>>> are testing is the cleanup path when a user doesn't provide the
>>>> N bytes. In practice this is used to validate headers and prevent
>>>> users from sending partial headers. We want to keep these tests because
>>>> they verify a tear-down path in the code.
>>>
>>> Ok.
>>>
>>>>
>>>> After your changes do these get reported as failures? If so we
>>>> need to account for the above in the calculations.
>>>
>>> Yes, cork related test are reported as failures because of RX thread
>>> timeout.
>>>
>>> So with your above description, I think we need to differentiate cork
>>> tests with partial data and full data. In partial data test we can have
>>> something like "timeout_expected" flag. Any other way to fix it?
>>>
>>
>> Adding a flag seems reasonable to me. Lets do this for now. Also I
>> plan to add more negative tests so we can either use the same
>> flag or a new one for those cases as well.
>>
> 
> John,
> I worked on this for some time and noticed that the RX-timeout of tests 
> with cork parameter is dependent on various parameters. So we can not 
> set a flag like the way 'drop_expected' flag is set before executing the 
> test.
> 
> So I decided to write a function which judges all parameters before each 
> test and decides whether a test with cork parameter will timeout or not. 
> Then the conditions in the function became complicated. For example some 
> tests fail if opt->rate < 17 (with some other conditions). Here is 17 is 
> related to FRAGS_PER_SKB. Consider following two examples.

I'm sorry. Correction: s/FRAGS_PER_SKB/MAX_SKB_FRAGS/

> 
> ./test_sockmap --cgroup /mnt/cgroup2 -r 16 -i 1 -l 30 -t sendpage 
> --txmsg --txmsg_cork 1024   # RX timeout occurs
> 
> ./test_sockmap --cgroup /mnt/cgroup2 -r 17 -i 1 -l 30 -t sendpage 
> --txmsg --txmsg_cork 1024   # Success!
> 
> Do we need to keep such tests? if yes, then I will continue with adding 
> such conditions in the function.
> 
> 
> -Prashant
> 
> 
> 

^ permalink raw reply

* Re: [Cake] [PATCH net-next v15 4/7] sch_cake: Add NAT awareness to packet classifier
From: Kevin Darbyshire-Bryant @ 2018-05-24  4:52 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Miller, Cake List, Linux Kernel Network Developers,
	netfilter-devel@vger.kernel.org
In-Reply-To: <878t8axafk.fsf@toke.dk>

[-- Attachment #1: Type: text/plain, Size: 1839 bytes --]

> On 23 May 2018, at 23:40, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
> 
<snip>
> 
> Hmm, and we still have an issue with ingress filtering (where cake is
> running on an ifb interface). That runs pre-NAT in the conntrack case,
> and we can't do the RX trick. Here we do the lookup manually in
> conntrack (and this part is actually what brings in most of the
> dependencies). Any neat tricks up your sleeve for this case? :)

I wonder here if our terminology with ‘ingress’ is causing confusion.  For avoidance of doubt:

Typical use case of cake on LAN/WAN router requires two instances.  One instance (the egress) is on the WAN interface itself.  It is post conntrack and hence uses skb->nfct to work out the real pre-nat source address of the LAN hosts.

Since we cannot apply this qdisc to the ingress of our WAN interface we use an IFB to mirror the ingress packets, and then use a cake instance on the ifb interface on its egress path to in essence control the ingress traffic.

Cake has two modes, the normal ‘egress’ mode which is designed to be used when controlling egress traffic output, and shapes post any dropped packets.  ‘ingress’ mode is designed to be used on the egress of our ingress IFB, where the shaper counts all packets used (well they got here!) even if we decide to drop them a bit later.

The ifb positioned cake has the additional fun factor that the conntrack field hasn’t yet been filled in, so the qdisc has to go looking in the conntrack tables itself to see if any NATting has taken place and balance LAN host fairness based on that.

As far as I understand it, the flow dissector doesn’t obviously help with working out the pre-NAT addressing as the flow has already been mangled in the egress case, and is awaiting mangling on the ingress case.

Kevin

[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH bpf-next 0/5] fix test_sockmap
From: John Fastabend @ 2018-05-24  4:58 UTC (permalink / raw)
  To: Prashant Bhole, Alexei Starovoitov, Daniel Borkmann
  Cc: David S . Miller, Shuah Khan, netdev
In-Reply-To: <85c79205-5bb6-f6c8-e4a1-abed059c2619@lab.ntt.co.jp>

On 05/23/2018 09:47 PM, Prashant Bhole wrote:
> 
> 
> On 5/23/2018 6:44 PM, Prashant Bhole wrote:
>>
>>
>> On 5/22/2018 2:08 AM, John Fastabend wrote:
>>> On 05/20/2018 10:13 PM, Prashant Bhole wrote:
>>>>
>>>>
>>>> On 5/19/2018 1:42 AM, John Fastabend wrote:
>>>>> On 05/18/2018 12:17 AM, Prashant Bhole wrote:
>>>>>> This series fixes bugs in test_sockmap code. They weren't caught
>>>>>> previously because failure in RX/TX thread was not notified to the
>>>>>> main thread.
>>>>>>
>>>>>> Also fixed data verification logic and slightly improved test output
>>>>>> such that parameters values (cork, apply, start, end) of failed test
>>>>>> can be easily seen.
>>>>>>
>>>>>
>>>>> Great, this was on my list so thanks for taking care of it.
>>>>>
>>>>>> Note: Even after fixing above problems there are issues with tests
>>>>>> which set cork parameter. Tests fail (RX thread timeout) when cork
>>>>>> value is non-zero and overall data sent by TX thread isn't multiples
>>>>>> of cork value.
>>>>>
>>>>>
>>>>> This is expected. When 'cork' is set the sender should only xmit
>>>>> the data when 'cork' bytes are available. If the user doesn't
>>>>> provide the N bytes the data is cork'ed waiting for the bytes and
>>>>> if the socket is closed the state is cleaned up. What these tests
>>>>> are testing is the cleanup path when a user doesn't provide the
>>>>> N bytes. In practice this is used to validate headers and prevent
>>>>> users from sending partial headers. We want to keep these tests because
>>>>> they verify a tear-down path in the code.
>>>>
>>>> Ok.
>>>>
>>>>>
>>>>> After your changes do these get reported as failures? If so we
>>>>> need to account for the above in the calculations.
>>>>
>>>> Yes, cork related test are reported as failures because of RX thread
>>>> timeout.
>>>>
>>>> So with your above description, I think we need to differentiate cork
>>>> tests with partial data and full data. In partial data test we can have
>>>> something like "timeout_expected" flag. Any other way to fix it?
>>>>
>>>
>>> Adding a flag seems reasonable to me. Lets do this for now. Also I
>>> plan to add more negative tests so we can either use the same
>>> flag or a new one for those cases as well.
>>>
>>
>> John,
>> I worked on this for some time and noticed that the RX-timeout of
>> tests with cork parameter is dependent on various parameters. So we
>> can not set a flag like the way 'drop_expected' flag is set before
>> executing the test.
>> 
>> So I decided to write a function which judges all parameters before
>> each test and decides whether a test with cork parameter will
>> timeout or not. Then the conditions in the function became
>> complicated. For example some tests fail if opt->rate < 17 (with
>> some other conditions). Here is 17 is related to FRAGS_PER_SKB.
>> Consider following two examples.
> I'm sorry. Correction: s/FRAGS_PER_SKB/MAX_SKB_FRAGS/
> 
>>
>> ./test_sockmap --cgroup /mnt/cgroup2 -r 16 -i 1 -l 30 -t sendpage
>> --txmsg --txmsg_cork 1024   # RX timeout occurs
>> 
>> ./test_sockmap --cgroup /mnt/cgroup2 -r 17 -i 1 -l 30 -t sendpage
>> --txmsg --txmsg_cork 1024   # Success!
>> 

Ah yes this hits the buffer limit and flushes the queue. The kernel
side doesn't know how to merge those specific sendpage requests so
it gives each request its own buffer and when the limit is reached
we flush it.

>> Do we need to keep such tests? if yes, then I will continue with
>> adding such conditions in the function.
>> 

Yes, these tests are needed because they are testing the edge cases.
These are probably the most important tests because my normal usage
will catch any issues in the "good" cases its these types of things
that can go unnoticed (at least for a short while) if we don't have
specific tests for them.

Thanks for doing this.
John

>> -Prashant
>>
>>
>>
> 

^ permalink raw reply

* Re: [PATCH bpf-next v4 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
From: Martin KaFai Lau @ 2018-05-24  5:07 UTC (permalink / raw)
  To: Yonghong Song; +Cc: peterz, ast, daniel, netdev, kernel-team
In-Reply-To: <20180524001844.1175727-3-yhs@fb.com>

On Wed, May 23, 2018 at 05:18:42PM -0700, Yonghong Song wrote:
> Currently, suppose a userspace application has loaded a bpf program
> and attached it to a tracepoint/kprobe/uprobe, and a bpf
> introspection tool, e.g., bpftool, wants to show which bpf program
> is attached to which tracepoint/kprobe/uprobe. Such attachment
> information will be really useful to understand the overall bpf
> deployment in the system.
> 
> There is a name field (16 bytes) for each program, which could
> be used to encode the attachment point. There are some drawbacks
> for this approaches. First, bpftool user (e.g., an admin) may not
> really understand the association between the name and the
> attachment point. Second, if one program is attached to multiple
> places, encoding a proper name which can imply all these
> attachments becomes difficult.
> 
> This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
> Given a pid and fd, if the <pid, fd> is associated with a
> tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
>    . prog_id
>    . tracepoint name, or
>    . k[ret]probe funcname + offset or kernel addr, or
>    . u[ret]probe filename + offset
> to the userspace.
> The user can use "bpftool prog" to find more information about
> bpf program itself with prog_id.
> 
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/trace_events.h |  17 +++++++
>  include/uapi/linux/bpf.h     |  26 ++++++++++
>  kernel/bpf/syscall.c         | 115 +++++++++++++++++++++++++++++++++++++++++++
>  kernel/trace/bpf_trace.c     |  48 ++++++++++++++++++
>  kernel/trace/trace_kprobe.c  |  29 +++++++++++
>  kernel/trace/trace_uprobe.c  |  22 +++++++++
>  6 files changed, 257 insertions(+)
> 
> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> index 2bde3ef..d34144a 100644
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info);
>  int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
>  int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
>  struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
> +			    u32 *fd_type, const char **buf,
> +			    u64 *probe_offset, u64 *probe_addr);
>  #else
>  static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
>  {
> @@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name
>  {
>  	return NULL;
>  }
> +static inline int bpf_get_perf_event_info(const struct perf_event *event,
> +					  u32 *prog_id, u32 *fd_type,
> +					  const char **buf, u64 *probe_offset,
> +					  u64 *probe_addr)
> +{
> +	return -EOPNOTSUPP;
> +}
>  #endif
>  
>  enum {
> @@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, int flags);
>  #ifdef CONFIG_KPROBE_EVENTS
>  extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
>  extern void perf_kprobe_destroy(struct perf_event *event);
> +extern int bpf_get_kprobe_info(const struct perf_event *event,
> +			       u32 *fd_type, const char **symbol,
> +			       u64 *probe_offset, u64 *probe_addr,
> +			       bool perf_type_tracepoint);
>  #endif
>  #ifdef CONFIG_UPROBE_EVENTS
>  extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
>  extern void perf_uprobe_destroy(struct perf_event *event);
> +extern int bpf_get_uprobe_info(const struct perf_event *event,
> +			       u32 *fd_type, const char **filename,
> +			       u64 *probe_offset, bool perf_type_tracepoint);
>  #endif
>  extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
>  				     char *filter_str);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index c3e502d..0d51946 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -97,6 +97,7 @@ enum bpf_cmd {
>  	BPF_RAW_TRACEPOINT_OPEN,
>  	BPF_BTF_LOAD,
>  	BPF_BTF_GET_FD_BY_ID,
> +	BPF_TASK_FD_QUERY,
>  };
>  
>  enum bpf_map_type {
> @@ -379,6 +380,22 @@ union bpf_attr {
>  		__u32		btf_log_size;
>  		__u32		btf_log_level;
>  	};
> +
> +	struct {
> +		__u32		pid;		/* input: pid */
> +		__u32		fd;		/* input: fd */
> +		__u32		flags;		/* input: flags */
> +		__u32		buf_len;	/* input/output: buf len */
> +		__aligned_u64	buf;		/* input/output:
> +						 *   tp_name for tracepoint
> +						 *   symbol for kprobe
> +						 *   filename for uprobe
> +						 */
> +		__u32		prog_id;	/* output: prod_id */
> +		__u32		fd_type;	/* output: BPF_FD_TYPE_* */
> +		__u64		probe_offset;	/* output: probe_offset */
> +		__u64		probe_addr;	/* output: probe_addr */
> +	} task_fd_query;
>  } __attribute__((aligned(8)));
>  
>  /* The description below is an attempt at providing documentation to eBPF
> @@ -2458,4 +2475,13 @@ struct bpf_fib_lookup {
>  	__u8	dmac[6];     /* ETH_ALEN */
>  };
>  
> +enum bpf_task_fd_type {
> +	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
> +	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
> +	BPF_FD_TYPE_KPROBE,		/* (symbol + offset) or addr */
> +	BPF_FD_TYPE_KRETPROBE,		/* (symbol + offset) or addr */
> +	BPF_FD_TYPE_UPROBE,		/* filename + offset */
> +	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
> +};
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 0b4c945..7dd8c86 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -18,7 +18,9 @@
>  #include <linux/vmalloc.h>
>  #include <linux/mmzone.h>
>  #include <linux/anon_inodes.h>
> +#include <linux/fdtable.h>
>  #include <linux/file.h>
> +#include <linux/fs.h>
>  #include <linux/license.h>
>  #include <linux/filter.h>
>  #include <linux/version.h>
> @@ -2102,6 +2104,116 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
>  	return btf_get_fd_by_id(attr->btf_id);
>  }
>  
> +static int bpf_task_fd_query_copy(const union bpf_attr *attr,
> +				    union bpf_attr __user *uattr,
> +				    u32 prog_id, u32 fd_type,
> +				    const char *buf, u64 probe_offset,
> +				    u64 probe_addr)
> +{
> +	void __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
> +	u32 len = buf ? strlen(buf) + 1 : 0, input_len;
> +	int err = 0;
> +
> +	if (put_user(len, &uattr->task_fd_query.buf_len))
> +		return -EFAULT;
> +	input_len = attr->task_fd_query.buf_len;
> +	if (input_len && len && ubuf) {
When len is 0 and input_len > 0, ubuf will not be touched (and
so not null terminated).

It may be helpful to note in uapi bpf.h that !output_buf_len has to be
checked on top of checking the syscall return value.  It is reasonable for
the userspace to assume that ubuf can be directly used with
strlen()/printf()... as long as the syscall does not return -1/ENOSPC.
I think the comment change could be done in a follow up patch.

or

always null terminate ubuf as long as input_len > 0
and the output_buf_len should be strlen(buf) instead of
strlen(buf) + 1 (i.e. exclude the null char in output_buf_len)
such that the !buf case will have output_buf_len == 0.
The user can depend on ENOSPC or input_buf_len <= output_buf_len
to decide the truncated condition.  This convention should be
closer to the snprintf() situation.

Other than that,

Acked-by: Martin KaFai Lau <kafai@fb.com>

> +		if (input_len < len) {
> +			err = -ENOSPC;
> +			len = input_len;
> +		}
> +		if (copy_to_user(ubuf, buf, len))
> +			return -EFAULT;
> +	}
> +
> +	if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
> +	    put_user(fd_type, &uattr->task_fd_query.fd_type) ||
> +	    put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
> +	    put_user(probe_addr, &uattr->task_fd_query.probe_addr))
> +		return -EFAULT;
> +
> +	return err;
> +}
> +
> +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
> +
> +static int bpf_task_fd_query(const union bpf_attr *attr,
> +			     union bpf_attr __user *uattr)
> +{
> +	pid_t pid = attr->task_fd_query.pid;
> +	u32 fd = attr->task_fd_query.fd;
> +	const struct perf_event *event;
> +	struct files_struct *files;
> +	struct task_struct *task;
> +	struct file *file;
> +	int err;
> +
> +	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
> +		return -EINVAL;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	if (attr->task_fd_query.flags != 0)
> +		return -EINVAL;
> +
> +	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
> +	if (!task)
> +		return -ENOENT;
> +
> +	files = get_files_struct(task);
> +	put_task_struct(task);
> +	if (!files)
> +		return -ENOENT;
> +
> +	err = 0;
> +	spin_lock(&files->file_lock);
> +	file = fcheck_files(files, fd);
> +	if (!file)
> +		err = -EBADF;
> +	else
> +		get_file(file);
> +	spin_unlock(&files->file_lock);
> +	put_files_struct(files);
> +
> +	if (err)
> +		goto out;
> +
> +	if (file->f_op == &bpf_raw_tp_fops) {
> +		struct bpf_raw_tracepoint *raw_tp = file->private_data;
> +		struct bpf_raw_event_map *btp = raw_tp->btp;
> +
> +		err = bpf_task_fd_query_copy(attr, uattr,
> +					     raw_tp->prog->aux->id,
> +					     BPF_FD_TYPE_RAW_TRACEPOINT,
> +					     btp->tp->name, 0, 0);
> +		goto put_file;
> +	}
> +
> +	event = perf_get_event(file);
> +	if (!IS_ERR(event)) {
> +		u64 probe_offset, probe_addr;
> +		u32 prog_id, fd_type;
> +		const char *buf;
> +
> +		err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
> +					      &buf, &probe_offset,
> +					      &probe_addr);
> +		if (!err)
> +			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
> +						     fd_type, buf,
> +						     probe_offset,
> +						     probe_addr);
> +		goto put_file;
> +	}
> +
> +	err = -ENOTSUPP;
> +put_file:
> +	fput(file);
> +out:
> +	return err;
> +}
> +
>  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
>  {
>  	union bpf_attr attr = {};
> @@ -2188,6 +2300,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
>  	case BPF_BTF_GET_FD_BY_ID:
>  		err = bpf_btf_get_fd_by_id(&attr);
>  		break;
> +	case BPF_TASK_FD_QUERY:
> +		err = bpf_task_fd_query(&attr, uattr);
> +		break;
>  	default:
>  		err = -EINVAL;
>  		break;
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index ce2cbbf..81fdf2f 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -14,6 +14,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/ctype.h>
>  #include <linux/kprobes.h>
> +#include <linux/syscalls.h>
>  #include <linux/error-injection.h>
>  
>  #include "trace_probe.h"
> @@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
>  	mutex_unlock(&bpf_event_mutex);
>  	return err;
>  }
> +
> +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
> +			    u32 *fd_type, const char **buf,
> +			    u64 *probe_offset, u64 *probe_addr)
> +{
> +	bool is_tracepoint, is_syscall_tp;
> +	struct bpf_prog *prog;
> +	int flags, err = 0;
> +
> +	prog = event->prog;
> +	if (!prog)
> +		return -ENOENT;
> +
> +	/* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
> +	if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
> +		return -EOPNOTSUPP;
> +
> +	*prog_id = prog->aux->id;
> +	flags = event->tp_event->flags;
> +	is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
> +	is_syscall_tp = is_syscall_trace_event(event->tp_event);
> +
> +	if (is_tracepoint || is_syscall_tp) {
> +		*buf = is_tracepoint ? event->tp_event->tp->name
> +				     : event->tp_event->name;
> +		*fd_type = BPF_FD_TYPE_TRACEPOINT;
> +		*probe_offset = 0x0;
> +		*probe_addr = 0x0;
> +	} else {
> +		/* kprobe/uprobe */
> +		err = -EOPNOTSUPP;
> +#ifdef CONFIG_KPROBE_EVENTS
> +		if (flags & TRACE_EVENT_FL_KPROBE)
> +			err = bpf_get_kprobe_info(event, fd_type, buf,
> +						  probe_offset, probe_addr,
> +						  event->attr.type == PERF_TYPE_TRACEPOINT);
> +#endif
> +#ifdef CONFIG_UPROBE_EVENTS
> +		if (flags & TRACE_EVENT_FL_UPROBE)
> +			err = bpf_get_uprobe_info(event, fd_type, buf,
> +						  probe_offset,
> +						  event->attr.type == PERF_TYPE_TRACEPOINT);
> +#endif
> +	}
> +
> +	return err;
> +}
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 02aed76..daa8157 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
>  			      head, NULL);
>  }
>  NOKPROBE_SYMBOL(kretprobe_perf_func);
> +
> +int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
> +			const char **symbol, u64 *probe_offset,
> +			u64 *probe_addr, bool perf_type_tracepoint)
> +{
> +	const char *pevent = trace_event_name(event->tp_event);
> +	const char *group = event->tp_event->class->system;
> +	struct trace_kprobe *tk;
> +
> +	if (perf_type_tracepoint)
> +		tk = find_trace_kprobe(pevent, group);
> +	else
> +		tk = event->tp_event->data;
> +	if (!tk)
> +		return -EINVAL;
> +
> +	*fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE
> +					      : BPF_FD_TYPE_KPROBE;
> +	if (tk->symbol) {
> +		*symbol = tk->symbol;
> +		*probe_offset = tk->rp.kp.offset;
> +		*probe_addr = 0;
> +	} else {
> +		*symbol = NULL;
> +		*probe_offset = 0;
> +		*probe_addr = (unsigned long)tk->rp.kp.addr;
> +	}
> +	return 0;
> +}
>  #endif	/* CONFIG_PERF_EVENTS */
>  
>  /*
> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index ac89287..bf89a51 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
>  {
>  	__uprobe_perf_func(tu, func, regs, ucb, dsize);
>  }
> +
> +int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
> +			const char **filename, u64 *probe_offset,
> +			bool perf_type_tracepoint)
> +{
> +	const char *pevent = trace_event_name(event->tp_event);
> +	const char *group = event->tp_event->class->system;
> +	struct trace_uprobe *tu;
> +
> +	if (perf_type_tracepoint)
> +		tu = find_probe_event(pevent, group);
> +	else
> +		tu = event->tp_event->data;
> +	if (!tu)
> +		return -EINVAL;
> +
> +	*fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE
> +				    : BPF_FD_TYPE_UPROBE;
> +	*filename = tu->filename;
> +	*probe_offset = tu->offset;
> +	return 0;
> +}
>  #endif	/* CONFIG_PERF_EVENTS */
>  
>  static int
> -- 
> 2.9.5
> 

^ permalink raw reply

* Re: [PATCH bpf-next v4 3/7] tools/bpf: sync kernel header bpf.h and add bpf_task_fd_query in libbpf
From: Martin KaFai Lau @ 2018-05-24  5:12 UTC (permalink / raw)
  To: Yonghong Song; +Cc: peterz, ast, daniel, netdev, kernel-team
In-Reply-To: <20180524001844.1175727-4-yhs@fb.com>

On Wed, May 23, 2018 at 05:18:43PM -0700, Yonghong Song wrote:
> Sync kernel header bpf.h to tools/include/uapi/linux/bpf.h and
> implement bpf_task_fd_query() in libbpf. The test programs
> in samples/bpf and tools/testing/selftests/bpf, and later bpftool
> will use this libbpf function to query kernel.
> 
> Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>

^ permalink raw reply

* Re: [PATCH net-next v2 0/2] net: phy: improve PHY suspend/resume
From: Heiner Kallweit @ 2018-05-24  5:52 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: Florian Fainelli, David Miller, netdev@vger.kernel.org
In-Reply-To: <20180523220418.GB5128@lunn.ch>

Am 24.05.2018 um 00:04 schrieb Andrew Lunn:
> On Wed, May 23, 2018 at 10:15:29PM +0200, Heiner Kallweit wrote:
>> I have the issue that suspending the MAC-integrated PHY gives an
>> error during system suspend. The sequence is:
>>
>> 1. unconnected PHY/MAC are runtime-suspended already
>> 2. system suspend commences
>> 3. mdio_bus_phy_suspend is called
>> 4. suspend callback of the network driver is called (implicitly
>>    MAC/PHY are runtime-resumed before)
>> 5. suspend callback suspends MAC/PHY
>>
>> The problem occurs in step 3. phy_suspend() fails because the MDIO
>> bus isn't accessible due to the chip being runtime-suspended.
> 
> I think you are fixing the wrong problem. I've had the same with the
> FEC driver. I fixed it by making the MDIO operations runtime-suspend
> aware:
> 
Interesting, didn't see it from that angle yet. Sounds plausible.
Thanks a lot for the feedback and I'll have a look at the FEC driver.

Heiner

> commit 8fff755e9f8d0f70a595e79f248695ce6aef5cc3
> Author: Andrew Lunn <andrew@lunn.ch>
> Date:   Sat Jul 25 22:38:02 2015 +0200
> 
>     net: fec: Ensure clocks are enabled while using mdio bus
>     
>     When a switch is attached to the mdio bus, the mdio bus can be used
>     while the interface is not open. If the IPG clock is not enabled, MDIO
>     reads/writes will simply time out.
>     
>     Add support for runtime PM to control this clock. Enable/disable this
>     clock using runtime PM, with open()/close() and mdio read()/write()
>     function triggering runtime PM operations. Since PM is optional, the
>     IPG clock is enabled at probe and is no longer modified by
>     fec_enet_clk_enable(), thus if PM is not enabled in the kernel, it is
>     guaranteed the clock is running when MDIO operations are performed.
> 
> Don't copy this patch 1:1. I introduced a few bugs which took a while
> to be shaken out :-(
> 
>    Andrew
> 

^ permalink raw reply

* Re: STMMAC driver with TSO enabled issue
From: Bhadram Varka @ 2018-05-24  5:58 UTC (permalink / raw)
  To: Jose Abreu, netdev@vger.kernel.org, Joao Pinto
In-Reply-To: <a7c17a56-dc40-2bd2-b621-cf73db50cd6e@synopsys.com>

Hi Jose,

On 5/17/2018 7:43 PM, Jose Abreu wrote:
> Hi Bhadram,
> 
> On 15-05-2018 07:44, Bhadram Varka wrote:
>> Hi Jose,
>>
>> On 5/10/2018 9:15 PM, Jose Abreu wrote:
>>>
>>>
>>> On 10-05-2018 16:08, Bhadram Varka wrote:
>>>> Hi Jose,
>>>>
>>>> On 5/10/2018 7:59 PM, Jose Abreu wrote:
>>>>> Hi Bhadram,
>>>>>
>>>>> On 10-05-2018 09:55, Jose Abreu wrote:
>>>>>> ++net-dev
>>>>>>
>>>>>> Hi Bhadram,
>>>>>>
>>>>>> On 09-05-2018 12:03, Bhadram Varka wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> Thanks for responding.
>>>>>>>
>>>>>>> Tried below suggested way. Still observing the issue -
>>>>>> It seems stmmac has a bug in the RX side when using TSO
>>>>>> which is
>>>>>> causing all the RX descriptors to be consumed. The stmmac_rx()
>>>>>> function will need to be refactored. I will send a fix ASAP.
>>>>>
>>>>> Are you using this patch [1] ? Because there is a problem with
>>>>> the patch. By adding the previously removed call to
>>>>> stmmac_init_rx_desc() TSO works okay in my setup.
>>>>>
>>>>
>>>> No. I don't have this change in my code base. I am using
>>>> net-next tree.
>>>>
>>>> Can you please post the change for which TSO works ? I can help
>>>> you with the testing.
>>>
>>> It should work with net-next because patch was not merged yet ...
>>> Please send me the output of "dmesg | grep -i stmmac", since boot
>>> and your full register values (from 0x0 to 0x12E4).
>>>
>>
>> [root@alarm ~]# dmesg | grep -i dwc
>> [    6.925005] dwc-eth-dwmac 2490000.ethernet: Cannot get CSR
>> clock
>> [    6.933657] dwc-eth-dwmac 2490000.ethernet: no reset control
>> found
>> [    6.955325] dwc-eth-dwmac 2490000.ethernet: User ID: 0x10,
>> Synopsys ID: 0x41
>> [    6.962379] dwc-eth-dwmac 2490000.ethernet:  DWMAC4/5
>> [    6.967434] dwc-eth-dwmac 2490000.ethernet: DMA HW
>> capability register supported
>> [    6.974827] dwc-eth-dwmac 2490000.ethernet: RX Checksum
>> Offload Engine supported
>> [    6.982915] dwc-eth-dwmac 2490000.ethernet: TX Checksum
>> insertion supported
>> [    6.991235] dwc-eth-dwmac 2490000.ethernet: Wake-Up On Lan
>> supported
>> [    6.998974] dwc-eth-dwmac 2490000.ethernet: TSO supported
>> [    7.006422] dwc-eth-dwmac 2490000.ethernet: TSO feature enabled
>> [    7.012581] dwc-eth-dwmac 2490000.ethernet: Enable RX
>> Mitigation via HW Watchdog Timer
>> [    7.236391] dwc-eth-dwmac 2490000.ethernet eth0: device MAC
>> address 4a:d1:e3:58:cb:7a
>> [    7.333414] dwc-eth-dwmac 2490000.ethernet eth0: IEEE
>> 1588-2008 Advanced Timestamp supported
>> [    7.342441] dwc-eth-dwmac 2490000.ethernet eth0: registered
>> PTP clock
>> [   10.157066] dwc-eth-dwmac 2490000.ethernet eth0: Link is Up
>> - 1Gbps/Full - flow control off
>> [root@alarm ~]# dmesg | grep -i stmma
>> [    7.020567] libphy: stmmac: probed
>> [    7.316295] Broadcom BCM89610 stmmac-0:00: attached PHY
>> driver [Broadcom BCM89610] (mii_bus:phy_addr=stmmac-0:00, irq=64)
>>
>> I will get the register details -
>>
>> FYI - TSO works fine with single channel. I see the issue only
>> if multi channel enabled (supports 4 Tx/Rx channels).
>>
> 
> And normal data transfer works okay with multi channel, right? I
> will need the register details to proceed ... You could also try
> git bisect ...
> 

Yes - normal data transfers works fine. Issue observed only driver gets 
TSO packet. Looks like TX DMA channel hang.

After adding few debug logs - observed that while processing second or 
third descriptor TX DMA hangs.

[85788.137498] stmmac_tso_xmit: tcphdrlen 32, hdr_len 66, pay_len 1392, 
mss 1448
[85788.144634]  skb->len 7306, skb->data_len 5848
[..]
[85788.274876] 025 [0x82795190]: 0x0 0x0 0x5a8 0xc4000000
[85788.280020] 026 [0x827951a0]: 0xf854e000 0xf854e042 0x5700042 0xa0441c48
[85788.286730] 027 [0x827951b0]: 0xf854f000 0x0 0x16d8 0x90000000
[...]


After some time if check Tx descriptor status - then I see only below

[..]
[85788.286730] 027 [0x827951b0]: 0xf854f000 0x0 0x16d8 0x90000000

index 025 and 026 descriptors processed but not index 027.

At this stage Tx DMA is always in below state -

■ 3'b011: Running (Reading Data from system memory
buffer and queuing it to the Tx buffer (Tx FIFO))

Thanks,
Bhadram.

^ permalink raw reply

* Re: [PATCH v3] powerpc: Implement csum_ipv6_magic in assembly
From: Christophe LEROY @ 2018-05-24  6:20 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linux-kernel, linuxppc-dev, netdev
In-Reply-To: <20180523183447.GV17342@gate.crashing.org>



Le 23/05/2018 à 20:34, Segher Boessenkool a écrit :
> On Tue, May 22, 2018 at 08:57:01AM +0200, Christophe Leroy wrote:
>> The generic csum_ipv6_magic() generates a pretty bad result
> 
> <snip>
> 
> Please try with a more recent compiler, what you used is pretty ancient.
> It's not like recent compilers do great on this either, but it's not
> *that* bad anymore ;-)
> 
>> --- a/arch/powerpc/lib/checksum_32.S
>> +++ b/arch/powerpc/lib/checksum_32.S
>> @@ -293,3 +293,36 @@ dst_error:
>>   	EX_TABLE(51b, dst_error);
>>   
>>   EXPORT_SYMBOL(csum_partial_copy_generic)
>> +
>> +/*
>> + * static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
>> + *				      const struct in6_addr *daddr,
>> + *				      __u32 len, __u8 proto, __wsum sum)
>> + */
>> +
>> +_GLOBAL(csum_ipv6_magic)
>> +	lwz	r8, 0(r3)
>> +	lwz	r9, 4(r3)
>> +	lwz	r10, 8(r3)
>> +	lwz	r11, 12(r3)
>> +	addc	r0, r5, r6
>> +	adde	r0, r0, r7
>> +	adde	r0, r0, r8
>> +	adde	r0, r0, r9
>> +	adde	r0, r0, r10
>> +	adde	r0, r0, r11
>> +	lwz	r8, 0(r4)
>> +	lwz	r9, 4(r4)
>> +	lwz	r10, 8(r4)
>> +	lwz	r11, 12(r4)
>> +	adde	r0, r0, r8
>> +	adde	r0, r0, r9
>> +	adde	r0, r0, r10
>> +	adde	r0, r0, r11
>> +	addze	r0, r0
>> +	rotlwi	r3, r0, 16
>> +	add	r3, r0, r3
>> +	not	r3, r3
>> +	rlwinm	r3, r3, 16, 16, 31
>> +	blr
>> +EXPORT_SYMBOL(csum_ipv6_magic)
> 
> Clustering the loads and carry insns together is pretty much the worst you
> can do on most 32-bit CPUs.

Oh, really ? __csum_partial is written that way too.

Right, now I tried interleaving the lwz and adde. I get no improvment at 
all on a 885, but I get a 15% improvment on a 8321.

Christophe

> 
> 
> Segher
> 

^ permalink raw reply

* [PATCH net-next 1/1] bnx2x: Collect the device debug information during Tx timeout.
From: Sudarsana Reddy Kalluru @ 2018-05-24  6:21 UTC (permalink / raw)
  To: davem; +Cc: netdev

Tx-timeout mostly happens due to some issue in the device. In such cases,
debug dump would be helpful for identifying the cause of the issue.
This patch adds support to spill debug data during the Tx timeout. Here
bnx2x_panic_dump() API is used instead of bnx2x_panic(), since we still
want to allow the Tx-timeout recovery a chance to succeed.

Please consider applying this to "net-next".

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 95871576..182d5e1 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -4962,8 +4962,13 @@ void bnx2x_tx_timeout(struct net_device *dev)
 {
 	struct bnx2x *bp = netdev_priv(dev);

-#ifdef BNX2X_STOP_ON_ERROR
+	/* We want the information of the dump logged,
+	 * but calling bnx2x_panic() would kill all chances of recovery.
+	 */
 	if (!bp->panic)
+#ifdef BNX2X_STOP_ON_ERROR
+		bnx2x_panic_dump(bp, false);
+#else
 		bnx2x_panic();
 #endif

-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH bpf-next v3 01/15] net: initial AF_XDP skeleton
From: Björn Töpel @ 2018-05-24  6:38 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Karlsson, Magnus, Duyck, Alexander H, Alexander Duyck,
	John Fastabend, Alexei Starovoitov, Jesper Dangaard Brouer,
	Willem de Bruijn, Daniel Borkmann, Michael S. Tsirkin, Netdev,
	Björn Töpel, michael.lundkvist, Brandeburg, Jesse,
	Singhai, Anjali, Zhang, Qi Z
In-Reply-To: <20180523155047.6c136279@xeon-e3>

2018-05-24 0:50 GMT+02:00 Stephen Hemminger <stephen@networkplumber.org>:
> On Wed,  2 May 2018 13:01:22 +0200
> Björn Töpel <bjorn.topel@gmail.com> wrote:
>
>> diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig
>> new file mode 100644
>> index 000000000000..90e4a7152854
>> --- /dev/null
>> +++ b/net/xdp/Kconfig
>> @@ -0,0 +1,7 @@
>> +config XDP_SOCKETS
>> +     bool "XDP sockets"
>> +     depends on BPF_SYSCALL
>> +     default n
>> +     help
>> +       XDP sockets allows a channel between XDP programs and
>> +       userspace applications.
>
> Why is XDP not supported as a module?
> Most distributions will want it to be a module so that it is not loaded
> unless used, and AF_XDP could be also be disabled by blacklisting the module.

Yes, all good points, and The Grand Plan is adding module support.
Unfortunately, it's not there yet.

^ permalink raw reply

* Re: [patch iproute2/net-next 2/2] devlink: introduce support for showing port number and split subport number
From: Jiri Pirko @ 2018-05-24  6:39 UTC (permalink / raw)
  To: David Ahern
  Cc: netdev, idosch, jakub.kicinski, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, simon.horman,
	pieter.jansenvanvuuren, john.hurley, dirk.vandermerwe,
	alexander.h.duyck, ogerlitz, vijaya.guvva, satananda.burla,
	raghu.vatsavayi, felix.manlunas, gospo, sathya.perla,
	vasundhara-v.volam, tariqt, eranbe, jeffrey.t.kirsher, roopa
In-Reply-To: <fdd0bb05-b5c1-7805-e8e8-30b6580ca4fd@gmail.com>

Wed, May 23, 2018 at 10:05:49PM CEST, dsahern@gmail.com wrote:
>On 5/20/18 2:15 AM, Jiri Pirko wrote:
>> From: Jiri Pirko <jiri@mellanox.com>
>> 
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>>  devlink/devlink.c            | 6 ++++++
>>  include/uapi/linux/devlink.h | 2 ++
>>  2 files changed, 8 insertions(+)
>> 
>> diff --git a/devlink/devlink.c b/devlink/devlink.c
>> index df2c66dac1c7..b0ae17767dab 100644
>> --- a/devlink/devlink.c
>> +++ b/devlink/devlink.c
>> @@ -1737,9 +1737,15 @@ static void pr_out_port(struct dl *dl, struct nlattr **tb)
>>  
>>  		pr_out_str(dl, "flavour", port_flavour_name(port_flavour));
>>  	}
>> +	if (tb[DEVLINK_ATTR_PORT_NUMBER])
>> +		pr_out_uint(dl, "number",
>> +			    mnl_attr_get_u32(tb[DEVLINK_ATTR_PORT_NUMBER]));
>
>"number" is a label means nothing. "port" is more descriptive.

That attribute name is "port_number". As the other attributes are
named "port_something", and the "something" is printed out here, the
"number" is consistent with it. Each line represents a port with a list
of attributes.

>
># ./devlink port
>pci/0000:03:00.0/1: type eth netdev swp17 flavour physical number 17
>pci/0000:03:00.0/3: type eth netdev swp18 flavour physical number 18
>pci/0000:03:00.0/5: type eth netdev swp19 flavour physical number 19
>pci/0000:03:00.0/7: type eth netdev swp20 flavour physical number 20
>pci/0000:03:00.0/9: type eth netdev swp21 flavour physical number 21
>...
>pci/0000:03:00.0/61: type eth netdev swp1s0 flavour physical number 1
>split_group 1 subport 0
>pci/0000:03:00.0/62: type eth netdev swp1s1 flavour physical number 1
>split_group 1 subport 1
>

^ permalink raw reply

* Re: [PATCH net-next v4 2/2] openvswitch: Support conntrack zone limit
From: Pravin Shelar @ 2018-05-24  6:49 UTC (permalink / raw)
  To: Yi-Hung Wei; +Cc: Linux Kernel Network Developers
In-Reply-To: <1526948165-32443-3-git-send-email-yihung.wei@gmail.com>

On Mon, May 21, 2018 at 5:16 PM, Yi-Hung Wei <yihung.wei@gmail.com> wrote:
> Currently, nf_conntrack_max is used to limit the maximum number of
> conntrack entries in the conntrack table for every network namespace.
> For the VMs and containers that reside in the same namespace,
> they share the same conntrack table, and the total # of conntrack entries
> for all the VMs and containers are limited by nf_conntrack_max.  In this
> case, if one of the VM/container abuses the usage the conntrack entries,
> it blocks the others from committing valid conntrack entries into the
> conntrack table.  Even if we can possibly put the VM in different network
> namespace, the current nf_conntrack_max configuration is kind of rigid
> that we cannot limit different VM/container to have different # conntrack
> entries.
>
> To address the aforementioned issue, this patch proposes to have a
> fine-grained mechanism that could further limit the # of conntrack entries
> per-zone.  For example, we can designate different zone to different VM,
> and set conntrack limit to each zone.  By providing this isolation, a
> mis-behaved VM only consumes the conntrack entries in its own zone, and
> it will not influence other well-behaved VMs.  Moreover, the users can
> set various conntrack limit to different zone based on their preference.
>
> The proposed implementation utilizes Netfilter's nf_conncount backend
> to count the number of connections in a particular zone.  If the number of
> connection is above a configured limitation, ovs will return ENOMEM to the
> userspace.  If userspace does not configure the zone limit, the limit
> defaults to zero that is no limitation, which is backward compatible to
> the behavior without this patch.
>
> The following high leve APIs are provided to the userspace:
>   - OVS_CT_LIMIT_CMD_SET:
>     * set default connection limit for all zones
>     * set the connection limit for a particular zone
>   - OVS_CT_LIMIT_CMD_DEL:
>     * remove the connection limit for a particular zone
>   - OVS_CT_LIMIT_CMD_GET:
>     * get the default connection limit for all zones
>     * get the connection limit for a particular zone
>
> Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>

I have few comments, but otherwise patch looks good.
> ---
>  net/openvswitch/Kconfig     |   3 +-
>  net/openvswitch/conntrack.c | 541 +++++++++++++++++++++++++++++++++++++++++++-
>  net/openvswitch/conntrack.h |   9 +-
>  net/openvswitch/datapath.c  |   7 +-
>  net/openvswitch/datapath.h  |   3 +
>  5 files changed, 557 insertions(+), 6 deletions(-)
>
> diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
> index 2650205cdaf9..89da9512ec1e 100644
> --- a/net/openvswitch/Kconfig
> +++ b/net/openvswitch/Kconfig
> @@ -9,7 +9,8 @@ config OPENVSWITCH
>                    (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
>                                      (!NF_NAT || NF_NAT) && \
>                                      (!NF_NAT_IPV4 || NF_NAT_IPV4) && \
> -                                    (!NF_NAT_IPV6 || NF_NAT_IPV6)))
> +                                    (!NF_NAT_IPV6 || NF_NAT_IPV6) && \
> +                                    (!NETFILTER_CONNCOUNT || NETFILTER_CONNCOUNT)))
>         select LIBCRC32C
>         select MPLS
>         select NET_MPLS_GSO
> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
> index 02fc343feb66..e8bb91420ca9 100644
> --- a/net/openvswitch/conntrack.c
> +++ b/net/openvswitch/conntrack.c
> @@ -16,8 +16,11 @@
>  #include <linux/tcp.h>
>  #include <linux/udp.h>
>  #include <linux/sctp.h>
> +#include <linux/static_key.h>
>  #include <net/ip.h>
> +#include <net/genetlink.h>
>  #include <net/netfilter/nf_conntrack_core.h>
> +#include <net/netfilter/nf_conntrack_count.h>
>  #include <net/netfilter/nf_conntrack_helper.h>
>  #include <net/netfilter/nf_conntrack_labels.h>
>  #include <net/netfilter/nf_conntrack_seqadj.h>
> @@ -76,6 +79,31 @@ struct ovs_conntrack_info {
>  #endif
>  };
>
> +#if    IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
> +#define OVS_CT_LIMIT_UNLIMITED 0
> +#define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED
> +#define CT_LIMIT_HASH_BUCKETS 512
> +static DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled);
> +
> +struct ovs_ct_limit {
> +       /* Elements in ovs_ct_limit_info->limits hash table */
> +       struct hlist_node hlist_node;
> +       struct rcu_head rcu;
> +       u16 zone;
> +       u32 limit;
> +};
> +
> +struct ovs_ct_limit_info {
> +       u32 default_limit;
> +       struct hlist_head *limits;
> +       struct nf_conncount_data *data __aligned(8);

Why does it need explicit alignment attribute?

> +};
> +
> +static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = {
> +       [OVS_CT_LIMIT_ATTR_ZONE_LIMIT] = { .type = NLA_NESTED, },
> +};
> +#endif
> +
>  static bool labels_nonzero(const struct ovs_key_ct_labels *labels);

...

> +static int ovs_ct_check_limit(struct net *net,
> +                             const struct ovs_conntrack_info *info,
> +                             const struct nf_conntrack_tuple *tuple)
> +{
> +       struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
> +       const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
> +       u32 per_zone_limit, connections;
> +       u32 conncount_key[5];

If the key size of single u32, why the array of 5 is defined for the key?

> +
> +       conncount_key[0] = info->zone.id;
> +
> +       per_zone_limit = ct_limit_get(ct_limit_info, info->zone.id);
> +       if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED)
> +               return 0;
> +
> +       connections = nf_conncount_count(net, ct_limit_info->data,
> +                                        conncount_key, tuple, &info->zone);
> +       if (connections > per_zone_limit)
> +               return -ENOMEM;
> +
> +       return 0;
...

> -void ovs_ct_init(struct net *net)
> +#if    IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
> +static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net)
> +{
> +       int i, err;
> +
> +       ovs_net->ct_limit_info = kmalloc(sizeof(*ovs_net->ct_limit_info),
> +                                        GFP_KERNEL);
> +       if (!ovs_net->ct_limit_info)
> +               return -ENOMEM;
> +
> +       ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT;
> +       ovs_net->ct_limit_info->limits =
> +               kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head),
> +                             GFP_KERNEL);
> +       if (!ovs_net->ct_limit_info->limits) {
> +               kfree(ovs_net->ct_limit_info);
> +               return -ENOMEM;
> +       }
> +
> +       for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++)
> +               INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]);
> +
> +       ovs_net->ct_limit_info->data =
> +               nf_conncount_init(net, NFPROTO_INET, sizeof(u32));
> +
> +       if (IS_ERR(ovs_net->ct_limit_info->data)) {

Can you print error msg, other wise it would be really hard to debug a
namespace launch failure due to this issue.

> +               err = PTR_ERR(ovs_net->ct_limit_info->data);
> +               kfree(ovs_net->ct_limit_info->limits);
> +               kfree(ovs_net->ct_limit_info);
> +               return err;
> +       }
> +       return 0;
> +}
> +
....
> +static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit,
> +                                      struct ovs_ct_limit_info *info)
> +{
> +       struct ovs_zone_limit *zone_limit;
> +       int rem;
> +       u16 zone;
> +
> +       rem = NLA_ALIGN(nla_len(nla_zone_limit));
> +       zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
> +
> +       while (rem >= sizeof(*zone_limit)) {
> +               if (unlikely(!check_zone_id(zone_limit->zone_id, &zone))) {
> +                       OVS_NLERR(true, "zone id is out of range");

There is no need to check if the port is out of range when we are
deleting it. since hash table lookup would fail anyways.

> +               } else {
> +                       ovs_lock();
> +                       ct_limit_del(info, zone);
> +                       ovs_unlock();
> +               }
> +               rem -= NLA_ALIGN(sizeof(*zone_limit));
> +               zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
> +                               NLA_ALIGN(sizeof(*zone_limit)));
> +       }
> +

This API does not handle delete of default limit.

> +       if (rem)
> +               OVS_NLERR(true, "del zone limit has %d unknown bytes", rem);
> +
> +       return 0;
> +}
> +
> +static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info,
> +                                         struct sk_buff *reply)
> +{
> +       struct ovs_zone_limit zone_limit;
> +       int err;
> +
> +       zone_limit.zone_id = -1;

This is part of UAPI, Can you define constant in openvswitch.h for
default zone id to be -1.

> +       zone_limit.limit = info->default_limit;
> +       err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
> +       if (err)
> +               return err;
> +
> +       return 0;
> +}
> +
> +static int ovs_ct_limit_get_zone_limit(struct net *net,
> +                                      struct nlattr *nla_zone_limit,
> +                                      struct ovs_ct_limit_info *info,
> +                                      struct sk_buff *reply)
> +{
> +       struct nf_conntrack_zone ct_zone;
> +       struct ovs_zone_limit *zone_limit;
> +       int rem, err;
> +       u32 conncount_key[5];
> +       u16 zone;
> +
> +       rem = NLA_ALIGN(nla_len(nla_zone_limit));
> +       zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit);
> +
> +       while (rem >= sizeof(*zone_limit)) {
> +               if (unlikely(zone_limit->zone_id == -1)) {
> +                       err = ovs_ct_limit_get_default_limit(info, reply);
> +                       if (err)
> +                               return err;
> +               } else if (unlikely(!check_zone_id(zone_limit->zone_id,
> +                                                       &zone))) {
> +                       OVS_NLERR(true, "zone id is out of range");
> +               } else {
> +                       rcu_read_lock();
> +                       zone_limit->limit = ct_limit_get(info, zone);
> +                       rcu_read_unlock();
> +
> +                       nf_ct_zone_init(&ct_zone, zone, NF_CT_DEFAULT_ZONE_DIR,
> +                                       0);
> +                       conncount_key[0] = zone;
> +                       zone_limit->count = nf_conncount_count(
> +                               net, info->data, conncount_key, NULL, &ct_zone);
> +                       err = nla_put_nohdr(reply, sizeof(*zone_limit),
> +                                           zone_limit);
> +                       if (err)
> +                               return err;
> +               }
> +               rem -= NLA_ALIGN(sizeof(*zone_limit));
> +               zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit +
> +                               NLA_ALIGN(sizeof(*zone_limit)));
> +       }
> +
> +       if (rem)
> +               OVS_NLERR(true, "get zone limit has %d unknown bytes", rem);
> +
> +       return 0;
> +}
> +
> +static int ovs_ct_limit_get_all_zone_limit(struct net *net,
> +                                          struct ovs_ct_limit_info *info,
> +                                          struct sk_buff *reply)
> +{
> +       struct nf_conntrack_zone ct_zone;
> +       struct ovs_zone_limit zone_limit;
> +       struct ovs_ct_limit *ct_limit;
> +       struct hlist_head *head;
> +       u32 conncount_key[5];
> +       int i, err = 0;
> +
> +       err = ovs_ct_limit_get_default_limit(info, reply);
> +       if (err)
> +               return err;
> +
> +       rcu_read_lock();
> +       for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
> +               head = &info->limits[i];
> +               hlist_for_each_entry_rcu(ct_limit, head, hlist_node) {
> +                       zone_limit.zone_id = ct_limit->zone;
> +                       zone_limit.limit = ct_limit->limit;
> +                       nf_ct_zone_init(&ct_zone, ct_limit->zone,
> +                                       NF_CT_DEFAULT_ZONE_DIR, 0);
> +
> +                       conncount_key[0] = ct_limit->zone;
> +                       zone_limit.count = nf_conncount_count(net, info->data,
> +                                       conncount_key, NULL, &ct_zone);
> +                       err = nla_put_nohdr(reply, sizeof(zone_limit),
> +                                       &zone_limit);
> +                       if (err)
> +                               goto exit_err;
Can you write a single helper function to build reply zone_limit
object that can be used in ovs_ct_limit_get_zone_limit() and
ovs_ct_limit_get_all_zone_limit()?

> +               }
> +       }
> +
> +exit_err:
> +       rcu_read_unlock();
> +       return err;
> +}
> +
...

^ permalink raw reply

* [PATCH 0/4] RFC CPSW switchdev mode
From: Ilias Apalodimas @ 2018-05-24  6:56 UTC (permalink / raw)
  To: netdev, grygorii.strashko, ivan.khoronzhuk, nsekhar, jiri,
	ivecera
  Cc: francois.ozog, yogeshs, spatton, Ilias Apalodimas

Hello, 

This is adding a new mode on the cpsw driver based around switchdev.
In order to enable this you need to enable CONFIG_NET_SWITCHDEV, 
CONFIG_BRIDGE_VLAN_FILTERING, CONFIG_TI_CPSW_SWITCHDEV
and add to udev config: 

SUBSYSTEM=="net", ACTION=="add", ATTR{phys_switch_id}=="0f011900", \
        ATTR{phys_port_name}!="", NAME="sw0$attr{phys_port_name}"
Since the phys_switch_id is based on cpsw version, users with different 
version will need to do 'ip -d link show dev sw0p0 | grep switchid' and 
replace with the correct value.

This patch creates 3 ports, sw0p0, sw0p1 and sw0p2.
sw0p1 and sw0p2 are the netdev interfaces connected to PHY devices
while sw0p0 is the switch 'cpu facing port'.
sw0p0 will be unable to receive and transmit traffic and it's not 100% within
switchdev scope but, it's used to configure switch cpu port individually as 
this is needed for various switch features and configuration scenarios.

Bridge setup:
ip link add name br0 type bridge
ip link set dev br0 type bridge ageing_time 1000
ip link set dev br0 type bridge vlan_filtering 1

ip link set dev sw0p1 up
ip link set dev sw0p2 up
ip link set dev sw0p0 up
ip link set dev sw0p0 master br0
ip link set dev sw0p2 master br0
ip link set dev sw0p1 master br0

ip link set br0 address $(cat /sys/class/net/sw0p1/address)
ifconfig br0 up

VLAN config:
untagged:
bridge vlan add dev sw0p1 vid 100 pvid untagged master
bridge vlan add dev sw0p2 vid 100 pvid untagged master

tagged:
bridge vlan add dev sw0p1 vid 100 master
bridge vlan add dev sw0p2 vid 100 master

IP address on br0:
bridge vlan add dev br0 vid 100 pvid untagged self
bridge vlan add dev sw0p0 vid 100 pvid untagged master
udhcpc -i br0

FDBs:
bridge fdb add aa:bb:cc:dd:ee:ff dev sw0p1 master vlan 100
bridge fdb add aa:bb:cc:dd:ee:fe dev sw0p2 master

MDBs:
single vlan:
bridge mdb add dev br0 port sw0p1 grp 239.1.1.1 permanent vid 100

all vlans:
bridge mdb add dev br0 port sw0p2 grp 239.1.1.1 permanent
bridge mdb add dev br0 port sw0p0 grp 239.1.1.1 permanent

Multicast:
setting multicast on and off will affect registered multicast
setting allmulti on and off will affect unregistered multicast
This muct occur before adding VLANs on the interfaces. If you change the
flag after the VLAN configuration you need to re-issue the VLAN config 
commands.

Promiscuous mode:
Adding/removing sw0p0 on the bridge will enable/disable ALE_P0_UNI_FLOOD

NFS:
The only way for NFS to work is by chrooting to a minimal environment when 
switch configuration that will affect connectivity is needed.
Assuming you are booting NFS with eth1 interface(the script is hacky and 
it's just there to prove NFS is doable).

setup.sh:
#!/bin/sh
mkdir proc
mount -t proc none /proc

ifconfig br0  > /dev/null
if [ $? -ne 0 ]; then
        echo "Setting up bridge"
        ip link add name br0 type bridge
        ip link set dev br0 type bridge ageing_time 1000
        ip link set dev br0 type bridge vlan_filtering 1

        ip link set eth1 down 
        ip link set eth1 name sw0p1 
        ip link set dev sw0p1 up
        ip link set dev sw0p2 up
        ip link set dev sw0p0 up
        ip link set dev sw0p0 master br0 
        ip link set dev sw0p2 master br0
        ip link set dev sw0p1 master br0
        ifconfig sw0p1 0.0.0.0
        udhchc -i br0
fi
umount /proc

run_nfs.sh:
#!/bin/sh
mkdir /tmp/root/bin -p
mkdir /tmp/root/lib -p

cp -r /lib/ /tmp/root/
cp -r /bin/ /tmp/root/
cp /sbin/ip /tmp/root/bin
cp /sbin/bridge /tmp/root/bin
cp /sbin/ifconfig /tmp/root/bin
cp /sbin/udhcpc /tmp/root/bin
cp /path/to/setup.sh /tmp/root/bin
chroot /tmp/root/ busybox sh /bin/run_nfs.sh

run ./run_nfs.sh

This is on top of 4.17-rc2 tree.

P.S: I am not 100% sure that the promiscuity handling is correct.
Please let me know if i should change anything on that

Ilias Apalodimas (4):
  cpsw: move common headers definitions to cpsw_priv.h
  cpsw_ale: add support functions for switchdev
  cpsw_switchdev: add switchdev support files
  cpsw: add switchdev support

 drivers/net/ethernet/ti/Kconfig          |   9 +
 drivers/net/ethernet/ti/Makefile         |   1 +
 drivers/net/ethernet/ti/cpsw.c           | 610 ++++++++++++++++++++++---------
 drivers/net/ethernet/ti/cpsw_ale.c       | 129 +++++++
 drivers/net/ethernet/ti/cpsw_ale.h       |   8 +
 drivers/net/ethernet/ti/cpsw_priv.h      | 148 ++++++++
 drivers/net/ethernet/ti/cpsw_switchdev.c | 299 +++++++++++++++
 drivers/net/ethernet/ti/cpsw_switchdev.h |   4 +
 8 files changed, 1039 insertions(+), 169 deletions(-)
 create mode 100644 drivers/net/ethernet/ti/cpsw_priv.h
 create mode 100644 drivers/net/ethernet/ti/cpsw_switchdev.c
 create mode 100644 drivers/net/ethernet/ti/cpsw_switchdev.h

-- 
2.7.4

^ permalink raw reply

* [PATCH 1/4] cpsw: move common headers definitions to cpsw_priv.h
From: Ilias Apalodimas @ 2018-05-24  6:56 UTC (permalink / raw)
  To: netdev, grygorii.strashko, ivan.khoronzhuk, nsekhar, jiri,
	ivecera
  Cc: francois.ozog, yogeshs, spatton, Ilias Apalodimas
In-Reply-To: <1527144984-31236-1-git-send-email-ilias.apalodimas@linaro.org>

Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
---
 drivers/net/ethernet/ti/cpsw.c      | 111 +---------------------------
 drivers/net/ethernet/ti/cpsw_priv.h | 141 ++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 110 deletions(-)
 create mode 100644 drivers/net/ethernet/ti/cpsw_priv.h

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 3037127..b16e7cf 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -41,6 +41,7 @@
 
 #include "cpsw.h"
 #include "cpsw_ale.h"
+#include "cpsw_priv.h"
 #include "cpts.h"
 #include "davinci_cpdma.h"
 
@@ -88,7 +89,6 @@ do {								\
 #define CPSW_VERSION_3		0x19010f
 #define CPSW_VERSION_4		0x190112
 
-#define HOST_PORT_NUM		0
 #define CPSW_ALE_PORTS_NUM	3
 #define SLIVER_SIZE		0x40
 
@@ -309,16 +309,6 @@ struct cpsw_ss_regs {
 #define CPSW_MAX_BLKS_TX_SHIFT		4
 #define CPSW_MAX_BLKS_RX		5
 
-struct cpsw_host_regs {
-	u32	max_blks;
-	u32	blk_cnt;
-	u32	tx_in_ctl;
-	u32	port_vlan;
-	u32	tx_pri_map;
-	u32	cpdma_tx_pri_map;
-	u32	cpdma_rx_chan_map;
-};
-
 struct cpsw_sliver_regs {
 	u32	id_ver;
 	u32	mac_control;
@@ -370,105 +360,6 @@ struct cpsw_hw_stats {
 	u32	rxdmaoverruns;
 };
 
-struct cpsw_slave_data {
-	struct device_node *phy_node;
-	char		phy_id[MII_BUS_ID_SIZE];
-	int		phy_if;
-	u8		mac_addr[ETH_ALEN];
-	u16		dual_emac_res_vlan;	/* Reserved VLAN for DualEMAC */
-};
-
-struct cpsw_platform_data {
-	struct cpsw_slave_data	*slave_data;
-	u32	ss_reg_ofs;	/* Subsystem control register offset */
-	u32	channels;	/* number of cpdma channels (symmetric) */
-	u32	slaves;		/* number of slave cpgmac ports */
-	u32	active_slave; /* time stamping, ethtool and SIOCGMIIPHY slave */
-	u32	ale_entries;	/* ale table size */
-	u32	bd_ram_size;  /*buffer descriptor ram size */
-	u32	mac_control;	/* Mac control register */
-	u16	default_vlan;	/* Def VLAN for ALE lookup in VLAN aware mode*/
-	bool	dual_emac;	/* Enable Dual EMAC mode */
-};
-
-struct cpsw_slave {
-	void __iomem			*regs;
-	struct cpsw_sliver_regs __iomem	*sliver;
-	int				slave_num;
-	u32				mac_control;
-	struct cpsw_slave_data		*data;
-	struct phy_device		*phy;
-	struct net_device		*ndev;
-	u32				port_vlan;
-};
-
-static inline u32 slave_read(struct cpsw_slave *slave, u32 offset)
-{
-	return readl_relaxed(slave->regs + offset);
-}
-
-static inline void slave_write(struct cpsw_slave *slave, u32 val, u32 offset)
-{
-	writel_relaxed(val, slave->regs + offset);
-}
-
-struct cpsw_vector {
-	struct cpdma_chan *ch;
-	int budget;
-};
-
-struct cpsw_common {
-	struct device			*dev;
-	struct cpsw_platform_data	data;
-	struct napi_struct		napi_rx;
-	struct napi_struct		napi_tx;
-	struct cpsw_ss_regs __iomem	*regs;
-	struct cpsw_wr_regs __iomem	*wr_regs;
-	u8 __iomem			*hw_stats;
-	struct cpsw_host_regs __iomem	*host_port_regs;
-	u32				version;
-	u32				coal_intvl;
-	u32				bus_freq_mhz;
-	int				rx_packet_max;
-	struct cpsw_slave		*slaves;
-	struct cpdma_ctlr		*dma;
-	struct cpsw_vector		txv[CPSW_MAX_QUEUES];
-	struct cpsw_vector		rxv[CPSW_MAX_QUEUES];
-	struct cpsw_ale			*ale;
-	bool				quirk_irq;
-	bool				rx_irq_disabled;
-	bool				tx_irq_disabled;
-	u32 irqs_table[IRQ_NUM];
-	struct cpts			*cpts;
-	int				rx_ch_num, tx_ch_num;
-	int				speed;
-	int				usage_count;
-};
-
-struct cpsw_priv {
-	struct net_device		*ndev;
-	struct device			*dev;
-	u32				msg_enable;
-	u8				mac_addr[ETH_ALEN];
-	bool				rx_pause;
-	bool				tx_pause;
-	u32 emac_port;
-	struct cpsw_common *cpsw;
-};
-
-struct cpsw_stats {
-	char stat_string[ETH_GSTRING_LEN];
-	int type;
-	int sizeof_stat;
-	int stat_offset;
-};
-
-enum {
-	CPSW_STATS,
-	CPDMA_RX_STATS,
-	CPDMA_TX_STATS,
-};
-
 #define CPSW_STAT(m)		CPSW_STATS,				\
 				sizeof(((struct cpsw_hw_stats *)0)->m), \
 				offsetof(struct cpsw_hw_stats, m)
diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h
new file mode 100644
index 0000000..3b02a83
--- /dev/null
+++ b/drivers/net/ethernet/ti/cpsw_priv.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+
+#define HOST_PORT_NUM		0
+#define IRQ_NUM			2
+#define CPSW_MAX_QUEUES		8
+
+#define CPSW_VERSION_1		0x19010a
+#define CPSW_VERSION_2		0x19010c
+#define CPSW_VERSION_3		0x19010f
+#define CPSW_VERSION_4		0x190112
+
+/* CPSW_PORT_V1 */
+#define CPSW1_MAX_BLKS      0x00 /* Maximum FIFO Blocks */
+#define CPSW1_BLK_CNT       0x04 /* FIFO Block Usage Count (Read Only) */
+#define CPSW1_TX_IN_CTL     0x08 /* Transmit FIFO Control */
+#define CPSW1_PORT_VLAN     0x0c /* VLAN Register */
+#define CPSW1_TX_PRI_MAP    0x10 /* Tx Header Priority to Switch Pri Mapping */
+#define CPSW1_TS_CTL        0x14 /* Time Sync Control */
+#define CPSW1_TS_SEQ_LTYPE  0x18 /* Time Sync Sequence ID Offset and Msg Type */
+#define CPSW1_TS_VLAN       0x1c /* Time Sync VLAN1 and VLAN2 */
+
+/* CPSW_PORT_V2 */
+#define CPSW2_CONTROL       0x00 /* Control Register */
+#define CPSW2_MAX_BLKS      0x08 /* Maximum FIFO Blocks */
+#define CPSW2_BLK_CNT       0x0c /* FIFO Block Usage Count (Read Only) */
+#define CPSW2_TX_IN_CTL     0x10 /* Transmit FIFO Control */
+#define CPSW2_PORT_VLAN     0x14 /* VLAN Register */
+#define CPSW2_TX_PRI_MAP    0x18 /* Tx Header Priority to Switch Pri Mapping */
+#define CPSW2_TS_SEQ_MTYPE  0x1c /* Time Sync Sequence ID Offset and Msg Type */
+
+struct cpsw_slave_data {
+	struct	device_node *phy_node;
+	char	phy_id[MII_BUS_ID_SIZE];
+	int	phy_if;
+	u8	mac_addr[ETH_ALEN];
+	u16	dual_emac_res_vlan;	/* Reserved VLAN for DualEMAC */
+};
+
+struct cpsw_platform_data {
+	struct cpsw_slave_data	*slave_data;
+	u32	ss_reg_ofs;	/* Subsystem control register offset */
+	u32	channels;	/* number of cpdma channels (symmetric) */
+	u32	slaves;		/* number of slave cpgmac ports */
+	u32	active_slave; /* time stamping, ethtool and SIOCGMIIPHY slave */
+	u32	ale_entries;	/* ale table size */
+	u32	bd_ram_size;  /*buffer descriptor ram size */
+	u32	mac_control;	/* Mac control register */
+	u16	default_vlan;	/* Def VLAN for ALE lookup in VLAN aware mode*/
+	bool	dual_emac;	/* Enable Dual EMAC mode */
+};
+
+struct cpsw_slave {
+	void __iomem			*regs;
+	struct cpsw_sliver_regs __iomem	*sliver;
+	int				slave_num;
+	u32				mac_control;
+	struct cpsw_slave_data		*data;
+	struct phy_device		*phy;
+	struct net_device		*ndev;
+	u32				port_vlan;
+};
+
+struct cpsw_vector {
+	struct cpdma_chan *ch;
+	int budget;
+};
+
+struct cpsw_common {
+	struct device			*dev;
+	struct cpsw_platform_data	data;
+	struct napi_struct		napi_rx;
+	struct napi_struct		napi_tx;
+	struct cpsw_ss_regs __iomem	*regs;
+	struct cpsw_wr_regs __iomem	*wr_regs;
+	u8 __iomem			*hw_stats;
+	struct cpsw_host_regs __iomem	*host_port_regs;
+	u32				version;
+	u32				coal_intvl;
+	u32				bus_freq_mhz;
+	int				rx_packet_max;
+	struct cpsw_slave		*slaves;
+	struct cpdma_ctlr		*dma;
+	struct cpsw_vector		txv[CPSW_MAX_QUEUES];
+	struct cpsw_vector		rxv[CPSW_MAX_QUEUES];
+	struct cpsw_ale			*ale;
+	bool				quirk_irq;
+	bool				rx_irq_disabled;
+	bool				tx_irq_disabled;
+	u32				irqs_table[IRQ_NUM];
+	struct cpts			*cpts;
+	int				rx_ch_num, tx_ch_num;
+	int				speed;
+	int				usage_count;
+};
+
+struct cpsw_priv {
+	struct net_device	*ndev;
+	struct device		*dev;
+	u32			msg_enable;
+	u8			mac_addr[ETH_ALEN];
+	bool			rx_pause;
+	bool			tx_pause;
+	u8			port_state[3];
+	u32			emac_port;
+	struct cpsw_common	*cpsw;
+};
+
+struct cpsw_stats {
+	char stat_string[ETH_GSTRING_LEN];
+	int type;
+	int sizeof_stat;
+	int stat_offset;
+};
+
+enum {
+	CPSW_STATS,
+	CPDMA_RX_STATS,
+	CPDMA_TX_STATS,
+};
+
+struct cpsw_host_regs {
+	u32	max_blks;
+	u32	blk_cnt;
+	u32	tx_in_ctl;
+	u32	port_vlan;
+	u32	tx_pri_map;
+	u32	cpdma_tx_pri_map;
+	u32	cpdma_rx_chan_map;
+};
+
+static inline u32 slave_read(struct cpsw_slave *slave, u32 offset)
+{
+	return readl_relaxed(slave->regs + offset);
+}
+
+static inline void slave_write(struct cpsw_slave *slave, u32 val, u32 offset)
+{
+	writel_relaxed(val, slave->regs + offset);
+}
-- 
2.7.4

^ permalink raw reply related

* [PATCH 2/4] cpsw_ale: add support functions for switchdev
From: Ilias Apalodimas @ 2018-05-24  6:56 UTC (permalink / raw)
  To: netdev, grygorii.strashko, ivan.khoronzhuk, nsekhar, jiri,
	ivecera
  Cc: francois.ozog, yogeshs, spatton, Ilias Apalodimas
In-Reply-To: <1527144984-31236-1-git-send-email-ilias.apalodimas@linaro.org>

Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
---
 drivers/net/ethernet/ti/cpsw_ale.c | 129 +++++++++++++++++++++++++++++++++++++
 drivers/net/ethernet/ti/cpsw_ale.h |   8 +++
 2 files changed, 137 insertions(+)

diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c
index 93dc05c..0b7383f 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -409,6 +409,41 @@ int cpsw_ale_del_mcast(struct cpsw_ale *ale, u8 *addr, int port_mask,
 }
 EXPORT_SYMBOL_GPL(cpsw_ale_del_mcast);
 
+static int cpsw_ale_read_mc(struct cpsw_ale *ale, u8 *addr, int flags, u16 vid)
+{
+	u32 ale_entry[ALE_ENTRY_WORDS] = {0, 0, 0};
+	int idx;
+
+	idx = cpsw_ale_match_addr(ale, addr, (flags & ALE_VLAN) ? vid : 0);
+	if (idx >= 0)
+		cpsw_ale_read(ale, idx, ale_entry);
+
+	return cpsw_ale_get_port_mask(ale_entry, ale->port_mask_bits);
+}
+
+int cpsw_ale_mcast_add_modify(struct cpsw_ale *ale, u8 *addr, int port_mask,
+			      int flags, u16 vid, int mcast_state)
+{
+	int mcast_members, ret;
+
+	mcast_members = cpsw_ale_read_mc(ale, addr, flags, vid) | port_mask;
+	ret = cpsw_ale_add_mcast(ale, addr, mcast_members, flags, vid, 0);
+
+	return ret;
+}
+
+int cpsw_ale_mcast_del_modify(struct cpsw_ale *ale, u8 *addr, int port_mask,
+			      int flags, u16 vid)
+{
+	int mcast_members, ret;
+
+	mcast_members = cpsw_ale_read_mc(ale, addr, flags, vid) & ~port_mask;
+	ret = cpsw_ale_del_mcast(ale, addr, mcast_members, flags, vid);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cpsw_ale_mcast_del_modify);
+
 /* ALE NetCP NU switch specific vlan functions */
 static void cpsw_ale_set_vlan_mcast(struct cpsw_ale *ale, u32 *ale_entry,
 				    int reg_mcast, int unreg_mcast)
@@ -424,6 +459,52 @@ static void cpsw_ale_set_vlan_mcast(struct cpsw_ale *ale, u32 *ale_entry,
 	writel(unreg_mcast, ale->params.ale_regs + ALE_VLAN_MASK_MUX(idx));
 }
 
+static int cpsw_ale_read_untagged(struct cpsw_ale *ale, u16 vid)
+{
+	u32 ale_entry[ALE_ENTRY_WORDS] = {0, 0, 0};
+	int idx;
+
+	idx = cpsw_ale_match_vlan(ale, vid);
+	if (idx >= 0)
+		cpsw_ale_read(ale, idx, ale_entry);
+
+	return cpsw_ale_get_vlan_untag_force(ale_entry, ale->vlan_field_bits);
+}
+
+/* returns mask of current members for specificed vlan */
+static int cpsw_ale_read_vlan_members(struct cpsw_ale *ale, u16 vid)
+{
+	u32 ale_entry[ALE_ENTRY_WORDS] = {0, 0, 0};
+	int idx;
+
+	idx = cpsw_ale_match_vlan(ale, vid);
+	if (idx >= 0)
+		cpsw_ale_read(ale, idx, ale_entry);
+
+	return cpsw_ale_get_vlan_member_list(ale_entry, ale->vlan_field_bits);
+}
+
+/* returns mask of registered/unregistered multicast registration */
+static int cpsw_ale_read_reg_unreg_mc(struct cpsw_ale *ale, u16 vid, bool unreg)
+{
+	u32 ale_entry[ALE_ENTRY_WORDS] = {0, 0, 0};
+	int idx;
+	int ret;
+
+	idx = cpsw_ale_match_vlan(ale, vid);
+	if (idx >= 0)
+		cpsw_ale_read(ale, idx, ale_entry);
+
+	if (unreg)
+		ret = cpsw_ale_get_vlan_unreg_mcast(ale_entry,
+						    ale->vlan_field_bits);
+	else
+		ret = cpsw_ale_get_vlan_reg_mcast(ale_entry,
+						  ale->vlan_field_bits);
+
+	return ret;
+}
+
 int cpsw_ale_add_vlan(struct cpsw_ale *ale, u16 vid, int port, int untag,
 		      int reg_mcast, int unreg_mcast)
 {
@@ -482,6 +563,54 @@ int cpsw_ale_del_vlan(struct cpsw_ale *ale, u16 vid, int port_mask)
 }
 EXPORT_SYMBOL_GPL(cpsw_ale_del_vlan);
 
+int cpsw_ale_vlan_add_modify(struct cpsw_ale *ale, u16 vid, int port_mask,
+			     int untag_mask, int reg_mask, int unreg_mask)
+{
+	int ret = 0;
+	int vlan_members = cpsw_ale_read_vlan_members(ale, vid) & ~port_mask;
+	int reg_mcast_members =
+		cpsw_ale_read_reg_unreg_mc(ale, vid, 0) & ~port_mask;
+	int unreg_mcast_members =
+		cpsw_ale_read_reg_unreg_mc(ale, vid, 1) & ~port_mask;
+	int untag_members = cpsw_ale_read_untagged(ale, vid) & ~port_mask;
+
+	vlan_members |= port_mask;
+	untag_members |= untag_mask;
+	reg_mcast_members |= reg_mask;
+	unreg_mcast_members |= unreg_mask;
+
+	ret = cpsw_ale_add_vlan(ale, vid, vlan_members, untag_members,
+				reg_mcast_members, unreg_mcast_members);
+	if (ret) {
+		dev_err(ale->params.dev, "Unable to add vlan\n");
+		return ret;
+	}
+	dev_dbg(ale->params.dev,  "port mask 0x%x untag 0x%x\n", vlan_members,
+		untag_mask);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cpsw_ale_vlan_add_modify);
+
+int cpsw_ale_vlan_del_modify(struct cpsw_ale *ale, u16 vid, int port_mask)
+{
+	int ret = 0;
+	int vlan_members;
+
+	vlan_members = cpsw_ale_read_vlan_members(ale, vid);
+	vlan_members &= ~port_mask;
+
+	ret = cpsw_ale_del_vlan(ale, vid, vlan_members);
+	if (ret) {
+		dev_err(ale->params.dev, "Unable to del vlan\n");
+		return ret;
+	}
+	dev_dbg(ale->params.dev, "port mask 0x%x\n", port_mask);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cpsw_ale_vlan_del_modify);
+
 void cpsw_ale_set_allmulti(struct cpsw_ale *ale, int allmulti)
 {
 	u32 ale_entry[ALE_ENTRY_WORDS];
diff --git a/drivers/net/ethernet/ti/cpsw_ale.h b/drivers/net/ethernet/ti/cpsw_ale.h
index d4fe901..bc29616 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.h
+++ b/drivers/net/ethernet/ti/cpsw_ale.h
@@ -123,4 +123,12 @@ int cpsw_ale_control_set(struct cpsw_ale *ale, int port,
 			 int control, int value);
 void cpsw_ale_dump(struct cpsw_ale *ale, u32 *data);
 
+int cpsw_ale_vlan_add_modify(struct cpsw_ale *ale, u16 vid, int port_mask,
+			     int untag_mask, int reg_mcast, int unreg_mcast);
+int cpsw_ale_vlan_del_modify(struct cpsw_ale *ale, u16 vid, int port_mask);
+int cpsw_ale_mcast_add_modify(struct cpsw_ale *ale, u8 *addr, int port_mask,
+			      int flags, u16 vid, int mcast_state);
+int cpsw_ale_mcast_del_modify(struct cpsw_ale *ale, u8 *addr, int port,
+			      int flags, u16 vid);
+
 #endif
-- 
2.7.4

^ permalink raw reply related

* [PATCH 3/4] cpsw_switchdev: add switchdev support files
From: Ilias Apalodimas @ 2018-05-24  6:56 UTC (permalink / raw)
  To: netdev, grygorii.strashko, ivan.khoronzhuk, nsekhar, jiri,
	ivecera
  Cc: francois.ozog, yogeshs, spatton, Ilias Apalodimas
In-Reply-To: <1527144984-31236-1-git-send-email-ilias.apalodimas@linaro.org>

Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
---
 drivers/net/ethernet/ti/Kconfig          |   9 +
 drivers/net/ethernet/ti/Makefile         |   1 +
 drivers/net/ethernet/ti/cpsw_switchdev.c | 299 +++++++++++++++++++++++++++++++
 drivers/net/ethernet/ti/cpsw_switchdev.h |   4 +
 4 files changed, 313 insertions(+)
 create mode 100644 drivers/net/ethernet/ti/cpsw_switchdev.c
 create mode 100644 drivers/net/ethernet/ti/cpsw_switchdev.h

diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig
index 48a541e..b22ae7d 100644
--- a/drivers/net/ethernet/ti/Kconfig
+++ b/drivers/net/ethernet/ti/Kconfig
@@ -73,6 +73,15 @@ config TI_CPSW
 	  To compile this driver as a module, choose M here: the module
 	  will be called cpsw.
 
+config TI_CPSW_SWITCHDEV
+	bool "TI CPSW switchdev support"
+	depends on TI_CPSW
+	depends on NET_SWITCHDEV
+	help
+	  Enable switchdev support on TI's CPSW Ethernet Switch.
+
+	  This will allow you to configure the switch using standard tools.
+
 config TI_CPTS
 	bool "TI Common Platform Time Sync (CPTS) Support"
 	depends on TI_CPSW || TI_KEYSTONE_NETCP
diff --git a/drivers/net/ethernet/ti/Makefile b/drivers/net/ethernet/ti/Makefile
index 0be551d..3926c6a 100644
--- a/drivers/net/ethernet/ti/Makefile
+++ b/drivers/net/ethernet/ti/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_TI_CPSW_PHY_SEL) += cpsw-phy-sel.o
 obj-$(CONFIG_TI_CPSW_ALE) += cpsw_ale.o
 obj-$(CONFIG_TI_CPTS_MOD) += cpts.o
 obj-$(CONFIG_TI_CPSW) += ti_cpsw.o
+ti_cpsw-objs:= cpsw_switchdev.o
 ti_cpsw-y := cpsw.o
 
 obj-$(CONFIG_TI_KEYSTONE_NETCP) += keystone_netcp.o
diff --git a/drivers/net/ethernet/ti/cpsw_switchdev.c b/drivers/net/ethernet/ti/cpsw_switchdev.c
new file mode 100644
index 0000000..bf8c1bf
--- /dev/null
+++ b/drivers/net/ethernet/ti/cpsw_switchdev.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Texas Instruments switchdev Driver
+ *
+ * Copyright (C) 2018 Texas Instruments
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/if_bridge.h>
+#include <net/switchdev.h>
+#include "cpsw.h"
+#include "cpsw_priv.h"
+#include "cpsw_ale.h"
+
+static u32 cpsw_switchdev_get_ver(struct net_device *ndev)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	struct cpsw_common *cpsw = priv->cpsw;
+
+	return cpsw->version;
+}
+
+static int cpsw_port_attr_set(struct net_device *dev,
+			      const struct switchdev_attr *attr,
+			      struct switchdev_trans *trans)
+{
+	return -EOPNOTSUPP;
+}
+
+static int cpsw_port_attr_get(struct net_device *dev,
+			      struct switchdev_attr *attr)
+{
+	u32 cpsw_ver;
+	int err = 0;
+
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
+		cpsw_ver = cpsw_switchdev_get_ver(dev);
+		attr->u.ppid.id_len = sizeof(cpsw_ver);
+		memcpy(&attr->u.ppid.id, &cpsw_ver, attr->u.ppid.id_len);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return err;
+}
+
+static u16 cpsw_get_pvid(struct cpsw_priv *priv)
+{
+	struct cpsw_common *cpsw = priv->cpsw;
+	u32 __iomem *port_vlan_reg;
+	u32 pvid;
+
+	if (priv->emac_port) {
+		int reg = CPSW2_PORT_VLAN;
+
+		if (cpsw->version == CPSW_VERSION_1)
+			reg = CPSW1_PORT_VLAN;
+		pvid = slave_read(cpsw->slaves + (priv->emac_port - 1), reg);
+	} else {
+		port_vlan_reg = &cpsw->host_port_regs->port_vlan;
+		pvid = readl(port_vlan_reg);
+	}
+
+	pvid = pvid & 0xfff;
+
+	return pvid;
+}
+
+static void cpsw_set_pvid(struct cpsw_priv *priv, u16 vid, bool cfi, u32 cos)
+{
+	struct cpsw_common *cpsw = priv->cpsw;
+	void __iomem *port_vlan_reg;
+	u32 pvid;
+
+	pvid = vid;
+	pvid |= cfi ? BIT(12) : 0;
+	pvid |= (cos & 0x7) << 13;
+
+	if (priv->emac_port) {
+		int reg = CPSW2_PORT_VLAN;
+
+		if (cpsw->version == CPSW_VERSION_1)
+			reg = CPSW1_PORT_VLAN;
+		/* no barrier */
+		slave_write(cpsw->slaves + (priv->emac_port - 1), pvid, reg);
+	} else {
+		/* CPU port */
+		port_vlan_reg = &cpsw->host_port_regs->port_vlan;
+		writel(pvid, port_vlan_reg);
+	}
+}
+
+static int cpsw_port_vlan_add(struct cpsw_priv *priv, bool untag, bool pvid,
+			      u16 vid)
+{
+	struct cpsw_common *cpsw = priv->cpsw;
+	int port_mask = BIT(priv->emac_port);
+	int unreg_mcast_mask = 0;
+	int reg_mcast_mask = 0;
+	int untag_mask = 0;
+	int ret = 0;
+
+	if (priv->ndev->flags & IFF_ALLMULTI)
+		unreg_mcast_mask = port_mask;
+
+	if (priv->ndev->flags & IFF_MULTICAST)
+		reg_mcast_mask = port_mask;
+
+	if (untag)
+		untag_mask = port_mask;
+
+	ret = cpsw_ale_vlan_add_modify(cpsw->ale, vid, port_mask, untag_mask,
+				       reg_mcast_mask, unreg_mcast_mask);
+	if (ret) {
+		dev_err(priv->dev, "Unable to add vlan\n");
+		return ret;
+	}
+
+	if (!pvid)
+		return ret;
+
+	cpsw_set_pvid(priv, vid, 0, 0);
+
+	dev_dbg(priv->dev, "VID: %u dev: %s port: %u\n", vid,
+		priv->ndev->name, priv->emac_port);
+
+	return ret;
+}
+
+static int cpsw_port_vlan_del(struct cpsw_priv *priv, u16 vid)
+{
+	struct cpsw_common *cpsw = priv->cpsw;
+	int port_mask = BIT(priv->emac_port);
+	int ret = 0;
+
+	ret = cpsw_ale_vlan_del_modify(cpsw->ale, vid, port_mask);
+	if (ret != 0)
+		return ret;
+
+	ret = cpsw_ale_del_ucast(cpsw->ale, priv->mac_addr,
+				 HOST_PORT_NUM, ALE_VLAN, vid);
+
+	if (vid == cpsw_get_pvid(priv))
+		cpsw_set_pvid(priv, 0, 0, 0);
+
+	if (ret != 0) {
+		dev_dbg(priv->dev, "Failed to delete unicast entry\n");
+		ret = 0;
+	}
+
+	ret = cpsw_ale_del_mcast(cpsw->ale, priv->ndev->broadcast,
+				 0, ALE_VLAN, vid);
+	if (ret != 0) {
+		dev_dbg(priv->dev, "Failed to delete multicast entry\n");
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int cpsw_port_vlans_add(struct cpsw_priv *priv,
+			       const struct switchdev_obj_port_vlan *vlan,
+			       struct switchdev_trans *trans)
+{
+	bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED;
+	bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID;
+	u16 vid;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) {
+		int err;
+
+		err = cpsw_port_vlan_add(priv, untagged, pvid, vid);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int cpsw_port_vlans_del(struct cpsw_priv *priv,
+			       const struct switchdev_obj_port_vlan *vlan)
+
+{
+	u16 vid;
+
+	for (vid = vlan->vid_begin; vid <= vlan->vid_end; vid++) {
+		int err;
+
+		err = cpsw_port_vlan_del(priv, vid);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int cpsw_port_mdb_add(struct cpsw_priv *priv,
+			     struct switchdev_obj_port_mdb *mdb,
+			     struct switchdev_trans *trans)
+{
+	struct cpsw_common *cpsw = priv->cpsw;
+	int port_mask;
+	int err;
+
+	if (switchdev_trans_ph_prepare(trans))
+		return 0;
+
+	port_mask = BIT(priv->emac_port);
+	err = cpsw_ale_mcast_add_modify(cpsw->ale, mdb->addr, port_mask,
+					ALE_VLAN, mdb->vid, 0);
+
+	return err;
+}
+
+static int cpsw_port_mdb_del(struct cpsw_priv *priv,
+			     struct switchdev_obj_port_mdb *mdb)
+
+{
+	struct cpsw_common *cpsw = priv->cpsw;
+	int del_mask;
+	int err;
+
+	del_mask = BIT(priv->emac_port);
+	err = cpsw_ale_mcast_del_modify(cpsw->ale, mdb->addr, del_mask,
+					ALE_VLAN, mdb->vid);
+
+	return err;
+}
+
+static int cpsw_port_obj_add(struct net_device *ndev,
+			     const struct switchdev_obj *obj,
+			     struct switchdev_trans *trans)
+{
+	struct switchdev_obj_port_vlan *vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+	struct switchdev_obj_port_mdb *mdb = SWITCHDEV_OBJ_PORT_MDB(obj);
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	int err = 0;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_ID_PORT_VLAN:
+		err = cpsw_port_vlans_add(priv, vlan, trans);
+		break;
+	case SWITCHDEV_OBJ_ID_PORT_MDB:
+		err = cpsw_port_mdb_add(priv, mdb, trans);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static int cpsw_port_obj_del(struct net_device *ndev,
+			     const struct switchdev_obj *obj)
+{
+	struct switchdev_obj_port_vlan *vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	int err = 0;
+
+	switch (obj->id) {
+	case SWITCHDEV_OBJ_ID_PORT_VLAN:
+		err = cpsw_port_vlans_del(priv, vlan);
+		break;
+	case SWITCHDEV_OBJ_ID_PORT_MDB:
+		err = cpsw_port_mdb_del(priv, SWITCHDEV_OBJ_PORT_MDB(obj));
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static const struct switchdev_ops cpsw_port_switchdev_ops = {
+	.switchdev_port_attr_set	= cpsw_port_attr_set,
+	.switchdev_port_attr_get	= cpsw_port_attr_get,
+	.switchdev_port_obj_add		= cpsw_port_obj_add,
+	.switchdev_port_obj_del		= cpsw_port_obj_del,
+};
+
+void cpsw_port_switchdev_init(struct net_device *ndev)
+{
+	ndev->switchdev_ops = &cpsw_port_switchdev_ops;
+}
diff --git a/drivers/net/ethernet/ti/cpsw_switchdev.h b/drivers/net/ethernet/ti/cpsw_switchdev.h
new file mode 100644
index 0000000..4940462
--- /dev/null
+++ b/drivers/net/ethernet/ti/cpsw_switchdev.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <net/switchdev.h>
+
+void cpsw_port_switchdev_init(struct net_device *ndev);
-- 
2.7.4

^ permalink raw reply related

* [PATCH 4/4] cpsw: add switchdev support
From: Ilias Apalodimas @ 2018-05-24  6:56 UTC (permalink / raw)
  To: netdev, grygorii.strashko, ivan.khoronzhuk, nsekhar, jiri,
	ivecera
  Cc: francois.ozog, yogeshs, spatton, Ilias Apalodimas
In-Reply-To: <1527144984-31236-1-git-send-email-ilias.apalodimas@linaro.org>

Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
---
 drivers/net/ethernet/ti/cpsw.c      | 503 +++++++++++++++++++++++++++++++-----
 drivers/net/ethernet/ti/cpsw_priv.h |   9 +-
 2 files changed, 450 insertions(+), 62 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index b16e7cf..8f8ebd8 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -18,12 +18,10 @@
 #include <linux/clk.h>
 #include <linux/timer.h>
 #include <linux/module.h>
-#include <linux/platform_device.h>
 #include <linux/irqreturn.h>
 #include <linux/interrupt.h>
 #include <linux/if_ether.h>
 #include <linux/etherdevice.h>
-#include <linux/netdevice.h>
 #include <linux/net_tstamp.h>
 #include <linux/phy.h>
 #include <linux/workqueue.h>
@@ -42,6 +40,7 @@
 #include "cpsw.h"
 #include "cpsw_ale.h"
 #include "cpsw_priv.h"
+#include "cpsw_switchdev.h"
 #include "cpts.h"
 #include "davinci_cpdma.h"
 
@@ -146,9 +145,6 @@ do {								\
 #define CPSW_CMINTMAX_INTVL	(1000 / CPSW_CMINTMIN_CNT)
 #define CPSW_CMINTMIN_INTVL	((1000 / CPSW_CMINTMAX_CNT) + 1)
 
-#define cpsw_slave_index(cpsw, priv)				\
-		((cpsw->data.dual_emac) ? priv->emac_port :	\
-		cpsw->data.active_slave)
 #define IRQ_NUM			2
 #define CPSW_MAX_QUEUES		8
 #define CPSW_CPDMA_DESCS_POOL_SIZE_DEFAULT 256
@@ -360,6 +356,13 @@ struct cpsw_hw_stats {
 	u32	rxdmaoverruns;
 };
 
+struct cpsw_switchdev_event_work {
+	struct work_struct work;
+	struct switchdev_notifier_fdb_info fdb_info;
+	struct cpsw_priv *priv;
+	unsigned long event;
+};
+
 #define CPSW_STAT(m)		CPSW_STATS,				\
 				sizeof(((struct cpsw_hw_stats *)0)->m), \
 				offsetof(struct cpsw_hw_stats, m)
@@ -433,18 +436,22 @@ static const struct cpsw_stats cpsw_gstrings_ch_stats[] = {
 		struct cpsw_slave *slave;				\
 		struct cpsw_common *cpsw = (priv)->cpsw;		\
 		int n;							\
-		if (cpsw->data.dual_emac)				\
-			(func)((cpsw)->slaves + priv->emac_port, ##arg);\
-		else							\
+		if (cpsw->data.switch_mode) {				\
+			if (priv->emac_port == HOST_PORT_NUM)		\
+				break;					\
+			(func)((cpsw)->slaves + (priv->emac_port - 1),  \
+			       ##arg);\
+		} else {						\
 			for (n = cpsw->data.slaves,			\
 					slave = cpsw->slaves;		\
 					n; n--)				\
 				(func)(slave++, ##arg);			\
+		}							\
 	} while (0)
 
 #define cpsw_dual_emac_src_port_detect(cpsw, status, ndev, skb)		\
 	do {								\
-		if (!cpsw->data.dual_emac)				\
+		if (!cpsw->data.switch_mode)				\
 			break;						\
 		if (CPDMA_RX_SOURCE_PORT(status) == 1) {		\
 			ndev = cpsw->slaves[0].ndev;			\
@@ -456,11 +463,13 @@ static const struct cpsw_stats cpsw_gstrings_ch_stats[] = {
 	} while (0)
 #define cpsw_add_mcast(cpsw, priv, addr)				\
 	do {								\
-		if (cpsw->data.dual_emac) {				\
+		if (cpsw->data.switch_mode) {				\
 			struct cpsw_slave *slave = cpsw->slaves +	\
-						priv->emac_port;	\
+						(priv->emac_port - 1);	\
 			int slave_port = cpsw_get_slave_port(		\
 						slave->slave_num);	\
+			if (priv->emac_port == HOST_PORT_NUM)		\
+				break;					\
 			cpsw_ale_add_mcast(cpsw->ale, addr,		\
 				1 << slave_port | ALE_PORT_HOST,	\
 				ALE_VLAN, slave->port_vlan, 0);		\
@@ -476,13 +485,41 @@ static inline int cpsw_get_slave_port(u32 slave_num)
 	return slave_num + 1;
 }
 
+static int cpsw_is_dual_mac(u8 switch_mode)
+{
+	return switch_mode == CPSW_DUAL_EMAC;
+}
+
+static int cpsw_is_switchdev(u8 switch_mode)
+{
+	return switch_mode == CPSW_SWITCHDEV;
+}
+
+static int cpsw_is_switch(u8 switch_mode)
+{
+	return switch_mode == CPSW_TI_SWITCH;
+}
+
+static int cpsw_slave_index(struct cpsw_priv *priv)
+{
+	struct cpsw_common *cpsw = priv->cpsw;
+
+#if IS_ENABLED(CONFIG_TI_CPSW_SWITCHDEV)
+	if (priv->emac_port == HOST_PORT_NUM)
+		return -1;
+#endif
+
+	return cpsw->data.switch_mode ? priv->emac_port - 1 :
+		cpsw->data.active_slave;
+}
+
 static void cpsw_set_promiscious(struct net_device *ndev, bool enable)
 {
 	struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
 	struct cpsw_ale *ale = cpsw->ale;
 	int i;
 
-	if (cpsw->data.dual_emac) {
+	if (cpsw_is_dual_mac(cpsw->data.switch_mode)) {
 		bool flag = false;
 
 		/* Enabling promiscuous mode for one interface will be
@@ -508,7 +545,7 @@ static void cpsw_set_promiscious(struct net_device *ndev, bool enable)
 			cpsw_ale_control_set(ale, 0, ALE_BYPASS, 0);
 			dev_dbg(&ndev->dev, "promiscuity disabled\n");
 		}
-	} else {
+	} else if (cpsw_is_switch(cpsw->data.switch_mode)) {
 		if (enable) {
 			unsigned long timeout = jiffies + HZ;
 
@@ -548,6 +585,18 @@ static void cpsw_set_promiscious(struct net_device *ndev, bool enable)
 			}
 			dev_dbg(&ndev->dev, "promiscuity disabled\n");
 		}
+	} else if (cpsw_is_switchdev(cpsw->data.switch_mode)) {
+		/* When interfaces are placed into a bridge they'll switch to
+		 * promiscuous mode. In switchdev case ALE_P0_UNI_FLOOD is
+		 * changed whether the cpu port participates in the bridge
+		 */
+		struct cpsw_priv *priv = netdev_priv(ndev);
+		int slave_idx = cpsw_slave_index(priv);
+		int slave_num;
+
+		slave_num = cpsw_get_slave_port(slave_idx);
+		cpsw_ale_control_set(ale, slave_num, ALE_PORT_NOLEARN, 0);
+		cpsw_ale_control_set(ale, slave_num, ALE_PORT_NO_SA_UPDATE, 0);
 	}
 }
 
@@ -555,10 +604,11 @@ static void cpsw_ndo_set_rx_mode(struct net_device *ndev)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
+	int slave_no = cpsw_slave_index(priv);
 	int vid;
 
-	if (cpsw->data.dual_emac)
-		vid = cpsw->slaves[priv->emac_port].port_vlan;
+	if (cpsw_is_dual_mac(cpsw->data.switch_mode))
+		vid = cpsw->slaves[slave_no].port_vlan;
 	else
 		vid = cpsw->data.default_vlan;
 
@@ -629,8 +679,9 @@ static void cpsw_tx_handler(void *token, int len, int status)
 static void cpsw_rx_vlan_encap(struct sk_buff *skb)
 {
 	struct cpsw_priv *priv = netdev_priv(skb->dev);
-	struct cpsw_common *cpsw = priv->cpsw;
 	u32 rx_vlan_encap_hdr = *((u32 *)skb->data);
+	struct cpsw_common *cpsw = priv->cpsw;
+	int slave_no = cpsw_slave_index(priv);
 	u16 vtag, vid, prio, pkt_type;
 
 	/* Remove VLAN header encapsulation word */
@@ -651,8 +702,8 @@ static void cpsw_rx_vlan_encap(struct sk_buff *skb)
 	if (!vid)
 		return;
 	/* Ignore default vlans in dual mac mode */
-	if (cpsw->data.dual_emac &&
-	    vid == cpsw->slaves[priv->emac_port].port_vlan)
+	if (cpsw_is_dual_mac(cpsw->data.switch_mode) &&
+	    vid == cpsw->slaves[slave_no].port_vlan)
 		return;
 
 	prio = (rx_vlan_encap_hdr >>
@@ -681,9 +732,9 @@ static void cpsw_rx_handler(void *token, int len, int status)
 	cpsw_dual_emac_src_port_detect(cpsw, status, ndev, skb);
 
 	if (unlikely(status < 0) || unlikely(!netif_running(ndev))) {
-		/* In dual emac mode check for all interfaces */
-		if (cpsw->data.dual_emac && cpsw->usage_count &&
-		    (status >= 0)) {
+		/* In any other that switch mode check for all interfaces */
+		if (!cpsw_is_switch(cpsw->data.switch_mode) &&
+		    cpsw->usage_count && status >= 0) {
 			/* The packet received is for the interface which
 			 * is already down and the other interface is up
 			 * and running, instead of freeing which results
@@ -699,6 +750,11 @@ static void cpsw_rx_handler(void *token, int len, int status)
 		return;
 	}
 
+#if IS_ENABLED(CONFIG_TI_CPSW_SWITCHDEV)
+	if (cpsw_is_switchdev(cpsw->data.switch_mode))
+		skb->offload_fwd_mark = 1;
+#endif
+
 	new_skb = netdev_alloc_skb_ip_align(ndev, cpsw->rx_packet_max);
 	if (new_skb) {
 		skb_copy_queue_mapping(new_skb, skb);
@@ -1206,11 +1262,10 @@ static inline int cpsw_tx_packet_submit(struct cpsw_priv *priv,
 					struct sk_buff *skb,
 					struct cpdma_chan *txch)
 {
-	struct cpsw_common *cpsw = priv->cpsw;
-
 	skb_tx_timestamp(skb);
+
 	return cpdma_chan_submit(txch, skb, skb->data, skb->len,
-				 priv->emac_port + cpsw->data.dual_emac);
+				 priv->emac_port);
 }
 
 static inline void cpsw_add_dual_emac_def_ale_entries(
@@ -1283,7 +1338,7 @@ static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv)
 
 	slave_port = cpsw_get_slave_port(slave->slave_num);
 
-	if (cpsw->data.dual_emac)
+	if (cpsw_is_dual_mac(cpsw->data.switch_mode))
 		cpsw_add_dual_emac_def_ale_entries(priv, slave, slave_port);
 	else
 		cpsw_ale_add_mcast(cpsw->ale, priv->ndev->broadcast,
@@ -1362,8 +1417,8 @@ static void cpsw_init_host_port(struct cpsw_priv *priv)
 	control_reg = readl(&cpsw->regs->control);
 	control_reg |= CPSW_VLAN_AWARE | CPSW_RX_VLAN_ENCAP;
 	writel(control_reg, &cpsw->regs->control);
-	fifo_mode = (cpsw->data.dual_emac) ? CPSW_FIFO_DUAL_MAC_MODE :
-		     CPSW_FIFO_NORMAL_MODE;
+	fifo_mode = cpsw_is_dual_mac(cpsw->data.switch_mode) ?
+		CPSW_FIFO_DUAL_MAC_MODE : CPSW_FIFO_NORMAL_MODE;
 	writel(fifo_mode, &cpsw->host_port_regs->tx_in_ctl);
 
 	/* setup host port priority mapping */
@@ -1374,7 +1429,7 @@ static void cpsw_init_host_port(struct cpsw_priv *priv)
 	cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM,
 			     ALE_PORT_STATE, ALE_PORT_STATE_FORWARD);
 
-	if (!cpsw->data.dual_emac) {
+	if (!cpsw_is_dual_mac(cpsw->data.switch_mode)) {
 		cpsw_ale_add_ucast(cpsw->ale, priv->mac_addr, HOST_PORT_NUM,
 				   0, 0);
 		cpsw_ale_add_mcast(cpsw->ale, priv->ndev->broadcast,
@@ -1474,14 +1529,19 @@ static int cpsw_ndo_open(struct net_device *ndev)
 	/* Initialize host and slave ports */
 	if (!cpsw->usage_count)
 		cpsw_init_host_port(priv);
-	for_each_slave(priv, cpsw_slave_open, priv);
 
-	/* Add default VLAN */
-	if (!cpsw->data.dual_emac)
-		cpsw_add_default_vlan(priv);
-	else
-		cpsw_ale_add_vlan(cpsw->ale, cpsw->data.default_vlan,
-				  ALE_ALL_PORTS, ALE_ALL_PORTS, 0, 0);
+	if (!IS_ENABLED(CONFIG_TI_CPSW_SWITCHDEV) ||
+	    (IS_ENABLED(CONFIG_TI_CPSW_SWITCHDEV) &&
+	     priv->emac_port != HOST_PORT_NUM)) {
+		for_each_slave(priv, cpsw_slave_open, priv);
+
+		/* Add default VLAN */
+		if (cpsw_is_dual_mac(cpsw->data.switch_mode))
+			cpsw_ale_add_vlan(cpsw->ale, cpsw->data.default_vlan,
+					  ALE_ALL_PORTS, ALE_ALL_PORTS, 0, 0);
+		else
+			cpsw_add_default_vlan(priv);
+	}
 
 	/* initialize shared resources for every ndev */
 	if (!cpsw->usage_count) {
@@ -1575,6 +1635,13 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb,
 	struct cpdma_chan *txch;
 	int ret, q_idx;
 
+#if IS_ENABLED(CONFIG_TI_CPSW_SWITCHDEV)
+	if (priv->emac_port == HOST_PORT_NUM) {
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+#endif
+
 	if (skb_padto(skb, CPSW_MIN_PACKET_SIZE)) {
 		cpsw_err(priv, tx_err, "packet pad failed\n");
 		ndev->stats.tx_dropped++;
@@ -1655,8 +1722,12 @@ static void cpsw_hwtstamp_v2(struct cpsw_priv *priv)
 	struct cpsw_slave *slave;
 	struct cpsw_common *cpsw = priv->cpsw;
 	u32 ctrl, mtype;
+	int slave_no = cpsw_slave_index(priv);
+
+	if (slave_no < 0)
+		return;
 
-	slave = &cpsw->slaves[cpsw_slave_index(cpsw, priv)];
+	slave = &cpsw->slaves[slave_no];
 
 	ctrl = slave_read(slave, CPSW2_CONTROL);
 	switch (cpsw->version) {
@@ -1791,11 +1862,14 @@ static int cpsw_ndo_ioctl(struct net_device *dev, struct ifreq *req, int cmd)
 {
 	struct cpsw_priv *priv = netdev_priv(dev);
 	struct cpsw_common *cpsw = priv->cpsw;
-	int slave_no = cpsw_slave_index(cpsw, priv);
+	int slave_no = cpsw_slave_index(priv);
 
 	if (!netif_running(dev))
 		return -EINVAL;
 
+	if (slave_no < 0)
+		return -EOPNOTSUPP;
+
 	switch (cmd) {
 	case SIOCSHWTSTAMP:
 		return cpsw_hwtstamp_set(dev, req);
@@ -1832,6 +1906,7 @@ static int cpsw_ndo_set_mac_address(struct net_device *ndev, void *p)
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct sockaddr *addr = (struct sockaddr *)p;
 	struct cpsw_common *cpsw = priv->cpsw;
+	int slave_no = cpsw_slave_index(priv);
 	int flags = 0;
 	u16 vid = 0;
 	int ret;
@@ -1845,8 +1920,8 @@ static int cpsw_ndo_set_mac_address(struct net_device *ndev, void *p)
 		return ret;
 	}
 
-	if (cpsw->data.dual_emac) {
-		vid = cpsw->slaves[priv->emac_port].port_vlan;
+	if (cpsw_is_dual_mac(cpsw->data.switch_mode)) {
+		vid = cpsw->slaves[slave_no].port_vlan;
 		flags = ALE_VLAN;
 	}
 
@@ -1884,8 +1959,11 @@ static inline int cpsw_add_vlan_ale_entry(struct cpsw_priv *priv,
 	u32 port_mask;
 	struct cpsw_common *cpsw = priv->cpsw;
 
-	if (cpsw->data.dual_emac) {
-		port_mask = (1 << (priv->emac_port + 1)) | ALE_PORT_HOST;
+	if (cpsw_is_switchdev(cpsw->data.switch_mode))
+		return -EOPNOTSUPP;
+
+	if (cpsw_is_dual_mac(cpsw->data.switch_mode)) {
+		port_mask = (1 << priv->emac_port) | ALE_PORT_HOST;
 
 		if (priv->ndev->flags & IFF_ALLMULTI)
 			unreg_mcast_mask = port_mask;
@@ -1929,6 +2007,9 @@ static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
 	struct cpsw_common *cpsw = priv->cpsw;
 	int ret;
 
+	if (cpsw_is_switchdev(cpsw->data.switch_mode))
+		return 0;
+
 	if (vid == cpsw->data.default_vlan)
 		return 0;
 
@@ -1938,7 +2019,7 @@ static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
 		return ret;
 	}
 
-	if (cpsw->data.dual_emac) {
+	if (cpsw_is_dual_mac(cpsw->data.switch_mode)) {
 		/* In dual EMAC, reserved VLAN id should not be used for
 		 * creating VLAN interfaces as this can break the dual
 		 * EMAC port separation
@@ -1965,6 +2046,9 @@ static int cpsw_ndo_vlan_rx_kill_vid(struct net_device *ndev,
 	struct cpsw_common *cpsw = priv->cpsw;
 	int ret;
 
+	if (cpsw_is_switchdev(cpsw->data.switch_mode))
+		return 0;
+
 	if (vid == cpsw->data.default_vlan)
 		return 0;
 
@@ -1974,7 +2058,7 @@ static int cpsw_ndo_vlan_rx_kill_vid(struct net_device *ndev,
 		return ret;
 	}
 
-	if (cpsw->data.dual_emac) {
+	if (cpsw_is_dual_mac(cpsw->data.switch_mode)) {
 		int i;
 
 		for (i = 0; i < cpsw->data.slaves; i++) {
@@ -1999,6 +2083,24 @@ static int cpsw_ndo_vlan_rx_kill_vid(struct net_device *ndev,
 	return ret;
 }
 
+static int cpsw_ndo_get_phys_port_name(struct net_device *ndev, char *name,
+				       size_t len)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	struct cpsw_common *cpsw = priv->cpsw;
+	int err;
+
+	if (!cpsw_is_switchdev(cpsw->data.switch_mode))
+		return -EOPNOTSUPP;
+
+	err = snprintf(name, len, "p%d", priv->emac_port);
+
+	if (err >= len)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int cpsw_ndo_set_tx_maxrate(struct net_device *ndev, int queue, u32 rate)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
@@ -2065,6 +2167,7 @@ static const struct net_device_ops cpsw_netdev_ops = {
 #endif
 	.ndo_vlan_rx_add_vid	= cpsw_ndo_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= cpsw_ndo_vlan_rx_kill_vid,
+	.ndo_get_phys_port_name = cpsw_ndo_get_phys_port_name,
 };
 
 static int cpsw_get_regs_len(struct net_device *ndev)
@@ -2152,7 +2255,10 @@ static int cpsw_get_link_ksettings(struct net_device *ndev,
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
-	int slave_no = cpsw_slave_index(cpsw, priv);
+	int slave_no = cpsw_slave_index(priv);
+
+	if (slave_no < 0)
+		return -EOPNOTSUPP;
 
 	if (!cpsw->slaves[slave_no].phy)
 		return -EOPNOTSUPP;
@@ -2166,7 +2272,10 @@ static int cpsw_set_link_ksettings(struct net_device *ndev,
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
-	int slave_no = cpsw_slave_index(cpsw, priv);
+	int slave_no = cpsw_slave_index(priv);
+
+	if (slave_no < 0)
+		return -EOPNOTSUPP;
 
 	if (cpsw->slaves[slave_no].phy)
 		return phy_ethtool_ksettings_set(cpsw->slaves[slave_no].phy,
@@ -2179,7 +2288,10 @@ static void cpsw_get_wol(struct net_device *ndev, struct ethtool_wolinfo *wol)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
-	int slave_no = cpsw_slave_index(cpsw, priv);
+	int slave_no = cpsw_slave_index(priv);
+
+	if (slave_no < 0)
+		return;
 
 	wol->supported = 0;
 	wol->wolopts = 0;
@@ -2192,7 +2304,10 @@ static int cpsw_set_wol(struct net_device *ndev, struct ethtool_wolinfo *wol)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
-	int slave_no = cpsw_slave_index(cpsw, priv);
+	int slave_no = cpsw_slave_index(priv);
+
+	if (slave_no < 0)
+		return -EOPNOTSUPP;
 
 	if (cpsw->slaves[slave_no].phy)
 		return phy_ethtool_set_wol(cpsw->slaves[slave_no].phy, wol);
@@ -2451,7 +2566,10 @@ static int cpsw_get_eee(struct net_device *ndev, struct ethtool_eee *edata)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
-	int slave_no = cpsw_slave_index(cpsw, priv);
+	int slave_no = cpsw_slave_index(priv);
+
+	if (slave_no < 0)
+		return -EOPNOTSUPP;
 
 	if (cpsw->slaves[slave_no].phy)
 		return phy_ethtool_get_eee(cpsw->slaves[slave_no].phy, edata);
@@ -2463,7 +2581,10 @@ static int cpsw_set_eee(struct net_device *ndev, struct ethtool_eee *edata)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
-	int slave_no = cpsw_slave_index(cpsw, priv);
+	int slave_no = cpsw_slave_index(priv);
+
+	if (slave_no < 0)
+		return -EOPNOTSUPP;
 
 	if (cpsw->slaves[slave_no].phy)
 		return phy_ethtool_set_eee(cpsw->slaves[slave_no].phy, edata);
@@ -2475,7 +2596,10 @@ static int cpsw_nway_reset(struct net_device *ndev)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	struct cpsw_common *cpsw = priv->cpsw;
-	int slave_no = cpsw_slave_index(cpsw, priv);
+	int slave_no = cpsw_slave_index(priv);
+
+	if (slave_no < 0)
+		return -EOPNOTSUPP;
 
 	if (cpsw->slaves[slave_no].phy)
 		return genphy_restart_aneg(cpsw->slaves[slave_no].phy);
@@ -2626,7 +2750,11 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 	data->mac_control = prop;
 
 	if (of_property_read_bool(node, "dual_emac"))
-		data->dual_emac = 1;
+		data->switch_mode = CPSW_DUAL_EMAC;
+
+	/* switchdev overrides DTS */
+	if (IS_ENABLED(CONFIG_TI_CPSW_SWITCHDEV))
+		data->switch_mode = CPSW_SWITCHDEV;
 
 	/*
 	 * Populate all the child nodes here...
@@ -2707,7 +2835,7 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 			if (ret)
 				return ret;
 		}
-		if (data->dual_emac) {
+		if (cpsw_is_dual_mac(data->switch_mode)) {
 			if (of_property_read_u32(slave_node, "dual_emac_res_vlan",
 						 &prop)) {
 				dev_err(&pdev->dev, "Missing dual_emac_res_vlan in DT.\n");
@@ -2787,9 +2915,13 @@ static int cpsw_probe_dual_emac(struct cpsw_priv *priv)
 	}
 	memcpy(ndev->dev_addr, priv_sl2->mac_addr, ETH_ALEN);
 
-	priv_sl2->emac_port = 1;
+	priv_sl2->emac_port = 2;
 	cpsw->slaves[1].ndev = ndev;
 	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+	if (cpsw_is_switchdev(cpsw->data.switch_mode)) {
+		ndev->features |= NETIF_F_NETNS_LOCAL;
+		cpsw_port_switchdev_init(ndev);
+	}
 
 	ndev->netdev_ops = &cpsw_netdev_ops;
 	ndev->ethtool_ops = &cpsw_ethtool_ops;
@@ -2806,6 +2938,49 @@ static int cpsw_probe_dual_emac(struct cpsw_priv *priv)
 	return ret;
 }
 
+static int cpsw_probe_cpu_port(struct cpsw_common *cpsw)
+{
+	struct cpsw_priv *priv_sl2;
+	struct net_device *ndev;
+	int ret = 0;
+
+	ndev = alloc_etherdev_mq(sizeof(struct cpsw_priv), CPSW_MAX_QUEUES);
+	if (!ndev) {
+		dev_err(cpsw->dev, "cpsw: error allocating net_device\n");
+		return -ENOMEM;
+	}
+
+	priv_sl2 = netdev_priv(ndev);
+	priv_sl2->cpsw = cpsw;
+	priv_sl2->ndev = ndev;
+	priv_sl2->dev  = &ndev->dev;
+	priv_sl2->msg_enable = netif_msg_init(debug_level, CPSW_DEBUG);
+
+	random_ether_addr(priv_sl2->mac_addr);
+	dev_info(cpsw->dev, "cpu port: Random MACID = %pM\n",
+		 priv_sl2->mac_addr);
+
+	memcpy(ndev->dev_addr, priv_sl2->mac_addr, ETH_ALEN);
+
+	priv_sl2->emac_port = HOST_PORT_NUM;
+	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL;
+
+	ndev->netdev_ops = &cpsw_netdev_ops;
+
+	/* register the network device */
+	SET_NETDEV_DEV(ndev, cpsw->dev);
+	cpsw_port_switchdev_init(ndev);
+	ret = register_netdev(ndev);
+	if (ret) {
+		dev_err(cpsw->dev, "cpsw: error registering net device\n");
+		free_netdev(ndev);
+		ret = -ENODEV;
+	}
+	cpsw->master = ndev;
+
+	return ret;
+}
+
 #define CPSW_QUIRK_IRQ		BIT(0)
 
 static const struct platform_device_id cpsw_devtype[] = {
@@ -2844,6 +3019,187 @@ static const struct of_device_id cpsw_of_mtable[] = {
 };
 MODULE_DEVICE_TABLE(of, cpsw_of_mtable);
 
+static bool cpsw_port_dev_check(const struct net_device *dev)
+{
+	return dev->netdev_ops == &cpsw_netdev_ops;
+}
+
+static void cpsw_fdb_offload_notify(struct net_device *ndev,
+				    struct switchdev_notifier_fdb_info *rcv)
+{
+	struct switchdev_notifier_fdb_info info;
+
+	info.addr = rcv->addr;
+	info.vid = rcv->vid;
+	call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED,
+				 ndev, &info.info);
+}
+
+static void cpsw_switchdev_event_work(struct work_struct *work)
+{
+	struct cpsw_switchdev_event_work *switchdev_work =
+		container_of(work, struct cpsw_switchdev_event_work, work);
+	struct cpsw_priv *priv = switchdev_work->priv;
+	struct switchdev_notifier_fdb_info *fdb;
+	struct cpsw_common *cpsw = priv->cpsw;
+
+	rtnl_lock();
+	switch (switchdev_work->event) {
+	case SWITCHDEV_FDB_ADD_TO_DEVICE:
+		fdb = &switchdev_work->fdb_info;
+		cpsw_ale_add_ucast(cpsw->ale, (u8 *)fdb->addr, priv->emac_port,
+				   ALE_VLAN | ALE_SECURE, fdb->vid);
+		cpsw_fdb_offload_notify(priv->ndev, fdb);
+		break;
+	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		fdb = &switchdev_work->fdb_info;
+		cpsw_ale_del_ucast(cpsw->ale, (u8 *)fdb->addr, priv->emac_port,
+				   ALE_VLAN | ALE_SECURE, fdb->vid);
+		break;
+	default:
+		break;
+	}
+	rtnl_unlock();
+
+	kfree(switchdev_work->fdb_info.addr);
+	kfree(switchdev_work);
+	dev_put(priv->ndev);
+}
+
+/* called under rcu_read_lock() */
+static int cpsw_switchdev_event(struct notifier_block *unused,
+				unsigned long event, void *ptr)
+{
+	struct net_device *ndev = switchdev_notifier_info_to_dev(ptr);
+	struct switchdev_notifier_fdb_info *fdb_info = ptr;
+	struct cpsw_switchdev_event_work *switchdev_work;
+	struct cpsw_priv *priv = netdev_priv(ndev);
+
+	if (!cpsw_port_dev_check(ndev))
+		return NOTIFY_DONE;
+
+	switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
+	if (WARN_ON(!switchdev_work))
+		return NOTIFY_BAD;
+
+	INIT_WORK(&switchdev_work->work, cpsw_switchdev_event_work);
+	switchdev_work->priv = priv;
+	switchdev_work->event = event;
+
+	switch (event) {
+	case SWITCHDEV_FDB_ADD_TO_DEVICE:
+	case SWITCHDEV_FDB_DEL_TO_DEVICE:
+		memcpy(&switchdev_work->fdb_info, ptr,
+		       sizeof(switchdev_work->fdb_info));
+		switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
+		ether_addr_copy((u8 *)switchdev_work->fdb_info.addr,
+				fdb_info->addr);
+		dev_hold(ndev);
+		break;
+	default:
+		kfree(switchdev_work);
+		return NOTIFY_DONE;
+	}
+
+	queue_work(system_long_wq, &switchdev_work->work);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block cpsw_switchdev_notifier = {
+	.notifier_call = cpsw_switchdev_event,
+};
+
+static void cpsw_netdevice_port_link(struct net_device *ndev)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	struct cpsw_common *cpsw = priv->cpsw;
+
+	if (priv->emac_port != HOST_PORT_NUM)
+		return;
+
+	cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM, ALE_P0_UNI_FLOOD, 1);
+}
+
+static void cpsw_netdevice_port_unlink(struct net_device *ndev)
+{
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	struct cpsw_common *cpsw = priv->cpsw;
+
+	if (priv->emac_port != HOST_PORT_NUM)
+		return;
+
+	cpsw_ale_control_set(cpsw->ale, HOST_PORT_NUM, ALE_P0_UNI_FLOOD, 0);
+}
+
+/* netdev notifier */
+static int cpsw_netdevice_event(struct notifier_block *unused,
+				unsigned long event, void *ptr)
+{
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	struct netdev_notifier_changeupper_info *info;
+
+	switch (event) {
+	case NETDEV_CHANGEUPPER:
+		info = ptr;
+		if (!info->master)
+			goto out;
+		if (info->linking)
+			cpsw_netdevice_port_link(ndev);
+		else
+			cpsw_netdevice_port_unlink(ndev);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+out:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block cpsw_netdevice_nb __read_mostly = {
+	.notifier_call = cpsw_netdevice_event,
+};
+
+static int cpsw_register_notifiers(struct cpsw_priv *priv)
+{
+	int ret;
+
+	ret = register_netdevice_notifier(&cpsw_netdevice_nb);
+	if (ret) {
+		cpsw_err(priv, probe, "can't register netdevice notifier\n");
+		return ret;
+	}
+
+	ret = register_switchdev_notifier(&cpsw_switchdev_notifier);
+	if (ret) {
+		cpsw_err(priv, probe, "can't register switchdev notifier\n");
+		goto unreg_netdevice;
+	}
+
+	return ret;
+
+unreg_netdevice:
+	ret = unregister_netdevice_notifier(&cpsw_netdevice_nb);
+
+	return ret;
+}
+
+static int cpsw_unregister_notifiers(struct cpsw_priv *priv)
+{
+	int ret;
+
+	ret = unregister_switchdev_notifier(&cpsw_switchdev_notifier);
+	if (ret)
+		dev_err(priv->dev, "can't unregister switchdev notifier\n");
+
+	ret += unregister_netdevice_notifier(&cpsw_netdevice_nb);
+	if (ret)
+		dev_err(priv->dev, "can't unregister netdevice notifier\n");
+
+	return ret;
+}
+
 static int cpsw_probe(struct platform_device *pdev)
 {
 	struct clk			*clk;
@@ -2935,7 +3291,11 @@ static int cpsw_probe(struct platform_device *pdev)
 		cpsw->slaves[i].slave_num = i;
 
 	cpsw->slaves[0].ndev = ndev;
-	priv->emac_port = 0;
+
+	if (cpsw_is_switch(cpsw->data.switch_mode))
+		priv->emac_port = HOST_PORT_NUM;
+	else
+		priv->emac_port = 1;
 
 	clk = devm_clk_get(&pdev->dev, "fck");
 	if (IS_ERR(clk)) {
@@ -3076,8 +3436,17 @@ static int cpsw_probe(struct platform_device *pdev)
 			cpsw->quirk_irq = true;
 	}
 
-	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_CTAG_RX;
+	if (cpsw_is_switchdev(cpsw->data.switch_mode)) {
+		ret = cpsw_probe_cpu_port(cpsw);
+		if (ret) {
+			cpsw_err(priv, probe, "error probe cpu interface\n");
+			goto clean_dma_ret;
+		}
+		cpsw_port_switchdev_init(ndev);
+		ndev->features |= NETIF_F_NETNS_LOCAL;
+	}
 
+	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_CTAG_RX;
 	ndev->netdev_ops = &cpsw_netdev_ops;
 	ndev->ethtool_ops = &cpsw_ethtool_ops;
 	netif_napi_add(ndev, &cpsw->napi_rx, cpsw_rx_poll, CPSW_POLL_WEIGHT);
@@ -3093,7 +3462,7 @@ static int cpsw_probe(struct platform_device *pdev)
 		goto clean_dma_ret;
 	}
 
-	if (cpsw->data.dual_emac) {
+	if (!cpsw_is_switch(cpsw->data.switch_mode)) {
 		ret = cpsw_probe_dual_emac(priv);
 		if (ret) {
 			cpsw_err(priv, probe, "error probe slave 2 emac interface\n");
@@ -3139,6 +3508,12 @@ static int cpsw_probe(struct platform_device *pdev)
 		goto clean_dma_ret;
 	}
 
+	if (cpsw_is_switchdev(cpsw->data.switch_mode)) {
+		ret = cpsw_register_notifiers(priv);
+		if (ret)
+			goto clean_dma_ret;
+	}
+
 	cpsw_notice(priv, probe,
 		    "initialized device (regs %pa, irq %d, pool size %d)\n",
 		    &ss_res->start, ndev->irq, dma_params.descs_pool_size);
@@ -3164,7 +3539,8 @@ static int cpsw_probe(struct platform_device *pdev)
 static int cpsw_remove(struct platform_device *pdev)
 {
 	struct net_device *ndev = platform_get_drvdata(pdev);
-	struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
+	struct cpsw_priv *priv = netdev_priv(ndev);
+	struct cpsw_common *cpsw = priv->cpsw;
 	int ret;
 
 	ret = pm_runtime_get_sync(&pdev->dev);
@@ -3173,7 +3549,10 @@ static int cpsw_remove(struct platform_device *pdev)
 		return ret;
 	}
 
-	if (cpsw->data.dual_emac)
+	if (cpsw_is_switchdev(cpsw->data.switch_mode))
+		ret = cpsw_unregister_notifiers(priv);
+
+	if (cpsw->data.switch_mode)
 		unregister_netdev(cpsw->slaves[1].ndev);
 	unregister_netdev(ndev);
 
@@ -3182,8 +3561,10 @@ static int cpsw_remove(struct platform_device *pdev)
 	cpsw_remove_dt(pdev);
 	pm_runtime_put_sync(&pdev->dev);
 	pm_runtime_disable(&pdev->dev);
-	if (cpsw->data.dual_emac)
+	if (cpsw->data.switch_mode)
 		free_netdev(cpsw->slaves[1].ndev);
+	if (cpsw->master)
+		free_netdev(cpsw->master);
 	free_netdev(ndev);
 	return 0;
 }
@@ -3195,7 +3576,7 @@ static int cpsw_suspend(struct device *dev)
 	struct net_device	*ndev = platform_get_drvdata(pdev);
 	struct cpsw_common	*cpsw = ndev_to_cpsw(ndev);
 
-	if (cpsw->data.dual_emac) {
+	if (cpsw->data.switch_mode) {
 		int i;
 
 		for (i = 0; i < cpsw->data.slaves; i++) {
@@ -3224,7 +3605,7 @@ static int cpsw_resume(struct device *dev)
 
 	/* shut up ASSERT_RTNL() warning in netif_set_real_num_tx/rx_queues */
 	rtnl_lock();
-	if (cpsw->data.dual_emac) {
+	if (cpsw->data.switch_mode) {
 		int i;
 
 		for (i = 0; i < cpsw->data.slaves; i++) {
diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h
index 3b02a83..4be5ffc 100644
--- a/drivers/net/ethernet/ti/cpsw_priv.h
+++ b/drivers/net/ethernet/ti/cpsw_priv.h
@@ -30,6 +30,12 @@
 #define CPSW2_TX_PRI_MAP    0x18 /* Tx Header Priority to Switch Pri Mapping */
 #define CPSW2_TS_SEQ_MTYPE  0x1c /* Time Sync Sequence ID Offset and Msg Type */
 
+enum {
+	CPSW_TI_SWITCH,
+	CPSW_DUAL_EMAC,
+	CPSW_SWITCHDEV,
+};
+
 struct cpsw_slave_data {
 	struct	device_node *phy_node;
 	char	phy_id[MII_BUS_ID_SIZE];
@@ -48,7 +54,7 @@ struct cpsw_platform_data {
 	u32	bd_ram_size;  /*buffer descriptor ram size */
 	u32	mac_control;	/* Mac control register */
 	u16	default_vlan;	/* Def VLAN for ALE lookup in VLAN aware mode*/
-	bool	dual_emac;	/* Enable Dual EMAC mode */
+	u8	switch_mode;    /* Enable Dual EMAC/switchdev mode */
 };
 
 struct cpsw_slave {
@@ -80,6 +86,7 @@ struct cpsw_common {
 	u32				coal_intvl;
 	u32				bus_freq_mhz;
 	int				rx_packet_max;
+	struct net_device		*master; /* used for switchdev */
 	struct cpsw_slave		*slaves;
 	struct cpdma_ctlr		*dma;
 	struct cpsw_vector		txv[CPSW_MAX_QUEUES];
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next v2 0/3] net: sfp: small improvements
From: Antoine Tenart @ 2018-05-24  6:56 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Antoine Tenart, davem, linux, netdev, linux-kernel,
	thomas.petazzoni, maxime.chevallier, gregory.clement,
	miquel.raynal, nadavh, stefanc, ymarkman, mw
In-Reply-To: <decfdf6b-6047-4338-5b81-2b8ef9bc8e48@gmail.com>

Hi Florian,

On Wed, May 23, 2018 at 11:40:50AM -0700, Florian Fainelli wrote:
> 
> Antoine, can you please do CC the people who worked on that code before,
> arguably, send an update to MAINTAINERS file to create a specific
> section for PHYLINK.

My bad, sorry for that, I'll make sure to Cc everyone involved next
time. As for an update to MAINTAINERS, I believe the one listed for a
particular file should do it himself.

Thanks!
Antoine

-- 
Antoine Ténart, Bootlin (formerly Free Electrons)
Embedded Linux and Kernel engineering
https://bootlin.com

^ permalink raw reply

* [PATCH bpf-next v4 00/10] bpf: enhancements for multi-function programs
From: Sandipan Das @ 2018-05-24  6:56 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski

[1] Support for bpf-to-bpf function calls in the powerpc64 JIT compiler.

[2] Provide a way for resolving function calls because of the way JITed
    images are allocated in powerpc64.

[3] Fix to get JITed instruction dumps for multi-function programs from
    the bpf system call.

[4] Fix for bpftool to show delimited multi-function JITed image dumps.

v4:
 - Incorporate review comments from Jakub.
 - Fix JSON output for bpftool.

v3:
 - Change base tree tag to bpf-next.
 - Incorporate review comments from Alexei, Daniel and Jakub.
 - Make sure that the JITed image does not grow or shrink after
   the last pass due to the way the instruction sequence used
   to load a callee's address maybe optimized.
 - Make additional changes to the bpf system call and bpftool to
   make multi-function JITed dumps easier to correlate.

v2:
 - Incorporate review comments from Jakub.

Sandipan Das (10):
  bpf: support 64-bit offsets for bpf function calls
  bpf: powerpc64: pad function address loads with NOPs
  bpf: powerpc64: add JIT support for multi-function programs
  bpf: get kernel symbol addresses via syscall
  tools: bpf: sync bpf uapi header
  tools: bpftool: resolve calls without using imm field
  bpf: fix multi-function JITed dump obtained via syscall
  bpf: get JITed image lengths of functions via syscall
  tools: bpf: sync bpf uapi header
  tools: bpftool: add delimiters to multi-function JITed dumps

 arch/powerpc/net/bpf_jit_comp64.c | 110 ++++++++++++++++++++++++++++++--------
 include/uapi/linux/bpf.h          |   4 ++
 kernel/bpf/syscall.c              |  82 ++++++++++++++++++++++++++--
 kernel/bpf/verifier.c             |  22 +++++---
 tools/bpf/bpftool/prog.c          |  97 ++++++++++++++++++++++++++++++++-
 tools/bpf/bpftool/xlated_dumper.c |  14 +++--
 tools/bpf/bpftool/xlated_dumper.h |   3 ++
 tools/include/uapi/linux/bpf.h    |   4 ++
 8 files changed, 301 insertions(+), 35 deletions(-)

-- 
2.14.3

^ permalink raw reply

* [PATCH bpf-next v4 01/10] bpf: support 64-bit offsets for bpf function calls
From: Sandipan Das @ 2018-05-24  6:56 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527143877.git.sandipan@linux.vnet.ibm.com>

The imm field of a bpf instruction is a signed 32-bit integer.
For JITed bpf-to-bpf function calls, it holds the offset of the
start address of the callee's JITed image from __bpf_call_base.

For some architectures, such as powerpc64, this offset may be
as large as 64 bits and cannot be accomodated in the imm field
without truncation.

We resolve this by:

[1] Additionally using the auxiliary data of each function to
    keep a list of start addresses of the JITed images for all
    functions determined by the verifier.

[2] Retaining the subprog id inside the off field of the call
    instructions and using it to index into the list mentioned
    above and lookup the callee's address.

To make sure that the existing JIT compilers continue to work
without requiring changes, we keep the imm field as it is.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 kernel/bpf/verifier.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a9e4b1372da6..559cb74ba29e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5383,11 +5383,24 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			    insn->src_reg != BPF_PSEUDO_CALL)
 				continue;
 			subprog = insn->off;
-			insn->off = 0;
 			insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
 				func[subprog]->bpf_func -
 				__bpf_call_base;
 		}
+
+		/* we use the aux data to keep a list of the start addresses
+		 * of the JITed images for each function in the program
+		 *
+		 * for some architectures, such as powerpc64, the imm field
+		 * might not be large enough to hold the offset of the start
+		 * address of the callee's JITed image from __bpf_call_base
+		 *
+		 * in such cases, we can lookup the start address of a callee
+		 * by using its subprog id, available from the off field of
+		 * the call instruction, as an index for this list
+		 */
+		func[i]->aux->func = func;
+		func[i]->aux->func_cnt = env->subprog_cnt;
 	}
 	for (i = 0; i < env->subprog_cnt; i++) {
 		old_bpf_func = func[i]->bpf_func;
-- 
2.14.3

^ permalink raw reply related

* [PATCH bpf-next v4 02/10] bpf: powerpc64: pad function address loads with NOPs
From: Sandipan Das @ 2018-05-24  6:56 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev, linuxppc-dev, mpe, naveen.n.rao, jakub.kicinski
In-Reply-To: <cover.1527143877.git.sandipan@linux.vnet.ibm.com>

For multi-function programs, loading the address of a callee
function to a register requires emitting instructions whose
count varies from one to five depending on the nature of the
address.

Since we come to know of the callee's address only before the
extra pass, the number of instructions required to load this
address may vary from what was previously generated. This can
make the JITed image grow or shrink.

To avoid this, we should generate a constant five-instruction
when loading function addresses by padding the optimized load
sequence with NOPs.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
---
 arch/powerpc/net/bpf_jit_comp64.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 1bdb1aff0619..e4582744a31d 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -167,25 +167,37 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
 
 static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func)
 {
+	unsigned int i, ctx_idx = ctx->idx;
+
+	/* Load function address into r12 */
+	PPC_LI64(12, func);
+
+	/* For bpf-to-bpf function calls, the callee's address is unknown
+	 * until the last extra pass. As seen above, we use PPC_LI64() to
+	 * load the callee's address, but this may optimize the number of
+	 * instructions required based on the nature of the address.
+	 *
+	 * Since we don't want the number of instructions emitted to change,
+	 * we pad the optimized PPC_LI64() call with NOPs to guarantee that
+	 * we always have a five-instruction sequence, which is the maximum
+	 * that PPC_LI64() can emit.
+	 */
+	for (i = ctx->idx - ctx_idx; i < 5; i++)
+		PPC_NOP();
+
 #ifdef PPC64_ELF_ABI_v1
-	/* func points to the function descriptor */
-	PPC_LI64(b2p[TMP_REG_2], func);
-	/* Load actual entry point from function descriptor */
-	PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0);
-	/* ... and move it to LR */
-	PPC_MTLR(b2p[TMP_REG_1]);
 	/*
 	 * Load TOC from function descriptor at offset 8.
 	 * We can clobber r2 since we get called through a
 	 * function pointer (so caller will save/restore r2)
 	 * and since we don't use a TOC ourself.
 	 */
-	PPC_BPF_LL(2, b2p[TMP_REG_2], 8);
-#else
-	/* We can clobber r12 */
-	PPC_FUNC_ADDR(12, func);
-	PPC_MTLR(12);
+	PPC_BPF_LL(2, 12, 8);
+	/* Load actual entry point from function descriptor */
+	PPC_BPF_LL(12, 12, 0);
 #endif
+
+	PPC_MTLR(12);
 	PPC_BLRL();
 }
 
-- 
2.14.3

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox