Netdev List
 help / color / mirror / Atom feed
* [patch net-next 16/18] mlxsw: core: Add mlxsw specific workqueue and use it for FDB notif. processing
From: Jiri Pirko @ 2016-04-14 16:19 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, ogerlitz, roopa, nikolay, jhs,
	john.fastabend, rami.rosen, gospo, stephen, sfeldma
In-Reply-To: <1460650770-19382-1-git-send-email-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Follow-up patch is going to need to use delayed work as well and
frequently. The FDB notification processing is already using that and
also quite frequently. It makes sense to create separate workqueue just
for mlxsw driver in this case and do not pollute system_wq.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c         | 25 ++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlxsw/core.h         |  3 +++
 .../ethernet/mellanox/mlxsw/spectrum_switchdev.c   |  4 ++--
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 63a9777..a14e422 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -55,6 +55,7 @@
 #include <linux/mutex.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
+#include <linux/workqueue.h>
 #include <asm/byteorder.h>
 #include <net/devlink.h>
 
@@ -73,6 +74,8 @@ static const char mlxsw_core_driver_name[] = "mlxsw_core";
 
 static struct dentry *mlxsw_core_dbg_root;
 
+static struct workqueue_struct *mlxsw_wq;
+
 struct mlxsw_core_pcpu_stats {
 	u64			trap_rx_packets[MLXSW_TRAP_ID_MAX];
 	u64			trap_rx_bytes[MLXSW_TRAP_ID_MAX];
@@ -1575,17 +1578,35 @@ int mlxsw_cmd_exec(struct mlxsw_core *mlxsw_core, u16 opcode, u8 opcode_mod,
 }
 EXPORT_SYMBOL(mlxsw_cmd_exec);
 
+int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay)
+{
+	return queue_delayed_work(mlxsw_wq, dwork, delay);
+}
+EXPORT_SYMBOL(mlxsw_core_schedule_dw);
+
 static int __init mlxsw_core_module_init(void)
 {
-	mlxsw_core_dbg_root = debugfs_create_dir(mlxsw_core_driver_name, NULL);
-	if (!mlxsw_core_dbg_root)
+	int err;
+
+	mlxsw_wq = create_workqueue(mlxsw_core_driver_name);
+	if (!mlxsw_wq)
 		return -ENOMEM;
+	mlxsw_core_dbg_root = debugfs_create_dir(mlxsw_core_driver_name, NULL);
+	if (!mlxsw_core_dbg_root) {
+		err = -ENOMEM;
+		goto err_debugfs_create_dir;
+	}
 	return 0;
+
+err_debugfs_create_dir:
+	destroy_workqueue(mlxsw_wq);
+	return err;
 }
 
 static void __exit mlxsw_core_module_exit(void)
 {
 	debugfs_remove_recursive(mlxsw_core_dbg_root);
+	destroy_workqueue(mlxsw_wq);
 }
 
 module_init(mlxsw_core_module_init);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 377dacc..b41ebf8 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -43,6 +43,7 @@
 #include <linux/gfp.h>
 #include <linux/types.h>
 #include <linux/skbuff.h>
+#include <linux/workqueue.h>
 #include <net/devlink.h>
 
 #include "trap.h"
@@ -151,6 +152,8 @@ int mlxsw_core_port_init(struct mlxsw_core *mlxsw_core,
 			 struct net_device *dev, bool split, u32 split_group);
 void mlxsw_core_port_fini(struct mlxsw_core_port *mlxsw_core_port);
 
+int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay);
+
 #define MLXSW_CONFIG_PROFILE_SWID_COUNT 8
 
 struct mlxsw_swid_config {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index e1c74ef..fb9efb8 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -1430,8 +1430,8 @@ static void mlxsw_sp_fdb_notify_rec_process(struct mlxsw_sp *mlxsw_sp,
 
 static void mlxsw_sp_fdb_notify_work_schedule(struct mlxsw_sp *mlxsw_sp)
 {
-	schedule_delayed_work(&mlxsw_sp->fdb_notify.dw,
-			      msecs_to_jiffies(mlxsw_sp->fdb_notify.interval));
+	mlxsw_core_schedule_dw(&mlxsw_sp->fdb_notify.dw,
+			       msecs_to_jiffies(mlxsw_sp->fdb_notify.interval));
 }
 
 static void mlxsw_sp_fdb_notify_work(struct work_struct *work)
-- 
2.5.5

^ permalink raw reply related

* [patch net-next 17/18] mlxsw: core: Introduce support for asynchronous EMAD register access
From: Jiri Pirko @ 2016-04-14 16:19 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, ogerlitz, roopa, nikolay, jhs,
	john.fastabend, rami.rosen, gospo, stephen, sfeldma
In-Reply-To: <1460650770-19382-1-git-send-email-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

So far it was possible to have one EMAD register access at a time,
locked by mutex. This patch extends this interface to allow multiple
EMAD register accesses to be in fly at once. That allows faster
processing on firmware side avoiding unused time in between EMADs.
Measured speedup is ~30% for shared occupancy snapshot operation.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c | 494 +++++++++++++++++++----------
 drivers/net/ethernet/mellanox/mlxsw/core.h |  13 +
 2 files changed, 334 insertions(+), 173 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index a14e422..b0a0b01 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -44,7 +44,7 @@
 #include <linux/seq_file.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/netdevice.h>
-#include <linux/wait.h>
+#include <linux/completion.h>
 #include <linux/skbuff.h>
 #include <linux/etherdevice.h>
 #include <linux/types.h>
@@ -96,11 +96,9 @@ struct mlxsw_core {
 	struct list_head rx_listener_list;
 	struct list_head event_listener_list;
 	struct {
-		struct sk_buff *resp_skb;
-		u64 tid;
-		wait_queue_head_t wait;
-		bool trans_active;
-		struct mutex lock; /* One EMAD transaction at a time. */
+		atomic64_t tid;
+		struct list_head trans_list;
+		spinlock_t trans_list_lock; /* protects trans_list writes */
 		bool use_emad;
 	} emad;
 	struct mlxsw_core_pcpu_stats __percpu *pcpu_stats;
@@ -293,7 +291,7 @@ static void mlxsw_emad_pack_reg_tlv(char *reg_tlv,
 static void mlxsw_emad_pack_op_tlv(char *op_tlv,
 				   const struct mlxsw_reg_info *reg,
 				   enum mlxsw_core_reg_access_type type,
-				   struct mlxsw_core *mlxsw_core)
+				   u64 tid)
 {
 	mlxsw_emad_op_tlv_type_set(op_tlv, MLXSW_EMAD_TLV_TYPE_OP);
 	mlxsw_emad_op_tlv_len_set(op_tlv, MLXSW_EMAD_OP_TLV_LEN);
@@ -309,7 +307,7 @@ static void mlxsw_emad_pack_op_tlv(char *op_tlv,
 					     MLXSW_EMAD_OP_TLV_METHOD_WRITE);
 	mlxsw_emad_op_tlv_class_set(op_tlv,
 				    MLXSW_EMAD_OP_TLV_CLASS_REG_ACCESS);
-	mlxsw_emad_op_tlv_tid_set(op_tlv, mlxsw_core->emad.tid);
+	mlxsw_emad_op_tlv_tid_set(op_tlv, tid);
 }
 
 static int mlxsw_emad_construct_eth_hdr(struct sk_buff *skb)
@@ -331,7 +329,7 @@ static void mlxsw_emad_construct(struct sk_buff *skb,
 				 const struct mlxsw_reg_info *reg,
 				 char *payload,
 				 enum mlxsw_core_reg_access_type type,
-				 struct mlxsw_core *mlxsw_core)
+				 u64 tid)
 {
 	char *buf;
 
@@ -342,7 +340,7 @@ static void mlxsw_emad_construct(struct sk_buff *skb,
 	mlxsw_emad_pack_reg_tlv(buf, reg, payload);
 
 	buf = skb_push(skb, MLXSW_EMAD_OP_TLV_LEN * sizeof(u32));
-	mlxsw_emad_pack_op_tlv(buf, reg, type, mlxsw_core);
+	mlxsw_emad_pack_op_tlv(buf, reg, type, tid);
 
 	mlxsw_emad_construct_eth_hdr(skb);
 }
@@ -379,58 +377,16 @@ static bool mlxsw_emad_is_resp(const struct sk_buff *skb)
 	return (mlxsw_emad_op_tlv_r_get(op_tlv) == MLXSW_EMAD_OP_TLV_RESPONSE);
 }
 
-#define MLXSW_EMAD_TIMEOUT_MS 200
-
-static int __mlxsw_emad_transmit(struct mlxsw_core *mlxsw_core,
-				 struct sk_buff *skb,
-				 const struct mlxsw_tx_info *tx_info)
-{
-	int err;
-	int ret;
-
-	mlxsw_core->emad.trans_active = true;
-
-	err = mlxsw_core_skb_transmit(mlxsw_core, skb, tx_info);
-	if (err) {
-		dev_err(mlxsw_core->bus_info->dev, "Failed to transmit EMAD (tid=%llx)\n",
-			mlxsw_core->emad.tid);
-		dev_kfree_skb(skb);
-		goto trans_inactive_out;
-	}
-
-	ret = wait_event_timeout(mlxsw_core->emad.wait,
-				 !(mlxsw_core->emad.trans_active),
-				 msecs_to_jiffies(MLXSW_EMAD_TIMEOUT_MS));
-	if (!ret) {
-		dev_warn(mlxsw_core->bus_info->dev, "EMAD timed-out (tid=%llx)\n",
-			 mlxsw_core->emad.tid);
-		err = -EIO;
-		goto trans_inactive_out;
-	}
-
-	return 0;
-
-trans_inactive_out:
-	mlxsw_core->emad.trans_active = false;
-	return err;
-}
-
-static int mlxsw_emad_process_status(struct mlxsw_core *mlxsw_core,
-				     char *op_tlv)
+static int mlxsw_emad_process_status(char *op_tlv,
+				     enum mlxsw_emad_op_tlv_status *p_status)
 {
-	enum mlxsw_emad_op_tlv_status status;
-	u64 tid;
-
-	status = mlxsw_emad_op_tlv_status_get(op_tlv);
-	tid = mlxsw_emad_op_tlv_tid_get(op_tlv);
+	*p_status = mlxsw_emad_op_tlv_status_get(op_tlv);
 
-	switch (status) {
+	switch (*p_status) {
 	case MLXSW_EMAD_OP_TLV_STATUS_SUCCESS:
 		return 0;
 	case MLXSW_EMAD_OP_TLV_STATUS_BUSY:
 	case MLXSW_EMAD_OP_TLV_STATUS_MESSAGE_RECEIPT_ACK:
-		dev_warn(mlxsw_core->bus_info->dev, "Reg access status again (tid=%llx,status=%x(%s))\n",
-			 tid, status, mlxsw_emad_op_tlv_status_str(status));
 		return -EAGAIN;
 	case MLXSW_EMAD_OP_TLV_STATUS_VERSION_NOT_SUPPORTED:
 	case MLXSW_EMAD_OP_TLV_STATUS_UNKNOWN_TLV:
@@ -441,70 +397,150 @@ static int mlxsw_emad_process_status(struct mlxsw_core *mlxsw_core,
 	case MLXSW_EMAD_OP_TLV_STATUS_RESOURCE_NOT_AVAILABLE:
 	case MLXSW_EMAD_OP_TLV_STATUS_INTERNAL_ERROR:
 	default:
-		dev_err(mlxsw_core->bus_info->dev, "Reg access status failed (tid=%llx,status=%x(%s))\n",
-			tid, status, mlxsw_emad_op_tlv_status_str(status));
 		return -EIO;
 	}
 }
 
-static int mlxsw_emad_process_status_skb(struct mlxsw_core *mlxsw_core,
-					 struct sk_buff *skb)
+static int
+mlxsw_emad_process_status_skb(struct sk_buff *skb,
+			      enum mlxsw_emad_op_tlv_status *p_status)
 {
-	return mlxsw_emad_process_status(mlxsw_core, mlxsw_emad_op_tlv(skb));
+	return mlxsw_emad_process_status(mlxsw_emad_op_tlv(skb), p_status);
+}
+
+struct mlxsw_reg_trans {
+	struct list_head list;
+	struct list_head bulk_list;
+	struct mlxsw_core *core;
+	struct sk_buff *tx_skb;
+	struct mlxsw_tx_info tx_info;
+	struct delayed_work timeout_dw;
+	unsigned int retries;
+	u64 tid;
+	struct completion completion;
+	atomic_t active;
+	mlxsw_reg_trans_cb_t *cb;
+	unsigned long cb_priv;
+	const struct mlxsw_reg_info *reg;
+	enum mlxsw_core_reg_access_type type;
+	int err;
+	enum mlxsw_emad_op_tlv_status emad_status;
+	struct rcu_head rcu;
+};
+
+#define MLXSW_EMAD_TIMEOUT_MS 200
+
+static void mlxsw_emad_trans_timeout_schedule(struct mlxsw_reg_trans *trans)
+{
+	unsigned long timeout = msecs_to_jiffies(MLXSW_EMAD_TIMEOUT_MS);
+
+	mlxsw_core_schedule_dw(&trans->timeout_dw, timeout);
 }
 
 static int mlxsw_emad_transmit(struct mlxsw_core *mlxsw_core,
-			       struct sk_buff *skb,
-			       const struct mlxsw_tx_info *tx_info)
+			       struct mlxsw_reg_trans *trans)
 {
-	struct sk_buff *trans_skb;
-	int n_retry;
+	struct sk_buff *skb;
 	int err;
 
-	n_retry = 0;
-retry:
-	/* We copy the EMAD to a new skb, since we might need
-	 * to retransmit it in case of failure.
-	 */
-	trans_skb = skb_copy(skb, GFP_KERNEL);
-	if (!trans_skb) {
-		err = -ENOMEM;
-		goto out;
+	skb = skb_copy(trans->tx_skb, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	atomic_set(&trans->active, 1);
+	err = mlxsw_core_skb_transmit(mlxsw_core, skb, &trans->tx_info);
+	if (err) {
+		dev_kfree_skb(skb);
+		return err;
 	}
+	mlxsw_emad_trans_timeout_schedule(trans);
+	return 0;
+}
 
-	err = __mlxsw_emad_transmit(mlxsw_core, trans_skb, tx_info);
-	if (!err) {
-		struct sk_buff *resp_skb = mlxsw_core->emad.resp_skb;
+static void mlxsw_emad_trans_finish(struct mlxsw_reg_trans *trans, int err)
+{
+	struct mlxsw_core *mlxsw_core = trans->core;
+
+	dev_kfree_skb(trans->tx_skb);
+	spin_lock_bh(&mlxsw_core->emad.trans_list_lock);
+	list_del_rcu(&trans->list);
+	spin_unlock_bh(&mlxsw_core->emad.trans_list_lock);
+	trans->err = err;
+	complete(&trans->completion);
+}
+
+static void mlxsw_emad_transmit_retry(struct mlxsw_core *mlxsw_core,
+				      struct mlxsw_reg_trans *trans)
+{
+	int err;
 
-		err = mlxsw_emad_process_status_skb(mlxsw_core, resp_skb);
-		if (err)
-			dev_kfree_skb(resp_skb);
-		if (!err || err != -EAGAIN)
-			goto out;
+	if (trans->retries < MLXSW_EMAD_MAX_RETRY) {
+		trans->retries++;
+		err = mlxsw_emad_transmit(trans->core, trans);
+		if (err == 0)
+			return;
+	} else {
+		err = -EIO;
 	}
-	if (n_retry++ < MLXSW_EMAD_MAX_RETRY)
-		goto retry;
+	mlxsw_emad_trans_finish(trans, err);
+}
 
-out:
-	dev_kfree_skb(skb);
-	mlxsw_core->emad.tid++;
-	return err;
+static void mlxsw_emad_trans_timeout_work(struct work_struct *work)
+{
+	struct mlxsw_reg_trans *trans = container_of(work,
+						     struct mlxsw_reg_trans,
+						     timeout_dw.work);
+
+	if (!atomic_dec_and_test(&trans->active))
+		return;
+
+	mlxsw_emad_transmit_retry(trans->core, trans);
 }
 
+static void mlxsw_emad_process_response(struct mlxsw_core *mlxsw_core,
+					struct mlxsw_reg_trans *trans,
+					struct sk_buff *skb)
+{
+	int err;
+
+	if (!atomic_dec_and_test(&trans->active))
+		return;
+
+	err = mlxsw_emad_process_status_skb(skb, &trans->emad_status);
+	if (err == -EAGAIN) {
+		mlxsw_emad_transmit_retry(mlxsw_core, trans);
+	} else {
+		if (err == 0) {
+			char *op_tlv = mlxsw_emad_op_tlv(skb);
+
+			if (trans->cb)
+				trans->cb(mlxsw_core,
+					  mlxsw_emad_reg_payload(op_tlv),
+					  trans->reg->len, trans->cb_priv);
+		}
+		mlxsw_emad_trans_finish(trans, err);
+	}
+}
+
+/* called with rcu read lock held */
 static void mlxsw_emad_rx_listener_func(struct sk_buff *skb, u8 local_port,
 					void *priv)
 {
 	struct mlxsw_core *mlxsw_core = priv;
+	struct mlxsw_reg_trans *trans;
 
-	if (mlxsw_emad_is_resp(skb) &&
-	    mlxsw_core->emad.trans_active &&
-	    mlxsw_emad_get_tid(skb) == mlxsw_core->emad.tid) {
-		mlxsw_core->emad.resp_skb = skb;
-		mlxsw_core->emad.trans_active = false;
-		wake_up(&mlxsw_core->emad.wait);
-	} else {
-		dev_kfree_skb(skb);
+	if (!mlxsw_emad_is_resp(skb))
+		goto free_skb;
+
+	list_for_each_entry_rcu(trans, &mlxsw_core->emad.trans_list, list) {
+		if (mlxsw_emad_get_tid(skb) == trans->tid) {
+			mlxsw_emad_process_response(mlxsw_core, trans, skb);
+			break;
+		}
 	}
+
+free_skb:
+	dev_kfree_skb(skb);
 }
 
 static const struct mlxsw_rx_listener mlxsw_emad_rx_listener = {
@@ -531,18 +567,19 @@ static int mlxsw_emad_traps_set(struct mlxsw_core *mlxsw_core)
 
 static int mlxsw_emad_init(struct mlxsw_core *mlxsw_core)
 {
+	u64 tid;
 	int err;
 
 	/* Set the upper 32 bits of the transaction ID field to a random
 	 * number. This allows us to discard EMADs addressed to other
 	 * devices.
 	 */
-	get_random_bytes(&mlxsw_core->emad.tid, 4);
-	mlxsw_core->emad.tid = mlxsw_core->emad.tid << 32;
+	get_random_bytes(&tid, 4);
+	tid <<= 32;
+	atomic64_set(&mlxsw_core->emad.tid, tid);
 
-	init_waitqueue_head(&mlxsw_core->emad.wait);
-	mlxsw_core->emad.trans_active = false;
-	mutex_init(&mlxsw_core->emad.lock);
+	INIT_LIST_HEAD(&mlxsw_core->emad.trans_list);
+	spin_lock_init(&mlxsw_core->emad.trans_list_lock);
 
 	err = mlxsw_core_rx_listener_register(mlxsw_core,
 					      &mlxsw_emad_rx_listener,
@@ -600,6 +637,59 @@ static struct sk_buff *mlxsw_emad_alloc(const struct mlxsw_core *mlxsw_core,
 	return skb;
 }
 
+static int mlxsw_emad_reg_access(struct mlxsw_core *mlxsw_core,
+				 const struct mlxsw_reg_info *reg,
+				 char *payload,
+				 enum mlxsw_core_reg_access_type type,
+				 struct mlxsw_reg_trans *trans,
+				 struct list_head *bulk_list,
+				 mlxsw_reg_trans_cb_t *cb,
+				 unsigned long cb_priv, u64 tid)
+{
+	struct sk_buff *skb;
+	int err;
+
+	dev_dbg(mlxsw_core->bus_info->dev, "EMAD reg access (tid=%llx,reg_id=%x(%s),type=%s)\n",
+		trans->tid, reg->id, mlxsw_reg_id_str(reg->id),
+		mlxsw_core_reg_access_type_str(type));
+
+	skb = mlxsw_emad_alloc(mlxsw_core, reg->len);
+	if (!skb)
+		return -ENOMEM;
+
+	list_add_tail(&trans->bulk_list, bulk_list);
+	trans->core = mlxsw_core;
+	trans->tx_skb = skb;
+	trans->tx_info.local_port = MLXSW_PORT_CPU_PORT;
+	trans->tx_info.is_emad = true;
+	INIT_DELAYED_WORK(&trans->timeout_dw, mlxsw_emad_trans_timeout_work);
+	trans->tid = tid;
+	init_completion(&trans->completion);
+	trans->cb = cb;
+	trans->cb_priv = cb_priv;
+	trans->reg = reg;
+	trans->type = type;
+
+	mlxsw_emad_construct(skb, reg, payload, type, trans->tid);
+	mlxsw_core->driver->txhdr_construct(skb, &trans->tx_info);
+
+	spin_lock_bh(&mlxsw_core->emad.trans_list_lock);
+	list_add_tail_rcu(&trans->list, &mlxsw_core->emad.trans_list);
+	spin_unlock_bh(&mlxsw_core->emad.trans_list_lock);
+	err = mlxsw_emad_transmit(mlxsw_core, trans);
+	if (err)
+		goto err_out;
+	return 0;
+
+err_out:
+	spin_lock_bh(&mlxsw_core->emad.trans_list_lock);
+	list_del_rcu(&trans->list);
+	spin_unlock_bh(&mlxsw_core->emad.trans_list_lock);
+	list_del(&trans->bulk_list);
+	dev_kfree_skb(trans->tx_skb);
+	return err;
+}
+
 /*****************
  * Core functions
  *****************/
@@ -689,24 +779,6 @@ static const struct file_operations mlxsw_core_rx_stats_dbg_ops = {
 	.llseek = seq_lseek
 };
 
-static void mlxsw_core_buf_dump_dbg(struct mlxsw_core *mlxsw_core,
-				    const char *buf, size_t size)
-{
-	__be32 *m = (__be32 *) buf;
-	int i;
-	int count = size / sizeof(__be32);
-
-	for (i = count - 1; i >= 0; i--)
-		if (m[i])
-			break;
-	i++;
-	count = i ? i : 1;
-	for (i = 0; i < count; i += 4)
-		dev_dbg(mlxsw_core->bus_info->dev, "%04x - %08x %08x %08x %08x\n",
-			i * 4, be32_to_cpu(m[i]), be32_to_cpu(m[i + 1]),
-			be32_to_cpu(m[i + 2]), be32_to_cpu(m[i + 3]));
-}
-
 int mlxsw_core_driver_register(struct mlxsw_driver *mlxsw_driver)
 {
 	spin_lock(&mlxsw_core_driver_list_lock);
@@ -1264,56 +1336,112 @@ void mlxsw_core_event_listener_unregister(struct mlxsw_core *mlxsw_core,
 }
 EXPORT_SYMBOL(mlxsw_core_event_listener_unregister);
 
+static u64 mlxsw_core_tid_get(struct mlxsw_core *mlxsw_core)
+{
+	return atomic64_inc_return(&mlxsw_core->emad.tid);
+}
+
 static int mlxsw_core_reg_access_emad(struct mlxsw_core *mlxsw_core,
 				      const struct mlxsw_reg_info *reg,
 				      char *payload,
-				      enum mlxsw_core_reg_access_type type)
+				      enum mlxsw_core_reg_access_type type,
+				      struct list_head *bulk_list,
+				      mlxsw_reg_trans_cb_t *cb,
+				      unsigned long cb_priv)
 {
+	u64 tid = mlxsw_core_tid_get(mlxsw_core);
+	struct mlxsw_reg_trans *trans;
 	int err;
-	char *op_tlv;
-	struct sk_buff *skb;
-	struct mlxsw_tx_info tx_info = {
-		.local_port = MLXSW_PORT_CPU_PORT,
-		.is_emad = true,
-	};
 
-	skb = mlxsw_emad_alloc(mlxsw_core, reg->len);
-	if (!skb)
+	trans = kzalloc(sizeof(*trans), GFP_KERNEL);
+	if (!trans)
 		return -ENOMEM;
 
-	mlxsw_emad_construct(skb, reg, payload, type, mlxsw_core);
-	mlxsw_core->driver->txhdr_construct(skb, &tx_info);
+	err = mlxsw_emad_reg_access(mlxsw_core, reg, payload, type, trans,
+				    bulk_list, cb, cb_priv, tid);
+	if (err) {
+		kfree(trans);
+		return err;
+	}
+	return 0;
+}
 
-	dev_dbg(mlxsw_core->bus_info->dev, "EMAD send (tid=%llx)\n",
-		mlxsw_core->emad.tid);
-	mlxsw_core_buf_dump_dbg(mlxsw_core, skb->data, skb->len);
+int mlxsw_reg_trans_query(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_reg_info *reg, char *payload,
+			  struct list_head *bulk_list,
+			  mlxsw_reg_trans_cb_t *cb, unsigned long cb_priv)
+{
+	return mlxsw_core_reg_access_emad(mlxsw_core, reg, payload,
+					  MLXSW_CORE_REG_ACCESS_TYPE_QUERY,
+					  bulk_list, cb, cb_priv);
+}
+EXPORT_SYMBOL(mlxsw_reg_trans_query);
 
-	err = mlxsw_emad_transmit(mlxsw_core, skb, &tx_info);
-	if (!err) {
-		op_tlv = mlxsw_emad_op_tlv(mlxsw_core->emad.resp_skb);
-		memcpy(payload, mlxsw_emad_reg_payload(op_tlv),
-		       reg->len);
+int mlxsw_reg_trans_write(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_reg_info *reg, char *payload,
+			  struct list_head *bulk_list,
+			  mlxsw_reg_trans_cb_t *cb, unsigned long cb_priv)
+{
+	return mlxsw_core_reg_access_emad(mlxsw_core, reg, payload,
+					  MLXSW_CORE_REG_ACCESS_TYPE_WRITE,
+					  bulk_list, cb, cb_priv);
+}
+EXPORT_SYMBOL(mlxsw_reg_trans_write);
 
-		dev_dbg(mlxsw_core->bus_info->dev, "EMAD recv (tid=%llx)\n",
-			mlxsw_core->emad.tid - 1);
-		mlxsw_core_buf_dump_dbg(mlxsw_core,
-					mlxsw_core->emad.resp_skb->data,
-					mlxsw_core->emad.resp_skb->len);
+static int mlxsw_reg_trans_wait(struct mlxsw_reg_trans *trans)
+{
+	struct mlxsw_core *mlxsw_core = trans->core;
+	int err;
 
-		dev_kfree_skb(mlxsw_core->emad.resp_skb);
-	}
+	wait_for_completion(&trans->completion);
+	cancel_delayed_work_sync(&trans->timeout_dw);
+	err = trans->err;
 
+	if (trans->retries)
+		dev_warn(mlxsw_core->bus_info->dev, "EMAD retries (%d/%d) (tid=%llx)\n",
+			 trans->retries, MLXSW_EMAD_MAX_RETRY, trans->tid);
+	if (err)
+		dev_err(mlxsw_core->bus_info->dev, "EMAD reg access failed (tid=%llx,reg_id=%x(%s),type=%s,status=%x(%s))\n",
+			trans->tid, trans->reg->id,
+			mlxsw_reg_id_str(trans->reg->id),
+			mlxsw_core_reg_access_type_str(trans->type),
+			trans->emad_status,
+			mlxsw_emad_op_tlv_status_str(trans->emad_status));
+
+	list_del(&trans->bulk_list);
+	kfree_rcu(trans, rcu);
 	return err;
 }
 
+int mlxsw_reg_trans_bulk_wait(struct list_head *bulk_list)
+{
+	struct mlxsw_reg_trans *trans;
+	struct mlxsw_reg_trans *tmp;
+	int sum_err = 0;
+	int err;
+
+	list_for_each_entry_safe(trans, tmp, bulk_list, bulk_list) {
+		err = mlxsw_reg_trans_wait(trans);
+		if (err && sum_err == 0)
+			sum_err = err; /* first error to be returned */
+	}
+	return sum_err;
+}
+EXPORT_SYMBOL(mlxsw_reg_trans_bulk_wait);
+
 static int mlxsw_core_reg_access_cmd(struct mlxsw_core *mlxsw_core,
 				     const struct mlxsw_reg_info *reg,
 				     char *payload,
 				     enum mlxsw_core_reg_access_type type)
 {
+	enum mlxsw_emad_op_tlv_status status;
 	int err, n_retry;
 	char *in_mbox, *out_mbox, *tmp;
 
+	dev_dbg(mlxsw_core->bus_info->dev, "Reg cmd access (reg_id=%x(%s),type=%s)\n",
+		reg->id, mlxsw_reg_id_str(reg->id),
+		mlxsw_core_reg_access_type_str(type));
+
 	in_mbox = mlxsw_cmd_mbox_alloc();
 	if (!in_mbox)
 		return -ENOMEM;
@@ -1324,7 +1452,8 @@ static int mlxsw_core_reg_access_cmd(struct mlxsw_core *mlxsw_core,
 		goto free_in_mbox;
 	}
 
-	mlxsw_emad_pack_op_tlv(in_mbox, reg, type, mlxsw_core);
+	mlxsw_emad_pack_op_tlv(in_mbox, reg, type,
+			       mlxsw_core_tid_get(mlxsw_core));
 	tmp = in_mbox + MLXSW_EMAD_OP_TLV_LEN * sizeof(u32);
 	mlxsw_emad_pack_reg_tlv(tmp, reg, payload);
 
@@ -1332,60 +1461,61 @@ static int mlxsw_core_reg_access_cmd(struct mlxsw_core *mlxsw_core,
 retry:
 	err = mlxsw_cmd_access_reg(mlxsw_core, in_mbox, out_mbox);
 	if (!err) {
-		err = mlxsw_emad_process_status(mlxsw_core, out_mbox);
-		if (err == -EAGAIN && n_retry++ < MLXSW_EMAD_MAX_RETRY)
-			goto retry;
+		err = mlxsw_emad_process_status(out_mbox, &status);
+		if (err) {
+			if (err == -EAGAIN && n_retry++ < MLXSW_EMAD_MAX_RETRY)
+				goto retry;
+			dev_err(mlxsw_core->bus_info->dev, "Reg cmd access status failed (status=%x(%s))\n",
+				status, mlxsw_emad_op_tlv_status_str(status));
+		}
 	}
 
 	if (!err)
 		memcpy(payload, mlxsw_emad_reg_payload(out_mbox),
 		       reg->len);
 
-	mlxsw_core->emad.tid++;
 	mlxsw_cmd_mbox_free(out_mbox);
 free_in_mbox:
 	mlxsw_cmd_mbox_free(in_mbox);
+	if (err)
+		dev_err(mlxsw_core->bus_info->dev, "Reg cmd access failed (reg_id=%x(%s),type=%s)\n",
+			reg->id, mlxsw_reg_id_str(reg->id),
+			mlxsw_core_reg_access_type_str(type));
 	return err;
 }
 
+static void mlxsw_core_reg_access_cb(struct mlxsw_core *mlxsw_core,
+				     char *payload, size_t payload_len,
+				     unsigned long cb_priv)
+{
+	char *orig_payload = (char *) cb_priv;
+
+	memcpy(orig_payload, payload, payload_len);
+}
+
 static int mlxsw_core_reg_access(struct mlxsw_core *mlxsw_core,
 				 const struct mlxsw_reg_info *reg,
 				 char *payload,
 				 enum mlxsw_core_reg_access_type type)
 {
-	u64 cur_tid;
+	LIST_HEAD(bulk_list);
 	int err;
 
-	if (mutex_lock_interruptible(&mlxsw_core->emad.lock)) {
-		dev_err(mlxsw_core->bus_info->dev, "Reg access interrupted (reg_id=%x(%s),type=%s)\n",
-			reg->id, mlxsw_reg_id_str(reg->id),
-			mlxsw_core_reg_access_type_str(type));
-		return -EINTR;
-	}
-
-	cur_tid = mlxsw_core->emad.tid;
-	dev_dbg(mlxsw_core->bus_info->dev, "Reg access (tid=%llx,reg_id=%x(%s),type=%s)\n",
-		cur_tid, reg->id, mlxsw_reg_id_str(reg->id),
-		mlxsw_core_reg_access_type_str(type));
-
 	/* During initialization EMAD interface is not available to us,
 	 * so we default to command interface. We switch to EMAD interface
 	 * after setting the appropriate traps.
 	 */
 	if (!mlxsw_core->emad.use_emad)
-		err = mlxsw_core_reg_access_cmd(mlxsw_core, reg,
-						payload, type);
-	else
-		err = mlxsw_core_reg_access_emad(mlxsw_core, reg,
+		return mlxsw_core_reg_access_cmd(mlxsw_core, reg,
 						 payload, type);
 
+	err = mlxsw_core_reg_access_emad(mlxsw_core, reg,
+					 payload, type, &bulk_list,
+					 mlxsw_core_reg_access_cb,
+					 (unsigned long) payload);
 	if (err)
-		dev_err(mlxsw_core->bus_info->dev, "Reg access failed (tid=%llx,reg_id=%x(%s),type=%s)\n",
-			cur_tid, reg->id, mlxsw_reg_id_str(reg->id),
-			mlxsw_core_reg_access_type_str(type));
-
-	mutex_unlock(&mlxsw_core->emad.lock);
-	return err;
+		return err;
+	return mlxsw_reg_trans_bulk_wait(&bulk_list);
 }
 
 int mlxsw_reg_query(struct mlxsw_core *mlxsw_core,
@@ -1536,6 +1666,24 @@ void mlxsw_core_port_fini(struct mlxsw_core_port *mlxsw_core_port)
 }
 EXPORT_SYMBOL(mlxsw_core_port_fini);
 
+static void mlxsw_core_buf_dump_dbg(struct mlxsw_core *mlxsw_core,
+				    const char *buf, size_t size)
+{
+	__be32 *m = (__be32 *) buf;
+	int i;
+	int count = size / sizeof(__be32);
+
+	for (i = count - 1; i >= 0; i--)
+		if (m[i])
+			break;
+	i++;
+	count = i ? i : 1;
+	for (i = 0; i < count; i += 4)
+		dev_dbg(mlxsw_core->bus_info->dev, "%04x - %08x %08x %08x %08x\n",
+			i * 4, be32_to_cpu(m[i]), be32_to_cpu(m[i + 1]),
+			be32_to_cpu(m[i + 2]), be32_to_cpu(m[i + 3]));
+}
+
 int mlxsw_cmd_exec(struct mlxsw_core *mlxsw_core, u16 opcode, u8 opcode_mod,
 		   u32 in_mod, bool out_mbox_direct,
 		   char *in_mbox, size_t in_mbox_size,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index b41ebf8..436bc49 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -109,6 +109,19 @@ void mlxsw_core_event_listener_unregister(struct mlxsw_core *mlxsw_core,
 					  const struct mlxsw_event_listener *el,
 					  void *priv);
 
+typedef void mlxsw_reg_trans_cb_t(struct mlxsw_core *mlxsw_core, char *payload,
+				  size_t payload_len, unsigned long cb_priv);
+
+int mlxsw_reg_trans_query(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_reg_info *reg, char *payload,
+			  struct list_head *bulk_list,
+			  mlxsw_reg_trans_cb_t *cb, unsigned long cb_priv);
+int mlxsw_reg_trans_write(struct mlxsw_core *mlxsw_core,
+			  const struct mlxsw_reg_info *reg, char *payload,
+			  struct list_head *bulk_list,
+			  mlxsw_reg_trans_cb_t *cb, unsigned long cb_priv);
+int mlxsw_reg_trans_bulk_wait(struct list_head *bulk_list);
+
 int mlxsw_reg_query(struct mlxsw_core *mlxsw_core,
 		    const struct mlxsw_reg_info *reg, char *payload);
 int mlxsw_reg_write(struct mlxsw_core *mlxsw_core,
-- 
2.5.5

^ permalink raw reply related

* [patch net-next 18/18] mlxsw: spectrum_buffers: Implement occupancy monitoring
From: Jiri Pirko @ 2016-04-14 16:19 UTC (permalink / raw)
  To: netdev
  Cc: davem, idosch, eladr, yotamg, ogerlitz, roopa, nikolay, jhs,
	john.fastabend, rami.rosen, gospo, stephen, sfeldma
In-Reply-To: <1460650770-19382-1-git-send-email-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Implement occupancy API introduced in devlink and mlxsw core. This is
done by accessing SBPM register for Port-Pool and SBSR for Port-TC
current and max occupancy values. Max clear is implemented using the
same registers.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     |  36 +--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h     |  18 ++
 .../net/ethernet/mellanox/mlxsw/spectrum_buffers.c | 255 +++++++++++++++++++++
 3 files changed, 293 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index ecadb15..681afe1 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -2493,22 +2493,26 @@ static struct mlxsw_config_profile mlxsw_sp_config_profile = {
 };
 
 static struct mlxsw_driver mlxsw_sp_driver = {
-	.kind			= MLXSW_DEVICE_KIND_SPECTRUM,
-	.owner			= THIS_MODULE,
-	.priv_size		= sizeof(struct mlxsw_sp),
-	.init			= mlxsw_sp_init,
-	.fini			= mlxsw_sp_fini,
-	.port_split		= mlxsw_sp_port_split,
-	.port_unsplit		= mlxsw_sp_port_unsplit,
-	.sb_pool_get		= mlxsw_sp_sb_pool_get,
-	.sb_pool_set		= mlxsw_sp_sb_pool_set,
-	.sb_port_pool_get	= mlxsw_sp_sb_port_pool_get,
-	.sb_port_pool_set	= mlxsw_sp_sb_port_pool_set,
-	.sb_tc_pool_bind_get	= mlxsw_sp_sb_tc_pool_bind_get,
-	.sb_tc_pool_bind_set	= mlxsw_sp_sb_tc_pool_bind_set,
-	.txhdr_construct	= mlxsw_sp_txhdr_construct,
-	.txhdr_len		= MLXSW_TXHDR_LEN,
-	.profile		= &mlxsw_sp_config_profile,
+	.kind				= MLXSW_DEVICE_KIND_SPECTRUM,
+	.owner				= THIS_MODULE,
+	.priv_size			= sizeof(struct mlxsw_sp),
+	.init				= mlxsw_sp_init,
+	.fini				= mlxsw_sp_fini,
+	.port_split			= mlxsw_sp_port_split,
+	.port_unsplit			= mlxsw_sp_port_unsplit,
+	.sb_pool_get			= mlxsw_sp_sb_pool_get,
+	.sb_pool_set			= mlxsw_sp_sb_pool_set,
+	.sb_port_pool_get		= mlxsw_sp_sb_port_pool_get,
+	.sb_port_pool_set		= mlxsw_sp_sb_port_pool_set,
+	.sb_tc_pool_bind_get		= mlxsw_sp_sb_tc_pool_bind_get,
+	.sb_tc_pool_bind_set		= mlxsw_sp_sb_tc_pool_bind_set,
+	.sb_occ_snapshot		= mlxsw_sp_sb_occ_snapshot,
+	.sb_occ_max_clear		= mlxsw_sp_sb_occ_max_clear,
+	.sb_occ_port_pool_get		= mlxsw_sp_sb_occ_port_pool_get,
+	.sb_occ_tc_port_bind_get	= mlxsw_sp_sb_occ_tc_port_bind_get,
+	.txhdr_construct		= mlxsw_sp_txhdr_construct,
+	.txhdr_len			= MLXSW_TXHDR_LEN,
+	.profile			= &mlxsw_sp_config_profile,
 };
 
 static int
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 6458efa..e2c022d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -123,15 +123,22 @@ struct mlxsw_sp_sb_pr {
 	u32 size;
 };
 
+struct mlxsw_cp_sb_occ {
+	u32 cur;
+	u32 max;
+};
+
 struct mlxsw_sp_sb_cm {
 	u32 min_buff;
 	u32 max_buff;
 	u8 pool;
+	struct mlxsw_cp_sb_occ occ;
 };
 
 struct mlxsw_sp_sb_pm {
 	u32 min_buff;
 	u32 max_buff;
+	struct mlxsw_cp_sb_occ occ;
 };
 
 #define MLXSW_SP_SB_POOL_COUNT	4
@@ -328,6 +335,17 @@ int mlxsw_sp_sb_tc_pool_bind_set(struct mlxsw_core_port *mlxsw_core_port,
 				 unsigned int sb_index, u16 tc_index,
 				 enum devlink_sb_pool_type pool_type,
 				 u16 pool_index, u32 threshold);
+int mlxsw_sp_sb_occ_snapshot(struct mlxsw_core *mlxsw_core,
+			     unsigned int sb_index);
+int mlxsw_sp_sb_occ_max_clear(struct mlxsw_core *mlxsw_core,
+			      unsigned int sb_index);
+int mlxsw_sp_sb_occ_port_pool_get(struct mlxsw_core_port *mlxsw_core_port,
+				  unsigned int sb_index, u16 pool_index,
+				  u32 *p_cur, u32 *p_max);
+int mlxsw_sp_sb_occ_tc_port_bind_get(struct mlxsw_core_port *mlxsw_core_port,
+				     unsigned int sb_index, u16 tc_index,
+				     enum devlink_sb_pool_type pool_type,
+				     u32 *p_cur, u32 *p_max);
 
 int mlxsw_sp_switchdev_init(struct mlxsw_sp *mlxsw_sp);
 void mlxsw_sp_switchdev_fini(struct mlxsw_sp *mlxsw_sp);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
index 639ba5a..f2e073a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
@@ -36,6 +36,7 @@
 #include <linux/types.h>
 #include <linux/dcbnl.h>
 #include <linux/if_ether.h>
+#include <linux/list.h>
 
 #include "spectrum.h"
 #include "core.h"
@@ -125,6 +126,41 @@ static int mlxsw_sp_sb_pm_write(struct mlxsw_sp *mlxsw_sp, u8 local_port,
 	return 0;
 }
 
+static int mlxsw_sp_sb_pm_occ_clear(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				    u8 pool, enum mlxsw_reg_sbxx_dir dir,
+				    struct list_head *bulk_list)
+{
+	char sbpm_pl[MLXSW_REG_SBPM_LEN];
+
+	mlxsw_reg_sbpm_pack(sbpm_pl, local_port, pool, dir, true, 0, 0);
+	return mlxsw_reg_trans_query(mlxsw_sp->core, MLXSW_REG(sbpm), sbpm_pl,
+				     bulk_list, NULL, 0);
+}
+
+static void mlxsw_sp_sb_pm_occ_query_cb(struct mlxsw_core *mlxsw_core,
+					char *sbpm_pl, size_t sbpm_pl_len,
+					unsigned long cb_priv)
+{
+	struct mlxsw_sp_sb_pm *pm = (struct mlxsw_sp_sb_pm *) cb_priv;
+
+	mlxsw_reg_sbpm_unpack(sbpm_pl, &pm->occ.cur, &pm->occ.max);
+}
+
+static int mlxsw_sp_sb_pm_occ_query(struct mlxsw_sp *mlxsw_sp, u8 local_port,
+				    u8 pool, enum mlxsw_reg_sbxx_dir dir,
+				    struct list_head *bulk_list)
+{
+	char sbpm_pl[MLXSW_REG_SBPM_LEN];
+	struct mlxsw_sp_sb_pm *pm;
+
+	pm = mlxsw_sp_sb_pm_get(mlxsw_sp, local_port, pool, dir);
+	mlxsw_reg_sbpm_pack(sbpm_pl, local_port, pool, dir, false, 0, 0);
+	return mlxsw_reg_trans_query(mlxsw_sp->core, MLXSW_REG(sbpm), sbpm_pl,
+				     bulk_list,
+				     mlxsw_sp_sb_pm_occ_query_cb,
+				     (unsigned long) pm);
+}
+
 static const u16 mlxsw_sp_pbs[] = {
 	2 * MLXSW_SP_BYTES_TO_CELLS(ETH_FRAME_LEN),
 	0,
@@ -707,3 +743,222 @@ int mlxsw_sp_sb_tc_pool_bind_set(struct mlxsw_core_port *mlxsw_core_port,
 	return mlxsw_sp_sb_cm_write(mlxsw_sp, local_port, pg_buff, dir,
 				    0, max_buff, pool);
 }
+
+#define MASKED_COUNT_MAX \
+	(MLXSW_REG_SBSR_REC_MAX_COUNT / (MLXSW_SP_SB_TC_COUNT * 2))
+
+struct mlxsw_sp_sb_sr_occ_query_cb_ctx {
+	u8 masked_count;
+	u8 local_port_1;
+};
+
+static void mlxsw_sp_sb_sr_occ_query_cb(struct mlxsw_core *mlxsw_core,
+					char *sbsr_pl, size_t sbsr_pl_len,
+					unsigned long cb_priv)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	struct mlxsw_sp_sb_sr_occ_query_cb_ctx cb_ctx;
+	u8 masked_count;
+	u8 local_port;
+	int rec_index = 0;
+	struct mlxsw_sp_sb_cm *cm;
+	int i;
+
+	memcpy(&cb_ctx, &cb_priv, sizeof(cb_ctx));
+
+	masked_count = 0;
+	for (local_port = cb_ctx.local_port_1;
+	     local_port < MLXSW_PORT_MAX_PORTS; local_port++) {
+		if (!mlxsw_sp->ports[local_port])
+			continue;
+		for (i = 0; i < MLXSW_SP_SB_TC_COUNT; i++) {
+			cm = mlxsw_sp_sb_cm_get(mlxsw_sp, local_port, i,
+						MLXSW_REG_SBXX_DIR_INGRESS);
+			mlxsw_reg_sbsr_rec_unpack(sbsr_pl, rec_index++,
+						  &cm->occ.cur, &cm->occ.max);
+		}
+		if (++masked_count == cb_ctx.masked_count)
+			break;
+	}
+	masked_count = 0;
+	for (local_port = cb_ctx.local_port_1;
+	     local_port < MLXSW_PORT_MAX_PORTS; local_port++) {
+		if (!mlxsw_sp->ports[local_port])
+			continue;
+		for (i = 0; i < MLXSW_SP_SB_TC_COUNT; i++) {
+			cm = mlxsw_sp_sb_cm_get(mlxsw_sp, local_port, i,
+						MLXSW_REG_SBXX_DIR_EGRESS);
+			mlxsw_reg_sbsr_rec_unpack(sbsr_pl, rec_index++,
+						  &cm->occ.cur, &cm->occ.max);
+		}
+		if (++masked_count == cb_ctx.masked_count)
+			break;
+	}
+}
+
+int mlxsw_sp_sb_occ_snapshot(struct mlxsw_core *mlxsw_core,
+			     unsigned int sb_index)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	struct mlxsw_sp_sb_sr_occ_query_cb_ctx cb_ctx;
+	unsigned long cb_priv;
+	LIST_HEAD(bulk_list);
+	char *sbsr_pl;
+	u8 masked_count;
+	u8 local_port_1;
+	u8 local_port = 0;
+	int i;
+	int err;
+	int err2;
+
+	sbsr_pl = kmalloc(MLXSW_REG_SBSR_LEN, GFP_KERNEL);
+	if (!sbsr_pl)
+		return -ENOMEM;
+
+next_batch:
+	local_port++;
+	local_port_1 = local_port;
+	masked_count = 0;
+	mlxsw_reg_sbsr_pack(sbsr_pl, false);
+	for (i = 0; i < MLXSW_SP_SB_TC_COUNT; i++) {
+		mlxsw_reg_sbsr_pg_buff_mask_set(sbsr_pl, i, 1);
+		mlxsw_reg_sbsr_tclass_mask_set(sbsr_pl, i, 1);
+	}
+	for (; local_port < MLXSW_PORT_MAX_PORTS; local_port++) {
+		if (!mlxsw_sp->ports[local_port])
+			continue;
+		mlxsw_reg_sbsr_ingress_port_mask_set(sbsr_pl, local_port, 1);
+		mlxsw_reg_sbsr_egress_port_mask_set(sbsr_pl, local_port, 1);
+		for (i = 0; i < MLXSW_SP_SB_POOL_COUNT; i++) {
+			err = mlxsw_sp_sb_pm_occ_query(mlxsw_sp, local_port, i,
+						       MLXSW_REG_SBXX_DIR_INGRESS,
+						       &bulk_list);
+			if (err)
+				goto out;
+			err = mlxsw_sp_sb_pm_occ_query(mlxsw_sp, local_port, i,
+						       MLXSW_REG_SBXX_DIR_EGRESS,
+						       &bulk_list);
+			if (err)
+				goto out;
+		}
+		if (++masked_count == MASKED_COUNT_MAX)
+			goto do_query;
+	}
+
+do_query:
+	cb_ctx.masked_count = masked_count;
+	cb_ctx.local_port_1 = local_port_1;
+	memcpy(&cb_priv, &cb_ctx, sizeof(cb_ctx));
+	err = mlxsw_reg_trans_query(mlxsw_core, MLXSW_REG(sbsr), sbsr_pl,
+				    &bulk_list, mlxsw_sp_sb_sr_occ_query_cb,
+				    cb_priv);
+	if (err)
+		goto out;
+	if (local_port < MLXSW_PORT_MAX_PORTS)
+		goto next_batch;
+
+out:
+	err2 = mlxsw_reg_trans_bulk_wait(&bulk_list);
+	if (!err)
+		err = err2;
+	kfree(sbsr_pl);
+	return err;
+}
+
+int mlxsw_sp_sb_occ_max_clear(struct mlxsw_core *mlxsw_core,
+			      unsigned int sb_index)
+{
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+	LIST_HEAD(bulk_list);
+	char *sbsr_pl;
+	unsigned int masked_count;
+	u8 local_port = 0;
+	int i;
+	int err;
+	int err2;
+
+	sbsr_pl = kmalloc(MLXSW_REG_SBSR_LEN, GFP_KERNEL);
+	if (!sbsr_pl)
+		return -ENOMEM;
+
+next_batch:
+	local_port++;
+	masked_count = 0;
+	mlxsw_reg_sbsr_pack(sbsr_pl, true);
+	for (i = 0; i < MLXSW_SP_SB_TC_COUNT; i++) {
+		mlxsw_reg_sbsr_pg_buff_mask_set(sbsr_pl, i, 1);
+		mlxsw_reg_sbsr_tclass_mask_set(sbsr_pl, i, 1);
+	}
+	for (; local_port < MLXSW_PORT_MAX_PORTS; local_port++) {
+		if (!mlxsw_sp->ports[local_port])
+			continue;
+		mlxsw_reg_sbsr_ingress_port_mask_set(sbsr_pl, local_port, 1);
+		mlxsw_reg_sbsr_egress_port_mask_set(sbsr_pl, local_port, 1);
+		for (i = 0; i < MLXSW_SP_SB_POOL_COUNT; i++) {
+			err = mlxsw_sp_sb_pm_occ_clear(mlxsw_sp, local_port, i,
+						       MLXSW_REG_SBXX_DIR_INGRESS,
+						       &bulk_list);
+			if (err)
+				goto out;
+			err = mlxsw_sp_sb_pm_occ_clear(mlxsw_sp, local_port, i,
+						       MLXSW_REG_SBXX_DIR_EGRESS,
+						       &bulk_list);
+			if (err)
+				goto out;
+		}
+		if (++masked_count == MASKED_COUNT_MAX)
+			goto do_query;
+	}
+
+do_query:
+	err = mlxsw_reg_trans_query(mlxsw_core, MLXSW_REG(sbsr), sbsr_pl,
+				    &bulk_list, NULL, 0);
+	if (err)
+		goto out;
+	if (local_port < MLXSW_PORT_MAX_PORTS)
+		goto next_batch;
+
+out:
+	err2 = mlxsw_reg_trans_bulk_wait(&bulk_list);
+	if (!err)
+		err = err2;
+	kfree(sbsr_pl);
+	return err;
+}
+
+int mlxsw_sp_sb_occ_port_pool_get(struct mlxsw_core_port *mlxsw_core_port,
+				  unsigned int sb_index, u16 pool_index,
+				  u32 *p_cur, u32 *p_max)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+			mlxsw_core_port_driver_priv(mlxsw_core_port);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u8 pool = pool_get(pool_index);
+	enum mlxsw_reg_sbxx_dir dir = dir_get(pool_index);
+	struct mlxsw_sp_sb_pm *pm = mlxsw_sp_sb_pm_get(mlxsw_sp, local_port,
+						       pool, dir);
+
+	*p_cur = MLXSW_SP_CELLS_TO_BYTES(pm->occ.cur);
+	*p_max = MLXSW_SP_CELLS_TO_BYTES(pm->occ.max);
+	return 0;
+}
+
+int mlxsw_sp_sb_occ_tc_port_bind_get(struct mlxsw_core_port *mlxsw_core_port,
+				     unsigned int sb_index, u16 tc_index,
+				     enum devlink_sb_pool_type pool_type,
+				     u32 *p_cur, u32 *p_max)
+{
+	struct mlxsw_sp_port *mlxsw_sp_port =
+			mlxsw_core_port_driver_priv(mlxsw_core_port);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+	u8 local_port = mlxsw_sp_port->local_port;
+	u8 pg_buff = tc_index;
+	enum mlxsw_reg_sbxx_dir dir = pool_type;
+	struct mlxsw_sp_sb_cm *cm = mlxsw_sp_sb_cm_get(mlxsw_sp, local_port,
+						       pg_buff, dir);
+
+	*p_cur = MLXSW_SP_CELLS_TO_BYTES(cm->occ.cur);
+	*p_max = MLXSW_SP_CELLS_TO_BYTES(cm->occ.max);
+	return 0;
+}
-- 
2.5.5

^ permalink raw reply related

* Re: Deleting child qdisc doesn't reset parent to default qdisc?
From: Phil Sutter @ 2016-04-14 16:22 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jiri Kosina, Jamal Hadi Salim, netdev, linux-kernel
In-Reply-To: <1460648680.10638.47.camel@edumazet-glaptop3.roam.corp.google.com>

On Thu, Apr 14, 2016 at 08:44:40AM -0700, Eric Dumazet wrote:
> On Thu, 2016-04-14 at 17:34 +0200, Jiri Kosina wrote:
> > On Thu, 14 Apr 2016, Phil Sutter wrote:
> > 
> > > OTOH some qdiscs (CBQ, DRR, DSMARK, HFSC, HTB, QFQ) assign the default
> > > one upon deletion instead of noop_qdisc, hence I would describe
> > > the situation using the words 'inconsistent' and 'accident' rather than
> > > 'expected'. :)
> > 
> > Exactly. I'd again like to stress the fact that this configuration works:
> > 
> > 	jikos:~ # tc qdisc show
> > 	qdisc tbf 10: dev eth0 root refcnt 2 rate 800Mbit burst 131000b lat 1.0ms 
> > 
> > and this (after performing add/delete operation) doesn't:
> > 
> > 	jikos:~ # tc qdisc show
> > 	qdisc tbf 10: dev eth0 root refcnt 2 rate 800Mbit burst 131000b lat 1.0ms 
> > 
> > It's hard to spot a difference (hint: there is none).
> 
> This is because some qdisc are not visible in the dump.

And those being invisible can be overridden using 'tc qd add', right?
AFAIR they're not listed because they don't properly register, so the
system doesn't care to override them. In this case we could change all
classful qdiscs to restore the default qdisc if a leaf qdisc is being
deleted instead of noop (which is probably not what the user wants
anyway).

Cheers, Phil

^ permalink raw reply

* Re: [PATCH 1/2] [v4] net: emac: emac gigabit ethernet controller driver
From: Rob Herring @ 2016-04-14 16:24 UTC (permalink / raw)
  To: Timur Tabi
  Cc: kbuild test robot, kbuild-all, netdev, linux-kernel, devicetree,
	linux-arm-msm, sdharia, Shanker Donthineni, Greg Kroah-Hartman,
	vikrams, cov, gavidov, andrew, bjorn.andersson, Mark Langsdorf,
	Jon Masters, Andy Gross, David S. Miller
In-Reply-To: <570E9E8D.50007@codeaurora.org>

On Wed, Apr 13, 2016 at 02:31:25PM -0500, Timur Tabi wrote:
> kbuild test robot wrote:
> >
> >    drivers/net/ethernet/qualcomm/emac/emac-mac.c: In function 'emac_mac_up':
> >>>>>drivers/net/ethernet/qualcomm/emac/emac-mac.c:1076:9: warning: large integer implicitly truncated to unsigned type [-Woverflow]
> >      writel(~DIS_INT, adpt->base + EMAC_INT_STATUS);
> 
> This doesn't happen on arm64, and I don't know how to fix it.  DIS_INT is
> defined as:

Probably depends on the compiler version. BTW, clang seems to throw 
errors for this type of thing.

> 
> 	#define DIS_INT          BIT(31)
> 
> It seems silly to add a typecast to DIS_INT.

BIT() should use 1U instead of 1.

Rob

^ permalink raw reply

* Re: [PATCH 1/2] [v4] net: emac: emac gigabit ethernet controller driver
From: Rob Herring @ 2016-04-14 16:32 UTC (permalink / raw)
  To: Timur Tabi
  Cc: netdev, linux-kernel, devicetree, linux-arm-msm, sdharia,
	Shanker Donthineni, Greg Kroah-Hartman, vikrams, cov, gavidov,
	andrew, bjorn.andersson, Mark Langsdorf, Jon Masters, Andy Gross,
	David S. Miller
In-Reply-To: <1460570393-19838-1-git-send-email-timur@codeaurora.org>

On Wed, Apr 13, 2016 at 12:59:52PM -0500, Timur Tabi wrote:
> From: Gilad Avidov <gavidov@codeaurora.org>
> 
> Add supports for ethernet controller HW on Qualcomm Technologies, Inc. SoC.
> This driver supports the following features:
> 1) Checksum offload.
> 2) Runtime power management support.
> 3) Interrupt coalescing support.
> 4) SGMII phy.
> 5) SGMII direct connection without external phy.
> 
> Based on a driver by Niranjana Vishwanathapura
> <nvishwan@codeaurora.org>.
> 
> Signed-off-by: Gilad Avidov <gavidov@codeaurora.org>
> Signed-off-by: Timur Tabi <timur@codeaurora.org>
> ---
> 
> v4:
>  - add missing ipv6 header file
>  - correct compatible string
>  - fix spacing in emac_reg_write arrays
>  - drop unnecessary cell-index property
>  - remove unsupported DT properties from docs
>  - remove GPIO initialization and update docs
> 
> v3:
>  - remove most of the memory barriers by using the non xxx_relaxed() api.
>  - remove RSS and WOL support.
>  - correct comments from physical address to dma address.
>  - rearrange structs to make them packed.
>  - replace polling loops with readl_poll_timeout().
>  - remove unnecessary wrapper functions from phy layer.
>  - add blank line before return statements.
>  - set to null clocks after clk_put().
>  - use module_platform_driver() and dma_set_mask_and_coherent()
>  - replace long hex bitmasks with BIT() macro.
> 
> v2:
>  - replace hw bit fields to macros with bitwise operations.
>  - change all iterators to unsized types (int)
>  - some minor code flow improvements.
>  - change return type to void for functions which return value is never
>    used.
>  - replace instance of xxxxl_relaxed() io followed by mb() with a
>    readl()/writel().
> 
> ---
>  .../devicetree/bindings/net/qcom-emac.txt          |   65 +
>  drivers/net/ethernet/qualcomm/Kconfig              |   11 +
>  drivers/net/ethernet/qualcomm/Makefile             |    2 +
>  drivers/net/ethernet/qualcomm/emac/Makefile        |    7 +
>  drivers/net/ethernet/qualcomm/emac/emac-mac.c      | 1782 ++++++++++++++++++++
>  drivers/net/ethernet/qualcomm/emac/emac-mac.h      |  286 ++++
>  drivers/net/ethernet/qualcomm/emac/emac-phy.c      |  484 ++++++
>  drivers/net/ethernet/qualcomm/emac/emac-phy.h      |   68 +
>  drivers/net/ethernet/qualcomm/emac/emac-sgmii.c    |  683 ++++++++
>  drivers/net/ethernet/qualcomm/emac/emac-sgmii.h    |   30 +
>  drivers/net/ethernet/qualcomm/emac/emac.c          | 1206 +++++++++++++
>  drivers/net/ethernet/qualcomm/emac/emac.h          |  382 +++++
>  12 files changed, 5006 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/net/qcom-emac.txt
>  create mode 100644 drivers/net/ethernet/qualcomm/emac/Makefile
>  create mode 100644 drivers/net/ethernet/qualcomm/emac/emac-mac.c
>  create mode 100644 drivers/net/ethernet/qualcomm/emac/emac-mac.h
>  create mode 100644 drivers/net/ethernet/qualcomm/emac/emac-phy.c
>  create mode 100644 drivers/net/ethernet/qualcomm/emac/emac-phy.h
>  create mode 100644 drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
>  create mode 100644 drivers/net/ethernet/qualcomm/emac/emac-sgmii.h
>  create mode 100644 drivers/net/ethernet/qualcomm/emac/emac.c
>  create mode 100644 drivers/net/ethernet/qualcomm/emac/emac.h
> 
> diff --git a/Documentation/devicetree/bindings/net/qcom-emac.txt b/Documentation/devicetree/bindings/net/qcom-emac.txt
> new file mode 100644
> index 0000000..df5e7c0
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/qcom-emac.txt
> @@ -0,0 +1,65 @@
> +Qualcomm EMAC Gigabit Ethernet Controller
> +
> +Required properties:
> +- compatible : Should be "qcom,emac".

Come on... Can you guess what I'm going to say here.

> +- reg : Offset and length of the register regions for the device
> +- reg-names : Register region names referenced in 'reg' above.
> +	Required register resource entries are:
> +	"base"   : EMAC controller base register block.
> +	"csr"    : EMAC wrapper register block.
> +	Optional register resource entries are:
> +	"ptp"    : EMAC PTP (1588) register block.
> +		   Required if 'qcom,emac-tstamp-en' is present.
> +	"sgmii"  : EMAC SGMII PHY register block.
> +- interrupts : Interrupt numbers used by this controller
> +- interrupt-names : Interrupt resource names referenced in 'interrupts' above.
> +	Required interrupt resource entries are:
> +	"emac_core0"   : EMAC core0 interrupt.
> +	"sgmii_irq"   : EMAC SGMII interrupt.
> +- phy-addr            : Specifies phy address on MDIO bus.
> +			Required if the optional property "qcom,no-external-phy"
> +			is not specified.

As I mentioned in the last version, you should still describe this with 
a standard MDIO bus binding even if you can't use the generic code.

> +
> +Optional properties:
> +- qcom,emac-tstamp-en       : Enables the PTP (1588) timestamping feature.
> +			      Include this only if PTP (1588) timestamping
> +			      feature is needed. If included, "ptp" register
> +			      base should be specified.
> +- mac-address               : The 6-byte MAC address. If present, it is the
> +			      default MAC address.
> +- qcom,no-external-phy      : Indicates there is no external PHY connected to
> +			      EMAC. Include this only if the EMAC is directly
> +			      connected to the peer end without EPHY.
> +Example:
> +	emac0: qcom,emac@feb20000 {

ethernet@

> +		compatible = "qcom,fsm9900-emac";

Ah, I see you fixed it here...

> +		reg-names = "base", "csr", "ptp", "sgmii";
> +		reg =   <0xfeb20000 0x10000>,
> +			<0xfeb36000 0x1000>,
> +			<0xfeb3c000 0x4000>,
> +			<0xfeb38000 0x400>;
> +		#address-cells = <0>;
> +		interrupt-parent = <&emac0>;
> +		#interrupt-cells = <1>;
> +		interrupts = <0 1>;
> +		interrupt-map-mask = <0xffffffff>;
> +		interrupt-map = <0 &intc 0 76 0
> +				 1 &intc 0 80 0>;
> +		interrupt-names = "emac_core0", "sgmii_irq";
> +		qcom,emac-tstamp-en;
> +		phy-addr = <0>;
> +
> +		pinctrl-names = "default";
> +		pinctrl-0 = <&mdio_pins_a>;
> +	};
> +
> +	tlmm: pinctrl@fd510000 {
> +		compatible = "qcom,fsm9900-pinctrl";
> +
> +		mdio_pins_a: mdio {
> +			state {
> +				pins = "gpio123", "gpio124";
> +				function = "mdio";
> +			};
> +		};
> +	};

^ permalink raw reply

* [PATCH net-next] tun: don't require serialization lock on tx
From: Paolo Abeni @ 2016-04-14 16:39 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Michael S. Tsirkin, Hannes Frederic Sowa,
	Eric W. Biederman, Greg Kurz, Jason Wang, Eric Dumazet

The current tun_net_xmit() implementation don't need any external
lock since it relies on rcu protection for the tun data structure
and on socket queue lock for skb queuing.

This patch set the NETIF_F_LLTX feature bit in the tun device, so
that on xmit, in absence of qdisc, no serialization lock is acquired
by the caller.

The user space can remove the default tun qdisc with:

tc qdisc replace dev <tun device name> root noqueue

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Eric Dumazet <edumazet@google.com>

---
RFC -> v1
 - fixed a commit message typo, extended the comment with a 
   configuration hint
---
 drivers/net/tun.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index faf9297..42992dc 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1796,7 +1796,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
 				   TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
 				   NETIF_F_HW_VLAN_STAG_TX;
-		dev->features = dev->hw_features;
+		dev->features = dev->hw_features | NETIF_F_LLTX;
 		dev->vlan_features = dev->features &
 				     ~(NETIF_F_HW_VLAN_CTAG_TX |
 				       NETIF_F_HW_VLAN_STAG_TX);
-- 
1.8.3.1

^ permalink raw reply related

* Re: Deleting child qdisc doesn't reset parent to default qdisc?
From: Eric Dumazet @ 2016-04-14 16:40 UTC (permalink / raw)
  To: Phil Sutter; +Cc: Jiri Kosina, Jamal Hadi Salim, netdev, linux-kernel
In-Reply-To: <20160414162229.GF3715@orbyte.nwl.cc>

On Thu, 2016-04-14 at 18:22 +0200, Phil Sutter wrote:

> And those being invisible can be overridden using 'tc qd add', right?
> AFAIR they're not listed because they don't properly register, so the
> system doesn't care to override them. In this case we could change all
> classful qdiscs to restore the default qdisc if a leaf qdisc is being
> deleted instead of noop (which is probably not what the user wants
> anyway).

Even if they properly register, they are not visible.

Take a look at
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=95dc19299f741c986227ec33e23cbf9b3321f812

for some context.

When a default pfifo is created on say a HTB class, you do not see it by
default in a dump.

If you have 100 HTB classes, HTB created 100 pfifo just fine, and it
works, unless an admin tries to delete them maybe ;)

^ permalink raw reply

* Re: [PATCH 1/2] [v4] net: emac: emac gigabit ethernet controller driver
From: Timur Tabi @ 2016-04-14 16:47 UTC (permalink / raw)
  To: Rob Herring
  Cc: netdev, linux-kernel, devicetree, linux-arm-msm, sdharia,
	Shanker Donthineni, Greg Kroah-Hartman, vikrams, cov, gavidov,
	andrew, bjorn.andersson, Mark Langsdorf, Jon Masters, Andy Gross,
	David S. Miller
In-Reply-To: <20160414163240.GB15303@rob-hp-laptop>

Rob Herring wrote:

>> @@ -0,0 +1,65 @@
>> +Qualcomm EMAC Gigabit Ethernet Controller
>> +
>> +Required properties:
>> +- compatible : Should be "qcom,emac".
>
> Come on... Can you guess what I'm going to say here.

Ooops, I missed that one.

>
>> +- reg : Offset and length of the register regions for the device
>> +- reg-names : Register region names referenced in 'reg' above.
>> +	Required register resource entries are:
>> +	"base"   : EMAC controller base register block.
>> +	"csr"    : EMAC wrapper register block.
>> +	Optional register resource entries are:
>> +	"ptp"    : EMAC PTP (1588) register block.
>> +		   Required if 'qcom,emac-tstamp-en' is present.
>> +	"sgmii"  : EMAC SGMII PHY register block.
>> +- interrupts : Interrupt numbers used by this controller
>> +- interrupt-names : Interrupt resource names referenced in 'interrupts' above.
>> +	Required interrupt resource entries are:
>> +	"emac_core0"   : EMAC core0 interrupt.
>> +	"sgmii_irq"   : EMAC SGMII interrupt.
>> +- phy-addr            : Specifies phy address on MDIO bus.
>> +			Required if the optional property "qcom,no-external-phy"
>> +			is not specified.
>
> As I mentioned in the last version, you should still describe this with
> a standard MDIO bus binding even if you can't use the generic code.

You mean like this?

	phy0: ethernet-phy@0 {
		compatible = "qcom,fsm9900-emac-phy";
		reg = <4>;
	};

>> +Optional properties:
>> +- qcom,emac-tstamp-en       : Enables the PTP (1588) timestamping feature.
>> +			      Include this only if PTP (1588) timestamping
>> +			      feature is needed. If included, "ptp" register
>> +			      base should be specified.
>> +- mac-address               : The 6-byte MAC address. If present, it is the
>> +			      default MAC address.
>> +- qcom,no-external-phy      : Indicates there is no external PHY connected to
>> +			      EMAC. Include this only if the EMAC is directly
>> +			      connected to the peer end without EPHY.
>> +Example:
>> +	emac0: qcom,emac@feb20000 {
>
> ethernet@
>
>> +		compatible = "qcom,fsm9900-emac";
>
> Ah, I see you fixed it here...

and in the code, I just missed it in the top of the file.  I'll fix it 
everywhere in v5.

-- 
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora
Forum, a Linux Foundation collaborative project.

^ permalink raw reply

* Re: [PATCH v3 0/2] sctp: delay calls to sk_data_ready() as much as possible
From: Marcelo Ricardo Leitner @ 2016-04-14 17:00 UTC (permalink / raw)
  To: Neil Horman, David Miller
  Cc: netdev, vyasevich, linux-sctp, David.Laight, jkbs
In-Reply-To: <20160414130324.GA6806@hmsreliant.think-freely.org>

Em 14-04-2016 10:03, Neil Horman escreveu:
> On Wed, Apr 13, 2016 at 11:05:32PM -0400, David Miller wrote:
>> From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
>> Date: Fri,  8 Apr 2016 16:41:26 -0300
>>
>>> 1st patch is a preparation for the 2nd. The idea is to not call
>>> ->sk_data_ready() for every data chunk processed while processing
>>> packets but only once before releasing the socket.
>>>
>>> v2: patchset re-checked, small changelog fixes
>>> v3: on patch 2, make use of local vars to make it more readable
>>
>> Applied to net-next, but isn't this reduced overhead coming at the
>> expense of latency?  What if that lower latency is important to the
>> application and/or consumer?
> Thats a fair point, but I'd make the counter argument that, as it currently
> stands, any latency introduced (or removed), is an artifact of our
> implementation rather than a designed feature of it.  That is to say, we make no
> guarantees at the application level regarding how long it takes to signal data
> readines from the time we get data off the wire, so I would rather see our
> throughput raised if we can, as thats been sctp's more pressing achilles heel.
>
>
> Thats not to say I'd like to enable lower latency, but I'd rather have this now,
> and start pondering how to design that in.  Perhaps we can convert the pending
> flag to a counter to count the number of events we enqueue, and call
> sk_data_ready every  time we reach a sysctl defined threshold.

That and also that there is no chance of the application reading the 
first chunks before all current ToDo's are performed by either the bh or 
backlog handlers for that packet. Socket lock won't be cycled in between 
chunks so the application is going to wait all the processing one way or 
another.

Thanks,
Marcelo

^ permalink raw reply

* Re: [PATCH 1/2] [v4] net: emac: emac gigabit ethernet controller driver
From: Rob Herring @ 2016-04-14 17:18 UTC (permalink / raw)
  To: Timur Tabi
  Cc: netdev, linux-kernel@vger.kernel.org, devicetree@vger.kernel.org,
	linux-arm-msm, Sagar Dharia, Shanker Donthineni,
	Greg Kroah-Hartman, vikrams, Christopher Covington, Gilad Avidov,
	Andrew Lunn, Bjorn Andersson, Mark Langsdorf, Jon Masters,
	Andy Gross, David S. Miller
In-Reply-To: <570FC987.80304@codeaurora.org>

On Thu, Apr 14, 2016 at 11:47 AM, Timur Tabi <timur@codeaurora.org> wrote:
> Rob Herring wrote:
>
>>> @@ -0,0 +1,65 @@
>>> +Qualcomm EMAC Gigabit Ethernet Controller
>>> +
>>> +Required properties:
>>> +- compatible : Should be "qcom,emac".
>>
>>
>> Come on... Can you guess what I'm going to say here.
>
>
> Ooops, I missed that one.
>
>>
>>> +- reg : Offset and length of the register regions for the device
>>> +- reg-names : Register region names referenced in 'reg' above.
>>> +       Required register resource entries are:
>>> +       "base"   : EMAC controller base register block.
>>> +       "csr"    : EMAC wrapper register block.
>>> +       Optional register resource entries are:
>>> +       "ptp"    : EMAC PTP (1588) register block.
>>> +                  Required if 'qcom,emac-tstamp-en' is present.
>>> +       "sgmii"  : EMAC SGMII PHY register block.
>>> +- interrupts : Interrupt numbers used by this controller
>>> +- interrupt-names : Interrupt resource names referenced in 'interrupts'
>>> above.
>>> +       Required interrupt resource entries are:
>>> +       "emac_core0"   : EMAC core0 interrupt.
>>> +       "sgmii_irq"   : EMAC SGMII interrupt.
>>> +- phy-addr            : Specifies phy address on MDIO bus.
>>> +                       Required if the optional property
>>> "qcom,no-external-phy"
>>> +                       is not specified.
>>
>>
>> As I mentioned in the last version, you should still describe this with
>> a standard MDIO bus binding even if you can't use the generic code.
>
>
> You mean like this?
>
>         phy0: ethernet-phy@0 {
>                 compatible = "qcom,fsm9900-emac-phy";
>                 reg = <4>;

Yes, but you mean 0 here or 4 for unit address.

^ permalink raw reply

* [net][PATCH v2 1/2] RDS: fix endianness for dp_ack_seq
From: Santosh Shilimkar @ 2016-04-14 17:43 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1460655807-26236-1-git-send-email-santosh.shilimkar@oracle.com>

From: Qing Huang <qing.huang@oracle.com>

dp->dp_ack_seq is used in big endian format. We need to do the
big endianness conversion when we assign a value in host format
to it.

Signed-off-by: Qing Huang <qing.huang@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/ib_cm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 8764970..310cabc 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -194,7 +194,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
 		dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
 		dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
 		dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
-		dp->dp_ack_seq = rds_ib_piggyb_ack(ic);
+		dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic));
 
 		/* Advertise flow control */
 		if (ic->i_flowctl) {
-- 
1.9.1

^ permalink raw reply related

* [net][PATCH v2 2/2] RDS: Fix the atomicity for congestion map update
From: Santosh Shilimkar @ 2016-04-14 17:43 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar
In-Reply-To: <1460655807-26236-1-git-send-email-santosh.shilimkar@oracle.com>

Two different threads with different rds sockets may be in
rds_recv_rcvbuf_delta() via receive path. If their ports
both map to the same word in the congestion map, then
using non-atomic ops to update it could cause the map to
be incorrect. Lets use atomics to avoid such an issue.

Full credit to Wengang <wen.gang.wang@oracle.com> for
finding the issue, analysing it and also pointing out
to offending code with spin lock based fix.

Reviewed-by: Leon Romanovsky <leon@leon.nu>
Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/cong.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/rds/cong.c b/net/rds/cong.c
index e6144b8..6641bcf 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -299,7 +299,7 @@ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
 	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
 	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
 
-	__set_bit_le(off, (void *)map->m_page_addrs[i]);
+	set_bit_le(off, (void *)map->m_page_addrs[i]);
 }
 
 void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
@@ -313,7 +313,7 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
 	i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
 	off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
 
-	__clear_bit_le(off, (void *)map->m_page_addrs[i]);
+	clear_bit_le(off, (void *)map->m_page_addrs[i]);
 }
 
 static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
-- 
1.9.1

^ permalink raw reply related

* [net][PATCH v2 0/2] RDS: couple of fixes for 4.6
From: Santosh Shilimkar @ 2016-04-14 17:43 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel

v2:
Rebased fixes against 'net' instead of 'net-next' Patches are also
available at below git tree.

The following changes since commit e013b7780c41b471c4269ac9ccafb65ba7c9ec86:

  Merge branch 'dsa-voidify-ops' (2016-04-08 16:51:15 -0400)

are available in the git repository at:


  git://git.kernel.org/pub/scm/linux/kernel/git/ssantosh/linux.git for_4.6/net/rds-fixes

for you to fetch changes up to e9155afb1902380938ca83ba8504aaa2d7ee5210:

  RDS: Fix the atomicity for congestion map update (2016-04-08 15:08:13 -0700)

----------------------------------------------------------------
Qing Huang (1):
      RDS: fix endianness for dp_ack_seq

Santosh Shilimkar (1):
      RDS: Fix the atomicity for congestion map update

 net/rds/cong.c  | 4 ++--
 net/rds/ib_cm.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

Regards,
Santosh

^ permalink raw reply

* Re: [net-next][PATCH 0/2] RDS: couple of fixes for 4.6
From: santosh.shilimkar @ 2016-04-14 17:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel
In-Reply-To: <20160413.233637.2047659923049596108.davem@davemloft.net>

On 4/13/16 8:36 PM, David Miller wrote:
> From: Santosh Shilimkar <santosh.shilimkar@oracle.com>
> Date: Fri,  8 Apr 2016 15:26:38 -0700
>
>> Patches are also available at below git tree.
>>
>> git://git.kernel.org/pub/scm/linux/kernel/git/ssantosh/linux.git for_4.6/net-next/rds-fixes
>
> "Bug fixes for 4.6" do not get targetted at the net-next tree, that's
> for 4.7 development.
>
Sorry. Should have based it against 'net'. Just posted re-based version.
Thanks !!

Regards,
Santosh

^ permalink raw reply

* Re: Deleting child qdisc doesn't reset parent to default qdisc?
From: Eric Dumazet @ 2016-04-14 17:49 UTC (permalink / raw)
  To: Jiri Kosina; +Cc: Phil Sutter, Jamal Hadi Salim, netdev, linux-kernel
In-Reply-To: <alpine.LNX.2.00.1604141807350.27368@cbobk.fhfr.pm>

On Thu, 2016-04-14 at 18:08 +0200, Jiri Kosina wrote:
> On Thu, 14 Apr 2016, Phil Sutter wrote:
> 
> > > > I've came across the behavior where adding a child qdisc and then deleting 
> > > > it again makes the networking dysfunctional (I guess that's because all of 
> > > > a sudden there is absolutely no working qdisc on the device, although 
> > > > there originally was a default one in the parent).
> > > > 
> > > > In a nutshell, is this expected behavior or bug?
> > > 
> > > This is the expected behavior.
> > 
> > OTOH some qdiscs (CBQ, DRR, DSMARK, HFSC, HTB, QFQ) assign the default
> > one upon deletion instead of noop_qdisc, hence I would describe
> > the situation using the words 'inconsistent' and 'accident' rather than
> > 'expected'. :)
> 
> Would a patch that'd unify this in a sense that all qdiscs would assign 
> the default one upon deletion acceptable?
> 

And what would be the chosen behavior ?

Relying on TBF installing a bfifo for you at delete would be hazardous.

For example CBQ got it differently than HFSC

If qdisc_create_dflt() fails in CBQ, we fail the 'delete', while HFSC
falls back to noop_qdisc, without warning the user :(

At least always using noop_qdisc is consistent. No magic there.

Doing 'unification' right now would break existing scripts.

This is too late, I am afraid.

^ permalink raw reply

* [PATCH pci] pci: Add helper function to set VPD size
From: Hariprasad Shenai @ 2016-04-14 18:12 UTC (permalink / raw)
  To: bhelgaas, davem
  Cc: linux-pci, netdev, leedom, swise, nirranjan, santosh,
	Hariprasad Shenai

commit 104daa71b396 ("PCI: Determine actual VPD size on first access")
introduced a regression in cxgb4 driver and used to fail in pci probe.

The problem is stemming from the fact that the Chelsio adapters actually
have two VPD structures stored in the VPD. An abbreviated on at Offset 0x0
and the complete VPD at Offset 0x400. The abbreviated one only contains
the PN, SN and EC Keywords, while the complete VPD contains those plus
various adapter constants contained in V0, V1, etc. And it also contains
the Base Ethernet MAC Address in the "NA" Keyword which the cxgb4 driver
needs when it can't contact the adapter firmware. (We don't have the "NA"
Keywork in the VPD Structure at Offset 0x0 because that's not an allowed
VPD Keyword in the PCI-E 3.0 specification.)

With the new code, the computed size of the VPD is 0x200 and so our efforts
to read the VPD at Offset 0x400 silently fails. We check the result of the
read looking for a signature 0x82 byte but we're checking against random
stack garbage.

The fix is to add a PCI helper function to set the VPD size, so the
driver can expicitly set the exact size of the VPD.

Fixes commit 104daa71b396 ("PCI: Determine actual VPD size on first access")

Signed-off-by: Casey Leedom <leedom@chelsio.com>
Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 10 +++++++
 drivers/pci/access.c                       | 42 ++++++++++++++++++++++++++++++
 drivers/pci/pci.h                          |  1 +
 include/linux/pci.h                        |  1 +
 4 files changed, 54 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index cc1736bece0f..2033159e26a5 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -2557,6 +2557,7 @@ void t4_get_regs(struct adapter *adap, void *buf, size_t buf_size)
 }
 
 #define EEPROM_STAT_ADDR   0x7bfc
+#define VPD_SIZE           0x800
 #define VPD_BASE           0x400
 #define VPD_BASE_OLD       0
 #define VPD_LEN            1024
@@ -2594,6 +2595,15 @@ int t4_get_raw_vpd_params(struct adapter *adapter, struct vpd_params *p)
 	if (!vpd)
 		return -ENOMEM;
 
+	/* We have two VPD data structures stored in the adapter VPD area.
+	 * By default, Linux calculates the size of the VPD area by traversing
+	 * the first VPD area at offset 0x0, so we need to tell the OS what
+	 * our real VPD size is.
+	 */
+	ret = pci_set_size_vpd(adapter->pdev, VPD_SIZE);
+	if (ret < 0)
+		goto out;
+
 	/* Card information normally starts at VPD_BASE but early cards had
 	 * it at 0.
 	 */
diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 01b9d0a00abc..e69b3877bd37 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -275,6 +275,19 @@ ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void
 }
 EXPORT_SYMBOL(pci_write_vpd);
 
+/**
+ * pci_set_size_vpd - Set size of Vital Product Data space
+ * @dev:	pci device struct
+ * @len:	size of vpd space
+ */
+ssize_t pci_set_size_vpd(struct pci_dev *dev, size_t len)
+{
+	if (!dev->vpd || !dev->vpd->ops)
+		return -ENODEV;
+	return dev->vpd->ops->set_size(dev, len);
+}
+EXPORT_SYMBOL(pci_set_size_vpd);
+
 #define PCI_VPD_MAX_SIZE (PCI_VPD_ADDR_MASK + 1)
 
 /**
@@ -498,9 +511,23 @@ out:
 	return ret ? ret : count;
 }
 
+static ssize_t pci_vpd_set_size(struct pci_dev *dev, size_t len)
+{
+	struct pci_vpd *vpd = dev->vpd;
+
+	if (len == 0 || len > PCI_VPD_MAX_SIZE)
+		return -EIO;
+
+	vpd->valid = 1;
+	vpd->len = len;
+
+	return 0;
+}
+
 static const struct pci_vpd_ops pci_vpd_ops = {
 	.read = pci_vpd_read,
 	.write = pci_vpd_write,
+	.set_size = pci_vpd_set_size,
 };
 
 static ssize_t pci_vpd_f0_read(struct pci_dev *dev, loff_t pos, size_t count,
@@ -533,9 +560,24 @@ static ssize_t pci_vpd_f0_write(struct pci_dev *dev, loff_t pos, size_t count,
 	return ret;
 }
 
+static ssize_t pci_vpd_f0_set_size(struct pci_dev *dev, size_t len)
+{
+	struct pci_dev *tdev = pci_get_slot(dev->bus,
+					    PCI_DEVFN(PCI_SLOT(dev->devfn), 0));
+	ssize_t ret;
+
+	if (!tdev)
+		return -ENODEV;
+
+	ret = pci_set_size_vpd(tdev, len);
+	pci_dev_put(tdev);
+	return ret;
+}
+
 static const struct pci_vpd_ops pci_vpd_f0_ops = {
 	.read = pci_vpd_f0_read,
 	.write = pci_vpd_f0_write,
+	.set_size = pci_vpd_f0_set_size,
 };
 
 int pci_vpd_init(struct pci_dev *dev)
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index d0fb93481573..8239d186f1ed 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -97,6 +97,7 @@ static inline bool pci_has_subordinate(struct pci_dev *pci_dev)
 struct pci_vpd_ops {
 	ssize_t (*read)(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
 	ssize_t (*write)(struct pci_dev *dev, loff_t pos, size_t count, const void *buf);
+	ssize_t (*set_size)(struct pci_dev *dev, size_t len);
 };
 
 struct pci_vpd {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 004b8133417d..1ab1b7458a8b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1111,6 +1111,7 @@ void pci_unlock_rescan_remove(void);
 /* Vital product data routines */
 ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
 ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf);
+ssize_t pci_set_size_vpd(struct pci_dev *dev, size_t len);
 
 /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */
 resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx);
-- 
2.3.4

^ permalink raw reply related

* [PATCH iproute2] ip: neigh: Fix leftover attributes message during flush
From: Jeff Harris @ 2016-04-14 18:15 UTC (permalink / raw)
  To: netdev; +Cc: Jeff Harris

Use the same rtnl_dump_request_n call as the show.  The rtnl_wilddump_request
assumes the type uses an ifinfomsg which is not the case for the neighbor
table.

Signed-off-by: Jeff Harris <jefftharris@gmail.com>
---
 ip/ipneigh.c |    6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ip/ipneigh.c b/ip/ipneigh.c
index c49fb4e..4ddb747 100644
--- a/ip/ipneigh.c
+++ b/ip/ipneigh.c
@@ -430,6 +430,8 @@ static int do_show_or_flush(int argc, char **argv, int flush)
 		addattr32(&req.n, sizeof(req), NDA_IFINDEX, filter.index);
 	}
 
+	req.ndm.ndm_family = filter.family;
+
 	if (flush) {
 		int round = 0;
 		char flushb[4096-512];
@@ -440,7 +442,7 @@ static int do_show_or_flush(int argc, char **argv, int flush)
 		filter.state &= ~NUD_FAILED;
 
 		while (round < MAX_ROUNDS) {
-			if (rtnl_wilddump_request(&rth, filter.family, RTM_GETNEIGH) < 0) {
+			if (rtnl_dump_request_n(&rth, &req.n) < 0) {
 				perror("Cannot send dump request");
 				exit(1);
 			}
@@ -472,8 +474,6 @@ static int do_show_or_flush(int argc, char **argv, int flush)
 		return 1;
 	}
 
-	req.ndm.ndm_family = filter.family;
-
 	if (rtnl_dump_request_n(&rth, &req.n) < 0) {
 		perror("Cannot send dump request");
 		exit(1);
-- 
1.7.9.5

^ permalink raw reply related

* RE: [PATCH pci] pci: Add helper function to set VPD size
From: Steve Wise @ 2016-04-14 18:35 UTC (permalink / raw)
  To: 'Hariprasad Shenai', bhelgaas, davem
  Cc: linux-pci, netdev, leedom, nirranjan, santosh
In-Reply-To: <1460657525-17551-1-git-send-email-hariprasad@chelsio.com>

> The fix is to add a PCI helper function to set the VPD size, so the
> driver can expicitly set the exact size of the VPD.
> 
> Fixes commit 104daa71b396 ("PCI: Determine actual VPD size on first access")
> 
> Signed-off-by: Casey Leedom <leedom@chelsio.com>
> Signed-off-by: Hariprasad Shenai <hariprasad@chelsio.com>

Looks good!
  
Tested-by: Steve Wise <swise@opengridcomputing.com>

^ permalink raw reply

* [PATCH net 0/3] net: dsa: mv88e6xxx: fix hardware cross-chip bridging
From: Vivien Didelot @ 2016-04-14 18:42 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot

In order to accelerate cross-chip switching of frames with the hardware,
the DSA Tag ports, used to interconnect switch devices, must learn SA
and DA addresses, and share the same FDB with the user ports.

The two first patches restore address learning on DSA links. This fixes
hardware cross-chip bridging in a VLAN filtering enabled system, which
implements a bridge group as a 802.1Q VLAN and thus share an isolated
address database between DSA and user ports.

The third patch changes the distinct default databases used for each
port, to the same address database. This fixes the hardware cross-chip
bridging in a VLAN filtering disabled system, where a bridge group gets
implemented only as a port-based VLAN.

Vivien Didelot (3):
  net: dsa: mv88e6xxx: unlock DSA and CPU ports
  net: dsa: mv88e6xxx: enable SA learning on DSA ports
  net: dsa: mv88e6xxx: share a default FDB

 drivers/net/dsa/mv88e6xxx.c | 34 +++++-----------------------------
 1 file changed, 5 insertions(+), 29 deletions(-)

-- 
2.8.0

^ permalink raw reply

* [PATCH net 1/3] net: dsa: mv88e6xxx: unlock DSA and CPU ports
From: Vivien Didelot @ 2016-04-14 18:42 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot
In-Reply-To: <1460659329-11473-1-git-send-email-vivien.didelot@savoirfairelinux.com>

Locking a port generates an hardware interrupt when a new SA address is
received. This enables CPU directed learning, which is needed for 802.1X
MAC authentication.

To disable automatic learning on a port, the only configuration needed
is to set its Port Association Vector to all zero.

Clear PAV when SA learning should be disabled instead of locking a port.

Fixes: 4c7ea3c0791e ("net: dsa: mv88e6xxx: disable SA learning for DSA and CPU ports")
Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 drivers/net/dsa/mv88e6xxx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 9985a0c..7725e29 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -2544,7 +2544,7 @@ static int mv88e6xxx_setup_port(struct dsa_switch *ds, int port)
 	reg = 1 << port;
 	/* Disable learning for DSA and CPU ports */
 	if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
-		reg = PORT_ASSOC_VECTOR_LOCKED_PORT;
+		reg = 0;
 
 	ret = _mv88e6xxx_reg_write(ds, REG_PORT(port), PORT_ASSOC_VECTOR, reg);
 	if (ret)
-- 
2.8.0

^ permalink raw reply related

* [PATCH net 2/3] net: dsa: mv88e6xxx: enable SA learning on DSA ports
From: Vivien Didelot @ 2016-04-14 18:42 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot
In-Reply-To: <1460659329-11473-1-git-send-email-vivien.didelot@savoirfairelinux.com>

In multi-chip systems, DSA Tag ports must learn SA addresses in order to
correctly switch frames between interconnected chips.

This fixes cross-chip hardware bridging in a VLAN filtering aware
system, because a bridge group gets implemented as an hardware 802.1Q
VLAN and thus DSA and user ports share the same FDB.

Fixes: 4c7ea3c0791e ("net: dsa: mv88e6xxx: disable SA learning for DSA and CPU ports")
Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 drivers/net/dsa/mv88e6xxx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 7725e29..46c9b4c 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -2542,8 +2542,8 @@ static int mv88e6xxx_setup_port(struct dsa_switch *ds, int port)
 	 * the other bits clear.
 	 */
 	reg = 1 << port;
-	/* Disable learning for DSA and CPU ports */
-	if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
+	/* Disable learning for CPU port */
+	if (dsa_is_cpu_port(ds, port))
 		reg = 0;
 
 	ret = _mv88e6xxx_reg_write(ds, REG_PORT(port), PORT_ASSOC_VECTOR, reg);
-- 
2.8.0

^ permalink raw reply related

* [PATCH net 3/3] net: dsa: mv88e6xxx: share the same default FDB
From: Vivien Didelot @ 2016-04-14 18:42 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, kernel, David S. Miller, Florian Fainelli,
	Andrew Lunn, Vivien Didelot
In-Reply-To: <1460659329-11473-1-git-send-email-vivien.didelot@savoirfairelinux.com>

For hardware cross-chip bridging to work, user ports *and* DSA ports
need to share a common address database, in order to switch a frame to
the correct interconnected device.

This is currently working for VLAN filtering aware systems, since Linux
will implement a bridge group as a 802.1Q VLAN, which has its own FDB,
including DSA and CPU links as members.

However when the system doesn't support VLAN filtering, Linux only
relies on the port-based VLAN to implement a bridge group.

To fix hardware cross-chip bridging for such systems, set the same
default address database 0 for user and DSA ports, instead of giving
them all a different default database.

Note that the bridging code prevents frames to egress between unbridged
ports, and flushes FDB entries of a port when changing its STP state.

Also note that the FID 0 is special and means "all" for ATU operations,
but it's OK since it is used as a default forwarding address database.

Fixes: 2db9ce1fd9a3 ("net: dsa: mv88e6xxx: assign default FDB to ports")
Fixes: 466dfa077022 ("net: dsa: mv88e6xxx: assign dynamic FDB to bridges")
Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
---
 drivers/net/dsa/mv88e6xxx.c | 28 ++--------------------------
 1 file changed, 2 insertions(+), 26 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 46c9b4c..b76f870 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -2246,27 +2246,10 @@ int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port,
 			       struct net_device *bridge)
 {
 	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
-	u16 fid;
 	int i, err;
 
 	mutex_lock(&ps->smi_mutex);
 
-	/* Get or create the bridge FID and assign it to the port */
-	for (i = 0; i < ps->num_ports; ++i)
-		if (ps->ports[i].bridge_dev == bridge)
-			break;
-
-	if (i < ps->num_ports)
-		err = _mv88e6xxx_port_fid_get(ds, i, &fid);
-	else
-		err = _mv88e6xxx_fid_new(ds, &fid);
-	if (err)
-		goto unlock;
-
-	err = _mv88e6xxx_port_fid_set(ds, port, fid);
-	if (err)
-		goto unlock;
-
 	/* Assign the bridge and remap each port's VLANTable */
 	ps->ports[port].bridge_dev = bridge;
 
@@ -2278,7 +2261,6 @@ int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port,
 		}
 	}
 
-unlock:
 	mutex_unlock(&ps->smi_mutex);
 
 	return err;
@@ -2288,16 +2270,10 @@ void mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port)
 {
 	struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
 	struct net_device *bridge = ps->ports[port].bridge_dev;
-	u16 fid;
 	int i;
 
 	mutex_lock(&ps->smi_mutex);
 
-	/* Give the port a fresh Filtering Information Database */
-	if (_mv88e6xxx_fid_new(ds, &fid) ||
-	    _mv88e6xxx_port_fid_set(ds, port, fid))
-		netdev_warn(ds->ports[port], "failed to assign a new FID\n");
-
 	/* Unassign the bridge and remap each port's VLANTable */
 	ps->ports[port].bridge_dev = NULL;
 
@@ -2624,11 +2600,11 @@ static int mv88e6xxx_setup_port(struct dsa_switch *ds, int port)
 	if (ret)
 		goto abort;
 
-	/* Port based VLAN map: give each port its own address
+	/* Port based VLAN map: give each port the same default address
 	 * database, and allow bidirectional communication between the
 	 * CPU and DSA port(s), and the other ports.
 	 */
-	ret = _mv88e6xxx_port_fid_set(ds, port, port + 1);
+	ret = _mv88e6xxx_port_fid_set(ds, port, 0);
 	if (ret)
 		goto abort;
 
-- 
2.8.0

^ permalink raw reply related

* Re: [PATCH v3 0/2] sctp: delay calls to sk_data_ready() as much as possible
From: David Miller @ 2016-04-14 18:59 UTC (permalink / raw)
  To: marcelo.leitner
  Cc: nhorman, netdev, vyasevich, linux-sctp, David.Laight, jkbs
In-Reply-To: <570FCCC1.6090504@gmail.com>

From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 14 Apr 2016 14:00:49 -0300

> Em 14-04-2016 10:03, Neil Horman escreveu:
>> On Wed, Apr 13, 2016 at 11:05:32PM -0400, David Miller wrote:
>>> From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
>>> Date: Fri,  8 Apr 2016 16:41:26 -0300
>>>
>>>> 1st patch is a preparation for the 2nd. The idea is to not call
>>>> ->sk_data_ready() for every data chunk processed while processing
>>>> packets but only once before releasing the socket.
>>>>
>>>> v2: patchset re-checked, small changelog fixes
>>>> v3: on patch 2, make use of local vars to make it more readable
>>>
>>> Applied to net-next, but isn't this reduced overhead coming at the
>>> expense of latency?  What if that lower latency is important to the
>>> application and/or consumer?
>> Thats a fair point, but I'd make the counter argument that, as it
>> currently
>> stands, any latency introduced (or removed), is an artifact of our
>> implementation rather than a designed feature of it.  That is to say,
>> we make no
>> guarantees at the application level regarding how long it takes to
>> signal data
>> readines from the time we get data off the wire, so I would rather see
>> our
>> throughput raised if we can, as thats been sctp's more pressing
>> achilles heel.
>>
>>
>> Thats not to say I'd like to enable lower latency, but I'd rather have
>> this now,
>> and start pondering how to design that in.  Perhaps we can convert the
>> pending
>> flag to a counter to count the number of events we enqueue, and call
>> sk_data_ready every  time we reach a sysctl defined threshold.
> 
> That and also that there is no chance of the application reading the
> first chunks before all current ToDo's are performed by either the bh
> or backlog handlers for that packet. Socket lock won't be cycled in
> between chunks so the application is going to wait all the processing
> one way or another.

But it takes time to signal the wakeup to the remote cpu the process
was running on, schedule out the current process on that cpu (if it
has in fact lost it's timeslice), and then finally look at the socket
queue.

Of course this is all assuming the process was sleeping in the first
place, either in recv or more likely poll.

I really think signalling early helps performance.

^ permalink raw reply

* RE: [PATCH 1/1] hv_netvsc: Implement support for VF drivers on Hyper-V
From: KY Srinivasan @ 2016-04-14 19:08 UTC (permalink / raw)
  To: KY Srinivasan, davem@davemloft.net, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, devel@linuxdriverproject.org,
	olaf@aepfle.de, apw@canonical.com, jasowang@redhat.com
In-Reply-To: <1460591887-24671-1-git-send-email-kys@microsoft.com>



> -----Original Message-----
> From: K. Y. Srinivasan [mailto:kys@microsoft.com]
> Sent: Wednesday, April 13, 2016 4:58 PM
> To: davem@davemloft.net; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org; devel@linuxdriverproject.org; olaf@aepfle.de;
> apw@canonical.com; jasowang@redhat.com
> Cc: KY Srinivasan <kys@microsoft.com>
> Subject: [PATCH 1/1] hv_netvsc: Implement support for VF drivers on Hyper-
> V
> 
> Support VF drivers on Hyper-V. On Hyper-V, each VF instance presented to
> the guest has an associated synthetic interface that shares the MAC address
> with the VF instance. Typically these are bonded together to support
> live migration. By default, the host delivers all the incoming packets
> on the synthetic interface. Once the VF is up, we need to explicitly switch
> the data path on the host to divert traffic onto the VF interface. Even after
> switching the data path, broadcast and multicast packets are always
> delivered
> on the synthetic interface and these will have to be injected back onto the
> VF interface (if VF is up).
> This patch implements the necessary support in netvsc to support Linux
> VF drivers.

David,

Please drop this patch. I just discovered a merge issue and I am going to resubmit this
patch.

Regards,

K. Y

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox