Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next,v3, 5/6] net/mlx5: Add HV VHCA control agent
From: Haiyang Zhang @ 2019-08-21  0:23 UTC (permalink / raw)
  To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
	leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
	bhelgaas@google.com, linux-pci@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
	linux-kernel@vger.kernel.org
In-Reply-To: <1566346948-69497-1-git-send-email-haiyangz@microsoft.com>

From: Eran Ben Elisha <eranbe@mellanox.com>

Control agent is responsible over of the control block (ID 0). It should
update the PF via this block about every capability change. In addition,
upon block 0 invalidate, it should activate all other supported agents
with data requests from the PF.

Upon agent create/destroy, the invalidate callback of the control agent
is being called in order to update the PF driver about this change.

The control agent is an integral part of HV VHCA and will be created
and destroy as part of the HV VHCA init/cleanup flow.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 122 ++++++++++++++++++++-
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  |   1 +
 2 files changed, 121 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
index 84d1d75..4047629 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -109,22 +109,131 @@ void mlx5_hv_vhca_invalidate(void *context, u64 block_mask)
 	queue_work(hv_vhca->work_queue, &work->invalidate_work);
 }
 
+#define AGENT_MASK(type) (type ? BIT(type - 1) : 0 /* control */)
+
+static void mlx5_hv_vhca_agents_control(struct mlx5_hv_vhca *hv_vhca,
+					struct mlx5_hv_vhca_control_block *block)
+{
+	int i;
+
+	for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+		struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+		if (!agent || !agent->control)
+			continue;
+
+		if (!(AGENT_MASK(agent->type) & block->control))
+			continue;
+
+		agent->control(agent, block);
+	}
+}
+
+static void mlx5_hv_vhca_capabilities(struct mlx5_hv_vhca *hv_vhca,
+				      u32 *capabilities)
+{
+	int i;
+
+	for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+		struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+		if (agent)
+			*capabilities |= AGENT_MASK(agent->type);
+	}
+}
+
+static void
+mlx5_hv_vhca_control_agent_invalidate(struct mlx5_hv_vhca_agent *agent,
+				      u64 block_mask)
+{
+	struct mlx5_hv_vhca *hv_vhca = agent->hv_vhca;
+	struct mlx5_core_dev *dev = hv_vhca->dev;
+	struct mlx5_hv_vhca_control_block *block;
+	u32 capabilities = 0;
+	int err;
+
+	block = kzalloc(sizeof(*block), GFP_KERNEL);
+	if (!block)
+		return;
+
+	err = mlx5_hv_read_config(dev, block, sizeof(*block), 0);
+	if (err)
+		goto free_block;
+
+	mlx5_hv_vhca_capabilities(hv_vhca, &capabilities);
+
+	/* In case no capabilities, send empty block in return */
+	if (!capabilities) {
+		memset(block, 0, sizeof(*block));
+		goto write;
+	}
+
+	if (block->capabilities != capabilities)
+		block->capabilities = capabilities;
+
+	if (block->control & ~capabilities)
+		goto free_block;
+
+	mlx5_hv_vhca_agents_control(hv_vhca, block);
+	block->command_ack = block->command;
+
+write:
+	mlx5_hv_write_config(dev, block, sizeof(*block), 0);
+
+free_block:
+	kfree(block);
+}
+
+static struct mlx5_hv_vhca_agent *
+mlx5_hv_vhca_control_agent_create(struct mlx5_hv_vhca *hv_vhca)
+{
+	return mlx5_hv_vhca_agent_create(hv_vhca, MLX5_HV_VHCA_AGENT_CONTROL,
+					 NULL,
+					 mlx5_hv_vhca_control_agent_invalidate,
+					 NULL, NULL);
+}
+
+static void mlx5_hv_vhca_control_agent_destroy(struct mlx5_hv_vhca_agent *agent)
+{
+	mlx5_hv_vhca_agent_destroy(agent);
+}
+
 int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
 {
+	struct mlx5_hv_vhca_agent *agent;
+	int err;
+
 	if (IS_ERR_OR_NULL(hv_vhca))
 		return IS_ERR_OR_NULL(hv_vhca);
 
-	return mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
-					   mlx5_hv_vhca_invalidate);
+	err = mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
+					  mlx5_hv_vhca_invalidate);
+	if (err)
+		return err;
+
+	agent = mlx5_hv_vhca_control_agent_create(hv_vhca);
+	if (IS_ERR_OR_NULL(agent)) {
+		mlx5_hv_unregister_invalidate(hv_vhca->dev);
+		return IS_ERR_OR_NULL(agent);
+	}
+
+	hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL] = agent;
+
+	return 0;
 }
 
 void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
 {
+	struct mlx5_hv_vhca_agent *agent;
 	int i;
 
 	if (IS_ERR_OR_NULL(hv_vhca))
 		return;
 
+	agent = hv_vhca->agents[MLX5_HV_VHCA_AGENT_CONTROL];
+	if (agent)
+		mlx5_hv_vhca_control_agent_destroy(agent);
+
 	mutex_lock(&hv_vhca->agents_lock);
 	for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++)
 		WARN_ON(hv_vhca->agents[i]);
@@ -134,6 +243,11 @@ void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
 	mlx5_hv_unregister_invalidate(hv_vhca->dev);
 }
 
+static void mlx5_hv_vhca_agents_update(struct mlx5_hv_vhca *hv_vhca)
+{
+	mlx5_hv_vhca_invalidate(hv_vhca, BIT(MLX5_HV_VHCA_AGENT_CONTROL));
+}
+
 struct mlx5_hv_vhca_agent *
 mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
 			  enum mlx5_hv_vhca_agent_type type,
@@ -174,6 +288,8 @@ struct mlx5_hv_vhca_agent *
 	hv_vhca->agents[type] = agent;
 	mutex_unlock(&hv_vhca->agents_lock);
 
+	mlx5_hv_vhca_agents_update(hv_vhca);
+
 	return agent;
 }
 
@@ -195,6 +311,8 @@ void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent)
 		agent->cleanup(agent);
 
 	kfree(agent);
+
+	mlx5_hv_vhca_agents_update(hv_vhca);
 }
 
 static int mlx5_hv_vhca_data_block_prepare(struct mlx5_hv_vhca_agent *agent,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
index cdf1303..984e7ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
@@ -12,6 +12,7 @@
 struct mlx5_hv_vhca_control_block;
 
 enum mlx5_hv_vhca_agent_type {
+	MLX5_HV_VHCA_AGENT_CONTROL = 0,
 	MLX5_HV_VHCA_AGENT_MAX = 32,
 };
 
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next,v3, 6/6] net/mlx5e: Add mlx5e HV VHCA stats agent
From: Haiyang Zhang @ 2019-08-21  0:23 UTC (permalink / raw)
  To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
	leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
	bhelgaas@google.com, linux-pci@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
	linux-kernel@vger.kernel.org
In-Reply-To: <1566346948-69497-1-git-send-email-haiyangz@microsoft.com>

From: Eran Ben Elisha <eranbe@mellanox.com>

HV VHCA stats agent is responsible on running a preiodic rx/tx
packets/bytes stats update. Currently the supported format is version
MLX5_HV_VHCA_STATS_VERSION. Block ID 1 is dedicated for statistics data
transfer from the VF to the PF.

The reporter fetch the statistics data from all opened channels, fill it
in a buffer and send it to mlx5_hv_vhca_write_agent.

As the stats layer should include some metadata per block (sequence and
offset), the HV VHCA layer shall modify the buffer before actually send it
over block 1.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  13 ++
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 162 +++++++++++++++++++++
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h |  25 ++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   3 +
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  |   1 +
 6 files changed, 205 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index fc59a40..a889a38 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -36,6 +36,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o en/port_buffer.o
 mlx5_core-$(CONFIG_MLX5_ESWITCH)     += en_rep.o en_tc.o en/tc_tun.o lib/port_tun.o lag_mp.o \
 					lib/geneve.o en/tc_tun_vxlan.o en/tc_tun_gre.o \
 					en/tc_tun_geneve.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += en/hv_vhca_stats.o
 
 #
 # Core extra
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 8fc5107..4a8008b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -54,6 +54,7 @@
 #include "mlx5_core.h"
 #include "en_stats.h"
 #include "en/fs.h"
+#include "lib/hv_vhca.h"
 
 extern const struct net_device_ops mlx5e_netdev_ops;
 struct page_pool;
@@ -777,6 +778,15 @@ struct mlx5e_modify_sq_param {
 	int rl_index;
 };
 
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+struct mlx5e_hv_vhca_stats_agent {
+	struct mlx5_hv_vhca_agent *agent;
+	struct delayed_work        work;
+	u16                        delay;
+	void                      *buf;
+};
+#endif
+
 struct mlx5e_xsk {
 	/* UMEMs are stored separately from channels, because we don't want to
 	 * lose them when channels are recreated. The kernel also stores UMEMs,
@@ -848,6 +858,9 @@ struct mlx5e_priv {
 	struct devlink_health_reporter *tx_reporter;
 	struct devlink_health_reporter *rx_reporter;
 	struct mlx5e_xsk           xsk;
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+	struct mlx5e_hv_vhca_stats_agent stats_agent;
+#endif
 };
 
 struct mlx5e_profile {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
new file mode 100644
index 0000000..c37b4ac
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include "en.h"
+#include "en/hv_vhca_stats.h"
+#include "lib/hv_vhca.h"
+#include "lib/hv.h"
+
+struct mlx5e_hv_vhca_per_ring_stats {
+	u64     rx_packets;
+	u64     rx_bytes;
+	u64     tx_packets;
+	u64     tx_bytes;
+};
+
+static void
+mlx5e_hv_vhca_fill_ring_stats(struct mlx5e_priv *priv, int ch,
+			      struct mlx5e_hv_vhca_per_ring_stats *data)
+{
+	struct mlx5e_channel_stats *stats;
+	int tc;
+
+	stats = &priv->channel_stats[ch];
+	data->rx_packets = stats->rq.packets;
+	data->rx_bytes   = stats->rq.bytes;
+
+	for (tc = 0; tc < priv->max_opened_tc; tc++) {
+		data->tx_packets += stats->sq[tc].packets;
+		data->tx_bytes   += stats->sq[tc].bytes;
+	}
+}
+
+static void mlx5e_hv_vhca_fill_stats(struct mlx5e_priv *priv, u64 *data,
+				     int buf_len)
+{
+	int ch, i = 0;
+
+	for (ch = 0; ch < priv->max_nch; ch++) {
+		u64 *buf = data + i;
+
+		if (WARN_ON_ONCE(buf +
+				 sizeof(struct mlx5e_hv_vhca_per_ring_stats) >
+				 data + buf_len))
+			return;
+
+		mlx5e_hv_vhca_fill_ring_stats(priv, ch,
+					      (struct mlx5e_hv_vhca_per_ring_stats *)buf);
+		i += sizeof(struct mlx5e_hv_vhca_per_ring_stats) / sizeof(u64);
+	}
+}
+
+static int mlx5e_hv_vhca_stats_buf_size(struct mlx5e_priv *priv)
+{
+	return (sizeof(struct mlx5e_hv_vhca_per_ring_stats) *
+		priv->max_nch);
+}
+
+static void mlx5e_hv_vhca_stats_work(struct work_struct *work)
+{
+	struct mlx5e_hv_vhca_stats_agent *sagent;
+	struct mlx5_hv_vhca_agent *agent;
+	struct delayed_work *dwork;
+	struct mlx5e_priv *priv;
+	int buf_len, rc;
+	void *buf;
+
+	dwork = to_delayed_work(work);
+	sagent = container_of(dwork, struct mlx5e_hv_vhca_stats_agent, work);
+	priv = container_of(sagent, struct mlx5e_priv, stats_agent);
+	buf_len = mlx5e_hv_vhca_stats_buf_size(priv);
+	agent = sagent->agent;
+	buf = sagent->buf;
+
+	memset(buf, 0, buf_len);
+	mlx5e_hv_vhca_fill_stats(priv, buf, buf_len);
+
+	rc = mlx5_hv_vhca_agent_write(agent, buf, buf_len);
+	if (rc) {
+		mlx5_core_err(priv->mdev,
+			      "%s: Failed to write stats, err = %d\n",
+			      __func__, rc);
+		return;
+	}
+
+	if (sagent->delay)
+		queue_delayed_work(priv->wq, &sagent->work, sagent->delay);
+}
+
+enum {
+	MLX5_HV_VHCA_STATS_VERSION     = 1,
+	MLX5_HV_VHCA_STATS_UPDATE_ONCE = 0xFFFF,
+};
+
+static void mlx5e_hv_vhca_stats_control(struct mlx5_hv_vhca_agent *agent,
+					struct mlx5_hv_vhca_control_block *block)
+{
+	struct mlx5e_hv_vhca_stats_agent *sagent;
+	struct mlx5e_priv *priv;
+
+	priv = mlx5_hv_vhca_agent_priv(agent);
+	sagent = &priv->stats_agent;
+
+	block->version = MLX5_HV_VHCA_STATS_VERSION;
+	block->rings   = priv->max_nch;
+
+	if (!block->command) {
+		cancel_delayed_work_sync(&priv->stats_agent.work);
+		return;
+	}
+
+	sagent->delay = block->command == MLX5_HV_VHCA_STATS_UPDATE_ONCE ? 0 :
+			msecs_to_jiffies(block->command * 100);
+
+	queue_delayed_work(priv->wq, &sagent->work, sagent->delay);
+}
+
+static void mlx5e_hv_vhca_stats_cleanup(struct mlx5_hv_vhca_agent *agent)
+{
+	struct mlx5e_priv *priv = mlx5_hv_vhca_agent_priv(agent);
+
+	cancel_delayed_work_sync(&priv->stats_agent.work);
+}
+
+int mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
+{
+	int buf_len = mlx5e_hv_vhca_stats_buf_size(priv);
+	struct mlx5_hv_vhca_agent *agent;
+
+	priv->stats_agent.buf = kvzalloc(buf_len, GFP_KERNEL);
+	if (!priv->stats_agent.buf)
+		return -ENOMEM;
+
+	agent = mlx5_hv_vhca_agent_create(priv->mdev->hv_vhca,
+					  MLX5_HV_VHCA_AGENT_STATS,
+					  mlx5e_hv_vhca_stats_control, NULL,
+					  mlx5e_hv_vhca_stats_cleanup,
+					  priv);
+
+	if (IS_ERR_OR_NULL(agent)) {
+		if (IS_ERR(agent))
+			netdev_warn(priv->netdev,
+				    "Failed to create hv vhca stats agent, err = %ld\n",
+				    PTR_ERR(agent));
+
+		kfree(priv->stats_agent.buf);
+		return IS_ERR_OR_NULL(agent);
+	}
+
+	priv->stats_agent.agent = agent;
+	INIT_DELAYED_WORK(&priv->stats_agent.work, mlx5e_hv_vhca_stats_work);
+
+	return 0;
+}
+
+void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv)
+{
+	if (IS_ERR_OR_NULL(priv->stats_agent.agent))
+		return;
+
+	mlx5_hv_vhca_agent_destroy(priv->stats_agent.agent);
+	kfree(priv->stats_agent.buf);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
new file mode 100644
index 0000000..664463f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __MLX5_EN_STATS_VHCA_H__
+#define __MLX5_EN_STATS_VHCA_H__
+#include "en.h"
+
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+
+int mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv);
+void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv);
+
+#else
+
+static inline int mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
+{
+	return 0;
+}
+
+static inline void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv)
+{
+}
+#endif
+
+#endif /* __MLX5_EN_STATS_VHCA_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 5721d3d..fac8455 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -64,6 +64,7 @@
 #include "en/xsk/setup.h"
 #include "en/xsk/rx.h"
 #include "en/xsk/tx.h"
+#include "en/hv_vhca_stats.h"
 
 
 bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
@@ -5103,6 +5104,7 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv)
 	if (mlx5e_monitor_counter_supported(priv))
 		mlx5e_monitor_counter_init(priv);
 
+	mlx5e_hv_vhca_stats_create(priv);
 	if (netdev->reg_state != NETREG_REGISTERED)
 		return;
 #ifdef CONFIG_MLX5_CORE_EN_DCB
@@ -5135,6 +5137,7 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv)
 
 	queue_work(priv->wq, &priv->set_rx_mode_work);
 
+	mlx5e_hv_vhca_stats_destroy(priv);
 	if (mlx5e_monitor_counter_supported(priv))
 		mlx5e_monitor_counter_cleanup(priv);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
index 984e7ad..4bad6a5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
@@ -13,6 +13,7 @@
 
 enum mlx5_hv_vhca_agent_type {
 	MLX5_HV_VHCA_AGENT_CONTROL = 0,
+	MLX5_HV_VHCA_AGENT_STATS   = 1,
 	MLX5_HV_VHCA_AGENT_MAX = 32,
 };
 
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next,v3, 4/6] net/mlx5: Add HV VHCA infrastructure
From: Haiyang Zhang @ 2019-08-21  0:23 UTC (permalink / raw)
  To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
	leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
	bhelgaas@google.com, linux-pci@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
	linux-kernel@vger.kernel.org
In-Reply-To: <1566346948-69497-1-git-send-email-haiyangz@microsoft.com>

From: Eran Ben Elisha <eranbe@mellanox.com>

HV VHCA is a layer which provides PF to VF communication channel based on
HyperV PCI config channel. It implements Mellanox's Inter VHCA control
communication protocol. The protocol contains control block in order to
pass messages between the PF and VF drivers, and data blocks in order to
pass actual data.

The infrastructure is agent based. Each agent will be responsible of
contiguous buffer blocks in the VHCA config space. This infrastructure will
bind agents to their blocks, and those agents can only access read/write
the buffer blocks assigned to them. Each agent will provide three
callbacks (control, invalidate, cleanup). Control will be invoked when
block-0 is invalidated with a command that concerns this agent. Invalidate
callback will be invoked if one of the blocks assigned to this agent was
invalidated. Cleanup will be invoked before the agent is being freed in
order to clean all of its open resources or deferred works.

Block-0 serves as the control block. All execution commands from the PF
will be written by the PF over this block. VF will ack on those by
writing on block-0 as well. Its format is described by struct
mlx5_hv_vhca_control_block layout.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c  | 253 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h  | 102 +++++++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   7 +
 include/linux/mlx5/driver.h                        |   2 +
 5 files changed, 365 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 247295b..fc59a40 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -45,7 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o eswitch_offloads.o eswitch_offlo
 mlx5_core-$(CONFIG_MLX5_MPFS)      += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)          += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
-mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o lib/hv_vhca.o
 
 #
 # Ipoib netdev
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
new file mode 100644
index 0000000..84d1d75
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include <linux/hyperv.h>
+#include "mlx5_core.h"
+#include "lib/hv.h"
+#include "lib/hv_vhca.h"
+
+struct mlx5_hv_vhca {
+	struct mlx5_core_dev       *dev;
+	struct workqueue_struct    *work_queue;
+	struct mlx5_hv_vhca_agent  *agents[MLX5_HV_VHCA_AGENT_MAX];
+	struct mutex                agents_lock; /* Protect agents array */
+};
+
+struct mlx5_hv_vhca_work {
+	struct work_struct     invalidate_work;
+	struct mlx5_hv_vhca   *hv_vhca;
+	u64                    block_mask;
+};
+
+struct mlx5_hv_vhca_data_block {
+	u16     sequence;
+	u16     offset;
+	u8      reserved[4];
+	u64     data[15];
+};
+
+struct mlx5_hv_vhca_agent {
+	enum mlx5_hv_vhca_agent_type	 type;
+	struct mlx5_hv_vhca		*hv_vhca;
+	void				*priv;
+	u16                              seq;
+	void (*control)(struct mlx5_hv_vhca_agent *agent,
+			struct mlx5_hv_vhca_control_block *block);
+	void (*invalidate)(struct mlx5_hv_vhca_agent *agent,
+			   u64 block_mask);
+	void (*cleanup)(struct mlx5_hv_vhca_agent *agent);
+};
+
+struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev)
+{
+	struct mlx5_hv_vhca *hv_vhca = NULL;
+
+	hv_vhca = kzalloc(sizeof(*hv_vhca), GFP_KERNEL);
+	if (!hv_vhca)
+		return ERR_PTR(-ENOMEM);
+
+	hv_vhca->work_queue = create_singlethread_workqueue("mlx5_hv_vhca");
+	if (!hv_vhca->work_queue) {
+		kfree(hv_vhca);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	hv_vhca->dev = dev;
+	mutex_init(&hv_vhca->agents_lock);
+
+	return hv_vhca;
+}
+
+void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca)
+{
+	if (IS_ERR_OR_NULL(hv_vhca))
+		return;
+
+	destroy_workqueue(hv_vhca->work_queue);
+	kfree(hv_vhca);
+}
+
+static void mlx5_hv_vhca_invalidate_work(struct work_struct *work)
+{
+	struct mlx5_hv_vhca_work *hwork;
+	struct mlx5_hv_vhca *hv_vhca;
+	int i;
+
+	hwork = container_of(work, struct mlx5_hv_vhca_work, invalidate_work);
+	hv_vhca = hwork->hv_vhca;
+
+	mutex_lock(&hv_vhca->agents_lock);
+	for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++) {
+		struct mlx5_hv_vhca_agent *agent = hv_vhca->agents[i];
+
+		if (!agent || !agent->invalidate)
+			continue;
+
+		if (!(BIT(agent->type) & hwork->block_mask))
+			continue;
+
+		agent->invalidate(agent, hwork->block_mask);
+	}
+	mutex_unlock(&hv_vhca->agents_lock);
+
+	kfree(hwork);
+}
+
+void mlx5_hv_vhca_invalidate(void *context, u64 block_mask)
+{
+	struct mlx5_hv_vhca *hv_vhca = (struct mlx5_hv_vhca *)context;
+	struct mlx5_hv_vhca_work *work;
+
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return;
+
+	INIT_WORK(&work->invalidate_work, mlx5_hv_vhca_invalidate_work);
+	work->hv_vhca    = hv_vhca;
+	work->block_mask = block_mask;
+
+	queue_work(hv_vhca->work_queue, &work->invalidate_work);
+}
+
+int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
+{
+	if (IS_ERR_OR_NULL(hv_vhca))
+		return IS_ERR_OR_NULL(hv_vhca);
+
+	return mlx5_hv_register_invalidate(hv_vhca->dev, hv_vhca,
+					   mlx5_hv_vhca_invalidate);
+}
+
+void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
+{
+	int i;
+
+	if (IS_ERR_OR_NULL(hv_vhca))
+		return;
+
+	mutex_lock(&hv_vhca->agents_lock);
+	for (i = 0; i < MLX5_HV_VHCA_AGENT_MAX; i++)
+		WARN_ON(hv_vhca->agents[i]);
+
+	mutex_unlock(&hv_vhca->agents_lock);
+
+	mlx5_hv_unregister_invalidate(hv_vhca->dev);
+}
+
+struct mlx5_hv_vhca_agent *
+mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
+			  enum mlx5_hv_vhca_agent_type type,
+			  void (*control)(struct mlx5_hv_vhca_agent*,
+					  struct mlx5_hv_vhca_control_block *block),
+			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
+					     u64 block_mask),
+			  void (*cleaup)(struct mlx5_hv_vhca_agent *agent),
+			  void *priv)
+{
+	struct mlx5_hv_vhca_agent *agent;
+
+	if (IS_ERR_OR_NULL(hv_vhca))
+		return ERR_PTR(-ENOMEM);
+
+	if (type >= MLX5_HV_VHCA_AGENT_MAX)
+		return ERR_PTR(-EINVAL);
+
+	mutex_lock(&hv_vhca->agents_lock);
+	if (hv_vhca->agents[type]) {
+		mutex_unlock(&hv_vhca->agents_lock);
+		return ERR_PTR(-EINVAL);
+	}
+	mutex_unlock(&hv_vhca->agents_lock);
+
+	agent = kzalloc(sizeof(*agent), GFP_KERNEL);
+	if (!agent)
+		return ERR_PTR(-ENOMEM);
+
+	agent->type      = type;
+	agent->hv_vhca   = hv_vhca;
+	agent->priv      = priv;
+	agent->control   = control;
+	agent->invalidate = invalidate;
+	agent->cleanup   = cleaup;
+
+	mutex_lock(&hv_vhca->agents_lock);
+	hv_vhca->agents[type] = agent;
+	mutex_unlock(&hv_vhca->agents_lock);
+
+	return agent;
+}
+
+void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent)
+{
+	struct mlx5_hv_vhca *hv_vhca = agent->hv_vhca;
+
+	mutex_lock(&hv_vhca->agents_lock);
+
+	if (WARN_ON(agent != hv_vhca->agents[agent->type])) {
+		mutex_unlock(&hv_vhca->agents_lock);
+		return;
+	}
+
+	hv_vhca->agents[agent->type] = NULL;
+	mutex_unlock(&hv_vhca->agents_lock);
+
+	if (agent->cleanup)
+		agent->cleanup(agent);
+
+	kfree(agent);
+}
+
+static int mlx5_hv_vhca_data_block_prepare(struct mlx5_hv_vhca_agent *agent,
+					   struct mlx5_hv_vhca_data_block *data_block,
+					   void *src, int len, int *offset)
+{
+	int bytes = min_t(int, (int)sizeof(data_block->data), len);
+
+	data_block->sequence = agent->seq;
+	data_block->offset   = (*offset)++;
+	memcpy(data_block->data, src, bytes);
+
+	return bytes;
+}
+
+static void mlx5_hv_vhca_agent_seq_update(struct mlx5_hv_vhca_agent *agent)
+{
+	agent->seq++;
+}
+
+int mlx5_hv_vhca_agent_write(struct mlx5_hv_vhca_agent *agent,
+			     void *buf, int len)
+{
+	int offset = agent->type * HV_CONFIG_BLOCK_SIZE_MAX;
+	int block_offset = 0;
+	int total = 0;
+	int err;
+
+	while (len) {
+		struct mlx5_hv_vhca_data_block data_block = {0};
+		int bytes;
+
+		bytes = mlx5_hv_vhca_data_block_prepare(agent, &data_block,
+							buf + total,
+							len, &block_offset);
+		if (!bytes)
+			return -ENOMEM;
+
+		err = mlx5_hv_write_config(agent->hv_vhca->dev, &data_block,
+					   sizeof(data_block), offset);
+		if (err)
+			return err;
+
+		total += bytes;
+		len   -= bytes;
+	}
+
+	mlx5_hv_vhca_agent_seq_update(agent);
+
+	return 0;
+}
+
+void *mlx5_hv_vhca_agent_priv(struct mlx5_hv_vhca_agent *agent)
+{
+	return agent->priv;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
new file mode 100644
index 0000000..cdf1303
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __LIB_HV_VHCA_H__
+#define __LIB_HV_VHCA_H__
+
+#include "en.h"
+#include "lib/hv.h"
+
+struct mlx5_hv_vhca_agent;
+struct mlx5_hv_vhca;
+struct mlx5_hv_vhca_control_block;
+
+enum mlx5_hv_vhca_agent_type {
+	MLX5_HV_VHCA_AGENT_MAX = 32,
+};
+
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+
+struct mlx5_hv_vhca_control_block {
+	u32     capabilities;
+	u32     control;
+	u16     command;
+	u16     command_ack;
+	u16     version;
+	u16     rings;
+	u32     reserved1[28];
+};
+
+struct mlx5_hv_vhca *mlx5_hv_vhca_create(struct mlx5_core_dev *dev);
+void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca);
+int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca);
+void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca);
+void mlx5_hv_vhca_invalidate(void *context, u64 block_mask);
+
+struct mlx5_hv_vhca_agent *
+mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
+			  enum mlx5_hv_vhca_agent_type type,
+			  void (*control)(struct mlx5_hv_vhca_agent*,
+					  struct mlx5_hv_vhca_control_block *block),
+			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
+					     u64 block_mask),
+			  void (*cleanup)(struct mlx5_hv_vhca_agent *agent),
+			  void *context);
+
+void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent);
+int mlx5_hv_vhca_agent_write(struct mlx5_hv_vhca_agent *agent,
+			     void *buf, int len);
+void *mlx5_hv_vhca_agent_priv(struct mlx5_hv_vhca_agent *agent);
+
+#else
+
+static inline struct mlx5_hv_vhca *
+mlx5_hv_vhca_create(struct mlx5_core_dev *dev)
+{
+	return NULL;
+}
+
+static inline void mlx5_hv_vhca_destroy(struct mlx5_hv_vhca *hv_vhca)
+{
+}
+
+static inline int mlx5_hv_vhca_init(struct mlx5_hv_vhca *hv_vhca)
+{
+	return 0;
+}
+
+static inline void mlx5_hv_vhca_cleanup(struct mlx5_hv_vhca *hv_vhca)
+{
+}
+
+static inline void mlx5_hv_vhca_invalidate(void *context,
+					   u64 block_mask)
+{
+}
+
+static inline struct mlx5_hv_vhca_agent *
+mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
+			  enum mlx5_hv_vhca_agent_type type,
+			  void (*control)(struct mlx5_hv_vhca_agent*,
+					  struct mlx5_hv_vhca_control_block *block),
+			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
+					     u64 block_mask),
+			  void (*cleanup)(struct mlx5_hv_vhca_agent *agent),
+			  void *context)
+{
+	return NULL;
+}
+
+static inline void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent)
+{
+}
+
+static inline int
+mlx5_hv_vhca_write_agent(struct mlx5_hv_vhca_agent *agent,
+			 void *buf, int len)
+{
+	return 0;
+}
+#endif
+
+#endif /* __LIB_HV_VHCA_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 80437d4..4b74315 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -69,6 +69,7 @@
 #include "lib/pci_vsc.h"
 #include "diag/fw_tracer.h"
 #include "ecpf.h"
+#include "lib/hv_vhca.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
@@ -868,6 +869,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 	}
 
 	dev->tracer = mlx5_fw_tracer_create(dev);
+	dev->hv_vhca = mlx5_hv_vhca_create(dev);
 
 	return 0;
 
@@ -897,6 +899,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 
 static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 {
+	mlx5_hv_vhca_destroy(dev->hv_vhca);
 	mlx5_fw_tracer_destroy(dev->tracer);
 	mlx5_fpga_cleanup(dev);
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
@@ -1063,6 +1066,8 @@ static int mlx5_load(struct mlx5_core_dev *dev)
 		goto err_fw_tracer;
 	}
 
+	mlx5_hv_vhca_init(dev->hv_vhca);
+
 	err = mlx5_fpga_device_start(dev);
 	if (err) {
 		mlx5_core_err(dev, "fpga device start failed %d\n", err);
@@ -1118,6 +1123,7 @@ static int mlx5_load(struct mlx5_core_dev *dev)
 err_ipsec_start:
 	mlx5_fpga_device_stop(dev);
 err_fpga_start:
+	mlx5_hv_vhca_cleanup(dev->hv_vhca);
 	mlx5_fw_tracer_cleanup(dev->tracer);
 err_fw_tracer:
 	mlx5_eq_table_destroy(dev);
@@ -1138,6 +1144,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
 	mlx5_accel_ipsec_cleanup(dev);
 	mlx5_accel_tls_cleanup(dev);
 	mlx5_fpga_device_stop(dev);
+	mlx5_hv_vhca_cleanup(dev->hv_vhca);
 	mlx5_fw_tracer_cleanup(dev->tracer);
 	mlx5_eq_table_destroy(dev);
 	mlx5_irq_table_destroy(dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 467de34..0e00cbc 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -638,6 +638,7 @@ struct mlx5_clock {
 struct mlx5_fw_tracer;
 struct mlx5_vxlan;
 struct mlx5_geneve;
+struct mlx5_hv_vhca;
 
 struct mlx5_core_dev {
 	struct device *device;
@@ -685,6 +686,7 @@ struct mlx5_core_dev {
 	struct mlx5_ib_clock_info  *clock_info;
 	struct mlx5_fw_tracer   *tracer;
 	u32                      vsc_addr;
+	struct mlx5_hv_vhca	*hv_vhca;
 };
 
 struct mlx5_db {
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next,v3, 3/6] net/mlx5: Add wrappers for HyperV PCIe operations
From: Haiyang Zhang @ 2019-08-21  0:23 UTC (permalink / raw)
  To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
	leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
	bhelgaas@google.com, linux-pci@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
	linux-kernel@vger.kernel.org
In-Reply-To: <1566346948-69497-1-git-send-email-haiyangz@microsoft.com>

From: Eran Ben Elisha <eranbe@mellanox.com>

Add wrapper functions for HyperV PCIe read / write /
block_invalidate_register operations.  This will be used as an
infrastructure in the downstream patch for software communication.

This will be enabled by default if CONFIG_PCI_HYPERV_INTERFACE is set.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c | 64 ++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h | 22 ++++++++
 3 files changed, 87 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8b7edaa..247295b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -45,6 +45,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += eswitch.o eswitch_offloads.o eswitch_offlo
 mlx5_core-$(CONFIG_MLX5_MPFS)      += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)          += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
+mlx5_core-$(CONFIG_PCI_HYPERV_INTERFACE) += lib/hv.o
 
 #
 # Ipoib netdev
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
new file mode 100644
index 0000000..cf08d02
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2018 Mellanox Technologies
+
+#include <linux/hyperv.h>
+#include "mlx5_core.h"
+#include "lib/hv.h"
+
+static int mlx5_hv_config_common(struct mlx5_core_dev *dev, void *buf, int len,
+				 int offset, bool read)
+{
+	int rc = -EOPNOTSUPP;
+	int bytes_returned;
+	int block_id;
+
+	if (offset % HV_CONFIG_BLOCK_SIZE_MAX || len % HV_CONFIG_BLOCK_SIZE_MAX)
+		return -EINVAL;
+
+	block_id = offset / HV_CONFIG_BLOCK_SIZE_MAX;
+
+	rc = read ?
+	     hyperv_read_cfg_blk(dev->pdev, buf,
+				 HV_CONFIG_BLOCK_SIZE_MAX, block_id,
+				 &bytes_returned) :
+	     hyperv_write_cfg_blk(dev->pdev, buf,
+				  HV_CONFIG_BLOCK_SIZE_MAX, block_id);
+
+	/* Make sure len bytes were read successfully  */
+	if (read)
+		rc |= !(len == bytes_returned);
+
+	if (rc) {
+		mlx5_core_err(dev, "Failed to %s hv config, err = %d, len = %d, offset = %d\n",
+			      read ? "read" : "write", rc, len,
+			      offset);
+		return rc;
+	}
+
+	return 0;
+}
+
+int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len,
+			int offset)
+{
+	return mlx5_hv_config_common(dev, buf, len, offset, true);
+}
+
+int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len,
+			 int offset)
+{
+	return mlx5_hv_config_common(dev, buf, len, offset, false);
+}
+
+int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context,
+				void (*block_invalidate)(void *context,
+							 u64 block_mask))
+{
+	return hyperv_reg_block_invalidate(dev->pdev, context,
+					   block_invalidate);
+}
+
+void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev)
+{
+	hyperv_reg_block_invalidate(dev->pdev, NULL, NULL);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
new file mode 100644
index 0000000..f9a4557
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __LIB_HV_H__
+#define __LIB_HV_H__
+
+#if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
+
+#include <linux/hyperv.h>
+#include <linux/mlx5/driver.h>
+
+int mlx5_hv_read_config(struct mlx5_core_dev *dev, void *buf, int len,
+			int offset);
+int mlx5_hv_write_config(struct mlx5_core_dev *dev, void *buf, int len,
+			 int offset);
+int mlx5_hv_register_invalidate(struct mlx5_core_dev *dev, void *context,
+				void (*block_invalidate)(void *context,
+							 u64 block_mask));
+void mlx5_hv_unregister_invalidate(struct mlx5_core_dev *dev);
+#endif
+
+#endif /* __LIB_HV_H__ */
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next,v3, 2/6] PCI: hv: Add a Hyper-V PCI interface driver for software backchannel interface
From: Haiyang Zhang @ 2019-08-21  0:23 UTC (permalink / raw)
  To: sashal@kernel.org, davem@davemloft.net, saeedm@mellanox.com,
	leon@kernel.org, eranbe@mellanox.com, lorenzo.pieralisi@arm.com,
	bhelgaas@google.com, linux-pci@vger.kernel.org,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
  Cc: Haiyang Zhang, KY Srinivasan, Stephen Hemminger,
	linux-kernel@vger.kernel.org
In-Reply-To: <1566346948-69497-1-git-send-email-haiyangz@microsoft.com>

This interface driver is a helper driver allows other drivers to
have a common interface with the Hyper-V PCI frontend driver.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 MAINTAINERS                              |  1 +
 drivers/pci/Kconfig                      |  1 +
 drivers/pci/controller/Kconfig           |  7 ++++
 drivers/pci/controller/Makefile          |  1 +
 drivers/pci/controller/pci-hyperv-intf.c | 67 ++++++++++++++++++++++++++++++++
 drivers/pci/controller/pci-hyperv.c      | 12 ++++--
 include/linux/hyperv.h                   | 30 ++++++++++----
 7 files changed, 108 insertions(+), 11 deletions(-)
 create mode 100644 drivers/pci/controller/pci-hyperv-intf.c

diff --git a/MAINTAINERS b/MAINTAINERS
index e352550..866ae88 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7453,6 +7453,7 @@ F:	drivers/hid/hid-hyperv.c
 F:	drivers/hv/
 F:	drivers/input/serio/hyperv-keyboard.c
 F:	drivers/pci/controller/pci-hyperv.c
+F:	drivers/pci/controller/pci-hyperv-intf.c
 F:	drivers/net/hyperv/
 F:	drivers/scsi/storvsc_drv.c
 F:	drivers/uio/uio_hv_generic.c
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 2ab9240..c313de9 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -182,6 +182,7 @@ config PCI_LABEL
 config PCI_HYPERV
         tristate "Hyper-V PCI Frontend"
         depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+	select PCI_HYPERV_INTERFACE
         help
           The PCI device frontend driver allows the kernel to import arbitrary
           PCI devices from a PCI backend to support PCI driver domains.
diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig
index fe9f9f1..70e0782 100644
--- a/drivers/pci/controller/Kconfig
+++ b/drivers/pci/controller/Kconfig
@@ -281,5 +281,12 @@ config VMD
 	  To compile this driver as a module, choose M here: the
 	  module will be called vmd.
 
+config PCI_HYPERV_INTERFACE
+	tristate "Hyper-V PCI Interface"
+	depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64
+	help
+	  The Hyper-V PCI Interface is a helper driver allows other drivers to
+	  have a common interface with the Hyper-V PCI frontend driver.
+
 source "drivers/pci/controller/dwc/Kconfig"
 endmenu
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index d56a507..a2a22c9 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_PCIE_CADENCE_HOST) += pcie-cadence-host.o
 obj-$(CONFIG_PCIE_CADENCE_EP) += pcie-cadence-ep.o
 obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
 obj-$(CONFIG_PCI_HYPERV) += pci-hyperv.o
+obj-$(CONFIG_PCI_HYPERV_INTERFACE) += pci-hyperv-intf.o
 obj-$(CONFIG_PCI_MVEBU) += pci-mvebu.o
 obj-$(CONFIG_PCI_AARDVARK) += pci-aardvark.o
 obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o
diff --git a/drivers/pci/controller/pci-hyperv-intf.c b/drivers/pci/controller/pci-hyperv-intf.c
new file mode 100644
index 0000000..cc96be4
--- /dev/null
+++ b/drivers/pci/controller/pci-hyperv-intf.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) Microsoft Corporation.
+ *
+ * Author:
+ *   Haiyang Zhang <haiyangz@microsoft.com>
+ *
+ * This small module is a helper driver allows other drivers to
+ * have a common interface with the Hyper-V PCI frontend driver.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/hyperv.h>
+
+struct hyperv_pci_block_ops hvpci_block_ops;
+EXPORT_SYMBOL_GPL(hvpci_block_ops);
+
+int hyperv_read_cfg_blk(struct pci_dev *dev, void *buf, unsigned int buf_len,
+			unsigned int block_id, unsigned int *bytes_returned)
+{
+	if (!hvpci_block_ops.read_block)
+		return -EOPNOTSUPP;
+
+	return hvpci_block_ops.read_block(dev, buf, buf_len, block_id,
+					  bytes_returned);
+}
+EXPORT_SYMBOL_GPL(hyperv_read_cfg_blk);
+
+int hyperv_write_cfg_blk(struct pci_dev *dev, void *buf, unsigned int len,
+			 unsigned int block_id)
+{
+	if (!hvpci_block_ops.write_block)
+		return -EOPNOTSUPP;
+
+	return hvpci_block_ops.write_block(dev, buf, len, block_id);
+}
+EXPORT_SYMBOL_GPL(hyperv_write_cfg_blk);
+
+int hyperv_reg_block_invalidate(struct pci_dev *dev, void *context,
+				void (*block_invalidate)(void *context,
+							 u64 block_mask))
+{
+	if (!hvpci_block_ops.reg_blk_invalidate)
+		return -EOPNOTSUPP;
+
+	return hvpci_block_ops.reg_blk_invalidate(dev, context,
+						  block_invalidate);
+}
+EXPORT_SYMBOL_GPL(hyperv_reg_block_invalidate);
+
+static void __exit exit_hv_pci_intf(void)
+{
+}
+
+static int __init init_hv_pci_intf(void)
+{
+	return 0;
+}
+
+module_init(init_hv_pci_intf);
+module_exit(exit_hv_pci_intf);
+
+MODULE_DESCRIPTION("Hyper-V PCI Interface");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 57adeca..9c93ac2 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -983,7 +983,6 @@ int hv_read_config_block(struct pci_dev *pdev, void *buf, unsigned int len,
 	*bytes_returned = comp_pkt.bytes_returned;
 	return 0;
 }
-EXPORT_SYMBOL(hv_read_config_block);
 
 /**
  * hv_pci_write_config_compl() - Invoked when a response packet for a write
@@ -1070,7 +1069,6 @@ int hv_write_config_block(struct pci_dev *pdev, void *buf, unsigned int len,
 
 	return 0;
 }
-EXPORT_SYMBOL(hv_write_config_block);
 
 /**
  * hv_register_block_invalidate() - Invoked when a config block invalidation
@@ -1101,7 +1099,6 @@ int hv_register_block_invalidate(struct pci_dev *pdev, void *context,
 	return 0;
 
 }
-EXPORT_SYMBOL(hv_register_block_invalidate);
 
 /* Interrupt management hooks */
 static void hv_int_desc_free(struct hv_pci_dev *hpdev,
@@ -3045,10 +3042,19 @@ static int hv_pci_remove(struct hv_device *hdev)
 static void __exit exit_hv_pci_drv(void)
 {
 	vmbus_driver_unregister(&hv_pci_drv);
+
+	hvpci_block_ops.read_block = NULL;
+	hvpci_block_ops.write_block = NULL;
+	hvpci_block_ops.reg_blk_invalidate = NULL;
 }
 
 static int __init init_hv_pci_drv(void)
 {
+	/* Initialize PCI block r/w interface */
+	hvpci_block_ops.read_block = hv_read_config_block;
+	hvpci_block_ops.write_block = hv_write_config_block;
+	hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate;
+
 	return vmbus_driver_register(&hv_pci_drv);
 }
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 9d37f8c..2afe6fd 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1579,18 +1579,32 @@ struct vmpacket_descriptor *
 	    pkt = hv_pkt_iter_next(channel, pkt))
 
 /*
- * Functions for passing data between SR-IOV PF and VF drivers.  The VF driver
+ * Interface for passing data between SR-IOV PF and VF drivers. The VF driver
  * sends requests to read and write blocks. Each block must be 128 bytes or
  * smaller. Optionally, the VF driver can register a callback function which
  * will be invoked when the host says that one or more of the first 64 block
  * IDs is "invalid" which means that the VF driver should reread them.
  */
 #define HV_CONFIG_BLOCK_SIZE_MAX 128
-int hv_read_config_block(struct pci_dev *dev, void *buf, unsigned int buf_len,
-			 unsigned int block_id, unsigned int *bytes_returned);
-int hv_write_config_block(struct pci_dev *dev, void *buf, unsigned int len,
-			  unsigned int block_id);
-int hv_register_block_invalidate(struct pci_dev *dev, void *context,
-				 void (*block_invalidate)(void *context,
-							  u64 block_mask));
+
+int hyperv_read_cfg_blk(struct pci_dev *dev, void *buf, unsigned int buf_len,
+			unsigned int block_id, unsigned int *bytes_returned);
+int hyperv_write_cfg_blk(struct pci_dev *dev, void *buf, unsigned int len,
+			 unsigned int block_id);
+int hyperv_reg_block_invalidate(struct pci_dev *dev, void *context,
+				void (*block_invalidate)(void *context,
+							 u64 block_mask));
+
+struct hyperv_pci_block_ops {
+	int (*read_block)(struct pci_dev *dev, void *buf, unsigned int buf_len,
+			  unsigned int block_id, unsigned int *bytes_returned);
+	int (*write_block)(struct pci_dev *dev, void *buf, unsigned int len,
+			   unsigned int block_id);
+	int (*reg_blk_invalidate)(struct pci_dev *dev, void *context,
+				  void (*block_invalidate)(void *context,
+							   u64 block_mask));
+};
+
+extern struct hyperv_pci_block_ops hvpci_block_ops;
+
 #endif /* _HYPERV_H */
-- 
1.8.3.1


^ permalink raw reply related

* Re: various TLS bug fixes...
From: Jakub Kicinski @ 2019-08-21  0:24 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20190820.160517.617004656524634921.davem@davemloft.net>

On Tue, 20 Aug 2019 16:05:17 -0700 (PDT), David Miller wrote:
> Jakub,
> 
> I just did a batch of networking -stable submissions, however I ran
> into some troubles with the various TLS backports.

Yes, the TLS back ports are a little messy :S

> I was able to backport commit 414776621d10 ("net/tls: prevent
> skb_orphan() from leaking TLS plain text with offload") to v5.2
> but not to v4.19

We can probably leave that out of v4.19. The only practical scenario
where the issue occurs to my knowledge is if admin configured TC
redirect, or netem in the setup. Let me know if you'd rather we did the
backport, I'm not 100% sure the risk/benefit ratio is favourable.

> I was not able to backport neither d85f01775850 ("net: tls, fix
> sk_write_space NULL write when tx disabled") nor commit 57c722e932cf
> ("net/tls: swap sk_write_space on close") to any release.  It seems
> like there are a bunch of dependencies and perhaps other fixes.

Right, those should still be applicable but John and I rejigged 
the close path quite a bit. I think the back port would be this:

diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 4c0ac79f82d4..3288bdff9889 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -301,6 +301,8 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 #else
        {
 #endif
+               if (sk->sk_write_space == tls_write_space)
+                       sk->sk_write_space = ctx->sk_write_space;
                tls_ctx_free(ctx);
                ctx = NULL;
        }

> I suspect you've triaged through this already on your side for other
> reasons, so perhaps you could help come up with a sane set of TLS
> bug fix backports that would be appropriate for -stable?

I'm planning to spend tomorrow working exactly on v4.19 backport. 
I have internal reports of openssl failing on v4.19 while v4.20 
works fine.. Hopefully I'll be able to figure that one out, test the
above and see if there are any other missing fixes.

Is it okay if I come back to this tomorrow?

^ permalink raw reply related

* [PATCH v2] sock: fix potential memory leak in proto_register()
From: zhanglin @ 2019-08-21  0:42 UTC (permalink / raw)
  To: davem
  Cc: ast, daniel, kafai, songliubraving, yhs, willemb, edumazet,
	deepa.kernel, arnd, dh.herrmann, gnault, netdev, linux-kernel,
	bpf, xue.zhihong, wang.yi59, jiang.xuexin, zhanglin

If protocols registered exceeded PROTO_INUSE_NR, prot will be
added to proto_list, but no available bit left for prot in
proto_inuse_idx.

Signed-off-by: zhanglin <zhang.lin16@zte.com.cn>
---
 net/core/sock.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index bc3512f230a3..c7ae32705705 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3139,16 +3139,17 @@ static __init int net_inuse_init(void)
 
 core_initcall(net_inuse_init);
 
-static void assign_proto_idx(struct proto *prot)
+static int assign_proto_idx(struct proto *prot)
 {
 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
 
 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
 		pr_err("PROTO_INUSE_NR exhausted\n");
-		return;
+		return -ENOSPC;
 	}
 
 	set_bit(prot->inuse_idx, proto_inuse_idx);
+	return 0;
 }
 
 static void release_proto_idx(struct proto *prot)
@@ -3157,8 +3158,9 @@ static void release_proto_idx(struct proto *prot)
 		clear_bit(prot->inuse_idx, proto_inuse_idx);
 }
 #else
-static inline void assign_proto_idx(struct proto *prot)
+static inline int assign_proto_idx(struct proto *prot)
 {
+	return 0;
 }
 
 static inline void release_proto_idx(struct proto *prot)
@@ -3243,18 +3245,24 @@ int proto_register(struct proto *prot, int alloc_slab)
 	}
 
 	mutex_lock(&proto_list_mutex);
+	if (assign_proto_idx(prot)) {
+		mutex_unlock(&proto_list_mutex);
+		goto out_free_timewait_sock_slab_name;
+	}
 	list_add(&prot->node, &proto_list);
-	assign_proto_idx(prot);
 	mutex_unlock(&proto_list_mutex);
 	return 0;
 
 out_free_timewait_sock_slab_name:
-	kfree(prot->twsk_prot->twsk_slab_name);
+	if (alloc_slab && prot->twsk_prot)
+		kfree(prot->twsk_prot->twsk_slab_name);
 out_free_request_sock_slab:
-	req_prot_cleanup(prot->rsk_prot);
+	if (alloc_slab) {
+		req_prot_cleanup(prot->rsk_prot);
 
-	kmem_cache_destroy(prot->slab);
-	prot->slab = NULL;
+		kmem_cache_destroy(prot->slab);
+		prot->slab = NULL;
+	}
 out:
 	return -ENOBUFS;
 }
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH 29/38] cls_flower: Convert handle_idr to XArray
From: Matthew Wilcox @ 2019-08-21  0:50 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20190820.165834.1420751898749952901.davem@davemloft.net>

On Tue, Aug 20, 2019 at 04:58:34PM -0700, David Miller wrote:
> From: Matthew Wilcox <willy@infradead.org>
> Date: Tue, 20 Aug 2019 15:32:50 -0700
> 
> > -		idr_replace(&head->handle_idr, fnew, fnew->handle);
> > +		xa_store(&head->filters, fnew->handle, fnew, 0);
> 
> Passing a gfp_t of zero? :-)

Yes!  We know we'll never do an allocation here because we're replacing
an entry that already exists.  It wouldn't harm us to pass a real
GFP flag, so I'll probably just change that.  It might help a future
implementation, and it will definitely save confusion.

^ permalink raw reply

* Re: [PATCH 23/38] cls_api: Convert tcf_net to XArray
From: Matthew Wilcox @ 2019-08-21  0:52 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20190820.165728.2062957580528299761.davem@davemloft.net>

On Tue, Aug 20, 2019 at 04:57:28PM -0700, David Miller wrote:
> > From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> > 
> > This module doesn't use the allocating functionality; convert it to a
> > plain XArray instead of an allocating one.  I've left struct tcf_net
> > in place in case more objects are added to it in future, although
> > it now only contains an XArray.  We don't need to call xa_destroy()
> > if the array is empty, so I've removed the contents of tcf_net_exit()
> > -- if it can be called with entries still in place, then it shoud call
> > xa_destroy() instead.
> > 
> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> 
> I don't know if the net exit can be invoked with entires still in place,
> however if the tcf_net_exit() function is made empty it should be removed
> along with the assignment to the per-netns ops.

Thanks!  I wasn't sure what the rule was for these ops.  I'll fix that up.

^ permalink raw reply

* Re: [PATCH net] ixgbe: fix double clean of tx descriptors with xdp
From: Alexander Duyck @ 2019-08-21  1:17 UTC (permalink / raw)
  To: Ilya Maximets
  Cc: Netdev, LKML, bpf, David S. Miller, Björn Töpel,
	Magnus Karlsson, Jakub Kicinski, Alexei Starovoitov,
	Daniel Borkmann, Jeff Kirsher, intel-wired-lan, Eelco Chaudron,
	William Tu
In-Reply-To: <625791af-c656-1e42-b60e-b3a5cedcb4c4@samsung.com>

On Tue, Aug 20, 2019 at 8:58 AM Ilya Maximets <i.maximets@samsung.com> wrote:
>
> On 20.08.2019 18:35, Alexander Duyck wrote:
> > On Tue, Aug 20, 2019 at 8:18 AM Ilya Maximets <i.maximets@samsung.com> wrote:
> >>
> >> Tx code doesn't clear the descriptor status after cleaning.
> >> So, if the budget is larger than number of used elems in a ring, some
> >> descriptors will be accounted twice and xsk_umem_complete_tx will move
> >> prod_tail far beyond the prod_head breaking the comletion queue ring.
> >>
> >> Fix that by limiting the number of descriptors to clean by the number
> >> of used descriptors in the tx ring.
> >>
> >> Fixes: 8221c5eba8c1 ("ixgbe: add AF_XDP zero-copy Tx support")
> >> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
> >
> > I'm not sure this is the best way to go. My preference would be to
> > have something in the ring that would prevent us from racing which I
> > don't think this really addresses. I am pretty sure this code is safe
> > on x86 but I would be worried about weak ordered systems such as
> > PowerPC.
> >
> > It might make sense to look at adding the eop_desc logic like we have
> > in the regular path with a proper barrier before we write it and after
> > we read it. So for example we could hold of on writing the bytecount
> > value until the end of an iteration and call smp_wmb before we write
> > it. Then on the cleanup we could read it and if it is non-zero we take
> > an smp_rmb before proceeding further to process the Tx descriptor and
> > clearing the value. Otherwise this code is going to just keep popping
> > up with issues.
>
> But, unlike regular case, xdp zero-copy xmit and clean for particular
> tx ring always happens in the same NAPI context and even on the same
> CPU core.
>
> I saw the 'eop_desc' manipulations in regular case and yes, we could
> use 'next_to_watch' field just as a flag of descriptor existence,
> but it seems unnecessarily complicated. Am I missing something?
>

So is it always in the same NAPI context?. I forgot, I was thinking
that somehow the socket could possibly make use of XDP for transmit.

As far as the logic to use I would be good with just using a value you
are already setting such as the bytecount value. All that would need
to happen is to guarantee that the value is cleared in the Tx path. So
if you clear the bytecount in ixgbe_clean_xdp_tx_irq you could
theoretically just use that as well to flag that a descriptor has been
populated and is ready to be cleaned. Assuming the logic about this
all being in the same NAPI context anyway you wouldn't need to mess
with the barrier stuff I mentioned before.

- Alex

^ permalink raw reply

* Re: [RFC 1/4] Add usb_get_address and usb_set_address support
From: Andrew Lunn @ 2019-08-21  1:22 UTC (permalink / raw)
  To: Charles.Hyde
  Cc: linux-usb, linux-acpi, gregkh, Mario.Limonciello, oliver, netdev,
	nic_swsd
In-Reply-To: <1566339522507.45056@Dellteam.com>

On Tue, Aug 20, 2019 at 10:18:42PM +0000, Charles.Hyde@dellteam.com wrote:
> The core USB driver message.c is missing get/set address functionality
> that stops ifconfig from being able to push MAC addresses out to USB
> based ethernet devices.  Without this functionality, some USB devices
> stop responding to ethernet packets when using ifconfig to change MAC
> addresses.

Hi Charles

ifconfig has been deprecated for years, maybe a decade. Please
reference the current tools, iproute2.

	  Andrew

^ permalink raw reply

* Re: [PATCH v1] ocfs2/dlm: Move BITS_TO_BYTES() to bitops.h for wider use
From: Joseph Qi @ 2019-08-21  1:29 UTC (permalink / raw)
  To: Andy Shevchenko, Mark Fasheh, Joel Becker, ocfs2-devel,
	Ariel Elior, Sudarsana Kalluru, GR-everest-linux-l2,
	David S. Miller, netdev, Colin Ian King
In-Reply-To: <20190820163112.50818-1-andriy.shevchenko@linux.intel.com>



On 19/8/21 00:31, Andy Shevchenko wrote:
> There are users already and will be more of BITS_TO_BYTES() macro.
> Move it to bitops.h for wider use.
> 
> Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
> ---
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x_init.h | 1 -
>  fs/ocfs2/dlm/dlmcommon.h                         | 4 ----
>  include/linux/bitops.h                           | 1 +
>  3 files changed, 1 insertion(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init.h
> index 066765fbef06..0a59a09ef82f 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init.h
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init.h
> @@ -296,7 +296,6 @@ static inline void bnx2x_dcb_config_qm(struct bnx2x *bp, enum cos_mode mode,
>   *    possible, the driver should only write the valid vnics into the internal
>   *    ram according to the appropriate port mode.
>   */
> -#define BITS_TO_BYTES(x) ((x)/8)>
I don't think this is a equivalent replace, or it is in fact
wrong before?

  
>  /* CMNG constants, as derived from system spec calculations */
>  
> diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
> index aaf24548b02a..0463dce65bb2 100644
> --- a/fs/ocfs2/dlm/dlmcommon.h
> +++ b/fs/ocfs2/dlm/dlmcommon.h
> @@ -688,10 +688,6 @@ struct dlm_begin_reco
>  	__be32 pad2;
>  };
>  
> -
> -#define BITS_PER_BYTE 8
> -#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE)
> -
For ocfs2 part, it looks good to me.
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>

>  struct dlm_query_join_request
>  {
>  	u8 node_idx;
> diff --git a/include/linux/bitops.h b/include/linux/bitops.h
> index cf074bce3eb3..79d80f5ddf7b 100644
> --- a/include/linux/bitops.h
> +++ b/include/linux/bitops.h
> @@ -5,6 +5,7 @@
>  #include <linux/bits.h>
>  
>  #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
> +#define BITS_TO_BYTES(nr)	DIV_ROUND_UP(nr, BITS_PER_BYTE)
>  #define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
>  
>  extern unsigned int __sw_hweight8(unsigned int w);
> 

^ permalink raw reply

* Re: [RFC 3/4] Move ACPI functionality out of r8152 driver
From: Andrew Lunn @ 2019-08-21  1:30 UTC (permalink / raw)
  To: Charles.Hyde
  Cc: linux-usb, linux-acpi, gregkh, Mario.Limonciello, oliver, netdev,
	nic_swsd
In-Reply-To: <1566339738195.2913@Dellteam.com>

On Tue, Aug 20, 2019 at 10:22:18PM +0000, Charles.Hyde@dellteam.com wrote:
> This change moves ACPI functionality out of the Realtek r8152 driver to
> its own source and header file, making it available to other drivers as
> needed now and into the future.  At the time this ACPI snippet was
> introduced in 2016, only the Realtek driver made use of it in support of
> Dell's enterprise IT policy efforts.  There comes now a need for this
> same support in a different driver, also in support of Dell's enterprise
> IT policy efforts.
> 
> Signed-off-by: Charles Hyde <charles.hyde@dellteam.com>
> Cc: Mario Limonciello <mario.limonciello@dell.com>
> Cc: Realtek linux nic maintainers <nic_swsd@realtek.com>
> Cc: linux-usb@vger.kernel.org
> Cc: linux-acpi@vger.kernel.org
> ---
>  drivers/net/usb/r8152.c | 44 ++++-------------------------------------
>  lib/Makefile            |  3 ++-

Hi Charles

I think your forgot to add the new files?

  Andrew

^ permalink raw reply

* [v4] ocelot_ace: fix action of trap
From: Yangbo Lu @ 2019-08-21  1:59 UTC (permalink / raw)
  To: netdev, David S . Miller, Allan W . Nielsen, Alexandre Belloni,
	Microchip Linux Driver Support, Andrew Lunn
  Cc: Yangbo Lu

The trap action should be copying the frame to CPU and
dropping it for forwarding, but current setting was just
copying frame to CPU.

Fixes: b596229448dd ("net: mscc: ocelot: Add support for tcam")
Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Acked-by: Allan W. Nielsen <allan.nielsen@microchip.com>
---
Changes for v4:
	- Added ACK and Fixes info in commit message.
Changes for v3:
	- Set MASK_MODE to 1 for dropping forwarding.
	- Dropped other patches of patch-set.
Changes for v2:
	- None.
---
 drivers/net/ethernet/mscc/ocelot_ace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mscc/ocelot_ace.c b/drivers/net/ethernet/mscc/ocelot_ace.c
index 39aca1a..86fc6e6 100644
--- a/drivers/net/ethernet/mscc/ocelot_ace.c
+++ b/drivers/net/ethernet/mscc/ocelot_ace.c
@@ -317,7 +317,7 @@ static void is2_action_set(struct vcap_data *data,
 		break;
 	case OCELOT_ACL_ACTION_TRAP:
 		VCAP_ACT_SET(PORT_MASK, 0x0);
-		VCAP_ACT_SET(MASK_MODE, 0x0);
+		VCAP_ACT_SET(MASK_MODE, 0x1);
 		VCAP_ACT_SET(POLICE_ENA, 0x0);
 		VCAP_ACT_SET(POLICE_IDX, 0x0);
 		VCAP_ACT_SET(CPU_QU_NUM, 0x0);
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH 1/1] net: rds: add service level support in rds-info
From: Zhu Yanjun @ 2019-08-21  2:10 UTC (permalink / raw)
  To: Doug Ledford, davem, netdev, linux-rdma, rds-devel
In-Reply-To: <f3de2e40f1bc2eb219d3056ee954747db90dbbb4.camel@redhat.com>

Hi，Doug

My reply is in line.

On 2019/8/20 23:28, Doug Ledford wrote:
> On Mon, 2019-08-19 at 20:52 -0400, Zhu Yanjun wrote:
>> diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
>> index fd6b5f6..cba368e 100644
>> --- a/include/uapi/linux/rds.h
>> +++ b/include/uapi/linux/rds.h
>> @@ -250,6 +250,7 @@ struct rds_info_rdma_connection {
>>          __u32           rdma_mr_max;
>>          __u32           rdma_mr_size;
>>          __u8            tos;
>> +       __u8            sl;
>>          __u32           cache_allocs;
>>   };
>>   
>> @@ -265,6 +266,7 @@ struct rds6_info_rdma_connection {
>>          __u32           rdma_mr_max;
>>          __u32           rdma_mr_size;
>>          __u8            tos;
>> +       __u8            sl;
>>          __u32           cache_allocs;
>>   };
>>   
> This is a user space API break (as was the prior patch mentioned
> below)...
>
>> The commit fe3475af3bdf ("net: rds: add per rds connection cache
>> statistics") adds cache_allocs in struct rds_info_rdma_connection
>> as below:
>> struct rds_info_rdma_connection {
>> ...
>>          __u32           rdma_mr_max;
>>          __u32           rdma_mr_size;
>>          __u8            tos;
>>          __u32           cache_allocs;
>>   };
>> The peer struct in rds-tools of struct rds_info_rdma_connection is as
>> below:
>> struct rds_info_rdma_connection {
>> ...
>>          uint32_t        rdma_mr_max;
>>          uint32_t        rdma_mr_size;
>>          uint8_t         tos;
>>          uint8_t         sl;
>>          uint32_t        cache_allocs;
>> };
> Why are the user space rds tools not using the kernel provided abi
> files?
Perhaps it is a long story.
>
> In order to know if this ABI breakage is safe, we need to know what
> versions of rds-tools are out in the wild and have their own headers
> that we need to match up with.

 From my works in LAB and in the customer's host, rds-tools 2.0.7 is the 
popular

version. Other versions rds-tools are used less.

>    Are there any versions of rds-tools that
> actually use the kernel provided headers?

"the kernel provided headers", do you mean include/uapi/linux/rds.h?

I checked the rds-tools source code. I do not find any version of 
rds-tools us this header files.

> Are there any other users of
> uapi/linux/rds.h besides rds-tools?

Not sure. But in Oracle, there are some rds applications. I am not sure 
whether these rds applications

will use include/uapi/linux/rds.h file or not.

I will investigate it.

>
> Once the kernel and rds-tools package are in sync,

After this commit is merged into mailine, the kernel and rds-tools 
package are in sync.

I will make investigations about rds-tools using the kernel header 
include/uapi/linux/rds.h.

Thanks a lot for your comments.

Zhu Yanjun

>   rds-tools needs to be
> modified to use the kernel header and proper ABI maintenance needs to be
> started.
>

^ permalink raw reply

* [PATCHv3 0/2] fix dev null pointer dereference when send packets larger than mtu in collect_md mode
From: Hangbin Liu @ 2019-08-21  2:09 UTC (permalink / raw)
  To: netdev
  Cc: Stefano Brivio, wenxu, Alexei Starovoitov, David S . Miller,
	Eric Dumazet, Hangbin Liu
In-Reply-To: <20190815060904.19426-1-liuhangbin@gmail.com>

When we send a packet larger than PMTU, we need to reply with
icmp_send(ICMP_FRAG_NEEDED) or icmpv6_send(ICMPV6_PKT_TOOBIG).

But with collect_md mode, kernel will crash while accessing the dst dev
as __metadata_dst_init() init dst->dev to NULL by default. Here is what
the code path looks like, for GRE:

- ip6gre_tunnel_xmit
  - ip6gre_xmit_ipv4
    - __gre6_xmit
      - ip6_tnl_xmit
        - if skb->len - t->tun_hlen - eth_hlen > mtu; return -EMSGSIZE
    - icmp_send
      - net = dev_net(rt->dst.dev); <-- here
  - ip6gre_xmit_ipv6
    - __gre6_xmit
      - ip6_tnl_xmit
        - if skb->len - t->tun_hlen - eth_hlen > mtu; return -EMSGSIZE
    - icmpv6_send
      ...
      - decode_session4
        - oif = skb_dst(skb)->dev->ifindex; <-- here
      - decode_session6
        - oif = skb_dst(skb)->dev->ifindex; <-- here

We could not fix it in __metadata_dst_init() as there is no dev supplied.
Look in to the __icmp_send()/decode_session{4,6} code we could find the dst
dev is actually not needed. In __icmp_send(), we could get the net by skb->dev.
For decode_session{4,6}, as it was called by xfrm_decode_session_reverse()
in this scenario, the oif is not used by
fl4->flowi4_oif = reverse ? skb->skb_iif : oif;

The reproducer is easy:

ovs-vsctl add-br br0
ip link set br0 up
ovs-vsctl add-port br0 gre0 -- set interface gre0 type=gre options:remote_ip=$dst_addr
ip link set gre0 up
ip addr add ${local_gre6}/64 dev br0
ping6 $remote_gre6 -s 1500

The kernel will crash like
[40595.821651] BUG: kernel NULL pointer dereference, address: 0000000000000108
[40595.822411] #PF: supervisor read access in kernel mode
[40595.822949] #PF: error_code(0x0000) - not-present page
[40595.823492] PGD 0 P4D 0
[40595.823767] Oops: 0000 [#1] SMP PTI
[40595.824139] CPU: 0 PID: 2831 Comm: handler12 Not tainted 5.2.0 #57
[40595.824788] Hardware name: Red Hat KVM, BIOS 1.11.1-3.module+el8.1.0+2983+b2ae9c0a 04/01/2014
[40595.825680] RIP: 0010:__xfrm_decode_session+0x6b/0x930
[40595.826219] Code: b7 c0 00 00 00 b8 06 00 00 00 66 85 d2 0f b7 ca 48 0f 45 c1 44 0f b6 2c 06 48 8b 47 58 48 83 e0 fe 0f 84 f4 04 00 00 48 8b 00 <44> 8b 80 08 01 00 00 41 f6 c4 01 4c 89 e7
ba 58 00 00 00 0f 85 47
[40595.828155] RSP: 0018:ffffc90000a73438 EFLAGS: 00010286
[40595.828705] RAX: 0000000000000000 RBX: ffff8881329d7100 RCX: 0000000000000000
[40595.829450] RDX: 0000000000000000 RSI: ffff8881339e70ce RDI: ffff8881329d7100
[40595.830191] RBP: ffffc90000a73470 R08: 0000000000000000 R09: 000000000000000a
[40595.830936] R10: 0000000000000000 R11: 0000000000000000 R12: ffffc90000a73490
[40595.831682] R13: 000000000000002c R14: ffff888132ff1301 R15: ffff8881329d7100
[40595.832427] FS:  00007f5bfcfd6700(0000) GS:ffff88813ba00000(0000) knlGS:0000000000000000
[40595.833266] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[40595.833883] CR2: 0000000000000108 CR3: 000000013a368000 CR4: 00000000000006f0
[40595.834633] Call Trace:
[40595.835392]  ? rt6_multipath_hash+0x4c/0x390
[40595.835853]  icmpv6_route_lookup+0xcb/0x1d0
[40595.836296]  ? icmpv6_xrlim_allow+0x3e/0x140
[40595.836751]  icmp6_send+0x537/0x840
[40595.837125]  icmpv6_send+0x20/0x30
[40595.837494]  tnl_update_pmtu.isra.27+0x19d/0x2a0 [ip_tunnel]
[40595.838088]  ip_md_tunnel_xmit+0x1b6/0x510 [ip_tunnel]
[40595.838633]  gre_tap_xmit+0x10c/0x160 [ip_gre]
[40595.839103]  dev_hard_start_xmit+0x93/0x200
[40595.839551]  sch_direct_xmit+0x101/0x2d0
[40595.839967]  __dev_queue_xmit+0x69f/0x9c0
[40595.840399]  do_execute_actions+0x1717/0x1910 [openvswitch]
[40595.840987]  ? validate_set.isra.12+0x2f5/0x3d0 [openvswitch]
[40595.841596]  ? reserve_sfa_size+0x31/0x130 [openvswitch]
[40595.842154]  ? __ovs_nla_copy_actions+0x1b4/0xad0 [openvswitch]
[40595.842778]  ? __kmalloc_reserve.isra.50+0x2e/0x80
[40595.843285]  ? should_failslab+0xa/0x20
[40595.843696]  ? __kmalloc+0x188/0x220
[40595.844078]  ? __alloc_skb+0x97/0x270
[40595.844472]  ovs_execute_actions+0x47/0x120 [openvswitch]
[40595.845041]  ovs_packet_cmd_execute+0x27d/0x2b0 [openvswitch]
[40595.845648]  genl_family_rcv_msg+0x3a8/0x430
[40595.846101]  genl_rcv_msg+0x47/0x90
[40595.846476]  ? __alloc_skb+0x83/0x270
[40595.846866]  ? genl_family_rcv_msg+0x430/0x430
[40595.847335]  netlink_rcv_skb+0xcb/0x100
[40595.847777]  genl_rcv+0x24/0x40
[40595.848113]  netlink_unicast+0x17f/0x230
[40595.848535]  netlink_sendmsg+0x2ed/0x3e0
[40595.848951]  sock_sendmsg+0x4f/0x60
[40595.849323]  ___sys_sendmsg+0x2bd/0x2e0
[40595.849733]  ? sock_poll+0x6f/0xb0
[40595.850098]  ? ep_scan_ready_list.isra.14+0x20b/0x240
[40595.850634]  ? _cond_resched+0x15/0x30
[40595.851032]  ? ep_poll+0x11b/0x440
[40595.851401]  ? _copy_to_user+0x22/0x30
[40595.851799]  __sys_sendmsg+0x58/0xa0
[40595.852180]  do_syscall_64+0x5b/0x190
[40595.852574]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[40595.853105] RIP: 0033:0x7f5c00038c7d
[40595.853489] Code: c7 20 00 00 75 10 b8 2e 00 00 00 0f 05 48 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 8e f7 ff ff 48 89 04 24 b8 2e 00 00 00 0f 05 <48> 8b 3c 24 48 89 c2 e8 d7 f7 ff ff 48 89
d0 48 83 c4 08 48 3d 01
[40595.855443] RSP: 002b:00007f5bfcf73c00 EFLAGS: 00003293 ORIG_RAX: 000000000000002e
[40595.856244] RAX: ffffffffffffffda RBX: 00007f5bfcf74a60 RCX: 00007f5c00038c7d
[40595.856990] RDX: 0000000000000000 RSI: 00007f5bfcf73c60 RDI: 0000000000000015
[40595.857736] RBP: 0000000000000004 R08: 0000000000000b7c R09: 0000000000000110
[40595.858613] R10: 0001000800050004 R11: 0000000000003293 R12: 000055c2d8329da0
[40595.859401] R13: 00007f5bfcf74120 R14: 0000000000000347 R15: 00007f5bfcf73c60
[40595.860185] Modules linked in: ip_gre ip_tunnel gre openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 sunrpc bochs_drm ttm drm_kms_helper drm pcspkr joydev i2c_piix4 qemu_fw_cfg xfs libcrc32c virtio_net net_failover serio_raw failover ata_generic virtio_blk pata_acpi floppy
[40595.863155] CR2: 0000000000000108
[40595.863551] ---[ end trace 22209bbcacb4addd ]---

v3: only replace pkg to packets in cover letter. So I didn't update the version
info in the follow up patches.

v2: fix it in __icmp_send() and decode_session{4,6} separately instead of
updating shared dst dev in {ip_md, ip6}_tunnel_xmit.

Hangbin Liu (2):
  ipv4/icmp: fix rt dst dev null pointer dereference
  xfrm/xfrm_policy: fix dst dev null pointer dereference in collect_md
    mode

 net/ipv4/icmp.c        | 5 ++++-
 net/xfrm/xfrm_policy.c | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

-- 
2.19.2


^ permalink raw reply

* [PATCHv3 1/2] ipv4/icmp: fix rt dst dev null pointer dereference
From: Hangbin Liu @ 2019-08-21  2:09 UTC (permalink / raw)
  To: netdev
  Cc: Stefano Brivio, wenxu, Alexei Starovoitov, David S . Miller,
	Eric Dumazet, Hangbin Liu
In-Reply-To: <20190821020954.21165-1-liuhangbin@gmail.com>

In __icmp_send() there is a possibility that the rt->dst.dev is NULL,
e,g, with tunnel collect_md mode, which will cause kernel crash.
Here is what the code path looks like, for GRE:

- ip6gre_tunnel_xmit
  - ip6gre_xmit_ipv4
    - __gre6_xmit
      - ip6_tnl_xmit
        - if skb->len - t->tun_hlen - eth_hlen > mtu; return -EMSGSIZE
    - icmp_send
      - net = dev_net(rt->dst.dev); <-- here

The reason is __metadata_dst_init() init dst->dev to NULL by default.
We could not fix it in __metadata_dst_init() as there is no dev supplied.
On the other hand, the reason we need rt->dst.dev is to get the net.
So we can just get it from skb->dev, just like commit 8d9336704521
("ipv6: make icmp6_send() robust against null skb->dev") did.

Fixes: c8b34e680a09 ("ip_tunnel: Add tnl_update_pmtu in ip_md_tunnel_xmit")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
 net/ipv4/icmp.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1510e951f451..5f00c9d18b02 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -582,7 +582,10 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,

 	if (!rt)
 		goto out;
-	net = dev_net(rt->dst.dev);
+
+	if (!skb_in->dev)
+		goto out;
+	net = dev_net(skb_in->dev);

 	/*
 	 *	Find the original header. It is expected to be valid, of course.
-- 
2.19.2

^ permalink raw reply related

* [PATCHv3 2/2] xfrm/xfrm_policy: fix dst dev null pointer dereference in collect_md mode
From: Hangbin Liu @ 2019-08-21  2:09 UTC (permalink / raw)
  To: netdev
  Cc: Stefano Brivio, wenxu, Alexei Starovoitov, David S . Miller,
	Eric Dumazet, Hangbin Liu
In-Reply-To: <20190821020954.21165-1-liuhangbin@gmail.com>

In decode_session{4,6} there is a possibility that the skb dst dev is NULL,
e,g, with tunnel collect_md mode, which will cause kernel crash.
Here is what the code path looks like, for GRE:

- ip6gre_tunnel_xmit
  - ip6gre_xmit_ipv6
    - __gre6_xmit
      - ip6_tnl_xmit
        - if skb->len - t->tun_hlen - eth_hlen > mtu; return -EMSGSIZE
    - icmpv6_send
      - icmpv6_route_lookup
        - xfrm_decode_session_reverse
          - decode_session4
            - oif = skb_dst(skb)->dev->ifindex; <-- here
          - decode_session6
            - oif = skb_dst(skb)->dev->ifindex; <-- here

The reason is __metadata_dst_init() init dst->dev to NULL by default.
We could not fix it in __metadata_dst_init() as there is no dev supplied.
On the other hand, the skb_dst(skb)->dev is actually not needed as we
called decode_session{4,6} via xfrm_decode_session_reverse(), so oif is not
used by: fl4->flowi4_oif = reverse ? skb->skb_iif : oif;

So make a dst dev check here should be clean and safe.

Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
 net/xfrm/xfrm_policy.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 8ca637a72697..ec94f5795ea4 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -3269,7 +3269,7 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse)
 	struct flowi4 *fl4 = &fl->u.ip4;
 	int oif = 0;
 
-	if (skb_dst(skb))
+	if (skb_dst(skb) && skb_dst(skb)->dev)
 		oif = skb_dst(skb)->dev->ifindex;
 
 	memset(fl4, 0, sizeof(struct flowi4));
@@ -3387,7 +3387,7 @@ decode_session6(struct sk_buff *skb, struct flowi *fl, bool reverse)
 
 	nexthdr = nh[nhoff];
 
-	if (skb_dst(skb))
+	if (skb_dst(skb) && skb_dst(skb)->dev)
 		oif = skb_dst(skb)->dev->ifindex;
 
 	memset(fl6, 0, sizeof(struct flowi6));
-- 
2.19.2


^ permalink raw reply related

* RE: [PATCH v2 0/2] Simplify mtty driver and mdev core
From: Parav Pandit @ 2019-08-21  2:42 UTC (permalink / raw)
  To: Cornelia Huck
  Cc: Christophe de Dinechin, Alex Williamson, Jiri Pirko,
	David S . Miller, Kirti Wankhede, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, cjia, netdev@vger.kernel.org
In-Reply-To: <20190820183106.1680d0d9.cohuck@redhat.com>



> -----Original Message-----
> From: Cornelia Huck <cohuck@redhat.com>
> Sent: Tuesday, August 20, 2019 10:01 PM
> > > > Option-1: mdev index
> > > > Introduce an optional mdev index/handle as u32 during mdev create
> time.
> > > > User passes mdev index/handle as input.
> > > >
> > > > phys_port_name=mIndex=m%u
> > > > mdev_index will be available in sysfs as mdev attribute for udev
> > > > to name the
> > > mdev's netdev.
> > > >
> > > > example mdev create command:
> > > > UUID=$(uuidgen)
> > > > echo $UUID index=10 >
> > > > /sys/class/net/ens2f0/mdev_supported_types/mlx5_core_mdev/create
> > > > example netdevs:
> > > > repnetdev=ens2f0_m10	/*ens2f0 is parent PF's netdevice */
> > > > mdev_netdev=enm10
> > > >
> > > > Pros:
> > > > 1. mdevctl and any other existing tools are unaffected.
> > > > 2. netdev stack, ovs and other switching platforms are unaffected.
> > > > 3. achieves unique phys_port_name for representor netdev 4.
> > > > achieves unique mdev eth netdev name for the mdev using udev/systemd
> extension.
> > > > 5. Aligns well with mdev and netdev subsystem and similar to
> > > > existing sriov
> > > bdf's.
> > > >
> > > > Option-2: shorter mdev name
> > > > Extend mdev to have shorter mdev device name in addition to UUID.
> > > > such as 'foo', 'bar'.
> > > > Mdev will continue to have UUID.
> 
> I fail to understand how 'uses uuid' and 'allow shorter device name'
> are supposed to play together?
> 
Each mdev will have uuid as today. Instead of naming device based on UUID, name it based on explicit name given by the user.
Again, I want to repeat, this name parameter is optional.

> > > > phys_port_name=mdev_name
> > > >
> > > > Pros:
> > > > 1. All same as option-1, except mdevctl needs upgrade for newer usage.
> > > > It is common practice to upgrade iproute2 package along with the kernel.
> > > > Similar practice to be done with mdevctl.
> > > > 2. Newer users of mdevctl who wants to work with non_UUID names,
> > > > will use
> > > newer mdevctl/tools.
> > > > Cons:
> > > > 1. Dual naming scheme of mdev might affect some of the existing tools.
> > > > It's unclear how/if it actually affects.
> > > > mdevctl [2] is very recently developed and can be enhanced for
> > > > dual naming
> > > scheme.
> 
> The main problem is not tools we know about (i.e. mdevctl), but those we don't
> know about.
> 
Well, if it not part of the distros, there is very little can do about it by kernel.
I tried mdevctl with mdev named using non UUID and it were able to list them.

> IOW, this (and the IFNAMESIZ change, which seems even worse) are the
> options I would not want at all.
> 
Ok.

> > > >
> > > > Option-3: mdev uuid alias
> > > > Instead of shorter mdev name or mdev index, have alpha-numeric
> > > > name
> > > alias.
> > > > Alias is an optional mdev sysfs attribute such as 'foo', 'bar'.
> > > > example mdev create command:
> > > > UUID=$(uuidgen)
> > > > echo $UUID alias=foo >
> > > > /sys/class/net/ens2f0/mdev_supported_types/mlx5_core_mdev/create
> > > > example netdevs:
> > > > examle netdevs:
> > > > repnetdev = ens2f0_mfoo
> > > > mdev_netdev=enmfoo
> > > >
> > > > Pros:
> > > > 1. All same as option-1.
> > > > 2. Doesn't affect existing mdev naming scheme.
> > > > Cons:
> > > > 1. Index scheme of option-1 is better which can number large
> > > > number of
> > > mdevs with fewer characters, simplifying the management tool.
> > >
> > > I believe that Alex pointed out another "Cons" to all three options,
> > > which is that it forces user-space to resolve potential race
> > > conditions when creating an index or short name or alias.
> > >
> > This race condition exists for at least two subsystems that I know of, i.e.
> netdev and rdma.
> > If a device with a given name exists, subsystem returns error.
> > When user space gets error code EEXIST, and it can picks up different
> identifier(s).
> 
> If you decouple device creation and setting the alias/index, you make the issue
> visible and thus much more manageable.
> 
I thought about it. It has two issues.
1. user should be able to set this only once. Repeatedly setting it requires changing/notifying it.
2. setting alias translating in creating devlink port doesn't sound correct.
Because if user attempts to reset to different value, it required unregistration, reregistration.
All of such race conditions handling it not worth it.
So setting the index, I liked Alex's term more 'instance number', at instance creation time is lot more simple.

> >
> > > Also, what happens if `index=10` is not provided on the command-line?
> > > Does that make the device unusable for your purpose?
> > Yes, it is unusable to an extent.
> > Currently we have DEVLINK_PORT_FLAVOUR_PCI_VF in
> > include/uapi/linux/devlink.h Similar to it, we need to have
> DEVLINK_PORT_FLAVOUR_MDEV for mdev eswitch ports.
> > This port flavour needs to generate phys_port_name(). This should be user
> parameter driven.
> > Because representor netdevice name is generated based on this parameter.
> 
> I'm also unsure how the extra parameter is supposed to work; writing it to the
> create attribute does not sound right.
> 
Why? When you create a device it takes multiple mandatory and optional parameters.
This is common for netdev (vxlan, vlan, macvlan, ipvlan, gre and more).

> mdevctl supports setting additional parameters on an already created device
> (see the examples provided for vfio-ap), so going that route would actually
> work out of the box from the tooling side.
> 
I explained that setting and re-setting attributes for instance create time value is not worth.

> What you would need is some kind of synchronization/locking to make sure that
> you only link up to the other device after the extra attribute has been set and
> that you don't allow to change it as long as it is associated with the other side. I
> do not know enough about the actual devices to suggest something here; if you
> need userspace cooperation, maybe uevents would be an option.

^ permalink raw reply

* Re: VRF notes when using ipv6 and flushing tables.
From: David Ahern @ 2019-08-21  3:02 UTC (permalink / raw)
  To: Ben Greear, netdev
In-Reply-To: <8977a25e-29c1-5375-cc97-950dc7c2eb0f@candelatech.com>

On 8/20/19 2:27 PM, Ben Greear wrote:
> I recently spend a few days debugging what in the end was user error on
> my part.
> 
> Here are my notes in hope they help someone else.
> 
> First, 'ip -6 route show vrf vrfX' will not show some of the
> routes (like local routes) that will show up with
> 'ip -6 route show table X', where X == vrfX's table-id
> 
> If you run 'ip -6 route flush table X', then you will loose all of the auto
> generated routes, including anycast, ff00::/8, and local routes.
> 
> ff00::/8 is needed for neigh discovery to work (probably among other
> things)
> 
> local route is needed or packets won't actually be accepted up the stack
> (I think that is the symptom at least)
> 
> Not sure exactly what anycast does, but I'm guessing it is required for
> something useful.
> 
> You must manually re-add those to the table unless you for certain know
> that
> you do not need them for whatever reason.
> 

sorry you went through such a long and painful debugging session.

yes, the kernel doc for VRF needs to be updated that 'ip route show vrf
X' and 'ip route show table X' are different ('show vrf' mimics the main
table in not showing local, broadcast, anycast; 'table vrf' shows all).

A suggestion for others: the documentation and selftests directory have
a lot of VRF examples now. If something basic is not working (e.g., arp
or neigh discovery), see if it works there and if so compare the outputs
of the route table along the way.

^ permalink raw reply

* [PATCH] net: Add the same IP detection for duplicate address.
From: Dongxu Liu @ 2019-08-21  3:20 UTC (permalink / raw)
  To: davem, kuznet, yoshfuji, netdev, linux-kernel

The network sends an ARP REQUEST packet to determine
whether there is a host with the same IP.
Windows and some other hosts may send the source IP
address instead of 0.
When IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP) is enable,
the REQUEST will be dropped.
When IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP) is disable,
The case should be added to the IP conflict handling process.

Signed-off-by: Dongxu Liu <liudongxu3@huawei.com>
---
 net/ipv4/arp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 05eb42f..a51c921 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -801,7 +801,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 						    GFP_ATOMIC);

 	/* Special case: IPv4 duplicate address detection packet (RFC2131) */
-	if (sip == 0) {
+	if (sip == 0 || sip == tip) {
 		if (arp->ar_op == htons(ARPOP_REQUEST) &&
 		    inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&
 		    !arp_ignore(in_dev, sip, tip))
-- 
2.12.3

^ permalink raw reply related

* Re: [PATCH net-next 3/6] net: dsa: Delete the VID from the upstream port as well
From: Vivien Didelot @ 2019-08-21  3:30 UTC (permalink / raw)
  To: Vladimir Oltean
  Cc: Florian Fainelli, Andrew Lunn, Ido Schimmel, Roopa Prabhu,
	nikolay, David S. Miller, netdev
In-Reply-To: <CA+h21hodsDTwPHY1pxQA-ucu6FU7rkOQa7Y4HJGZC0fRd8zmDA@mail.gmail.com>

On Wed, 21 Aug 2019 01:09:39 +0300, Vladimir Oltean <olteanv@gmail.com> wrote:
> I mean I made an argument already for the hack in 4/6 ("Don't program
> the VLAN as pvid on the upstream port"). If the hack gets accepted
> like that, I have no further need of any change in the implicit VLAN
> configuration. But it's still a hack, so in that sense it would be
> nicer to not need it and have a better amount of control.

How come you simply cannot ignore the PVID flag for the CPU port in the
driver directly, as mv88e6xxx does in preference of the Marvell specific
"unmodified" mode? What PVID are you programming on the CPU port already?


Thanks,

	Vivien

^ permalink raw reply

* Re: [pull request][net-next v2 00/16] Mellanox, mlx5 devlink RX health reporters
From: Jakub Kicinski @ 2019-08-21  3:36 UTC (permalink / raw)
  To: Saeed Mahameed; +Cc: David S. Miller, netdev@vger.kernel.org
In-Reply-To: <20190820202352.2995-1-saeedm@mellanox.com>

On Tue, 20 Aug 2019 20:24:10 +0000, Saeed Mahameed wrote:
> This patchset introduces changes in mlx5 devlink health reporters.
> The highlight of these changes is adding a new reporter: RX reporter
> 
> mlx5 RX reporter: reports and recovers from timeouts and RX completion
> error.
> 
> 1) Perform TX reporter cleanup. In order to maintain the
> code flow as similar as possible between RX and TX reporters, start the
> set with cleanup.
> 
> 2) Prepare for code sharing, generalize and move shared
> functionality.
> 
> 3) Refactor and extend TX reporter diagnostics information
> to align the TX reporter diagnostics output with the RX reporter's
> diagnostics output.
> 
> 4) Add helper functions Patch 11: Add RX reporter, initially
> supports only the diagnostics call back.
> 
> 5) Change ICOSQ (Internal Operations Send Queue) open/close flow to
> avoid race between interface down and completion error recovery.
> 
> 6) Introduce recovery flows for RX ring population timeout on ICOSQ,
> and for completion errors on ICOSQ and on RQ (Regular receive queues).
> 
> 7) Include RX reporters in mlx5 documentation.
> 
> 8) Last two patches of this series, are trivial fixes for previously
> submitted patches on this release cycle.

Not really something I can competently ack, but FWIW doesn't raise any
red flags for me.

^ permalink raw reply

* RE: [PATCH v2 0/2] Simplify mtty driver and mdev core
From: Parav Pandit @ 2019-08-21  3:42 UTC (permalink / raw)
  To: Alex Williamson
  Cc: Jiri Pirko, David S . Miller, Kirti Wankhede, Cornelia Huck,
	kvm@vger.kernel.org, linux-kernel@vger.kernel.org, cjia,
	netdev@vger.kernel.org
In-Reply-To: <20190820111904.75515f58@x1.home>



> -----Original Message-----
> From: Alex Williamson <alex.williamson@redhat.com>
> Sent: Tuesday, August 20, 2019 10:49 PM
> To: Parav Pandit <parav@mellanox.com>
> Cc: Jiri Pirko <jiri@mellanox.com>; David S . Miller <davem@davemloft.net>;
> Kirti Wankhede <kwankhede@nvidia.com>; Cornelia Huck
> <cohuck@redhat.com>; kvm@vger.kernel.org; linux-kernel@vger.kernel.org;
> cjia <cjia@nvidia.com>; netdev@vger.kernel.org
> Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> 
> On Tue, 20 Aug 2019 08:58:02 +0000
> Parav Pandit <parav@mellanox.com> wrote:
> 
> > + Dave.
> >
> > Hi Jiri, Dave, Alex, Kirti, Cornelia,
> >
> > Please provide your feedback on it, how shall we proceed?
> >
> > Short summary of requirements.
> > For a given mdev (mediated device [1]), there is one representor
> > netdevice and devlink port in switchdev mode (similar to SR-IOV VF),
> > And there is one netdevice for the actual mdev when mdev is probed.
> >
> > (a) representor netdev and devlink port should be able derive
> > phys_port_name(). So that representor netdev name can be built
> > deterministically across reboots.
> >
> > (b) for mdev's netdevice, mdev's device should have an attribute.
> > This attribute can be used by udev rules/systemd or something else to
> > rename netdev name deterministically.
> >
> > (c) IFNAMSIZ of 16 bytes is too small to fit whole UUID.
> > A simple grep IFNAMSIZ in stack hints hundreds of users of IFNAMSIZ in
> > drivers, uapi, netlink, boot config area and more. Changing IFNAMSIZ
> > for a mdev bus doesn't really look reasonable option to me.
> 
> How many characters do we really have to work with?  Your examples below
> prepend various characters, ex. option-1 results in ens2f0_m10 or enm10.  Do
> the extra 8 or 3 characters in these count against IFNAMSIZ?
> 
Maximum 15. Last is null termination.
Some udev rules setting by user prefix the PF netdev interface. I took such example below where ens2f0 netdev named is prefixed.
Some prefer not to prefix.

> > Hence, I would like to discuss below options.
> >
> > Option-1: mdev index
> > Introduce an optional mdev index/handle as u32 during mdev create
> > time. User passes mdev index/handle as input.
> >
> > phys_port_name=mIndex=m%u
> > mdev_index will be available in sysfs as mdev attribute for udev to
> > name the mdev's netdev.
> >
> > example mdev create command:
> > UUID=$(uuidgen)
> > echo $UUID index=10
> > > /sys/class/net/ens2f0/mdev_supported_types/mlx5_core_mdev/create
> 
> Nit, IIRC previous discussions of additional parameters used comma separators,
> ex. echo $UUID,index=10 >...
> 
Yes, ok.

> > > example netdevs:
> > repnetdev=ens2f0_m10	/*ens2f0 is parent PF's netdevice */
> 
> Is the parent really relevant in the name?  
No. I just picked one udev example who prefixed the parent netdev name.
But there are users who do not prefix it.

> Tools like mdevctl are meant to
> provide persistence, creating the same mdev devices on the same parent, but
> that's simply the easiest policy decision.  We can also imagine that multiple
> parent devices might support a specified mdev type and policies factoring in
> proximity, load-balancing, power consumption, etc might be weighed such that
> we really don't want to promote userspace creating dependencies on the
> parent association.
> 
> > mdev_netdev=enm10
> >
> > Pros:
> > 1. mdevctl and any other existing tools are unaffected.
> > 2. netdev stack, ovs and other switching platforms are unaffected.
> > 3. achieves unique phys_port_name for representor netdev 4. achieves
> > unique mdev eth netdev name for the mdev using udev/systemd extension.
> > 5. Aligns well with mdev and netdev subsystem and similar to existing
> > sriov bdf's.
> 
> A user provided index seems strange to me.  It's not really an index, just a user
> specified instance number.  Presumably you have the user providing this
> because if it really were an index, then the value depends on the creation order
> and persistence is lost.  Now the user needs to both avoid uuid collision as well
> as "index" number collision.  The uuid namespace is large enough to mostly
> ignore this, but this is not.  This seems like a burden.
> 
I liked the term 'instance number', which is lot better way to say than index/handle.
Yes, user needs to avoid both the collision.
UUID collision should not occur in most cases, they way UUID are generated.
So practically users needs to pick unique 'instance number', similar to how it picks unique netdev names.

Burden to user comes from the requirement to get uniqueness.

> > Option-2: shorter mdev name
> > Extend mdev to have shorter mdev device name in addition to UUID.
> > such as 'foo', 'bar'.
> > Mdev will continue to have UUID.
> > phys_port_name=mdev_name
> >
> > Pros:
> > 1. All same as option-1, except mdevctl needs upgrade for newer usage.
> > It is common practice to upgrade iproute2 package along with the
> > kernel. Similar practice to be done with mdevctl.
> > 2. Newer users of mdevctl who wants to work with non_UUID names, will
> > use newer mdevctl/tools. Cons:
> > 1. Dual naming scheme of mdev might affect some of the existing tools.
> > It's unclear how/if it actually affects.
> > mdevctl [2] is very recently developed and can be enhanced for dual
> > naming scheme.
> 
> I think we've already nak'ed this one, the device namespace becomes
> meaningless if the name becomes just a string where a uuid might be an
> example string.  mdevs are named by uuid.
> 
> > Option-3: mdev uuid alias
> > Instead of shorter mdev name or mdev index, have alpha-numeric name
> > alias. Alias is an optional mdev sysfs attribute such as 'foo', 'bar'.
> > example mdev create command:
> > UUID=$(uuidgen)
> > echo $UUID alias=foo
> > > /sys/class/net/ens2f0/mdev_supported_types/mlx5_core_mdev/create
> > > example netdevs:
> > examle netdevs:
> > repnetdev = ens2f0_mfoo
> > mdev_netdev=enmfoo
> >
> > Pros:
> > 1. All same as option-1.
> > 2. Doesn't affect existing mdev naming scheme.
> > Cons:
> > 1. Index scheme of option-1 is better which can number large number of
> > mdevs with fewer characters, simplifying the management tool.
> 
> No better than option-1, simply a larger secondary namespace, but still
> requires the user to come up with two independent names for the device.
> 
> > Option-4: extend IFNAMESZ to be 64 bytes Extended IFNAMESZ from 16 to
> > 64 bytes phys_port_name=mdev_UUID_string mdev_netdev_name=enmUUID
> >
> > Pros:
> > 1. Doesn't require mdev extension
> > Cons:
> > 1. netdev stack, driver, uapi, user space, boot config wide changes 2.
> > Possible user space extensions who assumed name size being 16
> > characters 3. Single device type demands namesize change for all
> > netdev types
> 
> What about an alias based on the uuid?  For example, we use 160-bit sha1s
> daily with git (uuids are only 128-bit), but we generally don't reference git
> commits with the full 20 character string.  Generally 12 characters is
> recommended to avoid ambiguity.  Could mdev automatically create an
> abbreviated sha1 alias for the device?  If so, how many characters should we
> use and what do we do on collision?  The colliding device could add enough
> alias characters to disambiguate (we likely couldn't re-alias the existing device
> to disambiguate, but I'm not sure it matters, userspace has sysfs to associate
> aliases).  Ex.
> 
> UUID=$(uuidgen)
> ALIAS=$(echo $UUID | sha1sum | colrm 13)
> 
I explained in previous reply to Cornelia, we should set UUID and ALIAS at the same time.
Setting is via different sysfs attribute is lot code burden with no extra benefit.

> Since there seems to be some prefix overhead, as I ask about above in how
> many characters we actually have to work with in IFNAMESZ, maybe we start
> with 8 characters (matching your "index" namespace) and expand as necessary
> for disambiguation.  If we can eliminate overhead in IFNAMESZ, let's start with
> 12.  Thanks,
> 
If user is going to choose the alias, why does it have to be limited to sha1?
Or you just told it as an example?

It can be an alpha-numeric string.

Instead of mdev imposing number of characters on the alias, it should be best left to the user.
Because in future if netdev improves on the naming scheme, mdev will be limiting it, which is not right.
So not restricting alias size seems right to me.
User configuring mdev for networking devices in a given kernel knows what user is doing.
So user can choose alias name size as it finds suitable.

> Alex

^ permalink raw reply

* RE: [PATCH v2 0/2] Simplify mtty driver and mdev core
From: Parav Pandit @ 2019-08-21  3:57 UTC (permalink / raw)
  To: Cornelia Huck, Alex Williamson
  Cc: Jiri Pirko, David S . Miller, Kirti Wankhede, kvm@vger.kernel.org,
	linux-kernel@vger.kernel.org, cjia, netdev@vger.kernel.org
In-Reply-To: <20190820195519.47d6fd6a.cohuck@redhat.com>



> -----Original Message-----
> From: Cornelia Huck <cohuck@redhat.com>
> Sent: Tuesday, August 20, 2019 11:25 PM
> To: Alex Williamson <alex.williamson@redhat.com>
> Cc: Parav Pandit <parav@mellanox.com>; Jiri Pirko <jiri@mellanox.com>;
> David S . Miller <davem@davemloft.net>; Kirti Wankhede
> <kwankhede@nvidia.com>; kvm@vger.kernel.org; linux-
> kernel@vger.kernel.org; cjia <cjia@nvidia.com>; netdev@vger.kernel.org
> Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> 
> On Tue, 20 Aug 2019 11:19:04 -0600
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> > What about an alias based on the uuid?  For example, we use 160-bit
> > sha1s daily with git (uuids are only 128-bit), but we generally don't
> > reference git commits with the full 20 character string.  Generally 12
> > characters is recommended to avoid ambiguity.  Could mdev
> > automatically create an abbreviated sha1 alias for the device?  If so,
> > how many characters should we use and what do we do on collision?  The
> > colliding device could add enough alias characters to disambiguate (we
> > likely couldn't re-alias the existing device to disambiguate, but I'm
> > not sure it matters, userspace has sysfs to associate aliases).  Ex.
> >
> > UUID=$(uuidgen)
> > ALIAS=$(echo $UUID | sha1sum | colrm 13)
> >
> > Since there seems to be some prefix overhead, as I ask about above in
> > how many characters we actually have to work with in IFNAMESZ, maybe
> > we start with 8 characters (matching your "index" namespace) and
> > expand as necessary for disambiguation.  If we can eliminate overhead
> > in IFNAMESZ, let's start with 12.  Thanks,
> >
> > Alex
> 
> I really like that idea, and it seems the best option proposed yet, as we don't
> need to create a secondary identifier.
User setting this alias at mdev creation time and exposed via sysfs as read only attribute works.
Exposing that as
const char *mdev_alias(struct mdev_device *dev) to vendor drivers..


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox