Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net 2/9] net/mlx5: E-Switch, Avoid setup attempt if not being e-switch manager
From: Saeed Mahameed @ 2018-06-27  0:21 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Or Gerlitz, Saeed Mahameed
In-Reply-To: <20180627002118.9856-1-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

In smartnic env, the host (PF) driver might not be an e-switch
manager, hence the FW will err on driver attempts to deal with
setting/unsetting the eswitch and as a result the overall setup
of sriov will fail.

Fix that by avoiding the operation if e-switch management is not
allowed for this driver instance. While here, move to use the
correct name for the esw manager capability name.

Fixes: 81848731ff40 ('net/mlx5: E-Switch, Add SR-IOV (FDB) support')
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reported-by: Guy Kushnir <guyk@mellanox.com>
Reviewed-by: Eli Cohen <eli@melloanox.com>
Tested-by: Eli Cohen <eli@melloanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c  | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/fw.c      | 5 +++--
 drivers/net/ethernet/mellanox/mlx5/core/sriov.c   | 7 ++++++-
 include/linux/mlx5/eswitch.h                      | 2 ++
 include/linux/mlx5/mlx5_ifc.h                     | 2 +-
 7 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 378ad74518ec..60236f73373c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -839,7 +839,7 @@ static bool mlx5e_is_vf_vport_rep(struct mlx5e_priv *priv)
 	struct mlx5e_rep_priv *rpriv = priv->ppriv;
 	struct mlx5_eswitch_rep *rep;
 
-	if (!MLX5_CAP_GEN(priv->mdev, eswitch_flow_table))
+	if (!MLX5_ESWITCH_MANAGER(priv->mdev))
 		return false;
 
 	rep = rpriv->rep;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index f63dfbcd29fe..103fd6a0cc65 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1604,7 +1604,7 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 	if (!ESW_ALLOWED(esw))
 		return 0;
 
-	if (!MLX5_CAP_GEN(esw->dev, eswitch_flow_table) ||
+	if (!MLX5_ESWITCH_MANAGER(esw->dev) ||
 	    !MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ft_support)) {
 		esw_warn(esw->dev, "E-Switch FDB is not supported, aborting ...\n");
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 49a75d31185e..f1a86cea86a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -32,6 +32,7 @@
 
 #include <linux/mutex.h>
 #include <linux/mlx5/driver.h>
+#include <linux/mlx5/eswitch.h>
 
 #include "mlx5_core.h"
 #include "fs_core.h"
@@ -2652,7 +2653,7 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
 			goto err;
 	}
 
-	if (MLX5_CAP_GEN(dev, eswitch_flow_table)) {
+	if (MLX5_ESWITCH_MANAGER(dev)) {
 		if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, ft_support)) {
 			err = init_fdb_root_ns(steering);
 			if (err)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index afd9f4fa22f4..41ad24f0de2c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -32,6 +32,7 @@
 
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cmd.h>
+#include <linux/mlx5/eswitch.h>
 #include <linux/module.h>
 #include "mlx5_core.h"
 #include "../../mlxfw/mlxfw.h"
@@ -159,13 +160,13 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 	}
 
 	if (MLX5_CAP_GEN(dev, vport_group_manager) &&
-	    MLX5_CAP_GEN(dev, eswitch_flow_table)) {
+	    MLX5_ESWITCH_MANAGER(dev)) {
 		err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH_FLOW_TABLE);
 		if (err)
 			return err;
 	}
 
-	if (MLX5_CAP_GEN(dev, eswitch_flow_table)) {
+	if (MLX5_ESWITCH_MANAGER(dev)) {
 		err = mlx5_core_get_caps(dev, MLX5_CAP_ESWITCH);
 		if (err)
 			return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
index 2a8b529ce6dd..a0674962f02c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -88,6 +88,9 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs)
 		return -EBUSY;
 	}
 
+	if (!MLX5_ESWITCH_MANAGER(dev))
+		goto enable_vfs_hca;
+
 	err = mlx5_eswitch_enable_sriov(dev->priv.eswitch, num_vfs, SRIOV_LEGACY);
 	if (err) {
 		mlx5_core_warn(dev,
@@ -95,6 +98,7 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs)
 		return err;
 	}
 
+enable_vfs_hca:
 	for (vf = 0; vf < num_vfs; vf++) {
 		err = mlx5_core_enable_hca(dev, vf + 1);
 		if (err) {
@@ -140,7 +144,8 @@ static void mlx5_device_disable_sriov(struct mlx5_core_dev *dev)
 	}
 
 out:
-	mlx5_eswitch_disable_sriov(dev->priv.eswitch);
+	if (MLX5_ESWITCH_MANAGER(dev))
+		mlx5_eswitch_disable_sriov(dev->priv.eswitch);
 
 	if (mlx5_wait_for_vf_pages(dev))
 		mlx5_core_warn(dev, "timeout reclaiming VFs pages\n");
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index d3c9db492b30..fab5121ffb8f 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -8,6 +8,8 @@
 
 #include <linux/mlx5/driver.h>
 
+#define MLX5_ESWITCH_MANAGER(mdev) MLX5_CAP_GEN(mdev, eswitch_manager)
+
 enum {
 	SRIOV_NONE,
 	SRIOV_LEGACY,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 27134c4fcb76..ac281f5ec9b8 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -922,7 +922,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         vnic_env_queue_counters[0x1];
 	u8         ets[0x1];
 	u8         nic_flow_table[0x1];
-	u8         eswitch_flow_table[0x1];
+	u8         eswitch_manager[0x1];
 	u8         device_memory[0x1];
 	u8         mcam_reg[0x1];
 	u8         pcam_reg[0x1];
-- 
2.17.0

^ permalink raw reply related

* [net 3/9] net/mlx5e: Avoid dealing with vport representors if not being e-switch manager
From: Saeed Mahameed @ 2018-06-27  0:21 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Or Gerlitz, Saeed Mahameed
In-Reply-To: <20180627002118.9856-1-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

In smartnic env, the host (PF) driver might not be an e-switch
manager, hence the switchdev mode representors are running on
the embedded cpu (EC) and not at the host.

As such, we should avoid dealing with vport representors if
not being esw manager.

While here, make sure to disallow eswitch switchdev related
setups through devlink if we are not esw managers.

Fixes: cb67b832921c ('net/mlx5e: Introduce SRIOV VF representors')
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c    | 12 ++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c     |  2 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c   |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 56c1b6f5593e..dae4156a710d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2846,7 +2846,7 @@ void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
 	mlx5e_activate_channels(&priv->channels);
 	netif_tx_start_all_queues(priv->netdev);
 
-	if (MLX5_VPORT_MANAGER(priv->mdev))
+	if (MLX5_ESWITCH_MANAGER(priv->mdev))
 		mlx5e_add_sqs_fwd_rules(priv);
 
 	mlx5e_wait_channels_min_rx_wqes(&priv->channels);
@@ -2857,7 +2857,7 @@ void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv)
 {
 	mlx5e_redirect_rqts_to_drop(priv);
 
-	if (MLX5_VPORT_MANAGER(priv->mdev))
+	if (MLX5_ESWITCH_MANAGER(priv->mdev))
 		mlx5e_remove_sqs_fwd_rules(priv);
 
 	/* FIXME: This is a W/A only for tx timeout watch dog false alarm when
@@ -4597,7 +4597,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	mlx5e_set_netdev_dev_addr(netdev);
 
 #if IS_ENABLED(CONFIG_MLX5_ESWITCH)
-	if (MLX5_VPORT_MANAGER(mdev))
+	if (MLX5_ESWITCH_MANAGER(mdev))
 		netdev->switchdev_ops = &mlx5e_switchdev_ops;
 #endif
 
@@ -4753,7 +4753,7 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv)
 
 	mlx5e_enable_async_events(priv);
 
-	if (MLX5_VPORT_MANAGER(priv->mdev))
+	if (MLX5_ESWITCH_MANAGER(priv->mdev))
 		mlx5e_register_vport_reps(priv);
 
 	if (netdev->reg_state != NETREG_REGISTERED)
@@ -4788,7 +4788,7 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv)
 
 	queue_work(priv->wq, &priv->set_rx_mode_work);
 
-	if (MLX5_VPORT_MANAGER(priv->mdev))
+	if (MLX5_ESWITCH_MANAGER(priv->mdev))
 		mlx5e_unregister_vport_reps(priv);
 
 	mlx5e_disable_async_events(priv);
@@ -4972,7 +4972,7 @@ static void *mlx5e_add(struct mlx5_core_dev *mdev)
 		return NULL;
 
 #ifdef CONFIG_MLX5_ESWITCH
-	if (MLX5_VPORT_MANAGER(mdev)) {
+	if (MLX5_ESWITCH_MANAGER(mdev)) {
 		rpriv = mlx5e_alloc_nic_rep_priv(mdev);
 		if (!rpriv) {
 			mlx5_core_warn(mdev, "Failed to alloc NIC rep priv data\n");
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 60236f73373c..2b8040a3cdbd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -823,7 +823,7 @@ bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv)
 	struct mlx5e_rep_priv *rpriv = priv->ppriv;
 	struct mlx5_eswitch_rep *rep;
 
-	if (!MLX5_CAP_GEN(priv->mdev, vport_group_manager))
+	if (!MLX5_ESWITCH_MANAGER(priv->mdev))
 		return false;
 
 	rep = rpriv->rep;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index cecd201f0b73..91f1209886ff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1079,8 +1079,8 @@ static int mlx5_devlink_eswitch_check(struct devlink *devlink)
 	if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
 		return -EOPNOTSUPP;
 
-	if (!MLX5_CAP_GEN(dev, vport_group_manager))
-		return -EOPNOTSUPP;
+	if(!MLX5_ESWITCH_MANAGER(dev))
+		return -EPERM;
 
 	if (dev->priv.eswitch->mode == SRIOV_NONE)
 		return -EOPNOTSUPP;
-- 
2.17.0

^ permalink raw reply related

* [net 5/9] net/mlx5: E-Switch, Disallow vlan/spoofcheck setup if not being esw manager
From: Saeed Mahameed @ 2018-06-27  0:21 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Or Gerlitz, Eli Cohen, Saeed Mahameed
In-Reply-To: <20180627002118.9856-1-saeedm@mellanox.com>

From: Eli Cohen <eli@mellanox.com>

In smartnic env, if the host (PF) driver is not an e-switch manager, we
are not allowed to apply eswitch ports setups such as vlan (VST),
spoof-checks, min/max rate or state.

Make sure we are eswitch manager when coming to issue these callbacks
and err otherwise.

Also fix the definition of ESW_ALLOWED to rely on eswitch_manager
capability and on the vport_group_manger.

Operations on the VF nic vport context, such as setting a mac or reading
the vport counters are allowed to the PF in this scheme.

The modify nic vport guid code was modified to omit checking the
nic_vport_node_guid_modify eswitch capability.
The reason for doing so is that modifying node guid requires vport group
manager capability, and there's no need to check further capabilities.

1. set_vf_vlan     - disallowed
2. set_vf_spoofchk - disallowed
3. set_vf_mac      - allowed
4. get_vf_config   - allowed
5. set_vf_trust    - disallowed
6. set_vf_rate     - disallowed
7. get_vf_stat     - allowed
8. set_vf_link_state - disallowed

Fixes: f942380c1239 ('net/mlx5: E-Switch, Vport ingress/egress ACLs rules for spoofchk')
Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Tested-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 12 +++++-------
 drivers/net/ethernet/mellanox/mlx5/core/vport.c   |  2 --
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 103fd6a0cc65..b79d74860a30 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1594,17 +1594,15 @@ static void esw_disable_vport(struct mlx5_eswitch *esw, int vport_num)
 }
 
 /* Public E-Switch API */
-#define ESW_ALLOWED(esw) ((esw) && MLX5_VPORT_MANAGER((esw)->dev))
+#define ESW_ALLOWED(esw) ((esw) && MLX5_ESWITCH_MANAGER((esw)->dev))
+
 
 int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 {
 	int err;
 	int i, enabled_events;
 
-	if (!ESW_ALLOWED(esw))
-		return 0;
-
-	if (!MLX5_ESWITCH_MANAGER(esw->dev) ||
+	if (!ESW_ALLOWED(esw) ||
 	    !MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ft_support)) {
 		esw_warn(esw->dev, "E-Switch FDB is not supported, aborting ...\n");
 		return -EOPNOTSUPP;
@@ -1806,7 +1804,7 @@ int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
 	u64 node_guid;
 	int err = 0;
 
-	if (!ESW_ALLOWED(esw))
+	if (!MLX5_CAP_GEN(esw->dev, vport_group_manager))
 		return -EPERM;
 	if (!LEGAL_VPORT(esw, vport) || is_multicast_ether_addr(mac))
 		return -EINVAL;
@@ -1883,7 +1881,7 @@ int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
 {
 	struct mlx5_vport *evport;
 
-	if (!ESW_ALLOWED(esw))
+	if (!MLX5_CAP_GEN(esw->dev, vport_group_manager))
 		return -EPERM;
 	if (!LEGAL_VPORT(esw, vport))
 		return -EINVAL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 719cecb182c6..7eecd5b07bb1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -549,8 +549,6 @@ int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev,
 		return -EINVAL;
 	if (!MLX5_CAP_GEN(mdev, vport_group_manager))
 		return -EACCES;
-	if (!MLX5_CAP_ESW(mdev, nic_vport_node_guid_modify))
-		return -EOPNOTSUPP;
 
 	in = kvzalloc(inlen, GFP_KERNEL);
 	if (!in)
-- 
2.17.0

^ permalink raw reply related

* [net 4/9] IB/mlx5: Avoid dealing with vport representors if not being e-switch manager
From: Saeed Mahameed @ 2018-06-27  0:21 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Or Gerlitz, Saeed Mahameed
In-Reply-To: <20180627002118.9856-1-saeedm@mellanox.com>

From: Or Gerlitz <ogerlitz@mellanox.com>

In smartnic env, the host (PF) driver might not be an e-switch
manager, hence the switchdev mode representors are running on
the embedded cpu (EC) and not at the host.

As such, we should avoid dealing with vport representors if
not being esw manager.

Fixes: b5ca15ad7e61 ('IB/mlx5: Add proper representors support')
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index e52dd21519b4..634be96dcb86 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6107,7 +6107,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	dev->num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
 			     MLX5_CAP_GEN(mdev, num_vhca_ports));

-	if (MLX5_VPORT_MANAGER(mdev) &&
+	if (MLX5_ESWITCH_MANAGER(mdev) &&
 	    mlx5_ib_eswitch_mode(mdev->priv.eswitch) == SRIOV_OFFLOADS) {
 		dev->rep = mlx5_ib_vport_rep(mdev->priv.eswitch, 0);

-- 
2.17.0

^ permalink raw reply related

* [net 6/9] net/mlx5: Fix required capability for manipulating MPFS
From: Saeed Mahameed @ 2018-06-27  0:21 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Or Gerlitz, Eli Cohen, Saeed Mahameed
In-Reply-To: <20180627002118.9856-1-saeedm@mellanox.com>

From: Eli Cohen <eli@mellanox.com>

Manipulating of the MPFS requires eswitch manager capabilities.

Fixes: eeb66cdb6826 ('net/mlx5: Separate between E-Switch and MPFS')
Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
index 7cb67122e8b5..98359559c77e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
@@ -33,6 +33,7 @@
 #include <linux/etherdevice.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/mlx5_ifc.h>
+#include <linux/mlx5/eswitch.h>
 #include "mlx5_core.h"
 #include "lib/mpfs.h"
 
@@ -98,7 +99,7 @@ int mlx5_mpfs_init(struct mlx5_core_dev *dev)
 	int l2table_size = 1 << MLX5_CAP_GEN(dev, log_max_l2_table);
 	struct mlx5_mpfs *mpfs;
 
-	if (!MLX5_VPORT_MANAGER(dev))
+	if (!MLX5_ESWITCH_MANAGER(dev))
 		return 0;
 
 	mpfs = kzalloc(sizeof(*mpfs), GFP_KERNEL);
@@ -122,7 +123,7 @@ void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev)
 {
 	struct mlx5_mpfs *mpfs = dev->priv.mpfs;
 
-	if (!MLX5_VPORT_MANAGER(dev))
+	if (!MLX5_ESWITCH_MANAGER(dev))
 		return;
 
 	WARN_ON(!hlist_empty(mpfs->hash));
@@ -137,7 +138,7 @@ int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac)
 	u32 index;
 	int err;
 
-	if (!MLX5_VPORT_MANAGER(dev))
+	if (!MLX5_ESWITCH_MANAGER(dev))
 		return 0;
 
 	mutex_lock(&mpfs->lock);
@@ -179,7 +180,7 @@ int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac)
 	int err = 0;
 	u32 index;
 
-	if (!MLX5_VPORT_MANAGER(dev))
+	if (!MLX5_ESWITCH_MANAGER(dev))
 		return 0;
 
 	mutex_lock(&mpfs->lock);
-- 
2.17.0

^ permalink raw reply related

* [net 7/9] net/mlx5: Fix wrong size allocation for QoS ETC TC regitster
From: Saeed Mahameed @ 2018-06-27  0:21 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Or Gerlitz, Shay Agroskin, Saeed Mahameed
In-Reply-To: <20180627002118.9856-1-saeedm@mellanox.com>

From: Shay Agroskin <shayag@mellanox.com>

The driver allocates wrong size (due to wrong struct name) when issuing
a query/set request to NIC's register.

Fixes: d8880795dabf ("net/mlx5e: Implement DCBNL IEEE max rate")
Signed-off-by: Shay Agroskin <shayag@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index fa9d0760dd36..31a9cbd85689 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -701,7 +701,7 @@ EXPORT_SYMBOL_GPL(mlx5_query_port_prio_tc);
 static int mlx5_set_port_qetcr_reg(struct mlx5_core_dev *mdev, u32 *in,
 				   int inlen)
 {
-	u32 out[MLX5_ST_SZ_DW(qtct_reg)];
+	u32 out[MLX5_ST_SZ_DW(qetc_reg)];
 
 	if (!MLX5_CAP_GEN(mdev, ets))
 		return -EOPNOTSUPP;
@@ -713,7 +713,7 @@ static int mlx5_set_port_qetcr_reg(struct mlx5_core_dev *mdev, u32 *in,
 static int mlx5_query_port_qetcr_reg(struct mlx5_core_dev *mdev, u32 *out,
 				     int outlen)
 {
-	u32 in[MLX5_ST_SZ_DW(qtct_reg)];
+	u32 in[MLX5_ST_SZ_DW(qetc_reg)];
 
 	if (!MLX5_CAP_GEN(mdev, ets))
 		return -EOPNOTSUPP;
-- 
2.17.0

^ permalink raw reply related

* [net 8/9] net/mlx5: Fix incorrect raw command length parsing
From: Saeed Mahameed @ 2018-06-27  0:21 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Or Gerlitz, Alex Vesker, Saeed Mahameed
In-Reply-To: <20180627002118.9856-1-saeedm@mellanox.com>

From: Alex Vesker <valex@mellanox.com>

The NULL character was not set correctly for the string containing
the command length, this caused failures reading the output of the
command due to a random length. The fix is to initialize the output
length string.

Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 487388aed98f..2db5fe571594 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -1276,7 +1276,7 @@ static ssize_t outlen_write(struct file *filp, const char __user *buf,
 {
 	struct mlx5_core_dev *dev = filp->private_data;
 	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
-	char outlen_str[8];
+	char outlen_str[8] = {0};
 	int outlen;
 	void *ptr;
 	int err;
@@ -1291,8 +1291,6 @@ static ssize_t outlen_write(struct file *filp, const char __user *buf,
 	if (copy_from_user(outlen_str, buf, count))
 		return -EFAULT;
 
-	outlen_str[7] = 0;
-
 	err = sscanf(outlen_str, "%d", &outlen);
 	if (err < 0)
 		return err;
-- 
2.17.0

^ permalink raw reply related

* [net 9/9] net/mlx5: Fix command interface race in polling mode
From: Saeed Mahameed @ 2018-06-27  0:21 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Or Gerlitz, Alex Vesker, Saeed Mahameed
In-Reply-To: <20180627002118.9856-1-saeedm@mellanox.com>

From: Alex Vesker <valex@mellanox.com>

The command interface can work in two modes: Events and Polling.
In the general case, each time we invoke a command, a work is
queued to handle it.

When working in events, the interrupt handler completes the
command execution. On the other hand, when working in polling
mode, the work itself completes it.

Due to a bug in the work handler, a command could have been
completed by the interrupt handler, while the work handler
hasn't finished yet, causing the it to complete once again
if the command interface mode was changed from Events to
polling after the interrupt handler was called.

mlx5_unload_one()
        mlx5_stop_eqs()
                // Destroy the EQ before cmd EQ
                ...cmd_work_handler()
                        write_doorbell()
                        --> EVENT_TYPE_CMD
                                mlx5_cmd_comp_handler() // First free
                                        free_ent(cmd, ent->idx)
                                        complete(&ent->done)

        <-- mlx5_stop_eqs //cmd was complete
                // move to polling before destroying the last cmd EQ
                mlx5_cmd_use_polling()
                        cmd->mode = POLL;

                --> cmd_work_handler (continues)
                        if (cmd->mode == POLL)
                                mlx5_cmd_comp_handler() // Double free

The solution is to store the cmd->mode before writing the doorbell.

Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 2db5fe571594..384c1fa49081 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -807,6 +807,7 @@ static void cmd_work_handler(struct work_struct *work)
 	unsigned long flags;
 	bool poll_cmd = ent->polling;
 	int alloc_ret;
+	int cmd_mode;
 
 	sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem;
 	down(sem);
@@ -853,6 +854,7 @@ static void cmd_work_handler(struct work_struct *work)
 	set_signature(ent, !cmd->checksum_disabled);
 	dump_command(dev, ent, 1);
 	ent->ts1 = ktime_get_ns();
+	cmd_mode = cmd->mode;
 
 	if (ent->callback)
 		schedule_delayed_work(&ent->cb_timeout_work, cb_timeout);
@@ -877,7 +879,7 @@ static void cmd_work_handler(struct work_struct *work)
 	iowrite32be(1 << ent->idx, &dev->iseg->cmd_dbell);
 	mmiowb();
 	/* if not in polling don't use ent after this point */
-	if (cmd->mode == CMD_MODE_POLLING || poll_cmd) {
+	if (cmd_mode == CMD_MODE_POLLING || poll_cmd) {
 		poll_timeout(ent);
 		/* make sure we read the descriptor after ownership is SW */
 		rmb();
-- 
2.17.0

^ permalink raw reply related

* Re: set_memory_* (was: Re: BUG: unable to handle kernel paging request in bpf_int_jit_compile)
From: Kees Cook @ 2018-06-27  0:26 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Ingo Molnar, David Miller, Thomas Gleixner,
	syzbot+a4eb8c7766952a1ca872, Alexei Starovoitov, H. Peter Anvin,
	Alexey Kuznetsov, LKML, Ingo Molnar, Network Development,
	syzkaller-bugs, X86 ML, Hideaki YOSHIFUJI, Peter Zijlstra,
	Laura Abbott, Linus Torvalds, Eric Dumazet, Rik van Riel,
	Ard Biesheuvel
In-Reply-To: <b4841a84-cf65-f3fb-13b3-ea0a15f47bde@iogearbox.net>

On Tue, Jun 26, 2018 at 3:53 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> In any case, for pairs like set_memory_ro() + set_memory_rw() that are also used
> outside of bpf e.g. STRICT_MODULE_RWX and friends which are mostly default these
> days for some archs, is the choice to not check errors from there by design or from
> historical context that it originated from 'debugging code' in that sense (DEBUG_RODATA /
> DEBUG_SET_MODULE_RONX) earlier? Also if no-one checks for errors (and if that would
> infact be the recommendation it is agreed upon) should the API be changed to void,
> or generally should actual error checking occur on these + potential rollback; but
> then question is what about restoring part from prior set_memory_ro() via set_memory_rw()?
> Kees/others, do you happen to have some more context on recommended use around this
> by any chance? (Would probably also help if we add some doc around assumptions into
> include/linux/set_memory.h for future users.)

If set_memory_* can fail, I think it needs to be __must_check, and all
the callers need to deal with it gracefully. Those markings aren't
"advisory": they're expected to actually do what they say.

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply

* Re: Re: [Qemu-devel] [PATCH] qemu: Introduce VIRTIO_NET_F_STANDBY feature bit to virtio_net
From: Michael S. Tsirkin @ 2018-06-27  0:29 UTC (permalink / raw)
  To: Siwei Liu
  Cc: Samudrala, Sridhar, Cornelia Huck, Alexander Duyck, virtio-dev,
	aaron.f.brown, Jiri Pirko, Jakub Kicinski, Netdev, qemu-devel,
	virtualization, konrad.wilk, boris.ostrovsky, Joao Martins,
	Venu Busireddy, vijay.balakrishna
In-Reply-To: <CADGSJ204fgdnafRsrzMVh+SeBCj3-z_twS9B+7g8iiOaUQGW5g@mail.gmail.com>

On Tue, Jun 26, 2018 at 04:38:26PM -0700, Siwei Liu wrote:
> On Mon, Jun 25, 2018 at 6:50 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Mon, Jun 25, 2018 at 10:54:09AM -0700, Samudrala, Sridhar wrote:
> >> > > > > Might not neccessarily be something wrong, but it's very limited to
> >> > > > > prohibit the MAC of VF from changing when enslaved by failover.
> >> > > > You mean guest changing MAC? I'm not sure why we prohibit that.
> >> > > I think Sridhar and Jiri might be better person to answer it. My
> >> > > impression was that sync'ing the MAC address change between all 3
> >> > > devices is challenging, as the failover driver uses MAC address to
> >> > > match net_device internally.
> >>
> >> Yes. The MAC address is assigned by the hypervisor and it needs to manage the movement
> >> of the MAC between the PF and VF.  Allowing the guest to change the MAC will require
> >> synchronization between the hypervisor and the PF/VF drivers. Most of the VF drivers
> >> don't allow changing guest MAC unless it is a trusted VF.
> >
> > OK but it's a policy thing. Maybe it's a trusted VF. Who knows?
> > For example I can see host just
> > failing VIRTIO_NET_CTRL_MAC_ADDR_SET if it wants to block it.
> > I'm not sure why VIRTIO_NET_F_STANDBY has to block it in the guest.
> 
> That's why I think pairing using MAC is fragile IMHO. When VF's MAC
> got changed before virtio attempts to match and pair the device, it
> ends up with no pairing found out at all.

Guest seems to match on the hardware mac and ignore whatever
is set by user. Makes sense to me and should not be fragile.


> UUID is better.
> 
> -Siwei
> 
> >
> > --
> > MST

^ permalink raw reply

* Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.
From: Cong Wang @ 2018-06-27  0:29 UTC (permalink / raw)
  To: Flavio Leitner
  Cc: Eric Dumazet, Linux Kernel Network Developers, Paolo Abeni,
	David Miller, Florian Westphal, NetFilter
In-Reply-To: <20180626233302.GU19565@plex.lan>

On Tue, Jun 26, 2018 at 4:33 PM Flavio Leitner <fbl@redhat.com> wrote:
>
> It is still isolated, the sk carries the netns info and it is
> orphaned when it re-enters the stack.

Then what difference does your patch make?

Before your patch:
veth orphans skb in its xmit

After your patch:
RX orphans it when re-entering stack (as you claimed, I don't know)

And for veth pair:
xmit from one side is RX for the other side

So, where is the queueing? Where is the buffer bloat? GRO list??

^ permalink raw reply

* Re: [net-next PATCH v4 5/7] net: Enable Tx queue selection based on Rx queues
From: Nambiar, Amritha @ 2018-06-27  0:36 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Network Development, David Miller, Alexander Duyck,
	Samudrala, Sridhar, Alexander Duyck, Eric Dumazet,
	Hannes Frederic Sowa, Tom Herbert
In-Reply-To: <CAF=yD-KQ60Ob2NekMvUhCQxCtLXcy-c5J1WR6stROQRmtANpOw@mail.gmail.com>

On 6/26/2018 4:04 AM, Willem de Bruijn wrote:
> On Mon, Jun 25, 2018 at 7:06 PM Amritha Nambiar
> <amritha.nambiar@intel.com> wrote:
>>
>> This patch adds support to pick Tx queue based on the Rx queue(s) map
>> configuration set by the admin through the sysfs attribute
>> for each Tx queue. If the user configuration for receive queue(s) map
>> does not apply, then the Tx queue selection falls back to CPU(s) map
>> based selection and finally to hashing.
>>
>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>> ---
> 
>> +static int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
>>  {
>>  #ifdef CONFIG_XPS
>>         struct xps_dev_maps *dev_maps;
>> -       struct xps_map *map;
>> +       struct sock *sk = skb->sk;
>>         int queue_index = -1;
>>
>>         if (!static_key_false(&xps_needed))
>>                 return -1;
>>
>>         rcu_read_lock();
>> -       dev_maps = rcu_dereference(dev->xps_cpus_map);
>> +       if (!static_key_false(&xps_rxqs_needed))
>> +               goto get_cpus_map;
>> +
>> +       dev_maps = rcu_dereference(dev->xps_rxqs_map);
>>         if (dev_maps) {
>> -               unsigned int tci = skb->sender_cpu - 1;
>> +               int tci = sk_rx_queue_get(sk);
> 
> What if the rx device differs from the tx device?
> 
I think I have 3 options here:
1. Cache the ifindex in sock_common which will introduce a new
additional field in sock_common.
2. Use dev_get_by_napi_id to get the device id. This could be expensive,
if the rxqs_map is set, this will be done on every packet and involves
walking through the hashlist for napi_id lookup.
3. Remove validating device id, similar to how it is in skb_tx_hash
where rx_queue recorded is used and if not, fall through to flow hash
calculation.
What do you think is suitable here?

^ permalink raw reply

* Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.
From: Flavio Leitner @ 2018-06-27  0:39 UTC (permalink / raw)
  To: Cong Wang
  Cc: Eric Dumazet, Linux Kernel Network Developers, Paolo Abeni,
	David Miller, Florian Westphal, NetFilter
In-Reply-To: <CAM_iQpVmbuVK+pfhad7HrQtaSxFbe7K=59_34f0BwxoP_=W0VQ@mail.gmail.com>

On Tue, Jun 26, 2018 at 05:29:51PM -0700, Cong Wang wrote:
> On Tue, Jun 26, 2018 at 4:33 PM Flavio Leitner <fbl@redhat.com> wrote:
> >
> > It is still isolated, the sk carries the netns info and it is
> > orphaned when it re-enters the stack.
> 
> Then what difference does your patch make?

Don't forget it is fixing two issues.

> Before your patch:
> veth orphans skb in its xmit
> 
> After your patch:
> RX orphans it when re-entering stack (as you claimed, I don't know)

ip_rcv, and equivalents.

> And for veth pair:
> xmit from one side is RX for the other side
> So, where is the queueing? Where is the buffer bloat? GRO list??

CPU backlog.

-- 
Flavio

^ permalink raw reply

* Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.
From: Cong Wang @ 2018-06-27  0:44 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Flavio Leitner, Linux Kernel Network Developers, Paolo Abeni,
	David Miller, Florian Westphal, NetFilter
In-Reply-To: <aae3fa15-598b-342d-20c7-23ed4b9faf84@gmail.com>

On Tue, Jun 26, 2018 at 4:53 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
>
> On 06/26/2018 03:47 PM, Cong Wang wrote:
> >
> > You need to justify why you want to break the TSQ's scope here,
> > which is obviously not compatible with netns design.
>
> You have to explain why you do not want us to fix this buggy behavior.
>
> Right now TSQ (and more generally back pressure) is broken by this skb_orphan()
>
> So we want to restore TSQ (and back pressure)
>
> TSQ scope never mentioned netns.

Conceptually, it is limiting the pipeline from L4 to L2 within one stack, now
you are extending it to NS1 (L4...L2)...NS2 (L2...L4) which could across
multiple stacks and _in theory_ could be infinitely long.

And TSQ setting is per-netns:

2180 static bool tcp_small_queue_check(struct sock *sk, const struct
sk_buff *skb,
2181                                   unsigned int factor)
2182 {
2183         unsigned int limit;
2184
2185         limit = max(2 * skb->truesize, sk->sk_pacing_rate >>
sk->sk_pacing_shift);
2186         limit = min_t(u32, limit,
2187                       sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
2188         limit <<= factor;

Should I expect to increase sysctl_tcp_limit_output_bytes based on the
number of stacks it crosses?

> We (TCP stack TSQ handler) want to be notified when this packet leaves the host,
> even if it had to traverse multiple netns (for whatever reasons).
>
> _If_ a packet is locally 'consumed' (like on loopback device, or veth pair),
> then the skb_orphan() will automatically be done.
>
> If you have a case where this skb_orphan() is needed, please add it at the needed place.


With this, a netns could totally throttle a TCP socket in a different
netns by holding the packets infinitely (e.g. putting them in a loop).
This is where the isolation breaks.

^ permalink raw reply

* Re: [rds-devel] [PATCH net-next] rds: clean up loopback rds_connections on netns deletion
From: David Miller @ 2018-06-27  1:12 UTC (permalink / raw)
  To: sowmini.varadhan; +Cc: netdev, rds-devel
In-Reply-To: <20180626162822.GA6394@oracle.com>

From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Tue, 26 Jun 2018 12:28:22 -0400

> On (06/26/18 10:53), Sowmini Varadhan wrote:
>> Date: Tue, 26 Jun 2018 10:53:23 -0400
>> From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
>> To: David Miller <davem@davemloft.net>
>> Cc: netdev@vger.kernel.org, rds-devel@oss.oracle.com
>> Subject: Re: [rds-devel] [PATCH net-next] rds: clean up loopback
>> 
>> and just to add, the fix itself is logically correct, so belongs in
>> net-next. What I dont have (and therefore did not target net) is
>> official confirmation that the syzbot failures are root-caused to the
>> absence of this patch (since there is no reproducer for many of these,
>> and no crash dumps available from syzbot).  
>> 
> 
> With help from Dmitry, I just got the confirmation from syzbot that
> "syzbot has tested the proposed patch and the reproducer did not trigger 
> crash:"
> 
> thus, we can mark this
> 
> Reported-and-tested-by: syzbot+4c20b3866171ce8441d2@syzkaller.appspotmail.com
> 
> and yes, it can target net.

Great, applied, thanks!

^ permalink raw reply

* Re: [PATCH net-next] net: preserve sock reference when scrubbing the skb.
From: Cong Wang @ 2018-06-27  1:28 UTC (permalink / raw)
  To: Flavio Leitner
  Cc: Eric Dumazet, Linux Kernel Network Developers, Paolo Abeni,
	David Miller, Florian Westphal, NetFilter
In-Reply-To: <20180627003925.GV19565@plex.lan>

On Tue, Jun 26, 2018 at 5:39 PM Flavio Leitner <fbl@redhat.com> wrote:
>
> On Tue, Jun 26, 2018 at 05:29:51PM -0700, Cong Wang wrote:
> > On Tue, Jun 26, 2018 at 4:33 PM Flavio Leitner <fbl@redhat.com> wrote:
> > >
> > > It is still isolated, the sk carries the netns info and it is
> > > orphaned when it re-enters the stack.
> >
> > Then what difference does your patch make?
>
> Don't forget it is fixing two issues.

Sure. I am only talking about TSQ from the very beginning.
Let me rephrase my above question:
What difference does your patch make to TSQ?


>
> > Before your patch:
> > veth orphans skb in its xmit
> >
> > After your patch:
> > RX orphans it when re-entering stack (as you claimed, I don't know)
>
> ip_rcv, and equivalents.


ip_rcv() is L3, we enter a stack from L1. So your above claim is incorrect. :)

>
> > And for veth pair:
> > xmit from one side is RX for the other side
> > So, where is the queueing? Where is the buffer bloat? GRO list??
>
> CPU backlog.

Yeah, but this is never targeted by TSQ:

        tcp_limit_output_bytes limits the number of bytes on qdisc
        or device to reduce artificial RTT/cwnd and reduce bufferbloat.

which means you have to update Documentation/networking/ip-sysctl.txt
too.

^ permalink raw reply

* [BUG] net: e100: possible data races in e100_watchdog()
From: Jia-Ju Bai @ 2018-06-27  1:30 UTC (permalink / raw)
  To: jeffrey.t.kirsher, jchapman
  Cc: intel-wired-lan, netdev, Linux Kernel Mailing List

The call paths in Linux 4.16.7 that may raise data races are:

CPU0:
e100_set_multicast_list
     e100_exec_cb
         line 879: spin_lock_irqsave()
         e100_configure
             line 1139: nic->flags [READ]
             line 1148: nic->flags [READ]

CPU1:
e100_watchdog:
     line 1758, 1756: nic->flags [WRITE]

The READ operations in CPU0 are performed with holding a spinlock (line 
879), but the WRITE operation in CPU1 is performed without holding this 
spinlock, so it may cause data races here.

A possible fix is to add spin_lock_irqsave() in e100_watchdog().
I am not sure that whether this possible fix is correct, so I only 
report the data races.

Best wishes,
Jia-Ju Bai

^ permalink raw reply

* [PATCH net] strparser: Remove early eaten to fix full tcp receive buffer stall
From: Doron Roberts-Kedes @ 2018-06-27  1:33 UTC (permalink / raw)
  To: David S. Miller; +Cc: Dave Watson, Tom Herbert, netdev, Doron Roberts-Kedes

On receving an incomplete message, the existing code stores the
remaining length of the cloned skb in the early_eaten field instead of
incrementing the value returned by __strp_recv. This defers invocation
of sock_rfree for the current skb until the next invocation of
__strp_recv, which returns early_eaten if early_eaten is non-zero.

This behavior causes a stall when the current message occupies the very
tail end of a massive skb, and strp_peek/need_bytes indicates that the
remainder of the current message has yet to arrive on the socket. The
TCP receive buffer is totally full, causing the TCP window to go to
zero, so the remainder of the message will never arrive.

Incrementing the value returned by __strp_recv by the amount otherwise
stored in early_eaten prevents stalls of this nature.

Signed-off-by: Doron Roberts-Kedes <doronrk@fb.com>
---
 net/strparser/strparser.c | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 373836615c57..625acb27efcc 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -35,7 +35,6 @@ struct _strp_msg {
 	 */
 	struct strp_msg strp;
 	int accum_len;
-	int early_eaten;
 };

 static inline struct _strp_msg *_strp_msg(struct sk_buff *skb)
@@ -115,20 +114,6 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 	head = strp->skb_head;
 	if (head) {
 		/* Message already in progress */
-
-		stm = _strp_msg(head);
-		if (unlikely(stm->early_eaten)) {
-			/* Already some number of bytes on the receive sock
-			 * data saved in skb_head, just indicate they
-			 * are consumed.
-			 */
-			eaten = orig_len <= stm->early_eaten ?
-				orig_len : stm->early_eaten;
-			stm->early_eaten -= eaten;
-
-			return eaten;
-		}
-
 		if (unlikely(orig_offset)) {
 			/* Getting data with a non-zero offset when a message is
 			 * in progress is not expected. If it does happen, we
@@ -297,9 +282,9 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 				}

 				stm->accum_len += cand_len;
+				eaten += cand_len;
 				strp->need_bytes = stm->strp.full_len -
 						       stm->accum_len;
-				stm->early_eaten = cand_len;
 				STRP_STATS_ADD(strp->stats.bytes, cand_len);
 				desc->count = 0; /* Stop reading socket */
 				break;
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH v2] fib_rules: match rules based on suppress_* properties too
From: David Miller @ 2018-06-27  1:34 UTC (permalink / raw)
  To: Jason; +Cc: roopa, netdev
In-Reply-To: <20180625233932.11531-1-Jason@zx2c4.com>

From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 26 Jun 2018 01:39:32 +0200

> Two rules with different values of suppress_prefix or suppress_ifgroup
> are not the same. This fixes an -EEXIST when running:
> 
>    $ ip -4 rule add table main suppress_prefixlength 0
> 
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> Fixes: f9d4b0c1e969 ("fib_rules: move common handling of newrule delrule msgs into fib_nl2rule")

Roopa, thanks for doing all of that analysis.

I think applying this patch makes the most sense at this point,
so that it what I have done.

^ permalink raw reply

* Re: [PATCH net-next] net/tls: Remove VLA usage on nonce
From: David Miller @ 2018-06-27  1:40 UTC (permalink / raw)
  To: keescook; +Cc: davejwatson, borisp, aviadye, netdev, linux-kernel
In-Reply-To: <20180625235505.GA22661@beast>

From: Kees Cook <keescook@chromium.org>
Date: Mon, 25 Jun 2018 16:55:05 -0700

> It looks like the prior VLA removal, commit b16520f7493d ("net/tls: Remove
> VLA usage"), and a new VLA addition, commit c46234ebb4d1e ("tls: RX path
> for ktls"), passed in the night. This removes the newly added VLA, which
> happens to have its bounds based on the same max value.
> 
> Signed-off-by: Kees Cook <keescook@chromium.org>

Applied, thanks Kees.

^ permalink raw reply

* Re: [PATCH net-next 0/6] Multipath tests for tunnel devices
From: David Miller @ 2018-06-27  1:42 UTC (permalink / raw)
  To: petrm; +Cc: netdev, linux-kselftest, shuah
In-Reply-To: <cover.1529971148.git.petrm@mellanox.com>

From: Petr Machata <petrm@mellanox.com>
Date: Tue, 26 Jun 2018 02:05:51 +0200

> This patchset adds a test for ECMP and weighted ECMP between two GRE
> tunnels.
> 
> In patches #1 and #2, the function multipath_eval() is first moved from
> router_multipath.sh to lib.sh for ease of reuse, and then fixed up.
> 
> In patch #3, the function tc_rule_stats_get() is parameterized to be
> useful for egress rules as well.
> 
> In patch #4, a new function __simple_if_init() is extracted from
> simple_if_init(). This covers the logic that needs to be done for the
> usual interface: VRF migration, upping and installation of IP addresses.
> 
> Patch #5 then adds the test itself.
> 
> Additionally in patch #6, a requirement to add diagrams to selftests is
> documented.

Looks good, series applied.

^ permalink raw reply

* Re: [PATCH net v2 0/2] nfp: MPLS and shared blocks TC offload fixes
From: David Miller @ 2018-06-27  1:47 UTC (permalink / raw)
  To: jakub.kicinski; +Cc: jiri, oss-drivers, netdev
In-Reply-To: <20180626033628.17660-1-jakub.kicinski@netronome.com>

From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 25 Jun 2018 20:36:26 -0700

> This series brings two fixes to TC filter/action offload code.
> Pieter fixes matching MPLS packets when the match is purely on
> the MPLS ethertype and none of the MPLS fields are used.
> John provides a fix for offload of shared blocks.  Unfortunately,
> with shared blocks there is currently no guarantee that filters
> which were added by the core will be removed before block unbind.
> Our simple fix is to not support offload of rules on shared blocks
> at all, a revert of this fix will be send for -next once the
> reoffload infrastructure lands.  The shared blocks became important
> as we are trying to use them for bonding offload (managed from user
> space) and lack of remove calls leads to resource leaks.
> 
> v2:
>  - fix build error reported by kbuild bot due to missing
>    tcf_block_shared() helper.

Series applied and queued up for -stable, thanks.

^ permalink raw reply

* [BUG] net: tg3: two possible data races
From: Jia-Ju Bai @ 2018-06-27  1:47 UTC (permalink / raw)
  To: siva.kallam, prashant, mchan; +Cc: netdev, Linux Kernel Mailing List

The call paths in Linux 4.16.7 that may raise the first data race:

CPU0:
tg3_open
     tg3_start
         line 11611: spin_lock_bh()
         tg3_enable_ints
             line 1023: tp->tnapi->last_tag [READ]

CPU1:
tg3_poll
     line 7341: tnapi->last_tag [WRITE]

The READ operation in CPU0 is performed with holding a spinlock (line 
11611), but the WRITE operation in CPU1 is performed without holding 
this spinlock, so it may cause a data race here.
A possible fix may be to add spin_lock_bh() in tg3_poll().

-----------------------------------------------------------------------

The call paths in Linux 4.16.7 that may raise the second data race:

CPU0:
tg3_open
     tg3_start
         line 11611: spin_lock_bh()
         tg3_enable_ints
             line 1023: tp->irq_sync [WRITE]

CPU1:
tg3_interrupt_tagged
     tg3_irq_sync
         line 7341: tp->irq_sync [READ]

The WRITE operation in CPU0 is performed with holding a spinlock (line 
11611), but the READ operation in CPU1 is performed without holding this 
spinlock, so it may cause a data race here.
A possible fix may be to add spin_lock_bh() in tg3_irq_sync().

I am not sure that whether the possible fixes are correct, so I only 
report the data races.

Best wishes,
Jia-Ju Bai

^ permalink raw reply

* [PATCH net-next] tcp: force cwnd at least 2 in tcp_cwnd_reduction
From: Lawrence Brakmo @ 2018-06-27  1:52 UTC (permalink / raw)
  To: netdev; +Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Eric Dumazet

When using dctcp and doing RPCs, if the last packet of a request is
ECN marked as having seen congestion (CE), the sender can decrease its
cwnd to 1. As a result, it will only send one packet when a new request
is sent. In some instances this results in high tail latencies.

For example, in one setup there are 3 hosts sending to a 4th one, with
each sender having 3 flows (1 stream, 1 1MB back-to-back RPCs and 1 10KB
back-to-back RPCs). The following table shows the 99% and 99.9%
latencies for both Cubic and dctcp

           Cubic 99%  Cubic 99.9%   dctcp 99%    dctcp 99.9%
 1MB RPCs    3.5ms       6.0ms         43ms          208ms
10KB RPCs    1.0ms       2.5ms         53ms          212ms

On 4.11, pcap traces indicate that in some instances the 1st packet of
the RPC is received but no ACK is sent before the packet is
retransmitted. On 4.11 netstat shows TCP timeouts, with some of them
spurious.

On 4.16, we don't see retransmits in netstat but the high tail latencies
are still there. Forcing cwnd to be at least 2 in tcp_cwnd_reduction
fixes the problem with the high tail latencies. The latencies now look
like this:

             dctcp 99%       dctcp 99.9%
 1MB RPCs      3.8ms             4.4ms
10KB RPCs      168us             211us

Another group working with dctcp saw the same issue with production
traffic and it was solved with this patch.

The only issue is if it is safe to always use 2 or if it is better to
use min(2, snd_ssthresh) (which could still trigger the problem).

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 76ca88f63b70..a9255c424761 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2477,7 +2477,7 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
 	}
 	/* Force a fast retransmit upon entering fast recovery */
 	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
-	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
+	tp->snd_cwnd = max(tcp_packets_in_flight(tp) + sndcnt, 2);
 }

 static inline void tcp_end_cwnd_reduction(struct sock *sk)
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH net-next] tcp: force cwnd at least 2 in tcp_cwnd_reduction
From: kbuild test robot @ 2018-06-27  2:18 UTC (permalink / raw)
  To: Lawrence Brakmo
  Cc: kbuild-all, netdev, Kernel Team, Blake Matheny,
	Alexei Starovoitov, Eric Dumazet
In-Reply-To: <20180627015222.3269067-1-brakmo@fb.com>

[-- Attachment #1: Type: text/plain, Size: 2947 bytes --]

Hi Lawrence,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Lawrence-Brakmo/tcp-force-cwnd-at-least-2-in-tcp_cwnd_reduction/20180627-095533
config: x86_64-randconfig-x003-201825 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All warnings (new ones prefixed by >>):

   In file included from include/asm-generic/bug.h:18:0,
                    from arch/x86/include/asm/bug.h:83,
                    from include/linux/bug.h:5,
                    from include/linux/mmdebug.h:5,
                    from include/linux/mm.h:9,
                    from net/ipv4/tcp_input.c:67:
   net/ipv4/tcp_input.c: In function 'tcp_cwnd_reduction':
   include/linux/kernel.h:812:29: warning: comparison of distinct pointer types lacks a cast
      (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
                                ^
   include/linux/kernel.h:826:4: note: in expansion of macro '__typecheck'
      (__typecheck(x, y) && __no_side_effects(x, y))
       ^~~~~~~~~~~
   include/linux/kernel.h:836:24: note: in expansion of macro '__safe_cmp'
     __builtin_choose_expr(__safe_cmp(x, y), \
                           ^~~~~~~~~~
   include/linux/kernel.h:852:19: note: in expansion of macro '__careful_cmp'
    #define max(x, y) __careful_cmp(x, y, >)
                      ^~~~~~~~~~~~~
>> net/ipv4/tcp_input.c:2480:17: note: in expansion of macro 'max'
     tp->snd_cwnd = max(tcp_packets_in_flight(tp) + sndcnt, 2);
                    ^~~

vim +/max +2480 net/ipv4/tcp_input.c

  2455	
  2456	void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
  2457	{
  2458		struct tcp_sock *tp = tcp_sk(sk);
  2459		int sndcnt = 0;
  2460		int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
  2461	
  2462		if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
  2463			return;
  2464	
  2465		tp->prr_delivered += newly_acked_sacked;
  2466		if (delta < 0) {
  2467			u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
  2468				       tp->prior_cwnd - 1;
  2469			sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
  2470		} else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
  2471			   !(flag & FLAG_LOST_RETRANS)) {
  2472			sndcnt = min_t(int, delta,
  2473				       max_t(int, tp->prr_delivered - tp->prr_out,
  2474					     newly_acked_sacked) + 1);
  2475		} else {
  2476			sndcnt = min(delta, newly_acked_sacked);
  2477		}
  2478		/* Force a fast retransmit upon entering fast recovery */
  2479		sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
> 2480		tp->snd_cwnd = max(tcp_packets_in_flight(tp) + sndcnt, 2);
  2481	}
  2482	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 32495 bytes --]

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox